palimpzest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/METADATA +19 -9
  57. palimpzest-0.7.0.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.4.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  import math
4
5
 
5
6
  # NOTE: the answer.mode() call(s) inside of _est_quality() throw a UserWarning when there are multiple
@@ -12,11 +13,9 @@ import warnings
12
13
  from typing import Any
13
14
 
14
15
  import pandas as pd
15
- import scipy.stats as stats
16
16
 
17
17
  from palimpzest.constants import MODEL_CARDS, NAIVE_BYTES_PER_RECORD, GPT_4o_MODEL_CARD, Model
18
- from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost, RecordOpStats
19
- from palimpzest.core.elements.records import DataRecordSet
18
+ from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost, RecordOpStats, SentinelPlanStats
20
19
  from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
21
20
  from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvert
22
21
  from palimpzest.query.operators.convert import LLMConvert
@@ -25,12 +24,13 @@ from palimpzest.query.operators.limit import LimitScanOp
25
24
  from palimpzest.query.operators.physical import PhysicalOperator
26
25
  from palimpzest.query.operators.rag_convert import RAGConvert
27
26
  from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
28
- from palimpzest.query.operators.token_reduction_convert import TokenReducedConvert
29
- from palimpzest.query.optimizer.plan import SentinelPlan
27
+ from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
30
28
  from palimpzest.utils.model_helpers import get_champion_model_name, get_models
31
29
 
32
30
  warnings.simplefilter(action='ignore', category=UserWarning)
33
31
 
32
+ logger = logging.getLogger(__name__)
33
+
34
34
  class BaseCostModel:
35
35
  """
36
36
  This base class contains the interface/abstraction that every CostModel must implement
@@ -64,14 +64,13 @@ class SampleBasedCostModel:
64
64
  """
65
65
  def __init__(
66
66
  self,
67
- sentinel_plan: SentinelPlan,
68
- execution_data: dict[str, dict[str, list[DataRecordSet]]],
67
+ sentinel_plan_stats: SentinelPlanStats,
69
68
  verbose: bool = False,
70
69
  exp_name: str | None = None,
71
70
  ):
72
- # store sentinel plan
73
- self.sentinel_plan = sentinel_plan
74
-
71
+ """
72
+ execution_data is: {logical_op_id: {physical_op_id: [DataRecordSet]}}
73
+ """
75
74
  # store verbose argument
76
75
  self.verbose = verbose
77
76
 
@@ -79,7 +78,7 @@ class SampleBasedCostModel:
79
78
  self.exp_name = exp_name
80
79
 
81
80
  # construct cost, time, quality, and selectivity matrices for each operator set;
82
- self.operator_to_stats = self.compute_operator_stats(execution_data)
81
+ self.operator_to_stats = self.compute_operator_stats(sentinel_plan_stats)
83
82
 
84
83
  # compute set of costed physical op ids from operator_to_stats
85
84
  self.costed_phys_op_ids = set([
@@ -88,30 +87,23 @@ class SampleBasedCostModel:
88
87
  for phys_op_id, _ in phys_op_id_to_stats.items()
89
88
  ])
90
89
 
90
+ logger.info(f"Initialized SampleBasedCostModel with verbose={self.verbose}")
91
+ logger.debug(f"Initialized SampleBasedCostModel with params: {self.__dict__}")
91
92
 
92
93
  def get_costed_phys_op_ids(self):
93
94
  return self.costed_phys_op_ids
94
95
 
95
-
96
- def compute_operator_stats(
97
- self,
98
- execution_data: dict[str, dict[str, list[DataRecordSet]]],
99
- ):
96
+ def compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats) -> dict:
97
+ logger.debug("Computing operator statistics")
100
98
  # flatten the nested dictionary of execution data and pull out fields relevant to cost estimation
101
99
  execution_record_op_stats = []
102
- for idx, (logical_op_id, _, _) in enumerate(self.sentinel_plan):
103
- # initialize variables
104
- upstream_logical_op_id = self.sentinel_plan.logical_op_ids[idx - 1] if idx > 0 else None
105
-
106
- # filter for the execution data from this operator set
107
- op_set_execution_data = execution_data[logical_op_id]
108
-
100
+ for logical_op_id, phys_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
101
+ logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
109
102
  # flatten the execution data into a list of RecordOpStats
110
103
  op_set_execution_data = [
111
104
  record_op_stats
112
- for _, record_sets in op_set_execution_data.items()
113
- for record_set in record_sets
114
- for record_op_stats in record_set.record_op_stats
105
+ for _, op_stats in phys_op_id_to_op_stats.items()
106
+ for record_op_stats in op_stats.record_op_stats_lst
115
107
  ]
116
108
 
117
109
  # add entries from execution data into matrices
@@ -119,7 +111,6 @@ class SampleBasedCostModel:
119
111
  record_op_stats_dict = {
120
112
  "logical_op_id": logical_op_id,
121
113
  "physical_op_id": record_op_stats.op_id,
122
- "upstream_logical_op_id": upstream_logical_op_id,
123
114
  "record_id": record_op_stats.record_id,
124
115
  "record_parent_id": record_op_stats.record_parent_id,
125
116
  "cost_per_record": record_op_stats.cost_per_record,
@@ -138,21 +129,19 @@ class SampleBasedCostModel:
138
129
  # for each physical_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
139
130
  operator_to_stats = {}
140
131
  for logical_op_id, logical_op_df in operator_stats_df.groupby("logical_op_id"):
132
+ logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
141
133
  operator_to_stats[logical_op_id] = {}
142
134
 
143
- # get the logical_op_id of the upstream operator
144
- upstream_logical_op_ids = logical_op_df.upstream_logical_op_id.unique()
145
- assert len(upstream_logical_op_ids) == 1, "More than one upstream logical_op_id"
146
- upstream_logical_op_id = upstream_logical_op_ids[0]
147
-
148
135
  for physical_op_id, physical_op_df in logical_op_df.groupby("physical_op_id"):
149
- # find set of parent records for this operator
150
- num_upstream_records = len(physical_op_df.record_parent_id.unique())
136
+ # compute the number of input records processed by this operator; use source_idx for scan operator(s)
137
+ num_source_records = (
138
+ len(physical_op_df.record_parent_id.unique())
139
+ if not physical_op_df.record_parent_id.isna().all()
140
+ else len(physical_op_df.source_idx.unique())
141
+ )
151
142
 
152
143
  # compute selectivity
153
- selectivity = (
154
- 1.0 if upstream_logical_op_id is None else physical_op_df.passed_operator.sum() / num_upstream_records
155
- )
144
+ selectivity = physical_op_df.passed_operator.sum() / num_source_records
156
145
 
157
146
  operator_to_stats[logical_op_id][physical_op_id] = {
158
147
  "cost": physical_op_df.cost_per_record.mean(),
@@ -165,6 +154,7 @@ class SampleBasedCostModel:
165
154
  if self.exp_name is not None:
166
155
  operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
167
156
 
157
+ logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
168
158
  return operator_to_stats
169
159
 
170
160
 
@@ -176,7 +166,10 @@ class SampleBasedCostModel:
176
166
  # look up physical and logical op ids associated with this physical operator
177
167
  phys_op_id = operator.get_op_id()
178
168
  logical_op_id = operator.logical_op_id
179
- assert self.operator_to_stats.get(logical_op_id).get(phys_op_id) is not None, f"No execution data for {str(operator)}"
169
+ physical_op_to_stats = self.operator_to_stats.get(logical_op_id)
170
+ assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
171
+ assert physical_op_to_stats.get(phys_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
172
+ logger.debug(f"Calling __call__ for {str(operator)}")
180
173
 
181
174
  # look up stats for this operation
182
175
  est_cost_per_record = self.operator_to_stats[logical_op_id][phys_op_id]["cost"]
@@ -210,7 +203,10 @@ class SampleBasedCostModel:
210
203
  op_quality = op_estimates.quality
211
204
 
212
205
  # construct and return op estimates
213
- return PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
206
+ plan_cost = PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
207
+ logger.debug(f"Done calling __call__ for {str(operator)}")
208
+ logger.debug(f"Plan cost: {plan_cost}")
209
+ return plan_cost
214
210
 
215
211
 
216
212
  class CostModel(BaseCostModel):
@@ -223,7 +219,6 @@ class CostModel(BaseCostModel):
223
219
  self,
224
220
  sample_execution_data: list[RecordOpStats] | None = None,
225
221
  available_models: list[Model] | None = None,
226
- confidence_level: float = 0.90,
227
222
  ) -> None:
228
223
  if sample_execution_data is None:
229
224
  sample_execution_data = []
@@ -242,107 +237,54 @@ class CostModel(BaseCostModel):
242
237
  # set available models
243
238
  self.available_models = available_models
244
239
 
245
- # set confidence level for CI estimates
246
- self.conf_level = confidence_level
247
-
248
240
  # compute per-operator estimates
249
241
  self.operator_estimates = self._compute_operator_estimates()
250
242
 
251
243
  # compute set of costed physical op ids from operator_to_stats
252
244
  self.costed_phys_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
245
+ logger.info("Initialized CostModel.")
246
+ logger.debug(f"Initialized CostModel with params: {self.__dict__}")
253
247
 
254
248
  def get_costed_phys_op_ids(self):
255
249
  return self.costed_phys_op_ids
256
250
 
257
- def _compute_ci(self, sample_mean: float, n_samples: int, std_dev: float) -> tuple[float, float]:
251
+ def _compute_mean(self, df: pd.DataFrame, col: str, model_name: str | None = None) -> float:
258
252
  """
259
- Compute confidence interval (for non-proportion quantities) given the sample mean, number of samples,
260
- and sample std. deviation at the CostModel's given confidence level. We use a t-distribution for
261
- computing the interval as many sample estimates in PZ may have few samples.
262
- """
263
- ci = stats.t.interval(
264
- confidence=self.conf_level, # Confidence level
265
- df=n_samples - 1, # Degrees of freedom
266
- loc=sample_mean, # Sample mean
267
- scale=std_dev, # Standard deviation estimate
268
- )
269
- return ci
270
-
271
- def _compute_proportion_ci(self, sample_prop: float, n_samples: int) -> tuple[float, float]:
272
- """
273
- Compute confidence interval for proportion quantities (i.e. selectivity) given the sample proportion
274
- and the number of samples. We use the normal distribution for computing the interval here, for reasons
275
- summarized by this post: https://stats.stackexchange.com/a/411727.
276
- """
277
- if sample_prop == 0.0 or sample_prop == 1.0:
278
- return (sample_prop, sample_prop)
279
-
280
- scaling_factor = math.sqrt((sample_prop * (1 - sample_prop)) / n_samples)
281
- lower_bound, upper_bound = stats.norm.interval(
282
- confidence=self.conf_level, # Confidence level
283
- loc=sample_prop, # Sample proportion
284
- scale=scaling_factor, # Scaling factor
285
- )
286
- lower_bound = max(lower_bound, 0.0)
287
- upper_bound = max(upper_bound, 1.0)
288
-
289
- return (lower_bound, upper_bound)
290
-
291
- def _compute_mean_and_ci(self, df: pd.DataFrame, col: str, model_name: str | None = None, non_negative_lb: bool = False) -> tuple[float, float, float]:
292
- """
293
- Compute the mean and CI for the given column and dataframe. If the model_name is provided, filter
253
+ Compute the mean for the given column and dataframe. If the model_name is provided, filter
294
254
  for the subset of rows belonging to the model.
295
255
  """
296
256
  # use model-specific estimate if possible
297
257
  if model_name is not None:
298
258
  model_df = df[df.model_name == model_name]
299
259
  if not model_df.empty:
300
- col_mean = model_df[col].mean()
301
- col_lb, col_ub = self._compute_ci(
302
- sample_mean=col_mean,
303
- n_samples=model_df[col].notna().sum(),
304
- std_dev=model_df[col].std(),
305
- )
306
- if non_negative_lb:
307
- col_lb = max(col_lb, 0.0)
260
+ return model_df[col].mean()
308
261
 
309
- return col_mean, col_lb, col_ub
310
-
311
- # compute aggregate
312
- col_mean = df[col].mean()
313
- col_lb, col_ub = self._compute_ci(
314
- sample_mean=col_mean,
315
- n_samples=df[col].notna().sum(),
316
- std_dev=df[col].std(),
317
- )
318
- if non_negative_lb:
319
- col_lb = max(col_lb, 0.0)
320
-
321
- return col_mean, col_lb, col_ub
262
+ # compute aggregate mean across all models
263
+ return df[col].mean()
322
264
 
323
265
  def _est_time_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
324
266
  """
325
267
  Given sample cost data observations for a specific operation, compute the mean and CI
326
268
  for the time per record.
327
269
  """
328
- return self._compute_mean_and_ci(df=op_df, col="time_per_record", model_name=model_name, non_negative_lb=True)
270
+ return self._compute_mean(df=op_df, col="time_per_record", model_name=model_name)
329
271
 
330
272
  def _est_cost_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
331
273
  """
332
274
  Given sample cost data observations for a specific operation, compute the mean and CI
333
275
  for the cost per record.
334
276
  """
335
- return self._compute_mean_and_ci(df=op_df, col="cost_per_record", model_name=model_name, non_negative_lb=True)
277
+ return self._compute_mean(df=op_df, col="cost_per_record", model_name=model_name)
336
278
 
337
- def _est_tokens_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[tuple[float, float, float], tuple[float, float, float]]:
279
+ def _est_tokens_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float]:
338
280
  """
339
281
  Given sample cost data observations for a specific operation, compute the mean and CI
340
282
  for the total input tokens and total output tokens.
341
283
  """
342
- total_input_tokens_tuple = self._compute_mean_and_ci(df=op_df, col="total_input_tokens", model_name=model_name, non_negative_lb=True)
343
- total_output_tokens_tuple = self._compute_mean_and_ci(df=op_df, col="total_output_tokens", model_name=model_name, non_negative_lb=True)
284
+ total_input_tokens = self._compute_mean(df=op_df, col="total_input_tokens", model_name=model_name)
285
+ total_output_tokens = self._compute_mean(df=op_df, col="total_output_tokens", model_name=model_name)
344
286
 
345
- return total_input_tokens_tuple, total_output_tokens_tuple
287
+ return total_input_tokens, total_output_tokens
346
288
 
347
289
  def _est_cardinality(self, op_df: pd.DataFrame, model_name: str | None = None) -> float:
348
290
  """
@@ -382,18 +324,8 @@ class CostModel(BaseCostModel):
382
324
  plan_ids = model_op_df.plan_id.unique().tolist()
383
325
  num_output_records = df[df.source_op_id.isin(op_ids) & df.plan_id.isin(plan_ids)].shape[0]
384
326
 
385
- # estimate the selectivity / fan-out and compute bounds
386
- est_selectivity = num_output_records / num_input_records
387
- if is_filter_op:
388
- est_selectivity_lb, est_selectivity_ub = self._compute_proportion_ci(est_selectivity, n_samples=num_input_records)
389
-
390
- # for now, if we are doing a convert operation w/fan-out then the assumptions of _compute_proportion_ci
391
- # do not hold; until we have a better method for estimating bounds, just set them to the estimate
392
- else:
393
- est_selectivity_lb = est_selectivity
394
- est_selectivity_ub = est_selectivity
395
-
396
- return est_selectivity, est_selectivity_lb, est_selectivity_ub
327
+ # estimate the selectivity / fan-out
328
+ return num_output_records / num_input_records
397
329
 
398
330
  # otherwise average selectivity across all ops
399
331
  num_input_records = op_df.shape[0]
@@ -406,18 +338,8 @@ class CostModel(BaseCostModel):
406
338
  op_ids = op_df.op_id.unique().tolist()
407
339
  num_output_records = df[df.source_op_id.isin(op_ids)].shape[0]
408
340
 
409
- # estimate the selectivity / fan-out and compute bounds
410
- est_selectivity = num_output_records / num_input_records
411
- if is_filter_op:
412
- est_selectivity_lb, est_selectivity_ub = self._compute_proportion_ci(est_selectivity, n_samples=num_input_records)
413
-
414
- # for now, if we are doing a convert operation w/fan-out then the assumptions of _compute_proportion_ci
415
- # do not hold; until we have a better method for estimating bounds, just set them to the estimate
416
- else:
417
- est_selectivity_lb = est_selectivity
418
- est_selectivity_ub = est_selectivity
419
-
420
- return est_selectivity, est_selectivity_lb, est_selectivity_ub
341
+ # estimate the selectivity / fan-out
342
+ return num_output_records / num_input_records
421
343
 
422
344
  def _compute_quality(self, row):
423
345
  # compute accuracy for filter
@@ -491,10 +413,7 @@ class CostModel(BaseCostModel):
491
413
  total_answers = model_df.num_answers.sum() if not model_df.empty else op_df.num_answers.sum()
492
414
  est_quality = num_correct / total_answers
493
415
 
494
- # compute CI on the proportion of correct answers
495
- est_quality_lb, est_quality_ub = self._compute_proportion_ci(est_quality, n_samples=total_answers)
496
-
497
- return est_quality, est_quality_lb, est_quality_ub
416
+ return est_quality
498
417
 
499
418
  def _compute_operator_estimates(self) -> dict[str, Any] | None:
500
419
  """
@@ -532,64 +451,36 @@ class CostModel(BaseCostModel):
532
451
  # model_names = op_df.model_name.unique().tolist()
533
452
  estimates = {model_name: None for model_name in model_names}
534
453
  for model_name in model_names:
535
- time_per_record, time_per_record_lb, time_per_record_ub = self._est_time_per_record(op_df, model_name=model_name)
536
- cost_per_record, cost_per_record_lb, cost_per_record_ub = self._est_cost_per_record(op_df, model_name=model_name)
537
- input_tokens_tup, output_tokens_tup = self._est_tokens_per_record(op_df, model_name=model_name)
538
- selectivity, selectivity_lb, selectivity_ub = self._est_selectivity(self.sample_execution_data_df, op_df, model_name=model_name)
539
- quality, quality_lb, quality_ub = self._est_quality(op_df, model_name=model_name)
540
-
454
+ time_per_record = self._est_time_per_record(op_df, model_name=model_name)
455
+ cost_per_record = self._est_cost_per_record(op_df, model_name=model_name)
456
+ input_tokens, output_tokens = self._est_tokens_per_record(op_df, model_name=model_name)
457
+ selectivity = self._est_selectivity(self.sample_execution_data_df, op_df, model_name=model_name)
458
+ quality = self._est_quality(op_df, model_name=model_name)
459
+
541
460
  model_estimates = {
542
461
  "time_per_record": time_per_record,
543
- "time_per_record_lower_bound": time_per_record_lb,
544
- "time_per_record_upper_bound": time_per_record_ub,
545
462
  "cost_per_record": cost_per_record,
546
- "cost_per_record_lower_bound": cost_per_record_lb,
547
- "cost_per_record_upper_bound": cost_per_record_ub,
548
- "total_input_tokens": input_tokens_tup[0],
549
- "total_input_tokens_lower_bound": input_tokens_tup[1],
550
- "total_input_tokens_upper_bound": input_tokens_tup[2],
551
- "total_output_tokens": output_tokens_tup[0],
552
- "total_output_tokens_lower_bound": output_tokens_tup[1],
553
- "total_output_tokens_upper_bound": output_tokens_tup[2],
463
+ "total_input_tokens": input_tokens,
464
+ "total_output_tokens": output_tokens,
554
465
  "selectivity": selectivity,
555
- "selectivity_lower_bound": selectivity_lb,
556
- "selectivity_upper_bound": selectivity_ub,
557
466
  "quality": quality,
558
- "quality_lower_bound": quality_lb,
559
- "quality_upper_bound": quality_ub,
560
467
  }
561
468
  estimates[model_name] = model_estimates
562
469
 
563
470
  # TODO pre-compute lists of op_names in groups
564
471
  elif op_name in ["NonLLMFilter"]:
565
- time_per_record, time_per_record_lb, time_per_record_ub = self._est_time_per_record(op_df)
566
- selectivity, selectivity_lb, selectivity_ub = self._est_selectivity(self.sample_execution_data_df, op_df)
567
- estimates = {
568
- "time_per_record": time_per_record,
569
- "time_per_record_lower_bound": time_per_record_lb,
570
- "time_per_record_upper_bound": time_per_record_ub,
571
- "selectivity": selectivity,
572
- "selectivity_lower_bound": selectivity_lb,
573
- "selectivity_upper_bound": selectivity_ub,
574
- }
472
+ time_per_record = self._est_time_per_record(op_df)
473
+ selectivity = self._est_selectivity(self.sample_execution_data_df, op_df)
474
+ estimates = {"time_per_record": time_per_record, "selectivity": selectivity}
575
475
 
576
476
  elif op_name in ["MarshalAndScanDataOp", "CacheScanDataOp", "LimitScanOp", "CountAggregateOp", "AverageAggregateOp"]:
577
- time_per_record, time_per_record_lb, time_per_record_ub = self._est_time_per_record(op_df)
578
- estimates = {
579
- "time_per_record": time_per_record,
580
- "time_per_record_lower_bound": time_per_record_lb,
581
- "time_per_record_upper_bound": time_per_record_ub,
582
- }
477
+ time_per_record = self._est_time_per_record(op_df)
478
+ estimates = {"time_per_record": time_per_record}
583
479
 
584
480
  elif op_name in ["ApplyGroupByOp"]:
585
- time_per_record, time_per_record_lb, time_per_record_ub = self._est_time_per_record(op_df)
481
+ time_per_record = self._est_time_per_record(op_df)
586
482
  cardinality = self._est_cardinality(op_df)
587
- estimates = {
588
- "time_per_record": time_per_record,
589
- "time_per_record_lower_bound": time_per_record_lb,
590
- "time_per_record_upper_bound": time_per_record_ub,
591
- "cardinality": cardinality,
592
- }
483
+ estimates = {"time_per_record": time_per_record, "cardinality": cardinality}
593
484
 
594
485
  operator_estimates[op_id] = estimates
595
486
 
@@ -598,6 +489,7 @@ class CostModel(BaseCostModel):
598
489
  def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
599
490
  # get identifier for operation which is unique within sentinel plan but consistent across sentinels
600
491
  op_id = operator.get_op_id()
492
+ logger.debug(f"Calling __call__ for {str(operator)} with op_id: {op_id}")
601
493
 
602
494
  # initialize estimates of operator metrics based on naive (but sometimes precise) logic
603
495
  if isinstance(operator, MarshalAndScanDataOp):
@@ -633,8 +525,6 @@ class CostModel(BaseCostModel):
633
525
  if sample_op_estimates is not None and op_id in sample_op_estimates:
634
526
  if isinstance(operator, (MarshalAndScanDataOp, CacheScanDataOp)):
635
527
  op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
636
- op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
637
- op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
638
528
 
639
529
  elif isinstance(operator, ApplyGroupByOp):
640
530
  # NOTE: in theory we should also treat this cardinality est. as a random variable, but in practice we will
@@ -646,48 +536,24 @@ class CostModel(BaseCostModel):
646
536
  # actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
647
537
  # the input cardinality (where the initial input cardinality from the datareader is known).
648
538
  op_estimates.cardinality = sample_op_estimates[op_id]["cardinality"]
649
- op_estimates.cardinality_lower_bound = op_estimates.cardinality
650
- op_estimates.cardinality_upper_bound = op_estimates.cardinality
651
539
  op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
652
- op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
653
- op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
654
540
 
655
541
  elif isinstance(operator, (CountAggregateOp, AverageAggregateOp)): # noqa: SIM114
656
542
  op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
657
- op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
658
- op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
659
543
 
660
544
  elif isinstance(operator, LimitScanOp):
661
545
  op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
662
- op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
663
- op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
664
546
 
665
547
  elif isinstance(operator, NonLLMFilter):
666
548
  op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id]["selectivity"]
667
- op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id]["selectivity_lower_bound"]
668
- op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id]["selectivity_upper_bound"]
669
-
670
549
  op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
671
- op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
672
- op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
673
550
 
674
551
  elif isinstance(operator, LLMFilter):
675
552
  model_name = operator.model.value
676
553
  op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
677
- op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id][model_name]["selectivity_lower_bound"]
678
- op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id][model_name]["selectivity_upper_bound"]
679
-
680
554
  op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
681
- op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id][model_name]["time_per_record_lower_bound"]
682
- op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id][model_name]["time_per_record_upper_bound"]
683
-
684
555
  op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
685
- op_estimates.cost_per_record_lower_bound = sample_op_estimates[op_id][model_name]["cost_per_record_lower_bound"]
686
- op_estimates.cost_per_record_upper_bound = sample_op_estimates[op_id][model_name]["cost_per_record_upper_bound"]
687
-
688
556
  op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
689
- op_estimates.quality_lower_bound = sample_op_estimates[op_id][model_name]["quality_lower_bound"]
690
- op_estimates.quality_upper_bound = sample_op_estimates[op_id][model_name]["quality_upper_bound"]
691
557
 
692
558
  elif isinstance(operator, LLMConvert):
693
559
  # TODO: EVEN BETTER: do similarity match (e.g. largest param intersection, more exotic techniques);
@@ -696,60 +562,28 @@ class CostModel(BaseCostModel):
696
562
  # NOTE: code synthesis does not have a model attribute
697
563
  model_name = operator.model.value if hasattr(operator, "model") else None
698
564
  op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
699
- op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id][model_name]["selectivity_lower_bound"]
700
- op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id][model_name]["selectivity_upper_bound"]
701
-
702
565
  op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
703
- op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id][model_name]["time_per_record_lower_bound"]
704
- op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id][model_name]["time_per_record_upper_bound"]
705
-
706
566
  op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
707
- op_estimates.cost_per_record_lower_bound = sample_op_estimates[op_id][model_name]["cost_per_record_lower_bound"]
708
- op_estimates.cost_per_record_upper_bound = sample_op_estimates[op_id][model_name]["cost_per_record_upper_bound"]
709
-
710
567
  op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
711
- op_estimates.quality_lower_bound = sample_op_estimates[op_id][model_name]["quality_lower_bound"]
712
- op_estimates.quality_upper_bound = sample_op_estimates[op_id][model_name]["quality_upper_bound"]
713
568
 
714
569
  # NOTE: if code synth. fails, this will turn into ConventionalQuery calls to GPT-3.5,
715
570
  # which would wildly mess up estimate of time and cost per-record
716
571
  # do code synthesis adjustment
717
572
  if isinstance(operator, CodeSynthesisConvert):
718
573
  op_estimates.time_per_record = 1e-5
719
- op_estimates.time_per_record_lower_bound = op_estimates.time_per_record
720
- op_estimates.time_per_record_upper_bound = op_estimates.time_per_record
721
574
  op_estimates.cost_per_record = 1e-4
722
- op_estimates.cost_per_record_lower_bound = op_estimates.cost_per_record
723
- op_estimates.cost_per_record_upper_bound = op_estimates.cost_per_record
724
575
  op_estimates.quality = op_estimates.quality * (GPT_4o_MODEL_CARD["code"] / 100.0)
725
- op_estimates.quality_lower_bound = op_estimates.quality_lower_bound * (GPT_4o_MODEL_CARD["code"] / 100.0)
726
- op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * (GPT_4o_MODEL_CARD["code"] / 100.0)
727
576
 
728
577
  # token reduction adjustment
729
- if isinstance(operator, TokenReducedConvert):
578
+ if isinstance(operator, TokenReducedConvertBonded):
730
579
  total_input_tokens = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens"]
731
580
  total_output_tokens = sample_op_estimates[op_id][model_name]["total_output_tokens"]
732
581
  op_estimates.cost_per_record = (
733
582
  MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
734
583
  + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
735
584
  )
736
- total_input_tokens_lb = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens_lower_bound"]
737
- total_output_tokens_lb = sample_op_estimates[op_id][model_name]["total_output_tokens_lower_bound"]
738
- op_estimates.cost_per_record_lower_bound = (
739
- MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_lb
740
- + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_lb
741
- )
742
- total_input_tokens_ub = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens_upper_bound"]
743
- total_output_tokens_ub = sample_op_estimates[op_id][model_name]["total_output_tokens_upper_bound"]
744
- op_estimates.cost_per_record_upper_bound = (
745
- MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_ub
746
- + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_ub
747
- )
748
-
749
585
  op_estimates.quality = op_estimates.quality * math.sqrt(math.sqrt(operator.token_budget))
750
- op_estimates.quality_lower_bound = op_estimates.quality_lower_bound * math.sqrt(math.sqrt(operator.token_budget))
751
- op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * math.sqrt(math.sqrt(operator.token_budget))
752
-
586
+
753
587
  # rag convert adjustment
754
588
  if isinstance(operator, RAGConvert):
755
589
  total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
@@ -758,22 +592,7 @@ class CostModel(BaseCostModel):
758
592
  MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
759
593
  + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
760
594
  )
761
- total_input_tokens_lb = operator.num_chunks_per_field * operator.chunk_size
762
- total_output_tokens_lb = sample_op_estimates[op_id][model_name]["total_output_tokens_lower_bound"]
763
- op_estimates.cost_per_record_lower_bound = (
764
- MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_lb
765
- + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_lb
766
- )
767
- total_input_tokens_ub = operator.num_chunks_per_field * operator.chunk_size
768
- total_output_tokens_ub = sample_op_estimates[op_id][model_name]["total_output_tokens_upper_bound"]
769
- op_estimates.cost_per_record_upper_bound = (
770
- MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_ub
771
- + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_ub
772
- )
773
-
774
595
  op_estimates.quality = op_estimates.quality * operator.naive_quality_adjustment
775
- op_estimates.quality_lower_bound = op_estimates.quality_lower_bound * operator.naive_quality_adjustment
776
- op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * operator.naive_quality_adjustment
777
596
 
778
597
  else:
779
598
  raise Exception("Unknown operator")
@@ -783,26 +602,14 @@ class CostModel(BaseCostModel):
783
602
  op_cost = op_estimates.cost_per_record * source_op_estimates.cardinality
784
603
  op_quality = op_estimates.quality
785
604
 
786
- # compute bounds on total time and cost estimates for this operator
787
- op_cost_lower_bound = op_estimates.cost_per_record_lower_bound * source_op_estimates.cardinality_lower_bound
788
- op_cost_upper_bound = op_estimates.cost_per_record_upper_bound * source_op_estimates.cardinality_upper_bound
789
- op_time_lower_bound = op_estimates.time_per_record_lower_bound * source_op_estimates.cardinality_lower_bound
790
- op_time_upper_bound = op_estimates.time_per_record_upper_bound * source_op_estimates.cardinality_upper_bound
791
- op_quality_lower_bound = op_estimates.quality_lower_bound
792
- op_quality_upper_bound = op_estimates.quality_upper_bound
793
-
794
605
  # create and return PlanCost object for this op's statistics
795
606
  op_plan_cost = PlanCost(
796
607
  cost=op_cost,
797
608
  time=op_time,
798
609
  quality=op_quality,
799
610
  op_estimates=op_estimates,
800
- cost_lower_bound=op_cost_lower_bound,
801
- cost_upper_bound=op_cost_upper_bound,
802
- time_lower_bound=op_time_lower_bound,
803
- time_upper_bound=op_time_upper_bound,
804
- quality_lower_bound=op_quality_lower_bound,
805
- quality_upper_bound=op_quality_upper_bound,
806
611
  )
612
+ logger.debug(f"Done calling __call__ for {str(operator)} with op_id: {op_id}")
613
+ logger.debug(f"Plan cost: {op_plan_cost}")
807
614
 
808
615
  return op_plan_cost