palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.21.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
-
5
- # NOTE: the answer.mode() call(s) inside of _est_quality() throw a UserWarning when there are multiple
6
- # answers to a convert with the same mode. This is because pandas tries to sort the answers
7
- # before returning them, but since answer is a column of dicts the '<' operator fails on dicts.
8
- # For now, we can simply ignore the warning b/c we pick an answer at random anyways if there are
9
- # multiple w/the same count, but in the future we may want to cast the 'dict' --> 'str' or compute
10
- # the mode on a per-field basis.
11
4
  import warnings
12
- from typing import Any
13
5
 
14
6
  import pandas as pd
15
7
 
16
- from palimpzest.constants import MODEL_CARDS, NAIVE_BYTES_PER_RECORD, GPT_4o_MODEL_CARD, Model
17
- from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost, RecordOpStats, SentinelPlanStats
18
- from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
19
- from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvert
20
- from palimpzest.query.operators.convert import LLMConvert
21
- from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
22
- from palimpzest.query.operators.limit import LimitScanOp
8
+ from palimpzest.constants import NAIVE_BYTES_PER_RECORD
9
+ from palimpzest.core.models import OperatorCostEstimates, PlanCost, SentinelPlanStats
10
+ from palimpzest.query.operators.join import JoinOp
23
11
  from palimpzest.query.operators.physical import PhysicalOperator
24
- from palimpzest.query.operators.rag_convert import RAGConvert
25
- from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
26
- from palimpzest.utils.model_helpers import get_champion_model_name, get_models
12
+ from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp, ScanPhysicalOp
27
13
 
28
14
  warnings.simplefilter(action='ignore', category=UserWarning)
29
15
 
@@ -62,7 +48,7 @@ class SampleBasedCostModel:
62
48
  """
63
49
  def __init__(
64
50
  self,
65
- sentinel_plan_stats: SentinelPlanStats,
51
+ sentinel_plan_stats: SentinelPlanStats | None = None,
66
52
  verbose: bool = False,
67
53
  exp_name: str | None = None,
68
54
  ):
@@ -73,25 +59,34 @@ class SampleBasedCostModel:
73
59
  self.exp_name = exp_name
74
60
 
75
61
  # construct cost, time, quality, and selectivity matrices for each operator set;
76
- self.operator_to_stats = self.compute_operator_stats(sentinel_plan_stats)
77
- self.costed_full_op_ids = set([
62
+ self.operator_to_stats = self._compute_operator_stats(sentinel_plan_stats)
63
+ self.costed_full_op_ids = None if self.operator_to_stats is None else set([
78
64
  full_op_id
79
65
  for _, full_op_id_to_stats in self.operator_to_stats.items()
80
66
  for full_op_id in full_op_id_to_stats
81
67
  ])
82
68
 
69
+ # if there is a logical operator with no samples; add all of its op ids to costed_full_op_ids;
70
+ # this will lead to the cost model applying the naive cost estimates for all physical op ids
71
+ # in this logical operator (I think?)
72
+ # TODO
73
+
83
74
  logger.info(f"Initialized SampleBasedCostModel with verbose={self.verbose}")
84
75
  logger.debug(f"Initialized SampleBasedCostModel with params: {self.__dict__}")
85
76
 
86
77
  def get_costed_full_op_ids(self):
87
78
  return self.costed_full_op_ids
88
79
 
89
- def compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats) -> dict:
80
+ def _compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats | None) -> dict:
90
81
  logger.debug("Computing operator statistics")
82
+ # if no stats are provided, simply return None
83
+ if sentinel_plan_stats is None:
84
+ return None
85
+
91
86
  # flatten the nested dictionary of execution data and pull out fields relevant to cost estimation
92
87
  execution_record_op_stats = []
93
- for logical_op_id, full_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
94
- logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
88
+ for unique_logical_op_id, full_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
89
+ logger.debug(f"Computing operator statistics for logical_op_id: {unique_logical_op_id}")
95
90
  # flatten the execution data into a list of RecordOpStats
96
91
  op_set_execution_data = [
97
92
  record_op_stats
@@ -102,17 +97,17 @@ class SampleBasedCostModel:
102
97
  # add entries from execution data into matrices
103
98
  for record_op_stats in op_set_execution_data:
104
99
  record_op_stats_dict = {
105
- "logical_op_id": logical_op_id,
100
+ "unique_logical_op_id": unique_logical_op_id,
106
101
  "full_op_id": record_op_stats.full_op_id,
107
102
  "record_id": record_op_stats.record_id,
108
- "record_parent_id": record_op_stats.record_parent_id,
103
+ "record_parent_ids": record_op_stats.record_parent_ids,
109
104
  "cost_per_record": record_op_stats.cost_per_record,
110
105
  "time_per_record": record_op_stats.time_per_record,
111
106
  "quality": record_op_stats.quality,
112
107
  "passed_operator": record_op_stats.passed_operator,
113
- "source_idx": record_op_stats.record_source_idx, # TODO: remove
114
- "op_details": record_op_stats.op_details, # TODO: remove
115
- "answer": record_op_stats.answer, # TODO: remove
108
+ "source_indices": record_op_stats.record_source_indices, # TODO: remove
109
+ "op_details": record_op_stats.op_details, # TODO: remove
110
+ "answer": record_op_stats.answer, # TODO: remove
116
111
  }
117
112
  execution_record_op_stats.append(record_op_stats_dict)
118
113
 
@@ -121,22 +116,22 @@ class SampleBasedCostModel:
121
116
 
122
117
  # for each full_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
123
118
  operator_to_stats = {}
124
- for logical_op_id, logical_op_df in operator_stats_df.groupby("logical_op_id"):
125
- logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
126
- operator_to_stats[logical_op_id] = {}
119
+ for unique_logical_op_id, logical_op_df in operator_stats_df.groupby("unique_logical_op_id"):
120
+ logger.debug(f"Computing operator statistics for unique_logical_op_id: {unique_logical_op_id}")
121
+ operator_to_stats[unique_logical_op_id] = {}
127
122
 
128
123
  for full_op_id, physical_op_df in logical_op_df.groupby("full_op_id"):
129
- # compute the number of input records processed by this operator; use source_idx for scan operator(s)
124
+ # compute the number of input records processed by this operator; use source_indices for scan operator(s)
130
125
  num_source_records = (
131
- len(physical_op_df.record_parent_id.unique())
132
- if not physical_op_df.record_parent_id.isna().all()
133
- else len(physical_op_df.source_idx.unique())
126
+ physical_op_df.record_parent_ids.apply(tuple).nunique()
127
+ if not physical_op_df.record_parent_ids.isna().all()
128
+ else physical_op_df.source_indices.apply(tuple).nunique()
134
129
  )
135
130
 
136
131
  # compute selectivity
137
132
  selectivity = physical_op_df.passed_operator.sum() / num_source_records
138
133
 
139
- operator_to_stats[logical_op_id][full_op_id] = {
134
+ operator_to_stats[unique_logical_op_id][full_op_id] = {
140
135
  "cost": physical_op_df.cost_per_record.mean(),
141
136
  "time": physical_op_df.time_per_record.mean(),
142
137
  "quality": physical_op_df.quality.mean(),
@@ -150,347 +145,18 @@ class SampleBasedCostModel:
150
145
  logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
151
146
  return operator_to_stats
152
147
 
153
-
154
- def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
155
- # NOTE: some physical operators may not have any sample execution data in this cost model;
156
- # these physical operators are filtered out of the Optimizer, thus we can assume that
157
- # we will have execution data for each operator passed into __call__; nevertheless, we
158
- # still perform a sanity check
159
- # look up physical and logical op ids associated with this physical operator
160
- full_op_id = operator.get_full_op_id()
161
- logical_op_id = operator.logical_op_id
162
- physical_op_to_stats = self.operator_to_stats.get(logical_op_id)
163
- assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
164
- assert physical_op_to_stats.get(full_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
165
- logger.debug(f"Calling __call__ for {str(operator)}")
166
-
167
- # look up stats for this operation
168
- est_cost_per_record = self.operator_to_stats[logical_op_id][full_op_id]["cost"]
169
- est_time_per_record = self.operator_to_stats[logical_op_id][full_op_id]["time"]
170
- est_quality = self.operator_to_stats[logical_op_id][full_op_id]["quality"]
171
- est_selectivity = self.operator_to_stats[logical_op_id][full_op_id]["selectivity"]
172
-
173
- # create source_op_estimates for scan operators if they are not provided
174
- if isinstance(operator, ScanPhysicalOp):
175
- # get handle to scan operator and pre-compute its size (number of records)
176
- datareader_len = len(operator.datareader)
177
-
178
- source_op_estimates = OperatorCostEstimates(
179
- cardinality=datareader_len,
180
- time_per_record=0.0,
181
- cost_per_record=0.0,
182
- quality=1.0,
183
- )
184
-
185
- # generate new set of OperatorCostEstimates
186
- op_estimates = OperatorCostEstimates(
187
- cardinality=est_selectivity * source_op_estimates.cardinality,
188
- time_per_record=est_time_per_record,
189
- cost_per_record=est_cost_per_record,
190
- quality=est_quality,
191
- )
192
-
193
- # compute estimates for this operator
194
- op_time = op_estimates.time_per_record * source_op_estimates.cardinality
195
- op_cost = op_estimates.cost_per_record * source_op_estimates.cardinality
196
- op_quality = op_estimates.quality
197
-
198
- # construct and return op estimates
199
- plan_cost = PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
200
- logger.debug(f"Done calling __call__ for {str(operator)}")
201
- logger.debug(f"Plan cost: {plan_cost}")
202
- return plan_cost
203
-
204
-
205
- class CostModel(BaseCostModel):
206
- """
207
- This class takes in a list of RecordOpStats and performs cost estimation on a given operator
208
- by taking the average of any sample execution that the CostModel has for that operator. If no
209
- such data exists, it returns a naive estimate.
210
- """
211
- def __init__(
212
- self,
213
- sample_execution_data: list[RecordOpStats] | None = None,
214
- available_models: list[Model] | None = None,
215
- ) -> None:
216
- if sample_execution_data is None:
217
- sample_execution_data = []
218
- if available_models is None:
219
- available_models = []
220
-
221
- # construct full dataset of samples
222
- self.sample_execution_data_df = (
223
- pd.DataFrame(sample_execution_data)
224
- if len(sample_execution_data) > 0
225
- else None
226
- )
227
- # df contains a column called record_state, that sometimes contain a dict
228
- # we want to extract the keys from the dict and create a new column for each key
229
-
230
- # set available models
231
- self.available_models = available_models
232
-
233
- # compute per-operator estimates
234
- self.operator_estimates = self._compute_operator_estimates()
235
-
236
- # compute set of costed full op ids from operator_to_stats
237
- self.costed_full_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
238
- logger.info("Initialized CostModel.")
239
- logger.debug(f"Initialized CostModel with params: {self.__dict__}")
240
-
241
- def get_costed_full_op_ids(self):
242
- return self.costed_full_op_ids
243
-
244
- def _compute_mean(self, df: pd.DataFrame, col: str, model_name: str | None = None) -> float:
245
- """
246
- Compute the mean for the given column and dataframe. If the model_name is provided, filter
247
- for the subset of rows belonging to the model.
248
- """
249
- # use model-specific estimate if possible
250
- if model_name is not None:
251
- model_df = df[df.model_name == model_name]
252
- if not model_df.empty:
253
- return model_df[col].mean()
254
-
255
- # compute aggregate mean across all models
256
- return df[col].mean()
257
-
258
- def _est_time_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
259
- """
260
- Given sample cost data observations for a specific operation, compute the mean and CI
261
- for the time per record.
262
- """
263
- return self._compute_mean(df=op_df, col="time_per_record", model_name=model_name)
264
-
265
- def _est_cost_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
266
- """
267
- Given sample cost data observations for a specific operation, compute the mean and CI
268
- for the cost per record.
269
- """
270
- return self._compute_mean(df=op_df, col="cost_per_record", model_name=model_name)
271
-
272
- def _est_tokens_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float]:
273
- """
274
- Given sample cost data observations for a specific operation, compute the mean and CI
275
- for the total input tokens and total output tokens.
276
- """
277
- total_input_tokens = self._compute_mean(df=op_df, col="total_input_tokens", model_name=model_name)
278
- total_output_tokens = self._compute_mean(df=op_df, col="total_output_tokens", model_name=model_name)
279
-
280
- return total_input_tokens, total_output_tokens
281
-
282
- def _est_cardinality(self, op_df: pd.DataFrame, model_name: str | None = None) -> float:
283
- """
284
- Given sample cost data observations for a specific operation, compute the number of
285
- rows output by the operation.
286
-
287
- NOTE: right now, this should only be used by the ApplyGroupByOp as a way to gauge the
288
- number of output groups. Using this to estimate, e.g. the cardinality of a filter,
289
- convert, or base scan will lead to wildly inaccurate results because the absolute value
290
- of these cardinalities will simply be a reflection of the sample size.
291
-
292
- For those operations, we use the `_est_selectivity` function to estimate the operator's
293
- selectivity, which we can apply to an est. of the operator's input cardinality.
294
- """
295
- return op_df.shape[0] / len(op_df.plan_id.unique())
296
-
297
- def _est_selectivity(self, df: pd.DataFrame, op_df: pd.DataFrame, model_name: str | None = None) -> float:
298
- """
299
- Given sample cost data observations for the plan and a specific operation, compute
300
- the ratio of records between this operator and its source operator.
301
- """
302
- # compute whether or not this operation is a filter
303
- is_filter_op = "filter" in str(op_df.op_name.iloc[0]).lower()
304
-
305
- # use model-specific estimate if possible
306
- if model_name is not None:
307
- model_op_df = op_df[op_df.model_name == model_name]
308
- if not model_op_df.empty:
309
- num_input_records = model_op_df.shape[0]
310
-
311
- # get subset of records that were the source to this operator
312
- num_output_records = None
313
- if is_filter_op:
314
- num_output_records = model_op_df.passed_operator.sum()
315
- else:
316
- full_op_ids = model_op_df.full_op_id.unique().tolist()
317
- plan_ids = model_op_df.plan_id.unique().tolist()
318
- num_output_records = df[df.source_full_op_id.isin(full_op_ids) & df.plan_id.isin(plan_ids)].shape[0]
319
-
320
- # estimate the selectivity / fan-out
321
- return num_output_records / num_input_records
322
-
323
- # otherwise average selectivity across all ops
324
- num_input_records = op_df.shape[0]
325
-
326
- # get subset of records that were the source to this operator
327
- num_output_records = None
328
- if is_filter_op:
329
- num_output_records = op_df.passed_operator.sum()
330
- else:
331
- full_op_ids = op_df.full_op_id.unique().tolist()
332
- num_output_records = df[df.source_full_op_id.isin(full_op_ids)].shape[0]
333
-
334
- # estimate the selectivity / fan-out
335
- return num_output_records / num_input_records
336
-
337
- def _compute_quality(self, row):
338
- # compute accuracy for filter
339
- if "filter" in row["op_name"].lower():
340
- row["correct"] = int(row["answer"] == row["accepted_answer"])
341
- row["num_answers"] = 1
342
- return row
343
-
344
- # otherwise, compute recall on a per-key basis
345
- try:
346
- # we'll measure recall on accepted_answer, as extraneous info is often not an issue
347
- answer = row["answer"]
348
- accepted_answer = row["accepted_answer"]
349
- correct = 0
350
- for key, value in accepted_answer.items():
351
- if key in answer and answer[key] == value:
352
- correct += 1
353
-
354
- row["correct"] = correct
355
- row["num_answers"] = len(accepted_answer.keys())
356
- return row
357
-
358
- except Exception as e:
359
- print(f"WARNING: error decoding answer or accepted_answer: {str(e)}")
360
- row["correct"] = 0
361
- row["num_answers"] = 1
362
- return row
363
-
364
- def _est_quality(self, op_df: pd.DataFrame, model_name: str | None = None) -> float:
365
- """
366
- Given sample cost data observations for a specific operation, compute the an estimate
367
- of the quality of its outputs by using GPT-4 as a champion model.
368
- """
369
- # get unique set of records
370
- record_ids = op_df.record_id.unique()
371
-
372
- # get champion model name
373
- vision = ("image_operation" in op_df.columns and op_df.image_operation.any())
374
- champion_model_name = get_champion_model_name(self.available_models, vision)
375
-
376
- # compute champion's answer (per-record) across all models; fall-back to most common answer if champion is not present
377
- record_id_to_answer = {}
378
- for record_id in record_ids:
379
- record_df = op_df[op_df.record_id == record_id]
380
- champion_most_common_answer = record_df[
381
- record_df.model_name == champion_model_name
382
- ].answer.mode()
383
- all_models_most_common_answer = record_df.answer.mode()
384
-
385
- if not champion_most_common_answer.empty:
386
- record_id_to_answer[record_id] = champion_most_common_answer.iloc[0]
387
- elif not all_models_most_common_answer.empty:
388
- record_id_to_answer[record_id] = all_models_most_common_answer.iloc[0]
389
- else:
390
- record_id_to_answer[record_id] = None
391
-
392
- # compute accepted answers and clean all answers
393
- pd.options.mode.chained_assignment = None # turn off copy warnings
394
- op_df.loc[:, "accepted_answer"] = op_df.record_id.apply(lambda id: record_id_to_answer[id])
395
- op_df = op_df.apply(lambda row: self._compute_quality(row), axis=1)
396
-
397
- # get subset of observations for model_name and estimate quality w/fraction of answers that match accepted answer
398
- model_df = (
399
- op_df[op_df.model_name == model_name]
400
- if model_name is not None
401
- else op_df[op_df.model_name.isna()]
402
- )
403
-
404
- # compute quality as the fraction of answers which are correct (recall on expected output)
405
- num_correct = model_df.correct.sum() if not model_df.empty else op_df.correct.sum()
406
- total_answers = model_df.num_answers.sum() if not model_df.empty else op_df.num_answers.sum()
407
- est_quality = num_correct / total_answers
408
-
409
- return est_quality
410
-
411
- def _compute_operator_estimates(self) -> dict[str, Any] | None:
412
- """
413
- Compute per-operator estimates of runtime, cost, and quality.
414
- """
415
- # if we don't have sample execution data, we cannot compute per-operator estimates
416
- if self.sample_execution_data_df is None:
417
- return None
418
-
419
- # get the set of operator ids for which we have sample data
420
- full_op_ids = self.sample_execution_data_df.full_op_id.unique()
421
-
422
- # compute estimates of runtime, cost, and quality (and intermediates like cardinality) for every operator
423
- operator_estimates = {}
424
- for full_op_id in full_op_ids:
425
- # filter for subset of sample execution data related to this operation
426
- op_df = self.sample_execution_data_df[
427
- self.sample_execution_data_df.full_op_id == full_op_id
428
- ]
429
-
430
- # skip computing an estimate if we didn't capture any sampling data for this operator
431
- # (this can happen if/when upstream filter operation(s) filter out all records)
432
- if op_df.empty:
433
- continue
434
-
435
- # initialize estimates
436
- estimates = {}
437
-
438
- # get the op_name for this operation
439
- model_name = op_df.model_name.iloc[0] if op_df.model_name.iloc[0] is not None else None
440
- op_name = str(op_df.op_name.iloc[0])
441
- if model_name is not None:
442
- # compute estimates per-model, and add None which forces computation of avg. across all models
443
- model_names = [m.value for m in get_models(include_vision=True)] + [None]
444
- # model_names = op_df.model_name.unique().tolist()
445
- estimates = {model_name: None for model_name in model_names}
446
- for model_name in model_names:
447
- time_per_record = self._est_time_per_record(op_df, model_name=model_name)
448
- cost_per_record = self._est_cost_per_record(op_df, model_name=model_name)
449
- input_tokens, output_tokens = self._est_tokens_per_record(op_df, model_name=model_name)
450
- selectivity = self._est_selectivity(self.sample_execution_data_df, op_df, model_name=model_name)
451
- quality = self._est_quality(op_df, model_name=model_name)
452
-
453
- model_estimates = {
454
- "time_per_record": time_per_record,
455
- "cost_per_record": cost_per_record,
456
- "total_input_tokens": input_tokens,
457
- "total_output_tokens": output_tokens,
458
- "selectivity": selectivity,
459
- "quality": quality,
460
- }
461
- estimates[model_name] = model_estimates
462
-
463
- # TODO pre-compute lists of op_names in groups
464
- elif op_name in ["NonLLMFilter"]:
465
- time_per_record = self._est_time_per_record(op_df)
466
- selectivity = self._est_selectivity(self.sample_execution_data_df, op_df)
467
- estimates = {"time_per_record": time_per_record, "selectivity": selectivity}
468
-
469
- elif op_name in ["MarshalAndScanDataOp", "CacheScanDataOp", "LimitScanOp", "CountAggregateOp", "AverageAggregateOp"]:
470
- time_per_record = self._est_time_per_record(op_df)
471
- estimates = {"time_per_record": time_per_record}
472
-
473
- elif op_name in ["ApplyGroupByOp"]:
474
- time_per_record = self._est_time_per_record(op_df)
475
- cardinality = self._est_cardinality(op_df)
476
- estimates = {"time_per_record": time_per_record, "cardinality": cardinality}
477
-
478
- operator_estimates[full_op_id] = estimates
479
-
480
- return operator_estimates
481
-
482
- def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
483
- # get identifier for operation which is unique within sentinel plan but consistent across sentinels
148
+ def _compute_naive_plan_cost(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None, right_source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
149
+ # get identifier for operator which is unique within sentinel plan but consistent across sentinels
484
150
  full_op_id = operator.get_full_op_id()
485
151
  logger.debug(f"Calling __call__ for {str(operator)} with full_op_id: {full_op_id}")
486
152
 
487
153
  # initialize estimates of operator metrics based on naive (but sometimes precise) logic
488
154
  if isinstance(operator, MarshalAndScanDataOp):
489
155
  # get handle to scan operator and pre-compute its size (number of records)
490
- datareader_len = len(operator.datareader)
156
+ datasource_len = len(operator.datasource)
491
157
 
492
158
  source_op_estimates = OperatorCostEstimates(
493
- cardinality=datareader_len,
159
+ cardinality=datasource_len,
494
160
  time_per_record=0.0,
495
161
  cost_per_record=0.0,
496
162
  quality=1.0,
@@ -498,88 +164,30 @@ class CostModel(BaseCostModel):
498
164
 
499
165
  op_estimates = operator.naive_cost_estimates(source_op_estimates, input_record_size_in_bytes=NAIVE_BYTES_PER_RECORD)
500
166
 
501
- elif isinstance(operator, CacheScanDataOp):
502
- datareader_len = len(operator.datareader)
503
-
167
+ elif isinstance(operator, ContextScanOp):
504
168
  source_op_estimates = OperatorCostEstimates(
505
- cardinality=datareader_len,
169
+ cardinality=1.0,
506
170
  time_per_record=0.0,
507
171
  cost_per_record=0.0,
508
172
  quality=1.0,
509
173
  )
510
174
 
511
- op_estimates = operator.naive_cost_estimates(source_op_estimates, input_record_size_in_bytes=NAIVE_BYTES_PER_RECORD)
175
+ op_estimates = operator.naive_cost_estimates(source_op_estimates)
176
+
177
+ elif isinstance(operator, JoinOp):
178
+ op_estimates = operator.naive_cost_estimates(source_op_estimates, right_source_op_estimates)
512
179
 
513
180
  else:
514
181
  op_estimates = operator.naive_cost_estimates(source_op_estimates)
515
182
 
516
- # if we have sample execution data, update naive estimates with more informed ones
517
- sample_op_estimates = self.operator_estimates
518
- if sample_op_estimates is not None and full_op_id in sample_op_estimates:
519
- if isinstance(operator, (MarshalAndScanDataOp, CacheScanDataOp)):
520
- op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
521
-
522
- elif isinstance(operator, ApplyGroupByOp):
523
- # NOTE: in theory we should also treat this cardinality est. as a random variable, but in practice we will
524
- # have K samples of the number of groups produced by the groupby operator, where K is the number of
525
- # plans we generate sample data with. For now, we will simply use the estimate without bounds.
526
- #
527
- # NOTE: this cardinality is the only cardinality we estimate directly b/c we can observe how many groups are
528
- # produced by the groupby in our sample and assume it may generalize to the full workload. To estimate
529
- # actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
530
- # the input cardinality (where the initial input cardinality from the datareader is known).
531
- op_estimates.cardinality = sample_op_estimates[full_op_id]["cardinality"]
532
- op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
533
-
534
- elif isinstance(operator, (CountAggregateOp, AverageAggregateOp)): # noqa: SIM114
535
- op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
536
-
537
- elif isinstance(operator, LimitScanOp):
538
- op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
539
-
540
- elif isinstance(operator, NonLLMFilter):
541
- op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id]["selectivity"]
542
- op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
543
-
544
- elif isinstance(operator, LLMFilter):
545
- model_name = operator.model.value
546
- op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
547
- op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
548
- op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
549
- op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
550
-
551
- elif isinstance(operator, LLMConvert):
552
- # NOTE: code synthesis does not have a model attribute
553
- model_name = operator.model.value if hasattr(operator, "model") else None
554
- op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
555
- op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
556
- op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
557
- op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
558
-
559
- # NOTE: if code synth. fails, this will turn into ConventionalQuery calls to GPT-3.5,
560
- # which would wildly mess up estimate of time and cost per-record
561
- # do code synthesis adjustment
562
- if isinstance(operator, CodeSynthesisConvert):
563
- op_estimates.time_per_record = 1e-5
564
- op_estimates.cost_per_record = 1e-4
565
- op_estimates.quality = op_estimates.quality * (GPT_4o_MODEL_CARD["code"] / 100.0)
566
-
567
- # rag convert adjustment
568
- if isinstance(operator, RAGConvert):
569
- total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
570
- total_output_tokens = sample_op_estimates[full_op_id][model_name]["total_output_tokens"]
571
- op_estimates.cost_per_record = (
572
- MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
573
- + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
574
- )
575
- op_estimates.quality = op_estimates.quality * operator.naive_quality_adjustment
576
-
577
- else:
578
- raise Exception("Unknown operator")
579
-
580
183
  # compute estimates for this operator
581
- op_time = op_estimates.time_per_record * source_op_estimates.cardinality
582
- op_cost = op_estimates.cost_per_record * source_op_estimates.cardinality
184
+ est_input_cardinality = (
185
+ source_op_estimates.cardinality * right_source_op_estimates.cardinality
186
+ if isinstance(operator, JoinOp)
187
+ else source_op_estimates.cardinality
188
+ )
189
+ op_time = op_estimates.time_per_record * est_input_cardinality
190
+ op_cost = op_estimates.cost_per_record * est_input_cardinality
583
191
  op_quality = op_estimates.quality
584
192
 
585
193
  # create and return PlanCost object for this op's statistics
@@ -593,3 +201,62 @@ class CostModel(BaseCostModel):
593
201
  logger.debug(f"Plan cost: {op_plan_cost}")
594
202
 
595
203
  return op_plan_cost
204
+
205
+ def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None, right_source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
206
+ # for non-sentinel execution, we use naive estimates
207
+ full_op_id = operator.get_full_op_id()
208
+ unique_logical_op_id = operator.unique_logical_op_id
209
+ if self.operator_to_stats is None or unique_logical_op_id not in self.operator_to_stats:
210
+ return self._compute_naive_plan_cost(operator, source_op_estimates, right_source_op_estimates)
211
+
212
+ # NOTE: some physical operators may not have any sample execution data in this cost model;
213
+ # these physical operators are filtered out of the Optimizer, thus we can assume that
214
+ # we will have execution data for each operator passed into __call__; nevertheless, we
215
+ # still perform a sanity check
216
+ # look up physical and logical op ids associated with this physical operator
217
+ physical_op_to_stats = self.operator_to_stats.get(unique_logical_op_id)
218
+ assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
219
+ assert physical_op_to_stats.get(full_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
220
+ logger.debug(f"Calling __call__ for {str(operator)}")
221
+
222
+ # look up stats for this operation
223
+ est_cost_per_record = self.operator_to_stats[unique_logical_op_id][full_op_id]["cost"]
224
+ est_time_per_record = self.operator_to_stats[unique_logical_op_id][full_op_id]["time"]
225
+ est_quality = self.operator_to_stats[unique_logical_op_id][full_op_id]["quality"]
226
+ est_selectivity = self.operator_to_stats[unique_logical_op_id][full_op_id]["selectivity"]
227
+
228
+ # create source_op_estimates for scan operators if they are not provided
229
+ if isinstance(operator, ScanPhysicalOp):
230
+ # get handle to scan operator and pre-compute its size (number of records)
231
+ datasource_len = len(operator.datasource)
232
+
233
+ source_op_estimates = OperatorCostEstimates(
234
+ cardinality=datasource_len,
235
+ time_per_record=0.0,
236
+ cost_per_record=0.0,
237
+ quality=1.0,
238
+ )
239
+
240
+ # generate new set of OperatorCostEstimates
241
+ est_input_cardinality = (
242
+ source_op_estimates.cardinality * right_source_op_estimates.cardinality
243
+ if isinstance(operator, JoinOp)
244
+ else source_op_estimates.cardinality
245
+ )
246
+ op_estimates = OperatorCostEstimates(
247
+ cardinality=est_selectivity * est_input_cardinality,
248
+ time_per_record=est_time_per_record,
249
+ cost_per_record=est_cost_per_record,
250
+ quality=est_quality,
251
+ )
252
+
253
+ # compute estimates for this operator
254
+ op_time = op_estimates.time_per_record * est_input_cardinality
255
+ op_cost = op_estimates.cost_per_record * est_input_cardinality
256
+ op_quality = op_estimates.quality
257
+
258
+ # construct and return op estimates
259
+ plan_cost = PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
260
+ logger.debug(f"Done calling __call__ for {str(operator)}")
261
+ logger.debug(f"Plan cost: {plan_cost}")
262
+ return plan_cost