palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +259 -197
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +634 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +61 -5
- palimpzest/prompts/filter_prompts.py +50 -5
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
- palimpzest/prompts/prompt_factory.py +358 -46
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +157 -330
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +27 -21
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +22 -13
- palimpzest/query/operators/join.py +402 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +198 -80
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +41 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +27 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
- palimpzest-0.8.0.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.20.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
|
|
5
|
-
# NOTE: the answer.mode() call(s) inside of _est_quality() throw a UserWarning when there are multiple
|
|
6
|
-
# answers to a convert with the same mode. This is because pandas tries to sort the answers
|
|
7
|
-
# before returning them, but since answer is a column of dicts the '<' operator fails on dicts.
|
|
8
|
-
# For now, we can simply ignore the warning b/c we pick an answer at random anyways if there are
|
|
9
|
-
# multiple w/the same count, but in the future we may want to cast the 'dict' --> 'str' or compute
|
|
10
|
-
# the mode on a per-field basis.
|
|
11
4
|
import warnings
|
|
12
|
-
from typing import Any
|
|
13
5
|
|
|
14
6
|
import pandas as pd
|
|
15
7
|
|
|
16
|
-
from palimpzest.constants import
|
|
17
|
-
from palimpzest.core.
|
|
18
|
-
from palimpzest.query.operators.
|
|
19
|
-
from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvert
|
|
20
|
-
from palimpzest.query.operators.convert import LLMConvert
|
|
21
|
-
from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
|
|
22
|
-
from palimpzest.query.operators.limit import LimitScanOp
|
|
8
|
+
from palimpzest.constants import NAIVE_BYTES_PER_RECORD
|
|
9
|
+
from palimpzest.core.models import OperatorCostEstimates, PlanCost, SentinelPlanStats
|
|
10
|
+
from palimpzest.query.operators.join import JoinOp
|
|
23
11
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
24
|
-
from palimpzest.query.operators.
|
|
25
|
-
from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
|
|
26
|
-
from palimpzest.utils.model_helpers import get_champion_model_name, get_models
|
|
12
|
+
from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp, ScanPhysicalOp
|
|
27
13
|
|
|
28
14
|
warnings.simplefilter(action='ignore', category=UserWarning)
|
|
29
15
|
|
|
@@ -62,7 +48,7 @@ class SampleBasedCostModel:
|
|
|
62
48
|
"""
|
|
63
49
|
def __init__(
|
|
64
50
|
self,
|
|
65
|
-
sentinel_plan_stats: SentinelPlanStats,
|
|
51
|
+
sentinel_plan_stats: SentinelPlanStats | None = None,
|
|
66
52
|
verbose: bool = False,
|
|
67
53
|
exp_name: str | None = None,
|
|
68
54
|
):
|
|
@@ -73,25 +59,34 @@ class SampleBasedCostModel:
|
|
|
73
59
|
self.exp_name = exp_name
|
|
74
60
|
|
|
75
61
|
# construct cost, time, quality, and selectivity matrices for each operator set;
|
|
76
|
-
self.operator_to_stats = self.
|
|
77
|
-
self.costed_full_op_ids = set([
|
|
62
|
+
self.operator_to_stats = self._compute_operator_stats(sentinel_plan_stats)
|
|
63
|
+
self.costed_full_op_ids = None if self.operator_to_stats is None else set([
|
|
78
64
|
full_op_id
|
|
79
65
|
for _, full_op_id_to_stats in self.operator_to_stats.items()
|
|
80
66
|
for full_op_id in full_op_id_to_stats
|
|
81
67
|
])
|
|
82
68
|
|
|
69
|
+
# if there is a logical operator with no samples; add all of its op ids to costed_full_op_ids;
|
|
70
|
+
# this will lead to the cost model applying the naive cost estimates for all physical op ids
|
|
71
|
+
# in this logical operator (I think?)
|
|
72
|
+
# TODO
|
|
73
|
+
|
|
83
74
|
logger.info(f"Initialized SampleBasedCostModel with verbose={self.verbose}")
|
|
84
75
|
logger.debug(f"Initialized SampleBasedCostModel with params: {self.__dict__}")
|
|
85
76
|
|
|
86
77
|
def get_costed_full_op_ids(self):
|
|
87
78
|
return self.costed_full_op_ids
|
|
88
79
|
|
|
89
|
-
def
|
|
80
|
+
def _compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats | None) -> dict:
|
|
90
81
|
logger.debug("Computing operator statistics")
|
|
82
|
+
# if no stats are provided, simply return None
|
|
83
|
+
if sentinel_plan_stats is None:
|
|
84
|
+
return None
|
|
85
|
+
|
|
91
86
|
# flatten the nested dictionary of execution data and pull out fields relevant to cost estimation
|
|
92
87
|
execution_record_op_stats = []
|
|
93
|
-
for
|
|
94
|
-
logger.debug(f"Computing operator statistics for logical_op_id: {
|
|
88
|
+
for unique_logical_op_id, full_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
|
|
89
|
+
logger.debug(f"Computing operator statistics for logical_op_id: {unique_logical_op_id}")
|
|
95
90
|
# flatten the execution data into a list of RecordOpStats
|
|
96
91
|
op_set_execution_data = [
|
|
97
92
|
record_op_stats
|
|
@@ -102,17 +97,17 @@ class SampleBasedCostModel:
|
|
|
102
97
|
# add entries from execution data into matrices
|
|
103
98
|
for record_op_stats in op_set_execution_data:
|
|
104
99
|
record_op_stats_dict = {
|
|
105
|
-
"
|
|
100
|
+
"unique_logical_op_id": unique_logical_op_id,
|
|
106
101
|
"full_op_id": record_op_stats.full_op_id,
|
|
107
102
|
"record_id": record_op_stats.record_id,
|
|
108
|
-
"
|
|
103
|
+
"record_parent_ids": record_op_stats.record_parent_ids,
|
|
109
104
|
"cost_per_record": record_op_stats.cost_per_record,
|
|
110
105
|
"time_per_record": record_op_stats.time_per_record,
|
|
111
106
|
"quality": record_op_stats.quality,
|
|
112
107
|
"passed_operator": record_op_stats.passed_operator,
|
|
113
|
-
"
|
|
114
|
-
"op_details": record_op_stats.op_details,
|
|
115
|
-
"answer": record_op_stats.answer,
|
|
108
|
+
"source_indices": record_op_stats.record_source_indices, # TODO: remove
|
|
109
|
+
"op_details": record_op_stats.op_details, # TODO: remove
|
|
110
|
+
"answer": record_op_stats.answer, # TODO: remove
|
|
116
111
|
}
|
|
117
112
|
execution_record_op_stats.append(record_op_stats_dict)
|
|
118
113
|
|
|
@@ -121,22 +116,22 @@ class SampleBasedCostModel:
|
|
|
121
116
|
|
|
122
117
|
# for each full_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
|
|
123
118
|
operator_to_stats = {}
|
|
124
|
-
for
|
|
125
|
-
logger.debug(f"Computing operator statistics for
|
|
126
|
-
operator_to_stats[
|
|
119
|
+
for unique_logical_op_id, logical_op_df in operator_stats_df.groupby("unique_logical_op_id"):
|
|
120
|
+
logger.debug(f"Computing operator statistics for unique_logical_op_id: {unique_logical_op_id}")
|
|
121
|
+
operator_to_stats[unique_logical_op_id] = {}
|
|
127
122
|
|
|
128
123
|
for full_op_id, physical_op_df in logical_op_df.groupby("full_op_id"):
|
|
129
|
-
# compute the number of input records processed by this operator; use
|
|
124
|
+
# compute the number of input records processed by this operator; use source_indices for scan operator(s)
|
|
130
125
|
num_source_records = (
|
|
131
|
-
|
|
132
|
-
if not physical_op_df.
|
|
133
|
-
else
|
|
126
|
+
physical_op_df.record_parent_ids.apply(tuple).nunique()
|
|
127
|
+
if not physical_op_df.record_parent_ids.isna().all()
|
|
128
|
+
else physical_op_df.source_indices.apply(tuple).nunique()
|
|
134
129
|
)
|
|
135
130
|
|
|
136
131
|
# compute selectivity
|
|
137
132
|
selectivity = physical_op_df.passed_operator.sum() / num_source_records
|
|
138
133
|
|
|
139
|
-
operator_to_stats[
|
|
134
|
+
operator_to_stats[unique_logical_op_id][full_op_id] = {
|
|
140
135
|
"cost": physical_op_df.cost_per_record.mean(),
|
|
141
136
|
"time": physical_op_df.time_per_record.mean(),
|
|
142
137
|
"quality": physical_op_df.quality.mean(),
|
|
@@ -150,347 +145,18 @@ class SampleBasedCostModel:
|
|
|
150
145
|
logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
|
|
151
146
|
return operator_to_stats
|
|
152
147
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
# NOTE: some physical operators may not have any sample execution data in this cost model;
|
|
156
|
-
# these physical operators are filtered out of the Optimizer, thus we can assume that
|
|
157
|
-
# we will have execution data for each operator passed into __call__; nevertheless, we
|
|
158
|
-
# still perform a sanity check
|
|
159
|
-
# look up physical and logical op ids associated with this physical operator
|
|
160
|
-
full_op_id = operator.get_full_op_id()
|
|
161
|
-
logical_op_id = operator.logical_op_id
|
|
162
|
-
physical_op_to_stats = self.operator_to_stats.get(logical_op_id)
|
|
163
|
-
assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
|
|
164
|
-
assert physical_op_to_stats.get(full_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
|
|
165
|
-
logger.debug(f"Calling __call__ for {str(operator)}")
|
|
166
|
-
|
|
167
|
-
# look up stats for this operation
|
|
168
|
-
est_cost_per_record = self.operator_to_stats[logical_op_id][full_op_id]["cost"]
|
|
169
|
-
est_time_per_record = self.operator_to_stats[logical_op_id][full_op_id]["time"]
|
|
170
|
-
est_quality = self.operator_to_stats[logical_op_id][full_op_id]["quality"]
|
|
171
|
-
est_selectivity = self.operator_to_stats[logical_op_id][full_op_id]["selectivity"]
|
|
172
|
-
|
|
173
|
-
# create source_op_estimates for scan operators if they are not provided
|
|
174
|
-
if isinstance(operator, ScanPhysicalOp):
|
|
175
|
-
# get handle to scan operator and pre-compute its size (number of records)
|
|
176
|
-
datareader_len = len(operator.datareader)
|
|
177
|
-
|
|
178
|
-
source_op_estimates = OperatorCostEstimates(
|
|
179
|
-
cardinality=datareader_len,
|
|
180
|
-
time_per_record=0.0,
|
|
181
|
-
cost_per_record=0.0,
|
|
182
|
-
quality=1.0,
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
# generate new set of OperatorCostEstimates
|
|
186
|
-
op_estimates = OperatorCostEstimates(
|
|
187
|
-
cardinality=est_selectivity * source_op_estimates.cardinality,
|
|
188
|
-
time_per_record=est_time_per_record,
|
|
189
|
-
cost_per_record=est_cost_per_record,
|
|
190
|
-
quality=est_quality,
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
# compute estimates for this operator
|
|
194
|
-
op_time = op_estimates.time_per_record * source_op_estimates.cardinality
|
|
195
|
-
op_cost = op_estimates.cost_per_record * source_op_estimates.cardinality
|
|
196
|
-
op_quality = op_estimates.quality
|
|
197
|
-
|
|
198
|
-
# construct and return op estimates
|
|
199
|
-
plan_cost = PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
|
|
200
|
-
logger.debug(f"Done calling __call__ for {str(operator)}")
|
|
201
|
-
logger.debug(f"Plan cost: {plan_cost}")
|
|
202
|
-
return plan_cost
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
class CostModel(BaseCostModel):
|
|
206
|
-
"""
|
|
207
|
-
This class takes in a list of RecordOpStats and performs cost estimation on a given operator
|
|
208
|
-
by taking the average of any sample execution that the CostModel has for that operator. If no
|
|
209
|
-
such data exists, it returns a naive estimate.
|
|
210
|
-
"""
|
|
211
|
-
def __init__(
|
|
212
|
-
self,
|
|
213
|
-
sample_execution_data: list[RecordOpStats] | None = None,
|
|
214
|
-
available_models: list[Model] | None = None,
|
|
215
|
-
) -> None:
|
|
216
|
-
if sample_execution_data is None:
|
|
217
|
-
sample_execution_data = []
|
|
218
|
-
if available_models is None:
|
|
219
|
-
available_models = []
|
|
220
|
-
|
|
221
|
-
# construct full dataset of samples
|
|
222
|
-
self.sample_execution_data_df = (
|
|
223
|
-
pd.DataFrame(sample_execution_data)
|
|
224
|
-
if len(sample_execution_data) > 0
|
|
225
|
-
else None
|
|
226
|
-
)
|
|
227
|
-
# df contains a column called record_state, that sometimes contain a dict
|
|
228
|
-
# we want to extract the keys from the dict and create a new column for each key
|
|
229
|
-
|
|
230
|
-
# set available models
|
|
231
|
-
self.available_models = available_models
|
|
232
|
-
|
|
233
|
-
# compute per-operator estimates
|
|
234
|
-
self.operator_estimates = self._compute_operator_estimates()
|
|
235
|
-
|
|
236
|
-
# compute set of costed full op ids from operator_to_stats
|
|
237
|
-
self.costed_full_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
|
|
238
|
-
logger.info("Initialized CostModel.")
|
|
239
|
-
logger.debug(f"Initialized CostModel with params: {self.__dict__}")
|
|
240
|
-
|
|
241
|
-
def get_costed_full_op_ids(self):
|
|
242
|
-
return self.costed_full_op_ids
|
|
243
|
-
|
|
244
|
-
def _compute_mean(self, df: pd.DataFrame, col: str, model_name: str | None = None) -> float:
|
|
245
|
-
"""
|
|
246
|
-
Compute the mean for the given column and dataframe. If the model_name is provided, filter
|
|
247
|
-
for the subset of rows belonging to the model.
|
|
248
|
-
"""
|
|
249
|
-
# use model-specific estimate if possible
|
|
250
|
-
if model_name is not None:
|
|
251
|
-
model_df = df[df.model_name == model_name]
|
|
252
|
-
if not model_df.empty:
|
|
253
|
-
return model_df[col].mean()
|
|
254
|
-
|
|
255
|
-
# compute aggregate mean across all models
|
|
256
|
-
return df[col].mean()
|
|
257
|
-
|
|
258
|
-
def _est_time_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
|
|
259
|
-
"""
|
|
260
|
-
Given sample cost data observations for a specific operation, compute the mean and CI
|
|
261
|
-
for the time per record.
|
|
262
|
-
"""
|
|
263
|
-
return self._compute_mean(df=op_df, col="time_per_record", model_name=model_name)
|
|
264
|
-
|
|
265
|
-
def _est_cost_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
|
|
266
|
-
"""
|
|
267
|
-
Given sample cost data observations for a specific operation, compute the mean and CI
|
|
268
|
-
for the cost per record.
|
|
269
|
-
"""
|
|
270
|
-
return self._compute_mean(df=op_df, col="cost_per_record", model_name=model_name)
|
|
271
|
-
|
|
272
|
-
def _est_tokens_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float]:
|
|
273
|
-
"""
|
|
274
|
-
Given sample cost data observations for a specific operation, compute the mean and CI
|
|
275
|
-
for the total input tokens and total output tokens.
|
|
276
|
-
"""
|
|
277
|
-
total_input_tokens = self._compute_mean(df=op_df, col="total_input_tokens", model_name=model_name)
|
|
278
|
-
total_output_tokens = self._compute_mean(df=op_df, col="total_output_tokens", model_name=model_name)
|
|
279
|
-
|
|
280
|
-
return total_input_tokens, total_output_tokens
|
|
281
|
-
|
|
282
|
-
def _est_cardinality(self, op_df: pd.DataFrame, model_name: str | None = None) -> float:
|
|
283
|
-
"""
|
|
284
|
-
Given sample cost data observations for a specific operation, compute the number of
|
|
285
|
-
rows output by the operation.
|
|
286
|
-
|
|
287
|
-
NOTE: right now, this should only be used by the ApplyGroupByOp as a way to gauge the
|
|
288
|
-
number of output groups. Using this to estimate, e.g. the cardinality of a filter,
|
|
289
|
-
convert, or base scan will lead to wildly inaccurate results because the absolute value
|
|
290
|
-
of these cardinalities will simply be a reflection of the sample size.
|
|
291
|
-
|
|
292
|
-
For those operations, we use the `_est_selectivity` function to estimate the operator's
|
|
293
|
-
selectivity, which we can apply to an est. of the operator's input cardinality.
|
|
294
|
-
"""
|
|
295
|
-
return op_df.shape[0] / len(op_df.plan_id.unique())
|
|
296
|
-
|
|
297
|
-
def _est_selectivity(self, df: pd.DataFrame, op_df: pd.DataFrame, model_name: str | None = None) -> float:
|
|
298
|
-
"""
|
|
299
|
-
Given sample cost data observations for the plan and a specific operation, compute
|
|
300
|
-
the ratio of records between this operator and its source operator.
|
|
301
|
-
"""
|
|
302
|
-
# compute whether or not this operation is a filter
|
|
303
|
-
is_filter_op = "filter" in str(op_df.op_name.iloc[0]).lower()
|
|
304
|
-
|
|
305
|
-
# use model-specific estimate if possible
|
|
306
|
-
if model_name is not None:
|
|
307
|
-
model_op_df = op_df[op_df.model_name == model_name]
|
|
308
|
-
if not model_op_df.empty:
|
|
309
|
-
num_input_records = model_op_df.shape[0]
|
|
310
|
-
|
|
311
|
-
# get subset of records that were the source to this operator
|
|
312
|
-
num_output_records = None
|
|
313
|
-
if is_filter_op:
|
|
314
|
-
num_output_records = model_op_df.passed_operator.sum()
|
|
315
|
-
else:
|
|
316
|
-
full_op_ids = model_op_df.full_op_id.unique().tolist()
|
|
317
|
-
plan_ids = model_op_df.plan_id.unique().tolist()
|
|
318
|
-
num_output_records = df[df.source_full_op_id.isin(full_op_ids) & df.plan_id.isin(plan_ids)].shape[0]
|
|
319
|
-
|
|
320
|
-
# estimate the selectivity / fan-out
|
|
321
|
-
return num_output_records / num_input_records
|
|
322
|
-
|
|
323
|
-
# otherwise average selectivity across all ops
|
|
324
|
-
num_input_records = op_df.shape[0]
|
|
325
|
-
|
|
326
|
-
# get subset of records that were the source to this operator
|
|
327
|
-
num_output_records = None
|
|
328
|
-
if is_filter_op:
|
|
329
|
-
num_output_records = op_df.passed_operator.sum()
|
|
330
|
-
else:
|
|
331
|
-
full_op_ids = op_df.full_op_id.unique().tolist()
|
|
332
|
-
num_output_records = df[df.source_full_op_id.isin(full_op_ids)].shape[0]
|
|
333
|
-
|
|
334
|
-
# estimate the selectivity / fan-out
|
|
335
|
-
return num_output_records / num_input_records
|
|
336
|
-
|
|
337
|
-
def _compute_quality(self, row):
|
|
338
|
-
# compute accuracy for filter
|
|
339
|
-
if "filter" in row["op_name"].lower():
|
|
340
|
-
row["correct"] = int(row["answer"] == row["accepted_answer"])
|
|
341
|
-
row["num_answers"] = 1
|
|
342
|
-
return row
|
|
343
|
-
|
|
344
|
-
# otherwise, compute recall on a per-key basis
|
|
345
|
-
try:
|
|
346
|
-
# we'll measure recall on accepted_answer, as extraneous info is often not an issue
|
|
347
|
-
answer = row["answer"]
|
|
348
|
-
accepted_answer = row["accepted_answer"]
|
|
349
|
-
correct = 0
|
|
350
|
-
for key, value in accepted_answer.items():
|
|
351
|
-
if key in answer and answer[key] == value:
|
|
352
|
-
correct += 1
|
|
353
|
-
|
|
354
|
-
row["correct"] = correct
|
|
355
|
-
row["num_answers"] = len(accepted_answer.keys())
|
|
356
|
-
return row
|
|
357
|
-
|
|
358
|
-
except Exception as e:
|
|
359
|
-
print(f"WARNING: error decoding answer or accepted_answer: {str(e)}")
|
|
360
|
-
row["correct"] = 0
|
|
361
|
-
row["num_answers"] = 1
|
|
362
|
-
return row
|
|
363
|
-
|
|
364
|
-
def _est_quality(self, op_df: pd.DataFrame, model_name: str | None = None) -> float:
|
|
365
|
-
"""
|
|
366
|
-
Given sample cost data observations for a specific operation, compute the an estimate
|
|
367
|
-
of the quality of its outputs by using GPT-4 as a champion model.
|
|
368
|
-
"""
|
|
369
|
-
# get unique set of records
|
|
370
|
-
record_ids = op_df.record_id.unique()
|
|
371
|
-
|
|
372
|
-
# get champion model name
|
|
373
|
-
vision = ("image_operation" in op_df.columns and op_df.image_operation.any())
|
|
374
|
-
champion_model_name = get_champion_model_name(self.available_models, vision)
|
|
375
|
-
|
|
376
|
-
# compute champion's answer (per-record) across all models; fall-back to most common answer if champion is not present
|
|
377
|
-
record_id_to_answer = {}
|
|
378
|
-
for record_id in record_ids:
|
|
379
|
-
record_df = op_df[op_df.record_id == record_id]
|
|
380
|
-
champion_most_common_answer = record_df[
|
|
381
|
-
record_df.model_name == champion_model_name
|
|
382
|
-
].answer.mode()
|
|
383
|
-
all_models_most_common_answer = record_df.answer.mode()
|
|
384
|
-
|
|
385
|
-
if not champion_most_common_answer.empty:
|
|
386
|
-
record_id_to_answer[record_id] = champion_most_common_answer.iloc[0]
|
|
387
|
-
elif not all_models_most_common_answer.empty:
|
|
388
|
-
record_id_to_answer[record_id] = all_models_most_common_answer.iloc[0]
|
|
389
|
-
else:
|
|
390
|
-
record_id_to_answer[record_id] = None
|
|
391
|
-
|
|
392
|
-
# compute accepted answers and clean all answers
|
|
393
|
-
pd.options.mode.chained_assignment = None # turn off copy warnings
|
|
394
|
-
op_df.loc[:, "accepted_answer"] = op_df.record_id.apply(lambda id: record_id_to_answer[id])
|
|
395
|
-
op_df = op_df.apply(lambda row: self._compute_quality(row), axis=1)
|
|
396
|
-
|
|
397
|
-
# get subset of observations for model_name and estimate quality w/fraction of answers that match accepted answer
|
|
398
|
-
model_df = (
|
|
399
|
-
op_df[op_df.model_name == model_name]
|
|
400
|
-
if model_name is not None
|
|
401
|
-
else op_df[op_df.model_name.isna()]
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
# compute quality as the fraction of answers which are correct (recall on expected output)
|
|
405
|
-
num_correct = model_df.correct.sum() if not model_df.empty else op_df.correct.sum()
|
|
406
|
-
total_answers = model_df.num_answers.sum() if not model_df.empty else op_df.num_answers.sum()
|
|
407
|
-
est_quality = num_correct / total_answers
|
|
408
|
-
|
|
409
|
-
return est_quality
|
|
410
|
-
|
|
411
|
-
def _compute_operator_estimates(self) -> dict[str, Any] | None:
|
|
412
|
-
"""
|
|
413
|
-
Compute per-operator estimates of runtime, cost, and quality.
|
|
414
|
-
"""
|
|
415
|
-
# if we don't have sample execution data, we cannot compute per-operator estimates
|
|
416
|
-
if self.sample_execution_data_df is None:
|
|
417
|
-
return None
|
|
418
|
-
|
|
419
|
-
# get the set of operator ids for which we have sample data
|
|
420
|
-
full_op_ids = self.sample_execution_data_df.full_op_id.unique()
|
|
421
|
-
|
|
422
|
-
# compute estimates of runtime, cost, and quality (and intermediates like cardinality) for every operator
|
|
423
|
-
operator_estimates = {}
|
|
424
|
-
for full_op_id in full_op_ids:
|
|
425
|
-
# filter for subset of sample execution data related to this operation
|
|
426
|
-
op_df = self.sample_execution_data_df[
|
|
427
|
-
self.sample_execution_data_df.full_op_id == full_op_id
|
|
428
|
-
]
|
|
429
|
-
|
|
430
|
-
# skip computing an estimate if we didn't capture any sampling data for this operator
|
|
431
|
-
# (this can happen if/when upstream filter operation(s) filter out all records)
|
|
432
|
-
if op_df.empty:
|
|
433
|
-
continue
|
|
434
|
-
|
|
435
|
-
# initialize estimates
|
|
436
|
-
estimates = {}
|
|
437
|
-
|
|
438
|
-
# get the op_name for this operation
|
|
439
|
-
model_name = op_df.model_name.iloc[0] if op_df.model_name.iloc[0] is not None else None
|
|
440
|
-
op_name = str(op_df.op_name.iloc[0])
|
|
441
|
-
if model_name is not None:
|
|
442
|
-
# compute estimates per-model, and add None which forces computation of avg. across all models
|
|
443
|
-
model_names = [m.value for m in get_models(include_vision=True)] + [None]
|
|
444
|
-
# model_names = op_df.model_name.unique().tolist()
|
|
445
|
-
estimates = {model_name: None for model_name in model_names}
|
|
446
|
-
for model_name in model_names:
|
|
447
|
-
time_per_record = self._est_time_per_record(op_df, model_name=model_name)
|
|
448
|
-
cost_per_record = self._est_cost_per_record(op_df, model_name=model_name)
|
|
449
|
-
input_tokens, output_tokens = self._est_tokens_per_record(op_df, model_name=model_name)
|
|
450
|
-
selectivity = self._est_selectivity(self.sample_execution_data_df, op_df, model_name=model_name)
|
|
451
|
-
quality = self._est_quality(op_df, model_name=model_name)
|
|
452
|
-
|
|
453
|
-
model_estimates = {
|
|
454
|
-
"time_per_record": time_per_record,
|
|
455
|
-
"cost_per_record": cost_per_record,
|
|
456
|
-
"total_input_tokens": input_tokens,
|
|
457
|
-
"total_output_tokens": output_tokens,
|
|
458
|
-
"selectivity": selectivity,
|
|
459
|
-
"quality": quality,
|
|
460
|
-
}
|
|
461
|
-
estimates[model_name] = model_estimates
|
|
462
|
-
|
|
463
|
-
# TODO pre-compute lists of op_names in groups
|
|
464
|
-
elif op_name in ["NonLLMFilter"]:
|
|
465
|
-
time_per_record = self._est_time_per_record(op_df)
|
|
466
|
-
selectivity = self._est_selectivity(self.sample_execution_data_df, op_df)
|
|
467
|
-
estimates = {"time_per_record": time_per_record, "selectivity": selectivity}
|
|
468
|
-
|
|
469
|
-
elif op_name in ["MarshalAndScanDataOp", "CacheScanDataOp", "LimitScanOp", "CountAggregateOp", "AverageAggregateOp"]:
|
|
470
|
-
time_per_record = self._est_time_per_record(op_df)
|
|
471
|
-
estimates = {"time_per_record": time_per_record}
|
|
472
|
-
|
|
473
|
-
elif op_name in ["ApplyGroupByOp"]:
|
|
474
|
-
time_per_record = self._est_time_per_record(op_df)
|
|
475
|
-
cardinality = self._est_cardinality(op_df)
|
|
476
|
-
estimates = {"time_per_record": time_per_record, "cardinality": cardinality}
|
|
477
|
-
|
|
478
|
-
operator_estimates[full_op_id] = estimates
|
|
479
|
-
|
|
480
|
-
return operator_estimates
|
|
481
|
-
|
|
482
|
-
def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
|
|
483
|
-
# get identifier for operation which is unique within sentinel plan but consistent across sentinels
|
|
148
|
+
def _compute_naive_plan_cost(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None, right_source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
|
|
149
|
+
# get identifier for operator which is unique within sentinel plan but consistent across sentinels
|
|
484
150
|
full_op_id = operator.get_full_op_id()
|
|
485
151
|
logger.debug(f"Calling __call__ for {str(operator)} with full_op_id: {full_op_id}")
|
|
486
152
|
|
|
487
153
|
# initialize estimates of operator metrics based on naive (but sometimes precise) logic
|
|
488
154
|
if isinstance(operator, MarshalAndScanDataOp):
|
|
489
155
|
# get handle to scan operator and pre-compute its size (number of records)
|
|
490
|
-
|
|
156
|
+
datasource_len = len(operator.datasource)
|
|
491
157
|
|
|
492
158
|
source_op_estimates = OperatorCostEstimates(
|
|
493
|
-
cardinality=
|
|
159
|
+
cardinality=datasource_len,
|
|
494
160
|
time_per_record=0.0,
|
|
495
161
|
cost_per_record=0.0,
|
|
496
162
|
quality=1.0,
|
|
@@ -498,88 +164,30 @@ class CostModel(BaseCostModel):
|
|
|
498
164
|
|
|
499
165
|
op_estimates = operator.naive_cost_estimates(source_op_estimates, input_record_size_in_bytes=NAIVE_BYTES_PER_RECORD)
|
|
500
166
|
|
|
501
|
-
elif isinstance(operator,
|
|
502
|
-
datareader_len = len(operator.datareader)
|
|
503
|
-
|
|
167
|
+
elif isinstance(operator, ContextScanOp):
|
|
504
168
|
source_op_estimates = OperatorCostEstimates(
|
|
505
|
-
cardinality=
|
|
169
|
+
cardinality=1.0,
|
|
506
170
|
time_per_record=0.0,
|
|
507
171
|
cost_per_record=0.0,
|
|
508
172
|
quality=1.0,
|
|
509
173
|
)
|
|
510
174
|
|
|
511
|
-
op_estimates = operator.naive_cost_estimates(source_op_estimates
|
|
175
|
+
op_estimates = operator.naive_cost_estimates(source_op_estimates)
|
|
176
|
+
|
|
177
|
+
elif isinstance(operator, JoinOp):
|
|
178
|
+
op_estimates = operator.naive_cost_estimates(source_op_estimates, right_source_op_estimates)
|
|
512
179
|
|
|
513
180
|
else:
|
|
514
181
|
op_estimates = operator.naive_cost_estimates(source_op_estimates)
|
|
515
182
|
|
|
516
|
-
# if we have sample execution data, update naive estimates with more informed ones
|
|
517
|
-
sample_op_estimates = self.operator_estimates
|
|
518
|
-
if sample_op_estimates is not None and full_op_id in sample_op_estimates:
|
|
519
|
-
if isinstance(operator, (MarshalAndScanDataOp, CacheScanDataOp)):
|
|
520
|
-
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
521
|
-
|
|
522
|
-
elif isinstance(operator, ApplyGroupByOp):
|
|
523
|
-
# NOTE: in theory we should also treat this cardinality est. as a random variable, but in practice we will
|
|
524
|
-
# have K samples of the number of groups produced by the groupby operator, where K is the number of
|
|
525
|
-
# plans we generate sample data with. For now, we will simply use the estimate without bounds.
|
|
526
|
-
#
|
|
527
|
-
# NOTE: this cardinality is the only cardinality we estimate directly b/c we can observe how many groups are
|
|
528
|
-
# produced by the groupby in our sample and assume it may generalize to the full workload. To estimate
|
|
529
|
-
# actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
|
|
530
|
-
# the input cardinality (where the initial input cardinality from the datareader is known).
|
|
531
|
-
op_estimates.cardinality = sample_op_estimates[full_op_id]["cardinality"]
|
|
532
|
-
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
533
|
-
|
|
534
|
-
elif isinstance(operator, (CountAggregateOp, AverageAggregateOp)): # noqa: SIM114
|
|
535
|
-
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
536
|
-
|
|
537
|
-
elif isinstance(operator, LimitScanOp):
|
|
538
|
-
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
539
|
-
|
|
540
|
-
elif isinstance(operator, NonLLMFilter):
|
|
541
|
-
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id]["selectivity"]
|
|
542
|
-
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
543
|
-
|
|
544
|
-
elif isinstance(operator, LLMFilter):
|
|
545
|
-
model_name = operator.model.value
|
|
546
|
-
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
|
|
547
|
-
op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
|
|
548
|
-
op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
|
|
549
|
-
op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
|
|
550
|
-
|
|
551
|
-
elif isinstance(operator, LLMConvert):
|
|
552
|
-
# NOTE: code synthesis does not have a model attribute
|
|
553
|
-
model_name = operator.model.value if hasattr(operator, "model") else None
|
|
554
|
-
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
|
|
555
|
-
op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
|
|
556
|
-
op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
|
|
557
|
-
op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
|
|
558
|
-
|
|
559
|
-
# NOTE: if code synth. fails, this will turn into ConventionalQuery calls to GPT-3.5,
|
|
560
|
-
# which would wildly mess up estimate of time and cost per-record
|
|
561
|
-
# do code synthesis adjustment
|
|
562
|
-
if isinstance(operator, CodeSynthesisConvert):
|
|
563
|
-
op_estimates.time_per_record = 1e-5
|
|
564
|
-
op_estimates.cost_per_record = 1e-4
|
|
565
|
-
op_estimates.quality = op_estimates.quality * (GPT_4o_MODEL_CARD["code"] / 100.0)
|
|
566
|
-
|
|
567
|
-
# rag convert adjustment
|
|
568
|
-
if isinstance(operator, RAGConvert):
|
|
569
|
-
total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
|
|
570
|
-
total_output_tokens = sample_op_estimates[full_op_id][model_name]["total_output_tokens"]
|
|
571
|
-
op_estimates.cost_per_record = (
|
|
572
|
-
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
|
|
573
|
-
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
|
|
574
|
-
)
|
|
575
|
-
op_estimates.quality = op_estimates.quality * operator.naive_quality_adjustment
|
|
576
|
-
|
|
577
|
-
else:
|
|
578
|
-
raise Exception("Unknown operator")
|
|
579
|
-
|
|
580
183
|
# compute estimates for this operator
|
|
581
|
-
|
|
582
|
-
|
|
184
|
+
est_input_cardinality = (
|
|
185
|
+
source_op_estimates.cardinality * right_source_op_estimates.cardinality
|
|
186
|
+
if isinstance(operator, JoinOp)
|
|
187
|
+
else source_op_estimates.cardinality
|
|
188
|
+
)
|
|
189
|
+
op_time = op_estimates.time_per_record * est_input_cardinality
|
|
190
|
+
op_cost = op_estimates.cost_per_record * est_input_cardinality
|
|
583
191
|
op_quality = op_estimates.quality
|
|
584
192
|
|
|
585
193
|
# create and return PlanCost object for this op's statistics
|
|
@@ -593,3 +201,62 @@ class CostModel(BaseCostModel):
|
|
|
593
201
|
logger.debug(f"Plan cost: {op_plan_cost}")
|
|
594
202
|
|
|
595
203
|
return op_plan_cost
|
|
204
|
+
|
|
205
|
+
def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None, right_source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
|
|
206
|
+
# for non-sentinel execution, we use naive estimates
|
|
207
|
+
full_op_id = operator.get_full_op_id()
|
|
208
|
+
unique_logical_op_id = operator.unique_logical_op_id
|
|
209
|
+
if self.operator_to_stats is None or unique_logical_op_id not in self.operator_to_stats:
|
|
210
|
+
return self._compute_naive_plan_cost(operator, source_op_estimates, right_source_op_estimates)
|
|
211
|
+
|
|
212
|
+
# NOTE: some physical operators may not have any sample execution data in this cost model;
|
|
213
|
+
# these physical operators are filtered out of the Optimizer, thus we can assume that
|
|
214
|
+
# we will have execution data for each operator passed into __call__; nevertheless, we
|
|
215
|
+
# still perform a sanity check
|
|
216
|
+
# look up physical and logical op ids associated with this physical operator
|
|
217
|
+
physical_op_to_stats = self.operator_to_stats.get(unique_logical_op_id)
|
|
218
|
+
assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
|
|
219
|
+
assert physical_op_to_stats.get(full_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
|
|
220
|
+
logger.debug(f"Calling __call__ for {str(operator)}")
|
|
221
|
+
|
|
222
|
+
# look up stats for this operation
|
|
223
|
+
est_cost_per_record = self.operator_to_stats[unique_logical_op_id][full_op_id]["cost"]
|
|
224
|
+
est_time_per_record = self.operator_to_stats[unique_logical_op_id][full_op_id]["time"]
|
|
225
|
+
est_quality = self.operator_to_stats[unique_logical_op_id][full_op_id]["quality"]
|
|
226
|
+
est_selectivity = self.operator_to_stats[unique_logical_op_id][full_op_id]["selectivity"]
|
|
227
|
+
|
|
228
|
+
# create source_op_estimates for scan operators if they are not provided
|
|
229
|
+
if isinstance(operator, ScanPhysicalOp):
|
|
230
|
+
# get handle to scan operator and pre-compute its size (number of records)
|
|
231
|
+
datasource_len = len(operator.datasource)
|
|
232
|
+
|
|
233
|
+
source_op_estimates = OperatorCostEstimates(
|
|
234
|
+
cardinality=datasource_len,
|
|
235
|
+
time_per_record=0.0,
|
|
236
|
+
cost_per_record=0.0,
|
|
237
|
+
quality=1.0,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# generate new set of OperatorCostEstimates
|
|
241
|
+
est_input_cardinality = (
|
|
242
|
+
source_op_estimates.cardinality * right_source_op_estimates.cardinality
|
|
243
|
+
if isinstance(operator, JoinOp)
|
|
244
|
+
else source_op_estimates.cardinality
|
|
245
|
+
)
|
|
246
|
+
op_estimates = OperatorCostEstimates(
|
|
247
|
+
cardinality=est_selectivity * est_input_cardinality,
|
|
248
|
+
time_per_record=est_time_per_record,
|
|
249
|
+
cost_per_record=est_cost_per_record,
|
|
250
|
+
quality=est_quality,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# compute estimates for this operator
|
|
254
|
+
op_time = op_estimates.time_per_record * est_input_cardinality
|
|
255
|
+
op_cost = op_estimates.cost_per_record * est_input_cardinality
|
|
256
|
+
op_quality = op_estimates.quality
|
|
257
|
+
|
|
258
|
+
# construct and return op estimates
|
|
259
|
+
plan_cost = PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
|
|
260
|
+
logger.debug(f"Done calling __call__ for {str(operator)}")
|
|
261
|
+
logger.debug(f"Plan cost: {plan_cost}")
|
|
262
|
+
return plan_cost
|