palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +5 -0
- palimpzest/constants.py +110 -43
- palimpzest/core/__init__.py +0 -78
- palimpzest/core/data/dataclasses.py +382 -44
- palimpzest/core/elements/filters.py +7 -3
- palimpzest/core/elements/index.py +70 -0
- palimpzest/core/elements/records.py +33 -11
- palimpzest/core/lib/fields.py +1 -0
- palimpzest/core/lib/schemas.py +4 -3
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
- palimpzest/prompts/prompt_factory.py +44 -7
- palimpzest/prompts/split_merge_prompts.py +56 -0
- palimpzest/prompts/split_proposer_prompts.py +55 -0
- palimpzest/query/execution/execution_strategy.py +435 -53
- palimpzest/query/execution/execution_strategy_type.py +20 -0
- palimpzest/query/execution/mab_execution_strategy.py +532 -0
- palimpzest/query/execution/parallel_execution_strategy.py +143 -172
- palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
- palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
- palimpzest/query/generators/api_client_factory.py +31 -0
- palimpzest/query/generators/generators.py +256 -76
- palimpzest/query/operators/__init__.py +1 -2
- palimpzest/query/operators/code_synthesis_convert.py +33 -18
- palimpzest/query/operators/convert.py +30 -97
- palimpzest/query/operators/critique_and_refine_convert.py +5 -6
- palimpzest/query/operators/filter.py +7 -10
- palimpzest/query/operators/logical.py +54 -10
- palimpzest/query/operators/map.py +130 -0
- palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
- palimpzest/query/operators/physical.py +3 -12
- palimpzest/query/operators/rag_convert.py +66 -18
- palimpzest/query/operators/retrieve.py +230 -34
- palimpzest/query/operators/scan.py +5 -2
- palimpzest/query/operators/split_convert.py +169 -0
- palimpzest/query/operators/token_reduction_convert.py +8 -14
- palimpzest/query/optimizer/__init__.py +4 -16
- palimpzest/query/optimizer/cost_model.py +73 -266
- palimpzest/query/optimizer/optimizer.py +87 -58
- palimpzest/query/optimizer/optimizer_strategy.py +18 -97
- palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/primitives.py +5 -3
- palimpzest/query/optimizer/rules.py +336 -172
- palimpzest/query/optimizer/tasks.py +30 -100
- palimpzest/query/processor/config.py +38 -22
- palimpzest/query/processor/nosentinel_processor.py +16 -520
- palimpzest/query/processor/processing_strategy_type.py +28 -0
- palimpzest/query/processor/query_processor.py +38 -206
- palimpzest/query/processor/query_processor_factory.py +117 -130
- palimpzest/query/processor/sentinel_processor.py +90 -0
- palimpzest/query/processor/streaming_processor.py +25 -32
- palimpzest/sets.py +88 -41
- palimpzest/utils/model_helpers.py +8 -7
- palimpzest/utils/progress.py +368 -152
- palimpzest/utils/token_reduction_helpers.py +1 -3
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
- palimpzest-0.7.0.dist-info/RECORD +96 -0
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
- palimpzest/query/processor/mab_sentinel_processor.py +0 -884
- palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
- palimpzest/utils/index_helpers.py +0 -6
- palimpzest-0.6.3.dist-info/RECORD +0 -87
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
import math
|
|
4
5
|
|
|
5
6
|
# NOTE: the answer.mode() call(s) inside of _est_quality() throw a UserWarning when there are multiple
|
|
@@ -12,11 +13,9 @@ import warnings
|
|
|
12
13
|
from typing import Any
|
|
13
14
|
|
|
14
15
|
import pandas as pd
|
|
15
|
-
import scipy.stats as stats
|
|
16
16
|
|
|
17
17
|
from palimpzest.constants import MODEL_CARDS, NAIVE_BYTES_PER_RECORD, GPT_4o_MODEL_CARD, Model
|
|
18
|
-
from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost, RecordOpStats
|
|
19
|
-
from palimpzest.core.elements.records import DataRecordSet
|
|
18
|
+
from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost, RecordOpStats, SentinelPlanStats
|
|
20
19
|
from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
|
|
21
20
|
from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvert
|
|
22
21
|
from palimpzest.query.operators.convert import LLMConvert
|
|
@@ -25,12 +24,13 @@ from palimpzest.query.operators.limit import LimitScanOp
|
|
|
25
24
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
26
25
|
from palimpzest.query.operators.rag_convert import RAGConvert
|
|
27
26
|
from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
|
|
28
|
-
from palimpzest.query.operators.token_reduction_convert import
|
|
29
|
-
from palimpzest.query.optimizer.plan import SentinelPlan
|
|
27
|
+
from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
|
|
30
28
|
from palimpzest.utils.model_helpers import get_champion_model_name, get_models
|
|
31
29
|
|
|
32
30
|
warnings.simplefilter(action='ignore', category=UserWarning)
|
|
33
31
|
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
34
|
class BaseCostModel:
|
|
35
35
|
"""
|
|
36
36
|
This base class contains the interface/abstraction that every CostModel must implement
|
|
@@ -64,14 +64,13 @@ class SampleBasedCostModel:
|
|
|
64
64
|
"""
|
|
65
65
|
def __init__(
|
|
66
66
|
self,
|
|
67
|
-
|
|
68
|
-
execution_data: dict[str, dict[str, list[DataRecordSet]]],
|
|
67
|
+
sentinel_plan_stats: SentinelPlanStats,
|
|
69
68
|
verbose: bool = False,
|
|
70
69
|
exp_name: str | None = None,
|
|
71
70
|
):
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
71
|
+
"""
|
|
72
|
+
execution_data is: {logical_op_id: {physical_op_id: [DataRecordSet]}}
|
|
73
|
+
"""
|
|
75
74
|
# store verbose argument
|
|
76
75
|
self.verbose = verbose
|
|
77
76
|
|
|
@@ -79,7 +78,7 @@ class SampleBasedCostModel:
|
|
|
79
78
|
self.exp_name = exp_name
|
|
80
79
|
|
|
81
80
|
# construct cost, time, quality, and selectivity matrices for each operator set;
|
|
82
|
-
self.operator_to_stats = self.compute_operator_stats(
|
|
81
|
+
self.operator_to_stats = self.compute_operator_stats(sentinel_plan_stats)
|
|
83
82
|
|
|
84
83
|
# compute set of costed physical op ids from operator_to_stats
|
|
85
84
|
self.costed_phys_op_ids = set([
|
|
@@ -88,30 +87,23 @@ class SampleBasedCostModel:
|
|
|
88
87
|
for phys_op_id, _ in phys_op_id_to_stats.items()
|
|
89
88
|
])
|
|
90
89
|
|
|
90
|
+
logger.info(f"Initialized SampleBasedCostModel with verbose={self.verbose}")
|
|
91
|
+
logger.debug(f"Initialized SampleBasedCostModel with params: {self.__dict__}")
|
|
91
92
|
|
|
92
93
|
def get_costed_phys_op_ids(self):
|
|
93
94
|
return self.costed_phys_op_ids
|
|
94
95
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
self,
|
|
98
|
-
execution_data: dict[str, dict[str, list[DataRecordSet]]],
|
|
99
|
-
):
|
|
96
|
+
def compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats) -> dict:
|
|
97
|
+
logger.debug("Computing operator statistics")
|
|
100
98
|
# flatten the nested dictionary of execution data and pull out fields relevant to cost estimation
|
|
101
99
|
execution_record_op_stats = []
|
|
102
|
-
for
|
|
103
|
-
|
|
104
|
-
upstream_logical_op_id = self.sentinel_plan.logical_op_ids[idx - 1] if idx > 0 else None
|
|
105
|
-
|
|
106
|
-
# filter for the execution data from this operator set
|
|
107
|
-
op_set_execution_data = execution_data[logical_op_id]
|
|
108
|
-
|
|
100
|
+
for logical_op_id, phys_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
|
|
101
|
+
logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
|
|
109
102
|
# flatten the execution data into a list of RecordOpStats
|
|
110
103
|
op_set_execution_data = [
|
|
111
104
|
record_op_stats
|
|
112
|
-
for _,
|
|
113
|
-
for
|
|
114
|
-
for record_op_stats in record_set.record_op_stats
|
|
105
|
+
for _, op_stats in phys_op_id_to_op_stats.items()
|
|
106
|
+
for record_op_stats in op_stats.record_op_stats_lst
|
|
115
107
|
]
|
|
116
108
|
|
|
117
109
|
# add entries from execution data into matrices
|
|
@@ -119,7 +111,6 @@ class SampleBasedCostModel:
|
|
|
119
111
|
record_op_stats_dict = {
|
|
120
112
|
"logical_op_id": logical_op_id,
|
|
121
113
|
"physical_op_id": record_op_stats.op_id,
|
|
122
|
-
"upstream_logical_op_id": upstream_logical_op_id,
|
|
123
114
|
"record_id": record_op_stats.record_id,
|
|
124
115
|
"record_parent_id": record_op_stats.record_parent_id,
|
|
125
116
|
"cost_per_record": record_op_stats.cost_per_record,
|
|
@@ -138,21 +129,19 @@ class SampleBasedCostModel:
|
|
|
138
129
|
# for each physical_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
|
|
139
130
|
operator_to_stats = {}
|
|
140
131
|
for logical_op_id, logical_op_df in operator_stats_df.groupby("logical_op_id"):
|
|
132
|
+
logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
|
|
141
133
|
operator_to_stats[logical_op_id] = {}
|
|
142
134
|
|
|
143
|
-
# get the logical_op_id of the upstream operator
|
|
144
|
-
upstream_logical_op_ids = logical_op_df.upstream_logical_op_id.unique()
|
|
145
|
-
assert len(upstream_logical_op_ids) == 1, "More than one upstream logical_op_id"
|
|
146
|
-
upstream_logical_op_id = upstream_logical_op_ids[0]
|
|
147
|
-
|
|
148
135
|
for physical_op_id, physical_op_df in logical_op_df.groupby("physical_op_id"):
|
|
149
|
-
#
|
|
150
|
-
|
|
136
|
+
# compute the number of input records processed by this operator; use source_idx for scan operator(s)
|
|
137
|
+
num_source_records = (
|
|
138
|
+
len(physical_op_df.record_parent_id.unique())
|
|
139
|
+
if not physical_op_df.record_parent_id.isna().all()
|
|
140
|
+
else len(physical_op_df.source_idx.unique())
|
|
141
|
+
)
|
|
151
142
|
|
|
152
143
|
# compute selectivity
|
|
153
|
-
selectivity = (
|
|
154
|
-
1.0 if upstream_logical_op_id is None else physical_op_df.passed_operator.sum() / num_upstream_records
|
|
155
|
-
)
|
|
144
|
+
selectivity = physical_op_df.passed_operator.sum() / num_source_records
|
|
156
145
|
|
|
157
146
|
operator_to_stats[logical_op_id][physical_op_id] = {
|
|
158
147
|
"cost": physical_op_df.cost_per_record.mean(),
|
|
@@ -165,6 +154,7 @@ class SampleBasedCostModel:
|
|
|
165
154
|
if self.exp_name is not None:
|
|
166
155
|
operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
|
|
167
156
|
|
|
157
|
+
logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
|
|
168
158
|
return operator_to_stats
|
|
169
159
|
|
|
170
160
|
|
|
@@ -176,7 +166,10 @@ class SampleBasedCostModel:
|
|
|
176
166
|
# look up physical and logical op ids associated with this physical operator
|
|
177
167
|
phys_op_id = operator.get_op_id()
|
|
178
168
|
logical_op_id = operator.logical_op_id
|
|
179
|
-
|
|
169
|
+
physical_op_to_stats = self.operator_to_stats.get(logical_op_id)
|
|
170
|
+
assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
|
|
171
|
+
assert physical_op_to_stats.get(phys_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
|
|
172
|
+
logger.debug(f"Calling __call__ for {str(operator)}")
|
|
180
173
|
|
|
181
174
|
# look up stats for this operation
|
|
182
175
|
est_cost_per_record = self.operator_to_stats[logical_op_id][phys_op_id]["cost"]
|
|
@@ -210,7 +203,10 @@ class SampleBasedCostModel:
|
|
|
210
203
|
op_quality = op_estimates.quality
|
|
211
204
|
|
|
212
205
|
# construct and return op estimates
|
|
213
|
-
|
|
206
|
+
plan_cost = PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
|
|
207
|
+
logger.debug(f"Done calling __call__ for {str(operator)}")
|
|
208
|
+
logger.debug(f"Plan cost: {plan_cost}")
|
|
209
|
+
return plan_cost
|
|
214
210
|
|
|
215
211
|
|
|
216
212
|
class CostModel(BaseCostModel):
|
|
@@ -223,7 +219,6 @@ class CostModel(BaseCostModel):
|
|
|
223
219
|
self,
|
|
224
220
|
sample_execution_data: list[RecordOpStats] | None = None,
|
|
225
221
|
available_models: list[Model] | None = None,
|
|
226
|
-
confidence_level: float = 0.90,
|
|
227
222
|
) -> None:
|
|
228
223
|
if sample_execution_data is None:
|
|
229
224
|
sample_execution_data = []
|
|
@@ -242,107 +237,54 @@ class CostModel(BaseCostModel):
|
|
|
242
237
|
# set available models
|
|
243
238
|
self.available_models = available_models
|
|
244
239
|
|
|
245
|
-
# set confidence level for CI estimates
|
|
246
|
-
self.conf_level = confidence_level
|
|
247
|
-
|
|
248
240
|
# compute per-operator estimates
|
|
249
241
|
self.operator_estimates = self._compute_operator_estimates()
|
|
250
242
|
|
|
251
243
|
# compute set of costed physical op ids from operator_to_stats
|
|
252
244
|
self.costed_phys_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
|
|
245
|
+
logger.info("Initialized CostModel.")
|
|
246
|
+
logger.debug(f"Initialized CostModel with params: {self.__dict__}")
|
|
253
247
|
|
|
254
248
|
def get_costed_phys_op_ids(self):
|
|
255
249
|
return self.costed_phys_op_ids
|
|
256
250
|
|
|
257
|
-
def
|
|
251
|
+
def _compute_mean(self, df: pd.DataFrame, col: str, model_name: str | None = None) -> float:
|
|
258
252
|
"""
|
|
259
|
-
Compute
|
|
260
|
-
and sample std. deviation at the CostModel's given confidence level. We use a t-distribution for
|
|
261
|
-
computing the interval as many sample estimates in PZ may have few samples.
|
|
262
|
-
"""
|
|
263
|
-
ci = stats.t.interval(
|
|
264
|
-
confidence=self.conf_level, # Confidence level
|
|
265
|
-
df=n_samples - 1, # Degrees of freedom
|
|
266
|
-
loc=sample_mean, # Sample mean
|
|
267
|
-
scale=std_dev, # Standard deviation estimate
|
|
268
|
-
)
|
|
269
|
-
return ci
|
|
270
|
-
|
|
271
|
-
def _compute_proportion_ci(self, sample_prop: float, n_samples: int) -> tuple[float, float]:
|
|
272
|
-
"""
|
|
273
|
-
Compute confidence interval for proportion quantities (i.e. selectivity) given the sample proportion
|
|
274
|
-
and the number of samples. We use the normal distribution for computing the interval here, for reasons
|
|
275
|
-
summarized by this post: https://stats.stackexchange.com/a/411727.
|
|
276
|
-
"""
|
|
277
|
-
if sample_prop == 0.0 or sample_prop == 1.0:
|
|
278
|
-
return (sample_prop, sample_prop)
|
|
279
|
-
|
|
280
|
-
scaling_factor = math.sqrt((sample_prop * (1 - sample_prop)) / n_samples)
|
|
281
|
-
lower_bound, upper_bound = stats.norm.interval(
|
|
282
|
-
confidence=self.conf_level, # Confidence level
|
|
283
|
-
loc=sample_prop, # Sample proportion
|
|
284
|
-
scale=scaling_factor, # Scaling factor
|
|
285
|
-
)
|
|
286
|
-
lower_bound = max(lower_bound, 0.0)
|
|
287
|
-
upper_bound = max(upper_bound, 1.0)
|
|
288
|
-
|
|
289
|
-
return (lower_bound, upper_bound)
|
|
290
|
-
|
|
291
|
-
def _compute_mean_and_ci(self, df: pd.DataFrame, col: str, model_name: str | None = None, non_negative_lb: bool = False) -> tuple[float, float, float]:
|
|
292
|
-
"""
|
|
293
|
-
Compute the mean and CI for the given column and dataframe. If the model_name is provided, filter
|
|
253
|
+
Compute the mean for the given column and dataframe. If the model_name is provided, filter
|
|
294
254
|
for the subset of rows belonging to the model.
|
|
295
255
|
"""
|
|
296
256
|
# use model-specific estimate if possible
|
|
297
257
|
if model_name is not None:
|
|
298
258
|
model_df = df[df.model_name == model_name]
|
|
299
259
|
if not model_df.empty:
|
|
300
|
-
|
|
301
|
-
col_lb, col_ub = self._compute_ci(
|
|
302
|
-
sample_mean=col_mean,
|
|
303
|
-
n_samples=model_df[col].notna().sum(),
|
|
304
|
-
std_dev=model_df[col].std(),
|
|
305
|
-
)
|
|
306
|
-
if non_negative_lb:
|
|
307
|
-
col_lb = max(col_lb, 0.0)
|
|
260
|
+
return model_df[col].mean()
|
|
308
261
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
# compute aggregate
|
|
312
|
-
col_mean = df[col].mean()
|
|
313
|
-
col_lb, col_ub = self._compute_ci(
|
|
314
|
-
sample_mean=col_mean,
|
|
315
|
-
n_samples=df[col].notna().sum(),
|
|
316
|
-
std_dev=df[col].std(),
|
|
317
|
-
)
|
|
318
|
-
if non_negative_lb:
|
|
319
|
-
col_lb = max(col_lb, 0.0)
|
|
320
|
-
|
|
321
|
-
return col_mean, col_lb, col_ub
|
|
262
|
+
# compute aggregate mean across all models
|
|
263
|
+
return df[col].mean()
|
|
322
264
|
|
|
323
265
|
def _est_time_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
|
|
324
266
|
"""
|
|
325
267
|
Given sample cost data observations for a specific operation, compute the mean and CI
|
|
326
268
|
for the time per record.
|
|
327
269
|
"""
|
|
328
|
-
return self.
|
|
270
|
+
return self._compute_mean(df=op_df, col="time_per_record", model_name=model_name)
|
|
329
271
|
|
|
330
272
|
def _est_cost_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
|
|
331
273
|
"""
|
|
332
274
|
Given sample cost data observations for a specific operation, compute the mean and CI
|
|
333
275
|
for the cost per record.
|
|
334
276
|
"""
|
|
335
|
-
return self.
|
|
277
|
+
return self._compute_mean(df=op_df, col="cost_per_record", model_name=model_name)
|
|
336
278
|
|
|
337
|
-
def _est_tokens_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[
|
|
279
|
+
def _est_tokens_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float]:
|
|
338
280
|
"""
|
|
339
281
|
Given sample cost data observations for a specific operation, compute the mean and CI
|
|
340
282
|
for the total input tokens and total output tokens.
|
|
341
283
|
"""
|
|
342
|
-
|
|
343
|
-
|
|
284
|
+
total_input_tokens = self._compute_mean(df=op_df, col="total_input_tokens", model_name=model_name)
|
|
285
|
+
total_output_tokens = self._compute_mean(df=op_df, col="total_output_tokens", model_name=model_name)
|
|
344
286
|
|
|
345
|
-
return
|
|
287
|
+
return total_input_tokens, total_output_tokens
|
|
346
288
|
|
|
347
289
|
def _est_cardinality(self, op_df: pd.DataFrame, model_name: str | None = None) -> float:
|
|
348
290
|
"""
|
|
@@ -382,18 +324,8 @@ class CostModel(BaseCostModel):
|
|
|
382
324
|
plan_ids = model_op_df.plan_id.unique().tolist()
|
|
383
325
|
num_output_records = df[df.source_op_id.isin(op_ids) & df.plan_id.isin(plan_ids)].shape[0]
|
|
384
326
|
|
|
385
|
-
# estimate the selectivity / fan-out
|
|
386
|
-
|
|
387
|
-
if is_filter_op:
|
|
388
|
-
est_selectivity_lb, est_selectivity_ub = self._compute_proportion_ci(est_selectivity, n_samples=num_input_records)
|
|
389
|
-
|
|
390
|
-
# for now, if we are doing a convert operation w/fan-out then the assumptions of _compute_proportion_ci
|
|
391
|
-
# do not hold; until we have a better method for estimating bounds, just set them to the estimate
|
|
392
|
-
else:
|
|
393
|
-
est_selectivity_lb = est_selectivity
|
|
394
|
-
est_selectivity_ub = est_selectivity
|
|
395
|
-
|
|
396
|
-
return est_selectivity, est_selectivity_lb, est_selectivity_ub
|
|
327
|
+
# estimate the selectivity / fan-out
|
|
328
|
+
return num_output_records / num_input_records
|
|
397
329
|
|
|
398
330
|
# otherwise average selectivity across all ops
|
|
399
331
|
num_input_records = op_df.shape[0]
|
|
@@ -406,18 +338,8 @@ class CostModel(BaseCostModel):
|
|
|
406
338
|
op_ids = op_df.op_id.unique().tolist()
|
|
407
339
|
num_output_records = df[df.source_op_id.isin(op_ids)].shape[0]
|
|
408
340
|
|
|
409
|
-
# estimate the selectivity / fan-out
|
|
410
|
-
|
|
411
|
-
if is_filter_op:
|
|
412
|
-
est_selectivity_lb, est_selectivity_ub = self._compute_proportion_ci(est_selectivity, n_samples=num_input_records)
|
|
413
|
-
|
|
414
|
-
# for now, if we are doing a convert operation w/fan-out then the assumptions of _compute_proportion_ci
|
|
415
|
-
# do not hold; until we have a better method for estimating bounds, just set them to the estimate
|
|
416
|
-
else:
|
|
417
|
-
est_selectivity_lb = est_selectivity
|
|
418
|
-
est_selectivity_ub = est_selectivity
|
|
419
|
-
|
|
420
|
-
return est_selectivity, est_selectivity_lb, est_selectivity_ub
|
|
341
|
+
# estimate the selectivity / fan-out
|
|
342
|
+
return num_output_records / num_input_records
|
|
421
343
|
|
|
422
344
|
def _compute_quality(self, row):
|
|
423
345
|
# compute accuracy for filter
|
|
@@ -491,10 +413,7 @@ class CostModel(BaseCostModel):
|
|
|
491
413
|
total_answers = model_df.num_answers.sum() if not model_df.empty else op_df.num_answers.sum()
|
|
492
414
|
est_quality = num_correct / total_answers
|
|
493
415
|
|
|
494
|
-
|
|
495
|
-
est_quality_lb, est_quality_ub = self._compute_proportion_ci(est_quality, n_samples=total_answers)
|
|
496
|
-
|
|
497
|
-
return est_quality, est_quality_lb, est_quality_ub
|
|
416
|
+
return est_quality
|
|
498
417
|
|
|
499
418
|
def _compute_operator_estimates(self) -> dict[str, Any] | None:
|
|
500
419
|
"""
|
|
@@ -532,64 +451,36 @@ class CostModel(BaseCostModel):
|
|
|
532
451
|
# model_names = op_df.model_name.unique().tolist()
|
|
533
452
|
estimates = {model_name: None for model_name in model_names}
|
|
534
453
|
for model_name in model_names:
|
|
535
|
-
time_per_record
|
|
536
|
-
cost_per_record
|
|
537
|
-
|
|
538
|
-
selectivity
|
|
539
|
-
quality
|
|
540
|
-
|
|
454
|
+
time_per_record = self._est_time_per_record(op_df, model_name=model_name)
|
|
455
|
+
cost_per_record = self._est_cost_per_record(op_df, model_name=model_name)
|
|
456
|
+
input_tokens, output_tokens = self._est_tokens_per_record(op_df, model_name=model_name)
|
|
457
|
+
selectivity = self._est_selectivity(self.sample_execution_data_df, op_df, model_name=model_name)
|
|
458
|
+
quality = self._est_quality(op_df, model_name=model_name)
|
|
459
|
+
|
|
541
460
|
model_estimates = {
|
|
542
461
|
"time_per_record": time_per_record,
|
|
543
|
-
"time_per_record_lower_bound": time_per_record_lb,
|
|
544
|
-
"time_per_record_upper_bound": time_per_record_ub,
|
|
545
462
|
"cost_per_record": cost_per_record,
|
|
546
|
-
"
|
|
547
|
-
"
|
|
548
|
-
"total_input_tokens": input_tokens_tup[0],
|
|
549
|
-
"total_input_tokens_lower_bound": input_tokens_tup[1],
|
|
550
|
-
"total_input_tokens_upper_bound": input_tokens_tup[2],
|
|
551
|
-
"total_output_tokens": output_tokens_tup[0],
|
|
552
|
-
"total_output_tokens_lower_bound": output_tokens_tup[1],
|
|
553
|
-
"total_output_tokens_upper_bound": output_tokens_tup[2],
|
|
463
|
+
"total_input_tokens": input_tokens,
|
|
464
|
+
"total_output_tokens": output_tokens,
|
|
554
465
|
"selectivity": selectivity,
|
|
555
|
-
"selectivity_lower_bound": selectivity_lb,
|
|
556
|
-
"selectivity_upper_bound": selectivity_ub,
|
|
557
466
|
"quality": quality,
|
|
558
|
-
"quality_lower_bound": quality_lb,
|
|
559
|
-
"quality_upper_bound": quality_ub,
|
|
560
467
|
}
|
|
561
468
|
estimates[model_name] = model_estimates
|
|
562
469
|
|
|
563
470
|
# TODO pre-compute lists of op_names in groups
|
|
564
471
|
elif op_name in ["NonLLMFilter"]:
|
|
565
|
-
time_per_record
|
|
566
|
-
selectivity
|
|
567
|
-
estimates = {
|
|
568
|
-
"time_per_record": time_per_record,
|
|
569
|
-
"time_per_record_lower_bound": time_per_record_lb,
|
|
570
|
-
"time_per_record_upper_bound": time_per_record_ub,
|
|
571
|
-
"selectivity": selectivity,
|
|
572
|
-
"selectivity_lower_bound": selectivity_lb,
|
|
573
|
-
"selectivity_upper_bound": selectivity_ub,
|
|
574
|
-
}
|
|
472
|
+
time_per_record = self._est_time_per_record(op_df)
|
|
473
|
+
selectivity = self._est_selectivity(self.sample_execution_data_df, op_df)
|
|
474
|
+
estimates = {"time_per_record": time_per_record, "selectivity": selectivity}
|
|
575
475
|
|
|
576
476
|
elif op_name in ["MarshalAndScanDataOp", "CacheScanDataOp", "LimitScanOp", "CountAggregateOp", "AverageAggregateOp"]:
|
|
577
|
-
time_per_record
|
|
578
|
-
estimates = {
|
|
579
|
-
"time_per_record": time_per_record,
|
|
580
|
-
"time_per_record_lower_bound": time_per_record_lb,
|
|
581
|
-
"time_per_record_upper_bound": time_per_record_ub,
|
|
582
|
-
}
|
|
477
|
+
time_per_record = self._est_time_per_record(op_df)
|
|
478
|
+
estimates = {"time_per_record": time_per_record}
|
|
583
479
|
|
|
584
480
|
elif op_name in ["ApplyGroupByOp"]:
|
|
585
|
-
time_per_record
|
|
481
|
+
time_per_record = self._est_time_per_record(op_df)
|
|
586
482
|
cardinality = self._est_cardinality(op_df)
|
|
587
|
-
estimates = {
|
|
588
|
-
"time_per_record": time_per_record,
|
|
589
|
-
"time_per_record_lower_bound": time_per_record_lb,
|
|
590
|
-
"time_per_record_upper_bound": time_per_record_ub,
|
|
591
|
-
"cardinality": cardinality,
|
|
592
|
-
}
|
|
483
|
+
estimates = {"time_per_record": time_per_record, "cardinality": cardinality}
|
|
593
484
|
|
|
594
485
|
operator_estimates[op_id] = estimates
|
|
595
486
|
|
|
@@ -598,6 +489,7 @@ class CostModel(BaseCostModel):
|
|
|
598
489
|
def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
|
|
599
490
|
# get identifier for operation which is unique within sentinel plan but consistent across sentinels
|
|
600
491
|
op_id = operator.get_op_id()
|
|
492
|
+
logger.debug(f"Calling __call__ for {str(operator)} with op_id: {op_id}")
|
|
601
493
|
|
|
602
494
|
# initialize estimates of operator metrics based on naive (but sometimes precise) logic
|
|
603
495
|
if isinstance(operator, MarshalAndScanDataOp):
|
|
@@ -633,8 +525,6 @@ class CostModel(BaseCostModel):
|
|
|
633
525
|
if sample_op_estimates is not None and op_id in sample_op_estimates:
|
|
634
526
|
if isinstance(operator, (MarshalAndScanDataOp, CacheScanDataOp)):
|
|
635
527
|
op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
|
|
636
|
-
op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
|
|
637
|
-
op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
|
|
638
528
|
|
|
639
529
|
elif isinstance(operator, ApplyGroupByOp):
|
|
640
530
|
# NOTE: in theory we should also treat this cardinality est. as a random variable, but in practice we will
|
|
@@ -646,48 +536,24 @@ class CostModel(BaseCostModel):
|
|
|
646
536
|
# actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
|
|
647
537
|
# the input cardinality (where the initial input cardinality from the datareader is known).
|
|
648
538
|
op_estimates.cardinality = sample_op_estimates[op_id]["cardinality"]
|
|
649
|
-
op_estimates.cardinality_lower_bound = op_estimates.cardinality
|
|
650
|
-
op_estimates.cardinality_upper_bound = op_estimates.cardinality
|
|
651
539
|
op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
|
|
652
|
-
op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
|
|
653
|
-
op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
|
|
654
540
|
|
|
655
541
|
elif isinstance(operator, (CountAggregateOp, AverageAggregateOp)): # noqa: SIM114
|
|
656
542
|
op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
|
|
657
|
-
op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
|
|
658
|
-
op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
|
|
659
543
|
|
|
660
544
|
elif isinstance(operator, LimitScanOp):
|
|
661
545
|
op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
|
|
662
|
-
op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
|
|
663
|
-
op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
|
|
664
546
|
|
|
665
547
|
elif isinstance(operator, NonLLMFilter):
|
|
666
548
|
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id]["selectivity"]
|
|
667
|
-
op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id]["selectivity_lower_bound"]
|
|
668
|
-
op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id]["selectivity_upper_bound"]
|
|
669
|
-
|
|
670
549
|
op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
|
|
671
|
-
op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
|
|
672
|
-
op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
|
|
673
550
|
|
|
674
551
|
elif isinstance(operator, LLMFilter):
|
|
675
552
|
model_name = operator.model.value
|
|
676
553
|
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
|
|
677
|
-
op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id][model_name]["selectivity_lower_bound"]
|
|
678
|
-
op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id][model_name]["selectivity_upper_bound"]
|
|
679
|
-
|
|
680
554
|
op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
|
|
681
|
-
op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id][model_name]["time_per_record_lower_bound"]
|
|
682
|
-
op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id][model_name]["time_per_record_upper_bound"]
|
|
683
|
-
|
|
684
555
|
op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
|
|
685
|
-
op_estimates.cost_per_record_lower_bound = sample_op_estimates[op_id][model_name]["cost_per_record_lower_bound"]
|
|
686
|
-
op_estimates.cost_per_record_upper_bound = sample_op_estimates[op_id][model_name]["cost_per_record_upper_bound"]
|
|
687
|
-
|
|
688
556
|
op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
|
|
689
|
-
op_estimates.quality_lower_bound = sample_op_estimates[op_id][model_name]["quality_lower_bound"]
|
|
690
|
-
op_estimates.quality_upper_bound = sample_op_estimates[op_id][model_name]["quality_upper_bound"]
|
|
691
557
|
|
|
692
558
|
elif isinstance(operator, LLMConvert):
|
|
693
559
|
# TODO: EVEN BETTER: do similarity match (e.g. largest param intersection, more exotic techniques);
|
|
@@ -696,60 +562,28 @@ class CostModel(BaseCostModel):
|
|
|
696
562
|
# NOTE: code synthesis does not have a model attribute
|
|
697
563
|
model_name = operator.model.value if hasattr(operator, "model") else None
|
|
698
564
|
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
|
|
699
|
-
op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id][model_name]["selectivity_lower_bound"]
|
|
700
|
-
op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id][model_name]["selectivity_upper_bound"]
|
|
701
|
-
|
|
702
565
|
op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
|
|
703
|
-
op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id][model_name]["time_per_record_lower_bound"]
|
|
704
|
-
op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id][model_name]["time_per_record_upper_bound"]
|
|
705
|
-
|
|
706
566
|
op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
|
|
707
|
-
op_estimates.cost_per_record_lower_bound = sample_op_estimates[op_id][model_name]["cost_per_record_lower_bound"]
|
|
708
|
-
op_estimates.cost_per_record_upper_bound = sample_op_estimates[op_id][model_name]["cost_per_record_upper_bound"]
|
|
709
|
-
|
|
710
567
|
op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
|
|
711
|
-
op_estimates.quality_lower_bound = sample_op_estimates[op_id][model_name]["quality_lower_bound"]
|
|
712
|
-
op_estimates.quality_upper_bound = sample_op_estimates[op_id][model_name]["quality_upper_bound"]
|
|
713
568
|
|
|
714
569
|
# NOTE: if code synth. fails, this will turn into ConventionalQuery calls to GPT-3.5,
|
|
715
570
|
# which would wildly mess up estimate of time and cost per-record
|
|
716
571
|
# do code synthesis adjustment
|
|
717
572
|
if isinstance(operator, CodeSynthesisConvert):
|
|
718
573
|
op_estimates.time_per_record = 1e-5
|
|
719
|
-
op_estimates.time_per_record_lower_bound = op_estimates.time_per_record
|
|
720
|
-
op_estimates.time_per_record_upper_bound = op_estimates.time_per_record
|
|
721
574
|
op_estimates.cost_per_record = 1e-4
|
|
722
|
-
op_estimates.cost_per_record_lower_bound = op_estimates.cost_per_record
|
|
723
|
-
op_estimates.cost_per_record_upper_bound = op_estimates.cost_per_record
|
|
724
575
|
op_estimates.quality = op_estimates.quality * (GPT_4o_MODEL_CARD["code"] / 100.0)
|
|
725
|
-
op_estimates.quality_lower_bound = op_estimates.quality_lower_bound * (GPT_4o_MODEL_CARD["code"] / 100.0)
|
|
726
|
-
op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * (GPT_4o_MODEL_CARD["code"] / 100.0)
|
|
727
576
|
|
|
728
577
|
# token reduction adjustment
|
|
729
|
-
if isinstance(operator,
|
|
578
|
+
if isinstance(operator, TokenReducedConvertBonded):
|
|
730
579
|
total_input_tokens = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens"]
|
|
731
580
|
total_output_tokens = sample_op_estimates[op_id][model_name]["total_output_tokens"]
|
|
732
581
|
op_estimates.cost_per_record = (
|
|
733
582
|
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
|
|
734
583
|
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
|
|
735
584
|
)
|
|
736
|
-
total_input_tokens_lb = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens_lower_bound"]
|
|
737
|
-
total_output_tokens_lb = sample_op_estimates[op_id][model_name]["total_output_tokens_lower_bound"]
|
|
738
|
-
op_estimates.cost_per_record_lower_bound = (
|
|
739
|
-
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_lb
|
|
740
|
-
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_lb
|
|
741
|
-
)
|
|
742
|
-
total_input_tokens_ub = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens_upper_bound"]
|
|
743
|
-
total_output_tokens_ub = sample_op_estimates[op_id][model_name]["total_output_tokens_upper_bound"]
|
|
744
|
-
op_estimates.cost_per_record_upper_bound = (
|
|
745
|
-
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_ub
|
|
746
|
-
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_ub
|
|
747
|
-
)
|
|
748
|
-
|
|
749
585
|
op_estimates.quality = op_estimates.quality * math.sqrt(math.sqrt(operator.token_budget))
|
|
750
|
-
|
|
751
|
-
op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * math.sqrt(math.sqrt(operator.token_budget))
|
|
752
|
-
|
|
586
|
+
|
|
753
587
|
# rag convert adjustment
|
|
754
588
|
if isinstance(operator, RAGConvert):
|
|
755
589
|
total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
|
|
@@ -758,22 +592,7 @@ class CostModel(BaseCostModel):
|
|
|
758
592
|
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
|
|
759
593
|
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
|
|
760
594
|
)
|
|
761
|
-
total_input_tokens_lb = operator.num_chunks_per_field * operator.chunk_size
|
|
762
|
-
total_output_tokens_lb = sample_op_estimates[op_id][model_name]["total_output_tokens_lower_bound"]
|
|
763
|
-
op_estimates.cost_per_record_lower_bound = (
|
|
764
|
-
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_lb
|
|
765
|
-
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_lb
|
|
766
|
-
)
|
|
767
|
-
total_input_tokens_ub = operator.num_chunks_per_field * operator.chunk_size
|
|
768
|
-
total_output_tokens_ub = sample_op_estimates[op_id][model_name]["total_output_tokens_upper_bound"]
|
|
769
|
-
op_estimates.cost_per_record_upper_bound = (
|
|
770
|
-
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_ub
|
|
771
|
-
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_ub
|
|
772
|
-
)
|
|
773
|
-
|
|
774
595
|
op_estimates.quality = op_estimates.quality * operator.naive_quality_adjustment
|
|
775
|
-
op_estimates.quality_lower_bound = op_estimates.quality_lower_bound * operator.naive_quality_adjustment
|
|
776
|
-
op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * operator.naive_quality_adjustment
|
|
777
596
|
|
|
778
597
|
else:
|
|
779
598
|
raise Exception("Unknown operator")
|
|
@@ -783,26 +602,14 @@ class CostModel(BaseCostModel):
|
|
|
783
602
|
op_cost = op_estimates.cost_per_record * source_op_estimates.cardinality
|
|
784
603
|
op_quality = op_estimates.quality
|
|
785
604
|
|
|
786
|
-
# compute bounds on total time and cost estimates for this operator
|
|
787
|
-
op_cost_lower_bound = op_estimates.cost_per_record_lower_bound * source_op_estimates.cardinality_lower_bound
|
|
788
|
-
op_cost_upper_bound = op_estimates.cost_per_record_upper_bound * source_op_estimates.cardinality_upper_bound
|
|
789
|
-
op_time_lower_bound = op_estimates.time_per_record_lower_bound * source_op_estimates.cardinality_lower_bound
|
|
790
|
-
op_time_upper_bound = op_estimates.time_per_record_upper_bound * source_op_estimates.cardinality_upper_bound
|
|
791
|
-
op_quality_lower_bound = op_estimates.quality_lower_bound
|
|
792
|
-
op_quality_upper_bound = op_estimates.quality_upper_bound
|
|
793
|
-
|
|
794
605
|
# create and return PlanCost object for this op's statistics
|
|
795
606
|
op_plan_cost = PlanCost(
|
|
796
607
|
cost=op_cost,
|
|
797
608
|
time=op_time,
|
|
798
609
|
quality=op_quality,
|
|
799
610
|
op_estimates=op_estimates,
|
|
800
|
-
cost_lower_bound=op_cost_lower_bound,
|
|
801
|
-
cost_upper_bound=op_cost_upper_bound,
|
|
802
|
-
time_lower_bound=op_time_lower_bound,
|
|
803
|
-
time_upper_bound=op_time_upper_bound,
|
|
804
|
-
quality_lower_bound=op_quality_lower_bound,
|
|
805
|
-
quality_upper_bound=op_quality_upper_bound,
|
|
806
611
|
)
|
|
612
|
+
logger.debug(f"Done calling __call__ for {str(operator)} with op_id: {op_id}")
|
|
613
|
+
logger.debug(f"Plan cost: {op_plan_cost}")
|
|
807
614
|
|
|
808
615
|
return op_plan_cost
|