palimpzest 0.7.6__py3-none-any.whl → 0.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/constants.py +113 -75
- palimpzest/core/data/dataclasses.py +55 -38
- palimpzest/core/elements/index.py +5 -15
- palimpzest/core/elements/records.py +1 -1
- palimpzest/prompts/prompt_factory.py +1 -1
- palimpzest/query/execution/all_sample_execution_strategy.py +216 -0
- palimpzest/query/execution/execution_strategy.py +4 -4
- palimpzest/query/execution/execution_strategy_type.py +7 -1
- palimpzest/query/execution/mab_execution_strategy.py +184 -72
- palimpzest/query/execution/parallel_execution_strategy.py +182 -15
- palimpzest/query/execution/single_threaded_execution_strategy.py +21 -21
- palimpzest/query/generators/api_client_factory.py +6 -7
- palimpzest/query/generators/generators.py +5 -8
- palimpzest/query/operators/aggregate.py +4 -3
- palimpzest/query/operators/convert.py +1 -1
- palimpzest/query/operators/filter.py +1 -1
- palimpzest/query/operators/limit.py +1 -1
- palimpzest/query/operators/map.py +1 -1
- palimpzest/query/operators/physical.py +8 -4
- palimpzest/query/operators/project.py +1 -1
- palimpzest/query/operators/retrieve.py +7 -23
- palimpzest/query/operators/scan.py +1 -1
- palimpzest/query/optimizer/cost_model.py +54 -62
- palimpzest/query/optimizer/optimizer.py +2 -6
- palimpzest/query/optimizer/plan.py +4 -4
- palimpzest/query/optimizer/primitives.py +1 -1
- palimpzest/query/optimizer/rules.py +8 -26
- palimpzest/query/optimizer/tasks.py +3 -3
- palimpzest/query/processor/processing_strategy_type.py +2 -2
- palimpzest/query/processor/sentinel_processor.py +0 -2
- palimpzest/sets.py +2 -3
- palimpzest/utils/generation_helpers.py +1 -1
- palimpzest/utils/model_helpers.py +27 -9
- palimpzest/utils/progress.py +81 -72
- {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/METADATA +4 -2
- {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/RECORD +39 -38
- {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/WHEEL +1 -1
- {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/top_level.txt +0 -0
|
@@ -87,7 +87,7 @@ class ScanPhysicalOp(PhysicalOperator, ABC):
|
|
|
87
87
|
record_parent_id=dr.parent_id,
|
|
88
88
|
record_source_idx=dr.source_idx,
|
|
89
89
|
record_state=dr.to_dict(include_bytes=False),
|
|
90
|
-
|
|
90
|
+
full_op_id=self.get_full_op_id(),
|
|
91
91
|
logical_op_id=self.logical_op_id,
|
|
92
92
|
op_name=self.op_name(),
|
|
93
93
|
time_per_record=(end_time - start_time),
|
|
@@ -43,11 +43,11 @@ class BaseCostModel:
|
|
|
43
43
|
"""
|
|
44
44
|
pass
|
|
45
45
|
|
|
46
|
-
def
|
|
46
|
+
def get_costed_full_op_ids(self) -> set[str]:
|
|
47
47
|
"""
|
|
48
|
-
Return the set of
|
|
48
|
+
Return the set of full op ids which the cost model has cost estimates for.
|
|
49
49
|
"""
|
|
50
|
-
raise NotImplementedError("Calling
|
|
50
|
+
raise NotImplementedError("Calling get_costed_full_op_ids from abstract method")
|
|
51
51
|
|
|
52
52
|
def __call__(self, operator: PhysicalOperator) -> PlanCost:
|
|
53
53
|
"""
|
|
@@ -66,9 +66,6 @@ class SampleBasedCostModel:
|
|
|
66
66
|
verbose: bool = False,
|
|
67
67
|
exp_name: str | None = None,
|
|
68
68
|
):
|
|
69
|
-
"""
|
|
70
|
-
execution_data is: {logical_op_id: {physical_op_id: [DataRecordSet]}}
|
|
71
|
-
"""
|
|
72
69
|
# store verbose argument
|
|
73
70
|
self.verbose = verbose
|
|
74
71
|
|
|
@@ -77,30 +74,28 @@ class SampleBasedCostModel:
|
|
|
77
74
|
|
|
78
75
|
# construct cost, time, quality, and selectivity matrices for each operator set;
|
|
79
76
|
self.operator_to_stats = self.compute_operator_stats(sentinel_plan_stats)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
for _, phys_op_id_to_stats in self.operator_to_stats.items()
|
|
85
|
-
for phys_op_id, _ in phys_op_id_to_stats.items()
|
|
77
|
+
self.costed_full_op_ids = set([
|
|
78
|
+
full_op_id
|
|
79
|
+
for _, full_op_id_to_stats in self.operator_to_stats.items()
|
|
80
|
+
for full_op_id in full_op_id_to_stats
|
|
86
81
|
])
|
|
87
82
|
|
|
88
83
|
logger.info(f"Initialized SampleBasedCostModel with verbose={self.verbose}")
|
|
89
84
|
logger.debug(f"Initialized SampleBasedCostModel with params: {self.__dict__}")
|
|
90
85
|
|
|
91
|
-
def
|
|
92
|
-
return self.
|
|
86
|
+
def get_costed_full_op_ids(self):
|
|
87
|
+
return self.costed_full_op_ids
|
|
93
88
|
|
|
94
89
|
def compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats) -> dict:
|
|
95
90
|
logger.debug("Computing operator statistics")
|
|
96
91
|
# flatten the nested dictionary of execution data and pull out fields relevant to cost estimation
|
|
97
92
|
execution_record_op_stats = []
|
|
98
|
-
for logical_op_id,
|
|
93
|
+
for logical_op_id, full_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
|
|
99
94
|
logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
|
|
100
95
|
# flatten the execution data into a list of RecordOpStats
|
|
101
96
|
op_set_execution_data = [
|
|
102
97
|
record_op_stats
|
|
103
|
-
for _, op_stats in
|
|
98
|
+
for _, op_stats in full_op_id_to_op_stats.items()
|
|
104
99
|
for record_op_stats in op_stats.record_op_stats_lst
|
|
105
100
|
]
|
|
106
101
|
|
|
@@ -108,7 +103,7 @@ class SampleBasedCostModel:
|
|
|
108
103
|
for record_op_stats in op_set_execution_data:
|
|
109
104
|
record_op_stats_dict = {
|
|
110
105
|
"logical_op_id": logical_op_id,
|
|
111
|
-
"
|
|
106
|
+
"full_op_id": record_op_stats.full_op_id,
|
|
112
107
|
"record_id": record_op_stats.record_id,
|
|
113
108
|
"record_parent_id": record_op_stats.record_parent_id,
|
|
114
109
|
"cost_per_record": record_op_stats.cost_per_record,
|
|
@@ -124,13 +119,13 @@ class SampleBasedCostModel:
|
|
|
124
119
|
# convert flattened execution data into dataframe
|
|
125
120
|
operator_stats_df = pd.DataFrame(execution_record_op_stats)
|
|
126
121
|
|
|
127
|
-
# for each
|
|
122
|
+
# for each full_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
|
|
128
123
|
operator_to_stats = {}
|
|
129
124
|
for logical_op_id, logical_op_df in operator_stats_df.groupby("logical_op_id"):
|
|
130
125
|
logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
|
|
131
126
|
operator_to_stats[logical_op_id] = {}
|
|
132
127
|
|
|
133
|
-
for
|
|
128
|
+
for full_op_id, physical_op_df in logical_op_df.groupby("full_op_id"):
|
|
134
129
|
# compute the number of input records processed by this operator; use source_idx for scan operator(s)
|
|
135
130
|
num_source_records = (
|
|
136
131
|
len(physical_op_df.record_parent_id.unique())
|
|
@@ -138,10 +133,10 @@ class SampleBasedCostModel:
|
|
|
138
133
|
else len(physical_op_df.source_idx.unique())
|
|
139
134
|
)
|
|
140
135
|
|
|
141
|
-
# compute selectivity
|
|
136
|
+
# compute selectivity
|
|
142
137
|
selectivity = physical_op_df.passed_operator.sum() / num_source_records
|
|
143
138
|
|
|
144
|
-
operator_to_stats[logical_op_id][
|
|
139
|
+
operator_to_stats[logical_op_id][full_op_id] = {
|
|
145
140
|
"cost": physical_op_df.cost_per_record.mean(),
|
|
146
141
|
"time": physical_op_df.time_per_record.mean(),
|
|
147
142
|
"quality": physical_op_df.quality.mean(),
|
|
@@ -162,18 +157,18 @@ class SampleBasedCostModel:
|
|
|
162
157
|
# we will have execution data for each operator passed into __call__; nevertheless, we
|
|
163
158
|
# still perform a sanity check
|
|
164
159
|
# look up physical and logical op ids associated with this physical operator
|
|
165
|
-
|
|
160
|
+
full_op_id = operator.get_full_op_id()
|
|
166
161
|
logical_op_id = operator.logical_op_id
|
|
167
162
|
physical_op_to_stats = self.operator_to_stats.get(logical_op_id)
|
|
168
163
|
assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
|
|
169
|
-
assert physical_op_to_stats.get(
|
|
164
|
+
assert physical_op_to_stats.get(full_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
|
|
170
165
|
logger.debug(f"Calling __call__ for {str(operator)}")
|
|
171
166
|
|
|
172
167
|
# look up stats for this operation
|
|
173
|
-
est_cost_per_record = self.operator_to_stats[logical_op_id][
|
|
174
|
-
est_time_per_record = self.operator_to_stats[logical_op_id][
|
|
175
|
-
est_quality = self.operator_to_stats[logical_op_id][
|
|
176
|
-
est_selectivity = self.operator_to_stats[logical_op_id][
|
|
168
|
+
est_cost_per_record = self.operator_to_stats[logical_op_id][full_op_id]["cost"]
|
|
169
|
+
est_time_per_record = self.operator_to_stats[logical_op_id][full_op_id]["time"]
|
|
170
|
+
est_quality = self.operator_to_stats[logical_op_id][full_op_id]["quality"]
|
|
171
|
+
est_selectivity = self.operator_to_stats[logical_op_id][full_op_id]["selectivity"]
|
|
177
172
|
|
|
178
173
|
# create source_op_estimates for scan operators if they are not provided
|
|
179
174
|
if isinstance(operator, ScanPhysicalOp):
|
|
@@ -238,13 +233,13 @@ class CostModel(BaseCostModel):
|
|
|
238
233
|
# compute per-operator estimates
|
|
239
234
|
self.operator_estimates = self._compute_operator_estimates()
|
|
240
235
|
|
|
241
|
-
# compute set of costed
|
|
242
|
-
self.
|
|
236
|
+
# compute set of costed full op ids from operator_to_stats
|
|
237
|
+
self.costed_full_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
|
|
243
238
|
logger.info("Initialized CostModel.")
|
|
244
239
|
logger.debug(f"Initialized CostModel with params: {self.__dict__}")
|
|
245
240
|
|
|
246
|
-
def
|
|
247
|
-
return self.
|
|
241
|
+
def get_costed_full_op_ids(self):
|
|
242
|
+
return self.costed_full_op_ids
|
|
248
243
|
|
|
249
244
|
def _compute_mean(self, df: pd.DataFrame, col: str, model_name: str | None = None) -> float:
|
|
250
245
|
"""
|
|
@@ -318,9 +313,9 @@ class CostModel(BaseCostModel):
|
|
|
318
313
|
if is_filter_op:
|
|
319
314
|
num_output_records = model_op_df.passed_operator.sum()
|
|
320
315
|
else:
|
|
321
|
-
|
|
316
|
+
full_op_ids = model_op_df.full_op_id.unique().tolist()
|
|
322
317
|
plan_ids = model_op_df.plan_id.unique().tolist()
|
|
323
|
-
num_output_records = df[df.
|
|
318
|
+
num_output_records = df[df.source_full_op_id.isin(full_op_ids) & df.plan_id.isin(plan_ids)].shape[0]
|
|
324
319
|
|
|
325
320
|
# estimate the selectivity / fan-out
|
|
326
321
|
return num_output_records / num_input_records
|
|
@@ -333,8 +328,8 @@ class CostModel(BaseCostModel):
|
|
|
333
328
|
if is_filter_op:
|
|
334
329
|
num_output_records = op_df.passed_operator.sum()
|
|
335
330
|
else:
|
|
336
|
-
|
|
337
|
-
num_output_records = df[df.
|
|
331
|
+
full_op_ids = op_df.full_op_id.unique().tolist()
|
|
332
|
+
num_output_records = df[df.source_full_op_id.isin(full_op_ids)].shape[0]
|
|
338
333
|
|
|
339
334
|
# estimate the selectivity / fan-out
|
|
340
335
|
return num_output_records / num_input_records
|
|
@@ -422,14 +417,14 @@ class CostModel(BaseCostModel):
|
|
|
422
417
|
return None
|
|
423
418
|
|
|
424
419
|
# get the set of operator ids for which we have sample data
|
|
425
|
-
|
|
420
|
+
full_op_ids = self.sample_execution_data_df.full_op_id.unique()
|
|
426
421
|
|
|
427
422
|
# compute estimates of runtime, cost, and quality (and intermediates like cardinality) for every operator
|
|
428
423
|
operator_estimates = {}
|
|
429
|
-
for
|
|
424
|
+
for full_op_id in full_op_ids:
|
|
430
425
|
# filter for subset of sample execution data related to this operation
|
|
431
426
|
op_df = self.sample_execution_data_df[
|
|
432
|
-
self.sample_execution_data_df.
|
|
427
|
+
self.sample_execution_data_df.full_op_id == full_op_id
|
|
433
428
|
]
|
|
434
429
|
|
|
435
430
|
# skip computing an estimate if we didn't capture any sampling data for this operator
|
|
@@ -480,14 +475,14 @@ class CostModel(BaseCostModel):
|
|
|
480
475
|
cardinality = self._est_cardinality(op_df)
|
|
481
476
|
estimates = {"time_per_record": time_per_record, "cardinality": cardinality}
|
|
482
477
|
|
|
483
|
-
operator_estimates[
|
|
478
|
+
operator_estimates[full_op_id] = estimates
|
|
484
479
|
|
|
485
480
|
return operator_estimates
|
|
486
481
|
|
|
487
482
|
def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
|
|
488
483
|
# get identifier for operation which is unique within sentinel plan but consistent across sentinels
|
|
489
|
-
|
|
490
|
-
logger.debug(f"Calling __call__ for {str(operator)} with
|
|
484
|
+
full_op_id = operator.get_full_op_id()
|
|
485
|
+
logger.debug(f"Calling __call__ for {str(operator)} with full_op_id: {full_op_id}")
|
|
491
486
|
|
|
492
487
|
# initialize estimates of operator metrics based on naive (but sometimes precise) logic
|
|
493
488
|
if isinstance(operator, MarshalAndScanDataOp):
|
|
@@ -520,9 +515,9 @@ class CostModel(BaseCostModel):
|
|
|
520
515
|
|
|
521
516
|
# if we have sample execution data, update naive estimates with more informed ones
|
|
522
517
|
sample_op_estimates = self.operator_estimates
|
|
523
|
-
if sample_op_estimates is not None and
|
|
518
|
+
if sample_op_estimates is not None and full_op_id in sample_op_estimates:
|
|
524
519
|
if isinstance(operator, (MarshalAndScanDataOp, CacheScanDataOp)):
|
|
525
|
-
op_estimates.time_per_record = sample_op_estimates[
|
|
520
|
+
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
526
521
|
|
|
527
522
|
elif isinstance(operator, ApplyGroupByOp):
|
|
528
523
|
# NOTE: in theory we should also treat this cardinality est. as a random variable, but in practice we will
|
|
@@ -533,36 +528,33 @@ class CostModel(BaseCostModel):
|
|
|
533
528
|
# produced by the groupby in our sample and assume it may generalize to the full workload. To estimate
|
|
534
529
|
# actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
|
|
535
530
|
# the input cardinality (where the initial input cardinality from the datareader is known).
|
|
536
|
-
op_estimates.cardinality = sample_op_estimates[
|
|
537
|
-
op_estimates.time_per_record = sample_op_estimates[
|
|
531
|
+
op_estimates.cardinality = sample_op_estimates[full_op_id]["cardinality"]
|
|
532
|
+
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
538
533
|
|
|
539
534
|
elif isinstance(operator, (CountAggregateOp, AverageAggregateOp)): # noqa: SIM114
|
|
540
|
-
op_estimates.time_per_record = sample_op_estimates[
|
|
535
|
+
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
541
536
|
|
|
542
537
|
elif isinstance(operator, LimitScanOp):
|
|
543
|
-
op_estimates.time_per_record = sample_op_estimates[
|
|
538
|
+
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
544
539
|
|
|
545
540
|
elif isinstance(operator, NonLLMFilter):
|
|
546
|
-
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[
|
|
547
|
-
op_estimates.time_per_record = sample_op_estimates[
|
|
541
|
+
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id]["selectivity"]
|
|
542
|
+
op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
|
|
548
543
|
|
|
549
544
|
elif isinstance(operator, LLMFilter):
|
|
550
545
|
model_name = operator.model.value
|
|
551
|
-
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[
|
|
552
|
-
op_estimates.time_per_record = sample_op_estimates[
|
|
553
|
-
op_estimates.cost_per_record = sample_op_estimates[
|
|
554
|
-
op_estimates.quality = sample_op_estimates[
|
|
546
|
+
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
|
|
547
|
+
op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
|
|
548
|
+
op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
|
|
549
|
+
op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
|
|
555
550
|
|
|
556
551
|
elif isinstance(operator, LLMConvert):
|
|
557
|
-
# TODO: EVEN BETTER: do similarity match (e.g. largest param intersection, more exotic techniques);
|
|
558
|
-
# another heuristic: logical_op_id-->subclass_physical_op_id-->specific_physical_op_id-->most_param_match_physical_op_id
|
|
559
|
-
# TODO: instead of [op_id][model_name] --> [logical_op_id][physical_op_id]
|
|
560
552
|
# NOTE: code synthesis does not have a model attribute
|
|
561
553
|
model_name = operator.model.value if hasattr(operator, "model") else None
|
|
562
|
-
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[
|
|
563
|
-
op_estimates.time_per_record = sample_op_estimates[
|
|
564
|
-
op_estimates.cost_per_record = sample_op_estimates[
|
|
565
|
-
op_estimates.quality = sample_op_estimates[
|
|
554
|
+
op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
|
|
555
|
+
op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
|
|
556
|
+
op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
|
|
557
|
+
op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
|
|
566
558
|
|
|
567
559
|
# NOTE: if code synth. fails, this will turn into ConventionalQuery calls to GPT-3.5,
|
|
568
560
|
# which would wildly mess up estimate of time and cost per-record
|
|
@@ -575,7 +567,7 @@ class CostModel(BaseCostModel):
|
|
|
575
567
|
# rag convert adjustment
|
|
576
568
|
if isinstance(operator, RAGConvert):
|
|
577
569
|
total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
|
|
578
|
-
total_output_tokens = sample_op_estimates[
|
|
570
|
+
total_output_tokens = sample_op_estimates[full_op_id][model_name]["total_output_tokens"]
|
|
579
571
|
op_estimates.cost_per_record = (
|
|
580
572
|
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
|
|
581
573
|
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
|
|
@@ -597,7 +589,7 @@ class CostModel(BaseCostModel):
|
|
|
597
589
|
quality=op_quality,
|
|
598
590
|
op_estimates=op_estimates,
|
|
599
591
|
)
|
|
600
|
-
logger.debug(f"Done calling __call__ for {str(operator)} with
|
|
592
|
+
logger.debug(f"Done calling __call__ for {str(operator)} with full_op_id: {full_op_id}")
|
|
601
593
|
logger.debug(f"Plan cost: {op_plan_cost}")
|
|
602
594
|
|
|
603
595
|
return op_plan_cost
|
|
@@ -103,9 +103,6 @@ class Optimizer:
|
|
|
103
103
|
# store the cost model
|
|
104
104
|
self.cost_model = cost_model
|
|
105
105
|
|
|
106
|
-
# store the set of physical operators for which our cost model has cost estimates
|
|
107
|
-
self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
|
|
108
|
-
|
|
109
106
|
# mapping from each group id to its Group object
|
|
110
107
|
self.groups = {}
|
|
111
108
|
|
|
@@ -189,7 +186,6 @@ class Optimizer:
|
|
|
189
186
|
|
|
190
187
|
def update_cost_model(self, cost_model: CostModel):
|
|
191
188
|
self.cost_model = cost_model
|
|
192
|
-
self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
|
|
193
189
|
|
|
194
190
|
def get_physical_op_params(self):
|
|
195
191
|
return {
|
|
@@ -338,7 +334,7 @@ class Optimizer:
|
|
|
338
334
|
# compute all properties including this operations'
|
|
339
335
|
all_properties = deepcopy(input_group_properties)
|
|
340
336
|
if isinstance(op, FilteredScan):
|
|
341
|
-
# NOTE: we could use op.
|
|
337
|
+
# NOTE: we could use op.get_full_op_id() here, but storing filter strings makes
|
|
342
338
|
# debugging a bit easier as you can read which filters are in the Group
|
|
343
339
|
op_filter_str = op.filter.get_filter_str()
|
|
344
340
|
if "filters" in all_properties:
|
|
@@ -464,7 +460,7 @@ class Optimizer:
|
|
|
464
460
|
elif isinstance(task, OptimizeLogicalExpression):
|
|
465
461
|
new_tasks = task.perform(self.transformation_rules, self.implementation_rules)
|
|
466
462
|
elif isinstance(task, ApplyRule):
|
|
467
|
-
context = {"
|
|
463
|
+
context = {"costed_full_op_ids": self.cost_model.get_costed_full_op_ids()}
|
|
468
464
|
new_tasks = task.perform(
|
|
469
465
|
self.groups, self.expressions, context=context, **self.get_physical_op_params()
|
|
470
466
|
)
|
|
@@ -53,7 +53,7 @@ class PhysicalPlan(Plan):
|
|
|
53
53
|
|
|
54
54
|
Two different PhysicalPlan instances with the identical lists of operators will have equivalent plan_ids.
|
|
55
55
|
"""
|
|
56
|
-
hash_str = str(tuple(op.
|
|
56
|
+
hash_str = str(tuple(op.get_full_op_id() for op in self.operators))
|
|
57
57
|
return hash_for_id(hash_str)
|
|
58
58
|
|
|
59
59
|
def __eq__(self, other):
|
|
@@ -103,9 +103,9 @@ class SentinelPlan(Plan):
|
|
|
103
103
|
assert isinstance(operator_sets[0][0], ScanPhysicalOp), "first operator set must be a scan"
|
|
104
104
|
assert all(len(op_set) > 0 for op_set in operator_sets), "every operator set must have at least one operator"
|
|
105
105
|
|
|
106
|
-
# store operator_sets and logical_op_ids; sort operator_sets internally by
|
|
106
|
+
# store operator_sets and logical_op_ids; sort operator_sets internally by full_op_id
|
|
107
107
|
self.operator_sets = operator_sets
|
|
108
|
-
self.operator_sets = [sorted(op_set, key=lambda op: op.
|
|
108
|
+
self.operator_sets = [sorted(op_set, key=lambda op: op.get_full_op_id()) for op_set in self.operator_sets]
|
|
109
109
|
self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
|
|
110
110
|
self.plan_id = self.compute_plan_id()
|
|
111
111
|
|
|
@@ -117,7 +117,7 @@ class SentinelPlan(Plan):
|
|
|
117
117
|
"""
|
|
118
118
|
hash_str = ""
|
|
119
119
|
for logical_op_id, op_set in zip(self.logical_op_ids, self.operator_sets):
|
|
120
|
-
hash_str += f"{logical_op_id} {tuple(op.
|
|
120
|
+
hash_str += f"{logical_op_id} {tuple(op.get_full_op_id() for op in op_set)} "
|
|
121
121
|
return hash_for_id(hash_str)
|
|
122
122
|
|
|
123
123
|
def __eq__(self, other):
|
|
@@ -43,7 +43,7 @@ class Expression:
|
|
|
43
43
|
return self.operator == other.operator and self.input_group_ids == other.input_group_ids
|
|
44
44
|
|
|
45
45
|
def __str__(self):
|
|
46
|
-
op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.
|
|
46
|
+
op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_full_op_id()
|
|
47
47
|
return str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
|
|
48
48
|
|
|
49
49
|
def __hash__(self):
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from copy import deepcopy
|
|
3
3
|
from itertools import combinations
|
|
4
4
|
|
|
5
|
-
from palimpzest.constants import AggFunc, Cardinality,
|
|
5
|
+
from palimpzest.constants import AggFunc, Cardinality, PromptStrategy
|
|
6
6
|
from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
|
|
7
7
|
from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvertSingle
|
|
8
8
|
from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
|
|
@@ -285,9 +285,6 @@ class LLMConvertBondedRule(ImplementationRule):
|
|
|
285
285
|
}
|
|
286
286
|
)
|
|
287
287
|
|
|
288
|
-
# NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
|
|
289
|
-
# thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
|
|
290
|
-
#
|
|
291
288
|
# identify models which can be used strictly for text or strictly for images
|
|
292
289
|
vision_models = set(get_vision_models())
|
|
293
290
|
text_models = set(get_models())
|
|
@@ -322,10 +319,10 @@ class LLMConvertBondedRule(ImplementationRule):
|
|
|
322
319
|
# skip this model if:
|
|
323
320
|
# 1. this is a pure vision model and we're not doing an image conversion, or
|
|
324
321
|
# 2. this is a pure text model and we're doing an image conversion, or
|
|
325
|
-
# 3. this is a vision model hosted by Together (i.e.
|
|
322
|
+
# 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
|
|
326
323
|
first_criteria = model in pure_vision_models and not is_image_conversion
|
|
327
324
|
second_criteria = model in pure_text_models and is_image_conversion
|
|
328
|
-
third_criteria = model
|
|
325
|
+
third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
|
|
329
326
|
if first_criteria or second_criteria or third_criteria:
|
|
330
327
|
continue
|
|
331
328
|
|
|
@@ -465,9 +462,6 @@ class RAGConvertRule(ImplementationRule):
|
|
|
465
462
|
}
|
|
466
463
|
)
|
|
467
464
|
|
|
468
|
-
# NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
|
|
469
|
-
# thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
|
|
470
|
-
#
|
|
471
465
|
# identify models which can be used strictly for text or strictly for images
|
|
472
466
|
vision_models = set(get_vision_models())
|
|
473
467
|
text_models = set(get_models())
|
|
@@ -536,9 +530,6 @@ class MixtureOfAgentsConvertRule(ImplementationRule):
|
|
|
536
530
|
}
|
|
537
531
|
)
|
|
538
532
|
|
|
539
|
-
# NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
|
|
540
|
-
# thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
|
|
541
|
-
#
|
|
542
533
|
# identify models which can be used strictly for text or strictly for images
|
|
543
534
|
vision_models = set(get_vision_models())
|
|
544
535
|
text_models = set(get_models())
|
|
@@ -560,7 +551,7 @@ class MixtureOfAgentsConvertRule(ImplementationRule):
|
|
|
560
551
|
)
|
|
561
552
|
proposer_model_set, is_image_conversion = text_models, False
|
|
562
553
|
if num_image_fields > 1 or list_image_field:
|
|
563
|
-
proposer_model_set = [model for model in vision_models if model
|
|
554
|
+
proposer_model_set = [model for model in vision_models if not model.is_llama_model()]
|
|
564
555
|
is_image_conversion = True
|
|
565
556
|
elif num_image_fields == 1:
|
|
566
557
|
proposer_model_set = vision_models
|
|
@@ -636,9 +627,6 @@ class CriticAndRefineConvertRule(ImplementationRule):
|
|
|
636
627
|
}
|
|
637
628
|
)
|
|
638
629
|
|
|
639
|
-
# NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
|
|
640
|
-
# thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
|
|
641
|
-
#
|
|
642
630
|
# identify models which can be used strictly for text or strictly for images
|
|
643
631
|
vision_models = set(get_vision_models())
|
|
644
632
|
text_models = set(get_models())
|
|
@@ -674,10 +662,10 @@ class CriticAndRefineConvertRule(ImplementationRule):
|
|
|
674
662
|
# skip this model if:
|
|
675
663
|
# 1. this is a pure vision model and we're not doing an image conversion, or
|
|
676
664
|
# 2. this is a pure text model and we're doing an image conversion, or
|
|
677
|
-
# 3. this is a vision model hosted by Together (i.e.
|
|
665
|
+
# 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
|
|
678
666
|
first_criteria = model in pure_vision_models and not is_image_conversion
|
|
679
667
|
second_criteria = model in pure_text_models and is_image_conversion
|
|
680
|
-
third_criteria = model
|
|
668
|
+
third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
|
|
681
669
|
if first_criteria or second_criteria or third_criteria:
|
|
682
670
|
continue
|
|
683
671
|
|
|
@@ -750,9 +738,6 @@ class SplitConvertRule(ImplementationRule):
|
|
|
750
738
|
}
|
|
751
739
|
)
|
|
752
740
|
|
|
753
|
-
# NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
|
|
754
|
-
# thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
|
|
755
|
-
#
|
|
756
741
|
# identify models which can be used strictly for text or strictly for images
|
|
757
742
|
vision_models = set(get_vision_models())
|
|
758
743
|
text_models = set(get_models())
|
|
@@ -911,9 +896,6 @@ class LLMFilterRule(ImplementationRule):
|
|
|
911
896
|
}
|
|
912
897
|
)
|
|
913
898
|
|
|
914
|
-
# NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
|
|
915
|
-
# thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
|
|
916
|
-
#
|
|
917
899
|
# identify models which can be used strictly for text or strictly for images
|
|
918
900
|
vision_models = set(get_vision_models())
|
|
919
901
|
text_models = set(get_models())
|
|
@@ -948,10 +930,10 @@ class LLMFilterRule(ImplementationRule):
|
|
|
948
930
|
# skip this model if:
|
|
949
931
|
# 1. this is a pure vision model and we're not doing an image filter, or
|
|
950
932
|
# 2. this is a pure text model and we're doing an image filter, or
|
|
951
|
-
# 3. this is a vision model hosted by Together (i.e.
|
|
933
|
+
# 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
|
|
952
934
|
first_criteria = model in pure_vision_models and not is_image_filter
|
|
953
935
|
second_criteria = model in pure_text_models and is_image_filter
|
|
954
|
-
third_criteria = model
|
|
936
|
+
third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
|
|
955
937
|
if first_criteria or second_criteria or third_criteria:
|
|
956
938
|
continue
|
|
957
939
|
|
|
@@ -235,9 +235,9 @@ class ApplyRule(Task):
|
|
|
235
235
|
# apply implementation rule
|
|
236
236
|
new_expressions = self.rule.substitute(self.logical_expression, **physical_op_params)
|
|
237
237
|
new_expressions = [expr for expr in new_expressions if expr.get_expr_id() not in expressions]
|
|
238
|
-
|
|
239
|
-
if
|
|
240
|
-
new_expressions = [expr for expr in new_expressions if expr.operator.
|
|
238
|
+
costed_full_op_ids = context['costed_full_op_ids']
|
|
239
|
+
if costed_full_op_ids is not None:
|
|
240
|
+
new_expressions = [expr for expr in new_expressions if expr.operator.get_full_op_id() in costed_full_op_ids]
|
|
241
241
|
expressions.update({expr.get_expr_id(): expr for expr in new_expressions})
|
|
242
242
|
group.physical_expressions.update(new_expressions)
|
|
243
243
|
|
|
@@ -17,9 +17,9 @@ class ProcessingStrategyType(Enum):
|
|
|
17
17
|
Returns a list of valid execution strategies for the given processing strategy.
|
|
18
18
|
"""
|
|
19
19
|
if self == ProcessingStrategyType.SENTINEL or self == ProcessingStrategyType.NO_SENTINEL:
|
|
20
|
-
return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
|
|
20
|
+
return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
|
|
21
21
|
elif self == ProcessingStrategyType.STREAMING:
|
|
22
|
-
return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
|
|
22
|
+
return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
|
|
23
23
|
|
|
24
24
|
def is_sentinel_strategy(self) -> bool:
|
|
25
25
|
"""
|
|
@@ -33,8 +33,6 @@ class SentinelQueryProcessor(QueryProcessor):
|
|
|
33
33
|
"""
|
|
34
34
|
Generates and returns a SentinelPlan for the given dataset.
|
|
35
35
|
"""
|
|
36
|
-
# TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
|
|
37
|
-
|
|
38
36
|
# create a new optimizer and update its strategy to SENTINEL
|
|
39
37
|
optimizer = self.optimizer.deepcopy_clean()
|
|
40
38
|
optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
|
palimpzest/sets.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Callable
|
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from chromadb.api.models.Collection import Collection
|
|
8
|
-
from ragatouille.RAGPretrainedModel import RAGPretrainedModel
|
|
9
8
|
|
|
10
9
|
from palimpzest.constants import AggFunc, Cardinality
|
|
11
10
|
from palimpzest.core.data.datareaders import DataReader
|
|
@@ -35,7 +34,7 @@ class Set:
|
|
|
35
34
|
agg_func: AggFunc | None = None,
|
|
36
35
|
group_by: GroupBySig | None = None,
|
|
37
36
|
project_cols: list[str] | None = None,
|
|
38
|
-
index: Collection |
|
|
37
|
+
index: Collection | None = None,
|
|
39
38
|
search_func: Callable | None = None,
|
|
40
39
|
search_attr: str | None = None,
|
|
41
40
|
output_attrs: list[dict] | None = None,
|
|
@@ -340,7 +339,7 @@ class Dataset(Set):
|
|
|
340
339
|
|
|
341
340
|
def retrieve(
|
|
342
341
|
self,
|
|
343
|
-
index: Collection
|
|
342
|
+
index: Collection,
|
|
344
343
|
search_attr: str,
|
|
345
344
|
output_attrs: list[dict] | type[Schema],
|
|
346
345
|
search_func: Callable | None = None,
|
|
@@ -12,7 +12,7 @@ def get_json_from_answer(answer: str, model: Model, cardinality: Cardinality) ->
|
|
|
12
12
|
and optimistically searches for the substring containing the JSON object.
|
|
13
13
|
"""
|
|
14
14
|
# model-specific trimming for LLAMA3 responses
|
|
15
|
-
if model
|
|
15
|
+
if model.is_llama_model():
|
|
16
16
|
answer = answer.split("---")[0]
|
|
17
17
|
answer = answer.replace("True", "true")
|
|
18
18
|
answer = answer.replace("False", "false")
|
|
@@ -9,10 +9,18 @@ def get_vision_models() -> list[Model]:
|
|
|
9
9
|
"""
|
|
10
10
|
models = []
|
|
11
11
|
if os.getenv("OPENAI_API_KEY") is not None:
|
|
12
|
-
|
|
12
|
+
openai_vision_models = [
|
|
13
|
+
model for model in Model
|
|
14
|
+
if model.is_openai_model() and model.is_vision_model()
|
|
15
|
+
]
|
|
16
|
+
models.extend(openai_vision_models)
|
|
13
17
|
|
|
14
18
|
if os.getenv("TOGETHER_API_KEY") is not None:
|
|
15
|
-
|
|
19
|
+
together_vision_models = [
|
|
20
|
+
model for model in Model
|
|
21
|
+
if model.is_together_model() and model.is_vision_model()
|
|
22
|
+
]
|
|
23
|
+
models.extend(together_vision_models)
|
|
16
24
|
|
|
17
25
|
return models
|
|
18
26
|
|
|
@@ -23,10 +31,16 @@ def get_models(include_vision: bool = False) -> list[Model]:
|
|
|
23
31
|
"""
|
|
24
32
|
models = []
|
|
25
33
|
if os.getenv("OPENAI_API_KEY") is not None:
|
|
26
|
-
|
|
34
|
+
openai_models = [model for model in Model if model.is_openai_model()]
|
|
35
|
+
models.extend(openai_models)
|
|
27
36
|
|
|
28
37
|
if os.getenv("TOGETHER_API_KEY") is not None:
|
|
29
|
-
|
|
38
|
+
together_models = [model for model in Model if model.is_together_model()]
|
|
39
|
+
if not include_vision:
|
|
40
|
+
together_models = [
|
|
41
|
+
model for model in together_models if not model.is_vision_model()
|
|
42
|
+
]
|
|
43
|
+
models.extend(together_models)
|
|
30
44
|
|
|
31
45
|
if include_vision:
|
|
32
46
|
vision_models = get_vision_models()
|
|
@@ -36,17 +50,21 @@ def get_models(include_vision: bool = False) -> list[Model]:
|
|
|
36
50
|
|
|
37
51
|
# The order is the priority of the model
|
|
38
52
|
TEXT_MODEL_PRIORITY = [
|
|
53
|
+
# Model.o1,
|
|
39
54
|
Model.GPT_4o,
|
|
40
55
|
Model.GPT_4o_MINI,
|
|
41
|
-
Model.
|
|
56
|
+
Model.LLAMA3_3_70B,
|
|
42
57
|
Model.MIXTRAL,
|
|
43
|
-
Model.
|
|
58
|
+
Model.DEEPSEEK_V3,
|
|
59
|
+
Model.LLAMA3_2_3B,
|
|
60
|
+
Model.LLAMA3_1_8B,
|
|
61
|
+
Model.DEEPSEEK_R1_DISTILL_QWEN_1_5B,
|
|
44
62
|
]
|
|
45
63
|
|
|
46
64
|
VISION_MODEL_PRIORITY = [
|
|
47
|
-
Model.
|
|
48
|
-
Model.
|
|
49
|
-
Model.
|
|
65
|
+
Model.GPT_4o,
|
|
66
|
+
Model.GPT_4o_MINI,
|
|
67
|
+
Model.LLAMA3_2_90B_V,
|
|
50
68
|
]
|
|
51
69
|
def get_champion_model(available_models, vision=False):
|
|
52
70
|
# Select appropriate priority list based on task
|