palimpzest 0.7.6__py3-none-any.whl → 0.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. palimpzest/constants.py +113 -75
  2. palimpzest/core/data/dataclasses.py +55 -38
  3. palimpzest/core/elements/index.py +5 -15
  4. palimpzest/core/elements/records.py +1 -1
  5. palimpzest/prompts/prompt_factory.py +1 -1
  6. palimpzest/query/execution/all_sample_execution_strategy.py +216 -0
  7. palimpzest/query/execution/execution_strategy.py +4 -4
  8. palimpzest/query/execution/execution_strategy_type.py +7 -1
  9. palimpzest/query/execution/mab_execution_strategy.py +184 -72
  10. palimpzest/query/execution/parallel_execution_strategy.py +182 -15
  11. palimpzest/query/execution/single_threaded_execution_strategy.py +21 -21
  12. palimpzest/query/generators/api_client_factory.py +6 -7
  13. palimpzest/query/generators/generators.py +5 -8
  14. palimpzest/query/operators/aggregate.py +4 -3
  15. palimpzest/query/operators/convert.py +1 -1
  16. palimpzest/query/operators/filter.py +1 -1
  17. palimpzest/query/operators/limit.py +1 -1
  18. palimpzest/query/operators/map.py +1 -1
  19. palimpzest/query/operators/physical.py +8 -4
  20. palimpzest/query/operators/project.py +1 -1
  21. palimpzest/query/operators/retrieve.py +7 -23
  22. palimpzest/query/operators/scan.py +1 -1
  23. palimpzest/query/optimizer/cost_model.py +54 -62
  24. palimpzest/query/optimizer/optimizer.py +2 -6
  25. palimpzest/query/optimizer/plan.py +4 -4
  26. palimpzest/query/optimizer/primitives.py +1 -1
  27. palimpzest/query/optimizer/rules.py +8 -26
  28. palimpzest/query/optimizer/tasks.py +3 -3
  29. palimpzest/query/processor/processing_strategy_type.py +2 -2
  30. palimpzest/query/processor/sentinel_processor.py +0 -2
  31. palimpzest/sets.py +2 -3
  32. palimpzest/utils/generation_helpers.py +1 -1
  33. palimpzest/utils/model_helpers.py +27 -9
  34. palimpzest/utils/progress.py +81 -72
  35. {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/METADATA +4 -2
  36. {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/RECORD +39 -38
  37. {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/WHEEL +1 -1
  38. {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/licenses/LICENSE +0 -0
  39. {palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/top_level.txt +0 -0
@@ -87,7 +87,7 @@ class ScanPhysicalOp(PhysicalOperator, ABC):
87
87
  record_parent_id=dr.parent_id,
88
88
  record_source_idx=dr.source_idx,
89
89
  record_state=dr.to_dict(include_bytes=False),
90
- op_id=self.get_op_id(),
90
+ full_op_id=self.get_full_op_id(),
91
91
  logical_op_id=self.logical_op_id,
92
92
  op_name=self.op_name(),
93
93
  time_per_record=(end_time - start_time),
@@ -43,11 +43,11 @@ class BaseCostModel:
43
43
  """
44
44
  pass
45
45
 
46
- def get_costed_phys_op_ids(self) -> set[str]:
46
+ def get_costed_full_op_ids(self) -> set[str]:
47
47
  """
48
- Return the set of physical op ids which the cost model has cost estimates for.
48
+ Return the set of full op ids which the cost model has cost estimates for.
49
49
  """
50
- raise NotImplementedError("Calling get_costed_phys_op_ids from abstract method")
50
+ raise NotImplementedError("Calling get_costed_full_op_ids from abstract method")
51
51
 
52
52
  def __call__(self, operator: PhysicalOperator) -> PlanCost:
53
53
  """
@@ -66,9 +66,6 @@ class SampleBasedCostModel:
66
66
  verbose: bool = False,
67
67
  exp_name: str | None = None,
68
68
  ):
69
- """
70
- execution_data is: {logical_op_id: {physical_op_id: [DataRecordSet]}}
71
- """
72
69
  # store verbose argument
73
70
  self.verbose = verbose
74
71
 
@@ -77,30 +74,28 @@ class SampleBasedCostModel:
77
74
 
78
75
  # construct cost, time, quality, and selectivity matrices for each operator set;
79
76
  self.operator_to_stats = self.compute_operator_stats(sentinel_plan_stats)
80
-
81
- # compute set of costed physical op ids from operator_to_stats
82
- self.costed_phys_op_ids = set([
83
- phys_op_id
84
- for _, phys_op_id_to_stats in self.operator_to_stats.items()
85
- for phys_op_id, _ in phys_op_id_to_stats.items()
77
+ self.costed_full_op_ids = set([
78
+ full_op_id
79
+ for _, full_op_id_to_stats in self.operator_to_stats.items()
80
+ for full_op_id in full_op_id_to_stats
86
81
  ])
87
82
 
88
83
  logger.info(f"Initialized SampleBasedCostModel with verbose={self.verbose}")
89
84
  logger.debug(f"Initialized SampleBasedCostModel with params: {self.__dict__}")
90
85
 
91
- def get_costed_phys_op_ids(self):
92
- return self.costed_phys_op_ids
86
+ def get_costed_full_op_ids(self):
87
+ return self.costed_full_op_ids
93
88
 
94
89
  def compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats) -> dict:
95
90
  logger.debug("Computing operator statistics")
96
91
  # flatten the nested dictionary of execution data and pull out fields relevant to cost estimation
97
92
  execution_record_op_stats = []
98
- for logical_op_id, phys_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
93
+ for logical_op_id, full_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
99
94
  logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
100
95
  # flatten the execution data into a list of RecordOpStats
101
96
  op_set_execution_data = [
102
97
  record_op_stats
103
- for _, op_stats in phys_op_id_to_op_stats.items()
98
+ for _, op_stats in full_op_id_to_op_stats.items()
104
99
  for record_op_stats in op_stats.record_op_stats_lst
105
100
  ]
106
101
 
@@ -108,7 +103,7 @@ class SampleBasedCostModel:
108
103
  for record_op_stats in op_set_execution_data:
109
104
  record_op_stats_dict = {
110
105
  "logical_op_id": logical_op_id,
111
- "physical_op_id": record_op_stats.op_id,
106
+ "full_op_id": record_op_stats.full_op_id,
112
107
  "record_id": record_op_stats.record_id,
113
108
  "record_parent_id": record_op_stats.record_parent_id,
114
109
  "cost_per_record": record_op_stats.cost_per_record,
@@ -124,13 +119,13 @@ class SampleBasedCostModel:
124
119
  # convert flattened execution data into dataframe
125
120
  operator_stats_df = pd.DataFrame(execution_record_op_stats)
126
121
 
127
- # for each physical_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
122
+ # for each full_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
128
123
  operator_to_stats = {}
129
124
  for logical_op_id, logical_op_df in operator_stats_df.groupby("logical_op_id"):
130
125
  logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
131
126
  operator_to_stats[logical_op_id] = {}
132
127
 
133
- for physical_op_id, physical_op_df in logical_op_df.groupby("physical_op_id"):
128
+ for full_op_id, physical_op_df in logical_op_df.groupby("full_op_id"):
134
129
  # compute the number of input records processed by this operator; use source_idx for scan operator(s)
135
130
  num_source_records = (
136
131
  len(physical_op_df.record_parent_id.unique())
@@ -138,10 +133,10 @@ class SampleBasedCostModel:
138
133
  else len(physical_op_df.source_idx.unique())
139
134
  )
140
135
 
141
- # compute selectivity
136
+ # compute selectivity
142
137
  selectivity = physical_op_df.passed_operator.sum() / num_source_records
143
138
 
144
- operator_to_stats[logical_op_id][physical_op_id] = {
139
+ operator_to_stats[logical_op_id][full_op_id] = {
145
140
  "cost": physical_op_df.cost_per_record.mean(),
146
141
  "time": physical_op_df.time_per_record.mean(),
147
142
  "quality": physical_op_df.quality.mean(),
@@ -162,18 +157,18 @@ class SampleBasedCostModel:
162
157
  # we will have execution data for each operator passed into __call__; nevertheless, we
163
158
  # still perform a sanity check
164
159
  # look up physical and logical op ids associated with this physical operator
165
- phys_op_id = operator.get_op_id()
160
+ full_op_id = operator.get_full_op_id()
166
161
  logical_op_id = operator.logical_op_id
167
162
  physical_op_to_stats = self.operator_to_stats.get(logical_op_id)
168
163
  assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
169
- assert physical_op_to_stats.get(phys_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
164
+ assert physical_op_to_stats.get(full_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
170
165
  logger.debug(f"Calling __call__ for {str(operator)}")
171
166
 
172
167
  # look up stats for this operation
173
- est_cost_per_record = self.operator_to_stats[logical_op_id][phys_op_id]["cost"]
174
- est_time_per_record = self.operator_to_stats[logical_op_id][phys_op_id]["time"]
175
- est_quality = self.operator_to_stats[logical_op_id][phys_op_id]["quality"]
176
- est_selectivity = self.operator_to_stats[logical_op_id][phys_op_id]["selectivity"]
168
+ est_cost_per_record = self.operator_to_stats[logical_op_id][full_op_id]["cost"]
169
+ est_time_per_record = self.operator_to_stats[logical_op_id][full_op_id]["time"]
170
+ est_quality = self.operator_to_stats[logical_op_id][full_op_id]["quality"]
171
+ est_selectivity = self.operator_to_stats[logical_op_id][full_op_id]["selectivity"]
177
172
 
178
173
  # create source_op_estimates for scan operators if they are not provided
179
174
  if isinstance(operator, ScanPhysicalOp):
@@ -238,13 +233,13 @@ class CostModel(BaseCostModel):
238
233
  # compute per-operator estimates
239
234
  self.operator_estimates = self._compute_operator_estimates()
240
235
 
241
- # compute set of costed physical op ids from operator_to_stats
242
- self.costed_phys_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
236
+ # compute set of costed full op ids from operator_to_stats
237
+ self.costed_full_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
243
238
  logger.info("Initialized CostModel.")
244
239
  logger.debug(f"Initialized CostModel with params: {self.__dict__}")
245
240
 
246
- def get_costed_phys_op_ids(self):
247
- return self.costed_phys_op_ids
241
+ def get_costed_full_op_ids(self):
242
+ return self.costed_full_op_ids
248
243
 
249
244
  def _compute_mean(self, df: pd.DataFrame, col: str, model_name: str | None = None) -> float:
250
245
  """
@@ -318,9 +313,9 @@ class CostModel(BaseCostModel):
318
313
  if is_filter_op:
319
314
  num_output_records = model_op_df.passed_operator.sum()
320
315
  else:
321
- op_ids = model_op_df.op_id.unique().tolist()
316
+ full_op_ids = model_op_df.full_op_id.unique().tolist()
322
317
  plan_ids = model_op_df.plan_id.unique().tolist()
323
- num_output_records = df[df.source_op_id.isin(op_ids) & df.plan_id.isin(plan_ids)].shape[0]
318
+ num_output_records = df[df.source_full_op_id.isin(full_op_ids) & df.plan_id.isin(plan_ids)].shape[0]
324
319
 
325
320
  # estimate the selectivity / fan-out
326
321
  return num_output_records / num_input_records
@@ -333,8 +328,8 @@ class CostModel(BaseCostModel):
333
328
  if is_filter_op:
334
329
  num_output_records = op_df.passed_operator.sum()
335
330
  else:
336
- op_ids = op_df.op_id.unique().tolist()
337
- num_output_records = df[df.source_op_id.isin(op_ids)].shape[0]
331
+ full_op_ids = op_df.full_op_id.unique().tolist()
332
+ num_output_records = df[df.source_full_op_id.isin(full_op_ids)].shape[0]
338
333
 
339
334
  # estimate the selectivity / fan-out
340
335
  return num_output_records / num_input_records
@@ -422,14 +417,14 @@ class CostModel(BaseCostModel):
422
417
  return None
423
418
 
424
419
  # get the set of operator ids for which we have sample data
425
- op_ids = self.sample_execution_data_df.op_id.unique()
420
+ full_op_ids = self.sample_execution_data_df.full_op_id.unique()
426
421
 
427
422
  # compute estimates of runtime, cost, and quality (and intermediates like cardinality) for every operator
428
423
  operator_estimates = {}
429
- for op_id in op_ids:
424
+ for full_op_id in full_op_ids:
430
425
  # filter for subset of sample execution data related to this operation
431
426
  op_df = self.sample_execution_data_df[
432
- self.sample_execution_data_df.op_id == op_id
427
+ self.sample_execution_data_df.full_op_id == full_op_id
433
428
  ]
434
429
 
435
430
  # skip computing an estimate if we didn't capture any sampling data for this operator
@@ -480,14 +475,14 @@ class CostModel(BaseCostModel):
480
475
  cardinality = self._est_cardinality(op_df)
481
476
  estimates = {"time_per_record": time_per_record, "cardinality": cardinality}
482
477
 
483
- operator_estimates[op_id] = estimates
478
+ operator_estimates[full_op_id] = estimates
484
479
 
485
480
  return operator_estimates
486
481
 
487
482
  def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
488
483
  # get identifier for operation which is unique within sentinel plan but consistent across sentinels
489
- op_id = operator.get_op_id()
490
- logger.debug(f"Calling __call__ for {str(operator)} with op_id: {op_id}")
484
+ full_op_id = operator.get_full_op_id()
485
+ logger.debug(f"Calling __call__ for {str(operator)} with full_op_id: {full_op_id}")
491
486
 
492
487
  # initialize estimates of operator metrics based on naive (but sometimes precise) logic
493
488
  if isinstance(operator, MarshalAndScanDataOp):
@@ -520,9 +515,9 @@ class CostModel(BaseCostModel):
520
515
 
521
516
  # if we have sample execution data, update naive estimates with more informed ones
522
517
  sample_op_estimates = self.operator_estimates
523
- if sample_op_estimates is not None and op_id in sample_op_estimates:
518
+ if sample_op_estimates is not None and full_op_id in sample_op_estimates:
524
519
  if isinstance(operator, (MarshalAndScanDataOp, CacheScanDataOp)):
525
- op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
520
+ op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
526
521
 
527
522
  elif isinstance(operator, ApplyGroupByOp):
528
523
  # NOTE: in theory we should also treat this cardinality est. as a random variable, but in practice we will
@@ -533,36 +528,33 @@ class CostModel(BaseCostModel):
533
528
  # produced by the groupby in our sample and assume it may generalize to the full workload. To estimate
534
529
  # actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
535
530
  # the input cardinality (where the initial input cardinality from the datareader is known).
536
- op_estimates.cardinality = sample_op_estimates[op_id]["cardinality"]
537
- op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
531
+ op_estimates.cardinality = sample_op_estimates[full_op_id]["cardinality"]
532
+ op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
538
533
 
539
534
  elif isinstance(operator, (CountAggregateOp, AverageAggregateOp)): # noqa: SIM114
540
- op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
535
+ op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
541
536
 
542
537
  elif isinstance(operator, LimitScanOp):
543
- op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
538
+ op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
544
539
 
545
540
  elif isinstance(operator, NonLLMFilter):
546
- op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id]["selectivity"]
547
- op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
541
+ op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id]["selectivity"]
542
+ op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
548
543
 
549
544
  elif isinstance(operator, LLMFilter):
550
545
  model_name = operator.model.value
551
- op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
552
- op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
553
- op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
554
- op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
546
+ op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
547
+ op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
548
+ op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
549
+ op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
555
550
 
556
551
  elif isinstance(operator, LLMConvert):
557
- # TODO: EVEN BETTER: do similarity match (e.g. largest param intersection, more exotic techniques);
558
- # another heuristic: logical_op_id-->subclass_physical_op_id-->specific_physical_op_id-->most_param_match_physical_op_id
559
- # TODO: instead of [op_id][model_name] --> [logical_op_id][physical_op_id]
560
552
  # NOTE: code synthesis does not have a model attribute
561
553
  model_name = operator.model.value if hasattr(operator, "model") else None
562
- op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
563
- op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
564
- op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
565
- op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
554
+ op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
555
+ op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
556
+ op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
557
+ op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
566
558
 
567
559
  # NOTE: if code synth. fails, this will turn into ConventionalQuery calls to GPT-3.5,
568
560
  # which would wildly mess up estimate of time and cost per-record
@@ -575,7 +567,7 @@ class CostModel(BaseCostModel):
575
567
  # rag convert adjustment
576
568
  if isinstance(operator, RAGConvert):
577
569
  total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
578
- total_output_tokens = sample_op_estimates[op_id][model_name]["total_output_tokens"]
570
+ total_output_tokens = sample_op_estimates[full_op_id][model_name]["total_output_tokens"]
579
571
  op_estimates.cost_per_record = (
580
572
  MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
581
573
  + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
@@ -597,7 +589,7 @@ class CostModel(BaseCostModel):
597
589
  quality=op_quality,
598
590
  op_estimates=op_estimates,
599
591
  )
600
- logger.debug(f"Done calling __call__ for {str(operator)} with op_id: {op_id}")
592
+ logger.debug(f"Done calling __call__ for {str(operator)} with full_op_id: {full_op_id}")
601
593
  logger.debug(f"Plan cost: {op_plan_cost}")
602
594
 
603
595
  return op_plan_cost
@@ -103,9 +103,6 @@ class Optimizer:
103
103
  # store the cost model
104
104
  self.cost_model = cost_model
105
105
 
106
- # store the set of physical operators for which our cost model has cost estimates
107
- self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
108
-
109
106
  # mapping from each group id to its Group object
110
107
  self.groups = {}
111
108
 
@@ -189,7 +186,6 @@ class Optimizer:
189
186
 
190
187
  def update_cost_model(self, cost_model: CostModel):
191
188
  self.cost_model = cost_model
192
- self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
193
189
 
194
190
  def get_physical_op_params(self):
195
191
  return {
@@ -338,7 +334,7 @@ class Optimizer:
338
334
  # compute all properties including this operations'
339
335
  all_properties = deepcopy(input_group_properties)
340
336
  if isinstance(op, FilteredScan):
341
- # NOTE: we could use op.get_op_id() here, but storing filter strings makes
337
+ # NOTE: we could use op.get_full_op_id() here, but storing filter strings makes
342
338
  # debugging a bit easier as you can read which filters are in the Group
343
339
  op_filter_str = op.filter.get_filter_str()
344
340
  if "filters" in all_properties:
@@ -464,7 +460,7 @@ class Optimizer:
464
460
  elif isinstance(task, OptimizeLogicalExpression):
465
461
  new_tasks = task.perform(self.transformation_rules, self.implementation_rules)
466
462
  elif isinstance(task, ApplyRule):
467
- context = {"costed_phys_op_ids": self.costed_phys_op_ids}
463
+ context = {"costed_full_op_ids": self.cost_model.get_costed_full_op_ids()}
468
464
  new_tasks = task.perform(
469
465
  self.groups, self.expressions, context=context, **self.get_physical_op_params()
470
466
  )
@@ -53,7 +53,7 @@ class PhysicalPlan(Plan):
53
53
 
54
54
  Two different PhysicalPlan instances with the identical lists of operators will have equivalent plan_ids.
55
55
  """
56
- hash_str = str(tuple(op.get_op_id() for op in self.operators))
56
+ hash_str = str(tuple(op.get_full_op_id() for op in self.operators))
57
57
  return hash_for_id(hash_str)
58
58
 
59
59
  def __eq__(self, other):
@@ -103,9 +103,9 @@ class SentinelPlan(Plan):
103
103
  assert isinstance(operator_sets[0][0], ScanPhysicalOp), "first operator set must be a scan"
104
104
  assert all(len(op_set) > 0 for op_set in operator_sets), "every operator set must have at least one operator"
105
105
 
106
- # store operator_sets and logical_op_ids; sort operator_sets internally by op_id
106
+ # store operator_sets and logical_op_ids; sort operator_sets internally by full_op_id
107
107
  self.operator_sets = operator_sets
108
- self.operator_sets = [sorted(op_set, key=lambda op: op.get_op_id()) for op_set in self.operator_sets]
108
+ self.operator_sets = [sorted(op_set, key=lambda op: op.get_full_op_id()) for op_set in self.operator_sets]
109
109
  self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
110
110
  self.plan_id = self.compute_plan_id()
111
111
 
@@ -117,7 +117,7 @@ class SentinelPlan(Plan):
117
117
  """
118
118
  hash_str = ""
119
119
  for logical_op_id, op_set in zip(self.logical_op_ids, self.operator_sets):
120
- hash_str += f"{logical_op_id} {tuple(op.get_op_id() for op in op_set)} "
120
+ hash_str += f"{logical_op_id} {tuple(op.get_full_op_id() for op in op_set)} "
121
121
  return hash_for_id(hash_str)
122
122
 
123
123
  def __eq__(self, other):
@@ -43,7 +43,7 @@ class Expression:
43
43
  return self.operator == other.operator and self.input_group_ids == other.input_group_ids
44
44
 
45
45
  def __str__(self):
46
- op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_op_id()
46
+ op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_full_op_id()
47
47
  return str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
48
48
 
49
49
  def __hash__(self):
@@ -2,7 +2,7 @@ import logging
2
2
  from copy import deepcopy
3
3
  from itertools import combinations
4
4
 
5
- from palimpzest.constants import AggFunc, Cardinality, Model, PromptStrategy
5
+ from palimpzest.constants import AggFunc, Cardinality, PromptStrategy
6
6
  from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
7
7
  from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvertSingle
8
8
  from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
@@ -285,9 +285,6 @@ class LLMConvertBondedRule(ImplementationRule):
285
285
  }
286
286
  )
287
287
 
288
- # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
289
- # thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
290
- #
291
288
  # identify models which can be used strictly for text or strictly for images
292
289
  vision_models = set(get_vision_models())
293
290
  text_models = set(get_models())
@@ -322,10 +319,10 @@ class LLMConvertBondedRule(ImplementationRule):
322
319
  # skip this model if:
323
320
  # 1. this is a pure vision model and we're not doing an image conversion, or
324
321
  # 2. this is a pure text model and we're doing an image conversion, or
325
- # 3. this is a vision model hosted by Together (i.e. LLAMA3_V) and there is more than one image field
322
+ # 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
326
323
  first_criteria = model in pure_vision_models and not is_image_conversion
327
324
  second_criteria = model in pure_text_models and is_image_conversion
328
- third_criteria = model == Model.LLAMA3_V and (num_image_fields > 1 or list_image_field)
325
+ third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
329
326
  if first_criteria or second_criteria or third_criteria:
330
327
  continue
331
328
 
@@ -465,9 +462,6 @@ class RAGConvertRule(ImplementationRule):
465
462
  }
466
463
  )
467
464
 
468
- # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
469
- # thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
470
- #
471
465
  # identify models which can be used strictly for text or strictly for images
472
466
  vision_models = set(get_vision_models())
473
467
  text_models = set(get_models())
@@ -536,9 +530,6 @@ class MixtureOfAgentsConvertRule(ImplementationRule):
536
530
  }
537
531
  )
538
532
 
539
- # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
540
- # thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
541
- #
542
533
  # identify models which can be used strictly for text or strictly for images
543
534
  vision_models = set(get_vision_models())
544
535
  text_models = set(get_models())
@@ -560,7 +551,7 @@ class MixtureOfAgentsConvertRule(ImplementationRule):
560
551
  )
561
552
  proposer_model_set, is_image_conversion = text_models, False
562
553
  if num_image_fields > 1 or list_image_field:
563
- proposer_model_set = [model for model in vision_models if model != Model.LLAMA3_V]
554
+ proposer_model_set = [model for model in vision_models if not model.is_llama_model()]
564
555
  is_image_conversion = True
565
556
  elif num_image_fields == 1:
566
557
  proposer_model_set = vision_models
@@ -636,9 +627,6 @@ class CriticAndRefineConvertRule(ImplementationRule):
636
627
  }
637
628
  )
638
629
 
639
- # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
640
- # thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
641
- #
642
630
  # identify models which can be used strictly for text or strictly for images
643
631
  vision_models = set(get_vision_models())
644
632
  text_models = set(get_models())
@@ -674,10 +662,10 @@ class CriticAndRefineConvertRule(ImplementationRule):
674
662
  # skip this model if:
675
663
  # 1. this is a pure vision model and we're not doing an image conversion, or
676
664
  # 2. this is a pure text model and we're doing an image conversion, or
677
- # 3. this is a vision model hosted by Together (i.e. LLAMA3_V) and there is more than one image field
665
+ # 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
678
666
  first_criteria = model in pure_vision_models and not is_image_conversion
679
667
  second_criteria = model in pure_text_models and is_image_conversion
680
- third_criteria = model == Model.LLAMA3_V and (num_image_fields > 1 or list_image_field)
668
+ third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
681
669
  if first_criteria or second_criteria or third_criteria:
682
670
  continue
683
671
 
@@ -750,9 +738,6 @@ class SplitConvertRule(ImplementationRule):
750
738
  }
751
739
  )
752
740
 
753
- # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
754
- # thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
755
- #
756
741
  # identify models which can be used strictly for text or strictly for images
757
742
  vision_models = set(get_vision_models())
758
743
  text_models = set(get_models())
@@ -911,9 +896,6 @@ class LLMFilterRule(ImplementationRule):
911
896
  }
912
897
  )
913
898
 
914
- # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
915
- # thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
916
- #
917
899
  # identify models which can be used strictly for text or strictly for images
918
900
  vision_models = set(get_vision_models())
919
901
  text_models = set(get_models())
@@ -948,10 +930,10 @@ class LLMFilterRule(ImplementationRule):
948
930
  # skip this model if:
949
931
  # 1. this is a pure vision model and we're not doing an image filter, or
950
932
  # 2. this is a pure text model and we're doing an image filter, or
951
- # 3. this is a vision model hosted by Together (i.e. LLAMA3_V) and there is more than one image field
933
+ # 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
952
934
  first_criteria = model in pure_vision_models and not is_image_filter
953
935
  second_criteria = model in pure_text_models and is_image_filter
954
- third_criteria = model == Model.LLAMA3_V and (num_image_fields > 1 or list_image_field)
936
+ third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
955
937
  if first_criteria or second_criteria or third_criteria:
956
938
  continue
957
939
 
@@ -235,9 +235,9 @@ class ApplyRule(Task):
235
235
  # apply implementation rule
236
236
  new_expressions = self.rule.substitute(self.logical_expression, **physical_op_params)
237
237
  new_expressions = [expr for expr in new_expressions if expr.get_expr_id() not in expressions]
238
- costed_phys_op_ids = context['costed_phys_op_ids']
239
- if costed_phys_op_ids is not None:
240
- new_expressions = [expr for expr in new_expressions if expr.operator.get_op_id() in costed_phys_op_ids]
238
+ costed_full_op_ids = context['costed_full_op_ids']
239
+ if costed_full_op_ids is not None:
240
+ new_expressions = [expr for expr in new_expressions if expr.operator.get_full_op_id() in costed_full_op_ids]
241
241
  expressions.update({expr.get_expr_id(): expr for expr in new_expressions})
242
242
  group.physical_expressions.update(new_expressions)
243
243
 
@@ -17,9 +17,9 @@ class ProcessingStrategyType(Enum):
17
17
  Returns a list of valid execution strategies for the given processing strategy.
18
18
  """
19
19
  if self == ProcessingStrategyType.SENTINEL or self == ProcessingStrategyType.NO_SENTINEL:
20
- return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
20
+ return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
21
21
  elif self == ProcessingStrategyType.STREAMING:
22
- return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
22
+ return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
23
23
 
24
24
  def is_sentinel_strategy(self) -> bool:
25
25
  """
@@ -33,8 +33,6 @@ class SentinelQueryProcessor(QueryProcessor):
33
33
  """
34
34
  Generates and returns a SentinelPlan for the given dataset.
35
35
  """
36
- # TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
37
-
38
36
  # create a new optimizer and update its strategy to SENTINEL
39
37
  optimizer = self.optimizer.deepcopy_clean()
40
38
  optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
palimpzest/sets.py CHANGED
@@ -5,7 +5,6 @@ from typing import Callable
5
5
 
6
6
  import pandas as pd
7
7
  from chromadb.api.models.Collection import Collection
8
- from ragatouille.RAGPretrainedModel import RAGPretrainedModel
9
8
 
10
9
  from palimpzest.constants import AggFunc, Cardinality
11
10
  from palimpzest.core.data.datareaders import DataReader
@@ -35,7 +34,7 @@ class Set:
35
34
  agg_func: AggFunc | None = None,
36
35
  group_by: GroupBySig | None = None,
37
36
  project_cols: list[str] | None = None,
38
- index: Collection | RAGPretrainedModel | None = None,
37
+ index: Collection | None = None,
39
38
  search_func: Callable | None = None,
40
39
  search_attr: str | None = None,
41
40
  output_attrs: list[dict] | None = None,
@@ -340,7 +339,7 @@ class Dataset(Set):
340
339
 
341
340
  def retrieve(
342
341
  self,
343
- index: Collection | RAGPretrainedModel,
342
+ index: Collection,
344
343
  search_attr: str,
345
344
  output_attrs: list[dict] | type[Schema],
346
345
  search_func: Callable | None = None,
@@ -12,7 +12,7 @@ def get_json_from_answer(answer: str, model: Model, cardinality: Cardinality) ->
12
12
  and optimistically searches for the substring containing the JSON object.
13
13
  """
14
14
  # model-specific trimming for LLAMA3 responses
15
- if model in [Model.LLAMA3, Model.LLAMA3_V]:
15
+ if model.is_llama_model():
16
16
  answer = answer.split("---")[0]
17
17
  answer = answer.replace("True", "true")
18
18
  answer = answer.replace("False", "false")
@@ -9,10 +9,18 @@ def get_vision_models() -> list[Model]:
9
9
  """
10
10
  models = []
11
11
  if os.getenv("OPENAI_API_KEY") is not None:
12
- models.extend([Model.GPT_4o_V, Model.GPT_4o_MINI_V])
12
+ openai_vision_models = [
13
+ model for model in Model
14
+ if model.is_openai_model() and model.is_vision_model()
15
+ ]
16
+ models.extend(openai_vision_models)
13
17
 
14
18
  if os.getenv("TOGETHER_API_KEY") is not None:
15
- models.extend([Model.LLAMA3_V])
19
+ together_vision_models = [
20
+ model for model in Model
21
+ if model.is_together_model() and model.is_vision_model()
22
+ ]
23
+ models.extend(together_vision_models)
16
24
 
17
25
  return models
18
26
 
@@ -23,10 +31,16 @@ def get_models(include_vision: bool = False) -> list[Model]:
23
31
  """
24
32
  models = []
25
33
  if os.getenv("OPENAI_API_KEY") is not None:
26
- models.extend([Model.GPT_4o, Model.GPT_4o_MINI])
34
+ openai_models = [model for model in Model if model.is_openai_model()]
35
+ models.extend(openai_models)
27
36
 
28
37
  if os.getenv("TOGETHER_API_KEY") is not None:
29
- models.extend([Model.LLAMA3, Model.MIXTRAL, Model.DEEPSEEK])
38
+ together_models = [model for model in Model if model.is_together_model()]
39
+ if not include_vision:
40
+ together_models = [
41
+ model for model in together_models if not model.is_vision_model()
42
+ ]
43
+ models.extend(together_models)
30
44
 
31
45
  if include_vision:
32
46
  vision_models = get_vision_models()
@@ -36,17 +50,21 @@ def get_models(include_vision: bool = False) -> list[Model]:
36
50
 
37
51
  # The order is the priority of the model
38
52
  TEXT_MODEL_PRIORITY = [
53
+ # Model.o1,
39
54
  Model.GPT_4o,
40
55
  Model.GPT_4o_MINI,
41
- Model.LLAMA3,
56
+ Model.LLAMA3_3_70B,
42
57
  Model.MIXTRAL,
43
- Model.DEEPSEEK,
58
+ Model.DEEPSEEK_V3,
59
+ Model.LLAMA3_2_3B,
60
+ Model.LLAMA3_1_8B,
61
+ Model.DEEPSEEK_R1_DISTILL_QWEN_1_5B,
44
62
  ]
45
63
 
46
64
  VISION_MODEL_PRIORITY = [
47
- Model.GPT_4o_V,
48
- Model.GPT_4o_MINI_V,
49
- Model.LLAMA3_V,
65
+ Model.GPT_4o,
66
+ Model.GPT_4o_MINI,
67
+ Model.LLAMA3_2_90B_V,
50
68
  ]
51
69
  def get_champion_model(available_models, vision=False):
52
70
  # Select appropriate priority list based on task