palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
  57. palimpzest-0.7.1.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.4.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from copy import deepcopy
4
5
 
5
6
  from palimpzest.constants import Model
@@ -14,6 +15,7 @@ from palimpzest.query.operators.logical import (
14
15
  GroupByAggregate,
15
16
  LimitScan,
16
17
  LogicalOperator,
18
+ MapScan,
17
19
  Project,
18
20
  RetrieveScan,
19
21
  )
@@ -22,22 +24,17 @@ from palimpzest.query.optimizer import (
22
24
  TRANSFORMATION_RULES,
23
25
  )
24
26
  from palimpzest.query.optimizer.cost_model import CostModel
25
- from palimpzest.query.optimizer.optimizer_strategy import (
26
- OptimizationStrategyType,
27
- OptimizerStrategyRegistry,
28
- )
27
+ from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
29
28
  from palimpzest.query.optimizer.plan import PhysicalPlan
30
29
  from palimpzest.query.optimizer.primitives import Group, LogicalExpression
31
30
  from palimpzest.query.optimizer.rules import (
32
31
  CodeSynthesisConvertRule,
33
32
  CriticAndRefineConvertRule,
34
33
  LLMConvertBondedRule,
35
- LLMConvertConventionalRule,
36
34
  MixtureOfAgentsConvertRule,
37
35
  RAGConvertRule,
36
+ SplitConvertRule,
38
37
  TokenReducedConvertBondedRule,
39
- TokenReducedConvertConventionalRule,
40
- TokenReducedConvertRule,
41
38
  )
42
39
  from palimpzest.query.optimizer.tasks import (
43
40
  ApplyRule,
@@ -48,7 +45,9 @@ from palimpzest.query.optimizer.tasks import (
48
45
  )
49
46
  from palimpzest.sets import Dataset, Set
50
47
  from palimpzest.utils.hash_helpers import hash_for_serialized_dict
51
- from palimpzest.utils.model_helpers import get_champion_model, get_code_champion_model, get_conventional_fallback_model
48
+ from palimpzest.utils.model_helpers import get_champion_model, get_code_champion_model, get_fallback_model
49
+
50
+ logger = logging.getLogger(__name__)
52
51
 
53
52
 
54
53
  def get_node_uid(node: Dataset | DataReader) -> str:
@@ -86,22 +85,21 @@ class Optimizer:
86
85
  self,
87
86
  policy: Policy,
88
87
  cost_model: CostModel,
89
- no_cache: bool = False,
88
+ available_models: list[Model],
89
+ cache: bool = False,
90
90
  verbose: bool = False,
91
- available_models: list[Model] | None = None,
92
91
  allow_bonded_query: bool = True,
93
- allow_conventional_query: bool = False,
94
92
  allow_code_synth: bool = False,
95
93
  allow_token_reduction: bool = False,
96
94
  allow_rag_reduction: bool = False,
97
95
  allow_mixtures: bool = True,
98
96
  allow_critic: bool = False,
99
- optimization_strategy_type: OptimizationStrategyType = OptimizationStrategyType.PARETO,
97
+ allow_split_merge: bool = False,
98
+ optimizer_strategy: OptimizationStrategyType = OptimizationStrategyType.PARETO,
100
99
  use_final_op_quality: bool = False, # TODO: make this func(plan) -> final_quality
100
+ **kwargs,
101
101
  ):
102
102
  # store the policy
103
- if available_models is None or len(available_models) == 0:
104
- available_models = []
105
103
  self.policy = policy
106
104
 
107
105
  # store the cost model
@@ -123,36 +121,38 @@ class Optimizer:
123
121
  self.implementation_rules = IMPLEMENTATION_RULES
124
122
  self.transformation_rules = TRANSFORMATION_RULES
125
123
 
126
- self.strategy = OptimizerStrategyRegistry.get_strategy(optimization_strategy_type.value)
124
+ # get the strategy class associated with the optimizer strategy
125
+ optimizer_strategy_cls = optimizer_strategy.value
126
+ self.strategy = optimizer_strategy_cls()
127
127
 
128
- # if we are doing SENTINEL / NONE optimization; remove transformation rules
129
- if optimization_strategy_type in [OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE]:
128
+ # remove transformation rules for optimization strategies which do not require them
129
+ if optimizer_strategy.no_transformation():
130
130
  self.transformation_rules = []
131
131
 
132
132
  # if we are not performing optimization, set available models to be single model
133
133
  # and remove all optimizations (except for bonded queries)
134
- if optimization_strategy_type == OptimizationStrategyType.NONE:
134
+ if optimizer_strategy == OptimizationStrategyType.NONE:
135
135
  self.allow_bonded_query = True
136
- self.allow_conventional_query = False
137
136
  self.allow_code_synth = False
138
137
  self.allow_token_reduction = False
139
138
  self.allow_rag_reduction = False
140
139
  self.allow_mixtures = False
141
140
  self.allow_critic = False
141
+ self.allow_split_merge = False
142
142
  self.available_models = [available_models[0]]
143
143
 
144
144
  # store optimization hyperparameters
145
- self.no_cache = no_cache
145
+ self.cache = cache
146
146
  self.verbose = verbose
147
147
  self.available_models = available_models
148
148
  self.allow_bonded_query = allow_bonded_query
149
- self.allow_conventional_query = allow_conventional_query
150
149
  self.allow_code_synth = allow_code_synth
151
150
  self.allow_token_reduction = allow_token_reduction
152
151
  self.allow_rag_reduction = allow_rag_reduction
153
152
  self.allow_mixtures = allow_mixtures
154
153
  self.allow_critic = allow_critic
155
- self.optimization_strategy_type = optimization_strategy_type
154
+ self.allow_split_merge = allow_split_merge
155
+ self.optimizer_strategy = optimizer_strategy
156
156
  self.use_final_op_quality = use_final_op_quality
157
157
 
158
158
  # prune implementation rules based on boolean flags
@@ -163,13 +163,6 @@ class Optimizer:
163
163
  if rule not in [LLMConvertBondedRule, TokenReducedConvertBondedRule]
164
164
  ]
165
165
 
166
- if not self.allow_conventional_query:
167
- self.implementation_rules = [
168
- rule
169
- for rule in self.implementation_rules
170
- if rule not in [LLMConvertConventionalRule, TokenReducedConvertConventionalRule]
171
- ]
172
-
173
166
  if not self.allow_code_synth:
174
167
  self.implementation_rules = [
175
168
  rule for rule in self.implementation_rules if not issubclass(rule, CodeSynthesisConvertRule)
@@ -177,7 +170,7 @@ class Optimizer:
177
170
 
178
171
  if not self.allow_token_reduction:
179
172
  self.implementation_rules = [
180
- rule for rule in self.implementation_rules if not issubclass(rule, TokenReducedConvertRule)
173
+ rule for rule in self.implementation_rules if not issubclass(rule, TokenReducedConvertBondedRule)
181
174
  ]
182
175
 
183
176
  if not self.allow_rag_reduction:
@@ -187,8 +180,7 @@ class Optimizer:
187
180
 
188
181
  if not self.allow_mixtures:
189
182
  self.implementation_rules = [
190
- rule for rule in self.implementation_rules
191
- if not issubclass(rule, MixtureOfAgentsConvertRule)
183
+ rule for rule in self.implementation_rules if not issubclass(rule, MixtureOfAgentsConvertRule)
192
184
  ]
193
185
 
194
186
  if not self.allow_critic:
@@ -196,8 +188,17 @@ class Optimizer:
196
188
  rule for rule in self.implementation_rules if not issubclass(rule, CriticAndRefineConvertRule)
197
189
  ]
198
190
 
191
+ if not self.allow_split_merge:
192
+ self.implementation_rules = [
193
+ rule for rule in self.implementation_rules if not issubclass(rule, SplitConvertRule)
194
+ ]
195
+
196
+ logger.info(f"Initialized Optimizer with verbose={self.verbose}")
197
+ logger.debug(f"Initialized Optimizer with params: {self.__dict__}")
198
+
199
199
  def update_cost_model(self, cost_model: CostModel):
200
200
  self.cost_model = cost_model
201
+ self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
201
202
 
202
203
  def get_physical_op_params(self):
203
204
  return {
@@ -205,38 +206,41 @@ class Optimizer:
205
206
  "available_models": self.available_models,
206
207
  "champion_model": get_champion_model(self.available_models),
207
208
  "code_champion_model": get_code_champion_model(self.available_models),
208
- "conventional_fallback_model": get_conventional_fallback_model(self.available_models),
209
+ "fallback_model": get_fallback_model(self.available_models),
209
210
  }
210
211
 
211
212
  def deepcopy_clean(self):
212
213
  optimizer = Optimizer(
213
214
  policy=self.policy,
214
215
  cost_model=CostModel(),
215
- no_cache=self.no_cache,
216
+ cache=self.cache,
216
217
  verbose=self.verbose,
217
218
  available_models=self.available_models,
218
219
  allow_bonded_query=self.allow_bonded_query,
219
- allow_conventional_query=self.allow_conventional_query,
220
220
  allow_code_synth=self.allow_code_synth,
221
221
  allow_token_reduction=self.allow_token_reduction,
222
222
  allow_rag_reduction=self.allow_rag_reduction,
223
223
  allow_mixtures=self.allow_mixtures,
224
224
  allow_critic=self.allow_critic,
225
- optimization_strategy_type=self.optimization_strategy_type,
225
+ allow_split_merge=self.allow_split_merge,
226
+ optimizer_strategy=self.optimizer_strategy,
226
227
  use_final_op_quality=self.use_final_op_quality,
227
228
  )
228
229
  return optimizer
229
-
230
- def update_strategy(self, optimizer_strategy_type: OptimizationStrategyType):
231
- self.optimization_strategy_type = optimizer_strategy_type
232
- self.strategy = OptimizerStrategyRegistry.get_strategy(optimizer_strategy_type.value)
233
-
230
+
231
+ def update_strategy(self, optimizer_strategy: OptimizationStrategyType):
232
+ self.optimizer_strategy = optimizer_strategy
233
+ optimizer_strategy_cls = optimizer_strategy.value
234
+ self.strategy = optimizer_strategy_cls()
235
+
234
236
  def construct_group_tree(self, dataset_nodes: list[Set]) -> tuple[list[int], dict[str, Field], dict[str, set[str]]]:
235
237
  # get node, output_schema, and input_schema (if applicable)
238
+ logger.debug(f"Constructing group tree for dataset_nodes: {dataset_nodes}")
239
+
236
240
  node = dataset_nodes[-1]
237
241
  output_schema = node.schema
238
242
  input_schema = dataset_nodes[-2].schema if len(dataset_nodes) > 1 else None
239
-
243
+
240
244
  ### convert node --> Group ###
241
245
  uid = get_node_uid(node)
242
246
 
@@ -244,7 +248,7 @@ class Optimizer:
244
248
  op: LogicalOperator | None = None
245
249
 
246
250
  # TODO: add cache scan when we add caching back to PZ
247
- # if not self.no_cache:
251
+ # if self.cache:
248
252
  # op = CacheScan(datareader=node, output_schema=output_schema)
249
253
  if isinstance(node, DataReader):
250
254
  op = BaseScan(datareader=node, output_schema=output_schema)
@@ -291,9 +295,9 @@ class Optimizer:
291
295
  index=node._index,
292
296
  search_func=node._search_func,
293
297
  search_attr=node._search_attr,
294
- output_attr=node._output_attr,
298
+ output_attrs=node._output_attrs,
295
299
  k=node._k,
296
- target_cache_id=uid
300
+ target_cache_id=uid,
297
301
  )
298
302
  elif output_schema != input_schema:
299
303
  op = ConvertScan(
@@ -304,6 +308,13 @@ class Optimizer:
304
308
  depends_on=node._depends_on,
305
309
  target_cache_id=uid,
306
310
  )
311
+ elif output_schema == input_schema and node._udf is not None:
312
+ op = MapScan(
313
+ input_schema=input_schema,
314
+ output_schema=output_schema,
315
+ udf=node._udf,
316
+ target_cache_id=uid,
317
+ )
307
318
  # some legacy plans may have a useless convert; for now we simply skip it
308
319
  elif output_schema == input_schema:
309
320
  return self.construct_group_tree(dataset_nodes[:-1]) if len(dataset_nodes) > 1 else ([], {}, {})
@@ -319,7 +330,9 @@ class Optimizer:
319
330
  )
320
331
 
321
332
  # compute the fields added by this operation and all fields
322
- input_group_short_field_names = list(map(lambda full_field: full_field.split(".")[-1], input_group_fields.keys()))
333
+ input_group_short_field_names = list(
334
+ map(lambda full_field: full_field.split(".")[-1], input_group_fields.keys())
335
+ )
323
336
  new_fields = {
324
337
  field_name: field
325
338
  for field_name, field in op.output_schema.field_map(unique=True, id=uid).items()
@@ -329,9 +342,7 @@ class Optimizer:
329
342
 
330
343
  # compute the set of (short) field names this operation depends on
331
344
  depends_on_field_names = (
332
- {}
333
- if isinstance(node, DataReader)
334
- else {field_name.split(".")[-1] for field_name in node._depends_on}
345
+ {} if isinstance(node, DataReader) else {field_name.split(".")[-1] for field_name in node._depends_on}
335
346
  )
336
347
 
337
348
  # compute all properties including this operations'
@@ -351,7 +362,7 @@ class Optimizer:
351
362
  all_properties["limits"].add(op_limit_str)
352
363
  else:
353
364
  all_properties["limits"] = set([op_limit_str])
354
-
365
+
355
366
  elif isinstance(op, Project):
356
367
  op_project_str = op.get_logical_op_id()
357
368
  if "projects" in all_properties:
@@ -359,6 +370,13 @@ class Optimizer:
359
370
  else:
360
371
  all_properties["projects"] = set([op_project_str])
361
372
 
373
+ elif isinstance(op, MapScan):
374
+ op_udf_str = op.udf.__name__
375
+ if "udfs" in all_properties:
376
+ all_properties["udfs"].add(op_udf_str)
377
+ else:
378
+ all_properties["udfs"] = set([op_udf_str])
379
+
362
380
  # construct the logical expression and group
363
381
  logical_expression = LogicalExpression(
364
382
  operator=op,
@@ -378,13 +396,16 @@ class Optimizer:
378
396
  # add the expression and group to the optimizer's expressions and groups and return
379
397
  self.expressions[logical_expression.get_expr_id()] = logical_expression
380
398
  self.groups[group.group_id] = group
399
+ logger.debug(f"Constructed group tree for dataset_nodes: {dataset_nodes}")
400
+ logger.debug(f"Group: {group.group_id}, {all_fields}, {all_properties}")
381
401
 
382
402
  return [group.group_id], all_fields, all_properties
383
403
 
384
404
  def convert_query_plan_to_group_tree(self, query_plan: Dataset) -> str:
405
+ logger.debug(f"Converting query plan to group tree for query_plan: {query_plan}")
385
406
  # Obtain ordered list of datasets
386
407
  dataset_nodes: list[Dataset | DataReader] = []
387
- node = deepcopy(query_plan)
408
+ node = query_plan.copy()
388
409
 
389
410
  # NOTE: the very first node will be a DataReader; the rest will be Dataset
390
411
  while isinstance(node, Dataset):
@@ -427,7 +448,8 @@ class Optimizer:
427
448
  # check that final_group_id is a singleton
428
449
  assert len(final_group_id) == 1
429
450
  final_group_id = final_group_id[0]
430
-
451
+ logger.debug(f"Converted query plan to group tree for query_plan: {query_plan}")
452
+ logger.debug(f"Final group id: {final_group_id}")
431
453
  return final_group_id
432
454
 
433
455
  def heuristic_optimization(self, group_id: int) -> None:
@@ -437,6 +459,8 @@ class Optimizer:
437
459
  pass
438
460
 
439
461
  def search_optimization_space(self, group_id: int) -> None:
462
+ logger.debug(f"Searching optimization space for group_id: {group_id}")
463
+
440
464
  # begin the search for an optimal plan with a task to optimize the final group
441
465
  initial_task = OptimizeGroup(group_id)
442
466
  self.tasks_stack.append(initial_task)
@@ -451,18 +475,23 @@ class Optimizer:
451
475
  new_tasks = task.perform(self.transformation_rules, self.implementation_rules)
452
476
  elif isinstance(task, ApplyRule):
453
477
  context = {"costed_phys_op_ids": self.costed_phys_op_ids}
454
- new_tasks = task.perform(self.groups, self.expressions, context=context, **self.get_physical_op_params())
478
+ new_tasks = task.perform(
479
+ self.groups, self.expressions, context=context, **self.get_physical_op_params()
480
+ )
455
481
  elif isinstance(task, OptimizePhysicalExpression):
456
- context = {"optimization_strategy_type": self.optimization_strategy_type}
482
+ context = {"optimizer_strategy": self.optimizer_strategy}
457
483
  new_tasks = task.perform(self.cost_model, self.groups, self.policy, context=context)
458
484
 
459
485
  self.tasks_stack.extend(new_tasks)
460
486
 
461
- def optimize(self, query_plan: Dataset, policy: Policy | None = None) -> list[PhysicalPlan]:
487
+ logger.debug(f"Done searching optimization space for group_id: {group_id}")
488
+
489
+ def optimize(self, query_plan: Dataset) -> list[PhysicalPlan]:
462
490
  """
463
491
  The optimize function takes in an initial query plan and searches the space of
464
492
  logical and physical plans in order to cost and produce a (near) optimal physical plan.
465
493
  """
494
+ logger.info(f"Optimizing query plan: {query_plan}")
466
495
  # compute the initial group tree for the user plan
467
496
  final_group_id = self.convert_query_plan_to_group_tree(query_plan)
468
497
 
@@ -472,6 +501,6 @@ class Optimizer:
472
501
 
473
502
  # search the optimization space by applying logical and physical transformations to the initial group tree
474
503
  self.search_optimization_space(final_group_id)
475
-
476
- return self.strategy.get_optimal_plans(self.groups, final_group_id, policy, self.use_final_op_quality)
477
-
504
+ logger.info(f"Getting optimal plans for final group id: {final_group_id}")
505
+
506
+ return self.strategy.get_optimal_plans(self.groups, final_group_id, self.policy, self.use_final_op_quality)
@@ -1,24 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from abc import ABC, abstractmethod
4
- from copy import deepcopy
5
- from enum import Enum
6
5
 
7
6
  from palimpzest.policy import Policy
8
7
  from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
9
8
 
10
-
11
- class OptimizationStrategyType(str, Enum):
12
- """
13
- OptimizationStrategyType determines which (set of) plan(s) the Optimizer
14
- will return to the Execution layer.
15
- """
16
- GREEDY = "greedy"
17
- CONFIDENCE_INTERVAL = "confidence-interval"
18
- PARETO = "pareto"
19
- SENTINEL = "sentinel"
20
- NONE = "none"
21
- AUTO = "auto"
9
+ logger = logging.getLogger(__name__)
22
10
 
23
11
 
24
12
  class OptimizationStrategy(ABC):
@@ -27,11 +15,6 @@ class OptimizationStrategy(ABC):
27
15
  """Strategy decides how to search through the groups for optimal plan(s)"""
28
16
  pass
29
17
 
30
- @classmethod
31
- def get_strategy(cls, strategy_type: str) -> OptimizationStrategy:
32
- """Factory method to create strategy instances"""
33
- return OptimizerStrategyRegistry.get_strategy(strategy_type)
34
-
35
18
  def normalize_final_plans(self, plans: list[PhysicalPlan]) -> list[PhysicalPlan]:
36
19
  """
37
20
  For each plan in `plans`, this function enforces that the input schema of every
@@ -47,7 +30,7 @@ class OptimizationStrategy(ABC):
47
30
  for plan in plans:
48
31
  normalized_ops = []
49
32
  for idx, op in enumerate(plan.operators):
50
- op_copy = deepcopy(op)
33
+ op_copy = op.copy()
51
34
  if idx == 0:
52
35
  normalized_ops.append(op_copy)
53
36
  else:
@@ -79,7 +62,12 @@ class GreedyStrategy(OptimizationStrategy):
79
62
  return PhysicalPlan.from_ops_and_sub_plan([best_phys_expr.operator], input_best_phys_plan, best_phys_expr.plan_cost)
80
63
 
81
64
  def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
82
- return [self._get_greedy_physical_plan(groups, final_group_id)]
65
+ logger.info(f"Getting greedy optimal plans for final group id: {final_group_id}")
66
+ plans = [self._get_greedy_physical_plan(groups, final_group_id)]
67
+ logger.info(f"Greedy optimal plans: {plans}")
68
+ logger.info(f"Done getting greedy optimal plans for final group id: {final_group_id}")
69
+
70
+ return plans
83
71
 
84
72
 
85
73
  class ParetoStrategy(OptimizationStrategy):
@@ -127,8 +115,9 @@ class ParetoStrategy(OptimizationStrategy):
127
115
  pareto_optimal_plans.append(plan)
128
116
 
129
117
  return pareto_optimal_plans
130
-
118
+
131
119
  def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
120
+ logger.info(f"Getting pareto optimal plans for final group id: {final_group_id}")
132
121
  # compute all of the pareto optimal physical plans
133
122
  plans = self._get_candidate_pareto_physical_plans(groups, final_group_id, policy)
134
123
 
@@ -138,7 +127,6 @@ class ParetoStrategy(OptimizationStrategy):
138
127
  plan.plan_cost.quality = plan.plan_cost.op_estimates.quality
139
128
 
140
129
  # filter pareto optimal plans for ones which satisfy policy constraint (if at least one of them does)
141
- # import pdb; pdb.set_trace()
142
130
  if any([policy.constraint(plan.plan_cost) for plan in plans]):
143
131
  plans = [plan for plan in plans if policy.constraint(plan.plan_cost)]
144
132
 
@@ -148,6 +136,8 @@ class ParetoStrategy(OptimizationStrategy):
148
136
  optimal_plan = optimal_plan if policy.choose(optimal_plan.plan_cost, plan.plan_cost) else plan
149
137
 
150
138
  plans = [optimal_plan]
139
+ logger.info(f"Pareto optimal plans: {plans}")
140
+ logger.info(f"Done getting pareto optimal plans for final group id: {final_group_id}")
151
141
  return plans
152
142
 
153
143
 
@@ -177,7 +167,11 @@ class SentinelStrategy(OptimizationStrategy):
177
167
  return SentinelPlan.from_ops_and_sub_plan([phys_op_set], best_phys_subplan)
178
168
 
179
169
  def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[SentinelPlan]:
180
- return [self._get_sentinel_plan(groups, final_group_id)]
170
+ logger.info(f"Getting sentinel optimal plans for final group id: {final_group_id}")
171
+ plans = [self._get_sentinel_plan(groups, final_group_id)]
172
+ logger.info(f"Sentinel optimal plans: {plans}")
173
+ logger.info(f"Done getting sentinel optimal plans for final group id: {final_group_id}")
174
+ return plans
181
175
 
182
176
 
183
177
  class NoOptimizationStrategy(GreedyStrategy):
@@ -186,76 +180,3 @@ class NoOptimizationStrategy(GreedyStrategy):
186
180
  logical transformations or optimizations. It uses the same get_optimal_plans logic as the
187
181
  GreedyOptimizationStrategy.
188
182
  """
189
-
190
-
191
- class ConfidenceIntervalStrategy(OptimizationStrategy):
192
- def _get_confidence_interval_optimal_plans(self, groups: dict, group_id: int) -> list[PhysicalPlan]:
193
- """
194
- Return all physical plans whose upper bound on the primary policy metric is greater than the
195
- best plan's lower bound on the primary policy metric (subject to satisfying the policy constraint).
196
-
197
- The OptimizePhysicalExpression task guarantees that each group's `ci_best_physical_expressions`
198
- maintains a list of expressions with overlapping CI's on the primary policy metric (while also
199
- satisfying the policy constraint).
200
-
201
- This function computes the cross-product of all such expressions across all groups.
202
- """
203
- # get all the physical expressions which could be the best for this group
204
- best_phys_exprs = groups[group_id].ci_best_physical_expressions
205
-
206
- best_plans = []
207
- for phys_expr in best_phys_exprs:
208
- # if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
209
- # create the physical plan and append it to the best_plans for this group
210
- if len(phys_expr.input_group_ids) == 0:
211
- plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=phys_expr.plan_cost)
212
- best_plans.append(plan)
213
-
214
- # otherwise, get the best physical plan(s) for this group's inputs
215
- else:
216
- # TODO: need to handle joins
217
- best_phys_subplans = [PhysicalPlan(operators=[])]
218
- for input_group_id in phys_expr.input_group_ids:
219
- input_best_phys_plans = self._get_confidence_interval_optimal_plans(groups, input_group_id)
220
- best_phys_subplans = [
221
- PhysicalPlan.from_ops_and_sub_plan(subplan.operators, input_subplan, subplan.plan_cost)
222
- for subplan in best_phys_subplans
223
- for input_subplan in input_best_phys_plans
224
- ]
225
-
226
- # add this operator to best physical plan and return
227
- for subplan in best_phys_subplans:
228
- plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, phys_expr.plan_cost)
229
- best_plans.append(plan)
230
-
231
- return best_plans
232
-
233
- def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
234
- # TODO: fix this to properly handle multiple potential plans
235
- raise Exception("NotImplementedError")
236
- # plans = self._get_confidence_interval_optimal_plans(final_group_id)
237
-
238
- class AutoOptimizationStrategy(OptimizationStrategy):
239
- def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
240
- raise NotImplementedError("Auto optimization strategy not implemented")
241
-
242
-
243
- class OptimizerStrategyRegistry:
244
- """Registry to map strategy types to their implementations"""
245
-
246
- _strategies: dict[str, type[OptimizationStrategy]] = {
247
- OptimizationStrategyType.GREEDY.value: GreedyStrategy,
248
- OptimizationStrategyType.CONFIDENCE_INTERVAL.value: ConfidenceIntervalStrategy,
249
- OptimizationStrategyType.PARETO.value: ParetoStrategy,
250
- OptimizationStrategyType.SENTINEL.value: SentinelStrategy,
251
- OptimizationStrategyType.NONE.value: NoOptimizationStrategy,
252
- OptimizationStrategyType.AUTO.value: AutoOptimizationStrategy,
253
- }
254
-
255
- @classmethod
256
- def get_strategy(cls, strategy_type: str) -> OptimizationStrategy:
257
- """Get strategy instance by type"""
258
- strategy_class = cls._strategies.get(strategy_type)
259
- if not strategy_class:
260
- raise ValueError(f"Unknown optimization strategy: {strategy_type}")
261
- return strategy_class()
@@ -0,0 +1,37 @@
1
+ from enum import Enum
2
+
3
+ from palimpzest.query.optimizer.optimizer_strategy import (
4
+ GreedyStrategy,
5
+ NoOptimizationStrategy,
6
+ ParetoStrategy,
7
+ SentinelStrategy,
8
+ )
9
+
10
+
11
+ class OptimizationStrategyType(Enum):
12
+ """
13
+ OptimizationStrategyType determines which (set of) plan(s) the Optimizer
14
+ will return to the Execution layer.
15
+ """
16
+ GREEDY = GreedyStrategy
17
+ PARETO = ParetoStrategy
18
+ SENTINEL = SentinelStrategy
19
+ NONE = NoOptimizationStrategy
20
+
21
+ def no_transformation(self) -> bool:
22
+ """
23
+ Return True if this optimization strategy does not transform the logical plan.
24
+ """
25
+ return self in [OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE]
26
+
27
+ def is_pareto(self) -> bool:
28
+ """
29
+ Return True if this optimization strategy uses Pareto optimization.
30
+ """
31
+ return self == OptimizationStrategyType.PARETO
32
+
33
+ def is_not_pareto(self) -> bool:
34
+ """
35
+ Return True if this optimization strategy does not use Pareto optimization.
36
+ """
37
+ return not self.is_pareto()
@@ -106,8 +106,7 @@ class SentinelPlan(Plan):
106
106
  # store operator_sets and logical_op_ids; sort operator_sets internally by op_id
107
107
  self.operator_sets = operator_sets
108
108
  self.operator_sets = [sorted(op_set, key=lambda op: op.get_op_id()) for op_set in self.operator_sets]
109
- self.logical_op_ids = [op_set[0].logical_op_id for op_set in operator_sets]
110
- self.logical_op_names = [op_set[0].logical_op_name for op_set in operator_sets]
109
+ self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
111
110
  self.plan_id = self.compute_plan_id()
112
111
 
113
112
  def compute_plan_id(self) -> str:
@@ -151,7 +150,7 @@ class SentinelPlan(Plan):
151
150
  return self.logical_op_ids[slice], self.operator_sets[slice]
152
151
 
153
152
  def __iter__(self):
154
- yield from zip(self.logical_op_ids, self.logical_op_names, self.operator_sets)
153
+ yield from zip(self.logical_op_ids, self.operator_sets)
155
154
 
156
155
  def __len__(self):
157
156
  return len(self.logical_op_ids)
@@ -42,9 +42,12 @@ class Expression:
42
42
  def __eq__(self, other):
43
43
  return self.operator == other.operator and self.input_group_ids == other.input_group_ids
44
44
 
45
- def __hash__(self):
45
+ def __str__(self):
46
46
  op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_op_id()
47
- hash_str = str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
47
+ return str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
48
+
49
+ def __hash__(self):
50
+ hash_str = self.__str__()
48
51
  hash_id = int(hash_for_id(hash_str), 16)
49
52
  return hash_id
50
53
 
@@ -80,7 +83,6 @@ class Group:
80
83
  self.explored = False
81
84
  self.best_physical_expression: PhysicalExpression | None = None
82
85
  self.pareto_optimal_physical_expressions: list[PhysicalExpression] | None = None
83
- self.ci_best_physical_expressions: list[PhysicalExpression] | None = None
84
86
  self.optimized = False
85
87
 
86
88
  # properties of the Group which distinguish it from groups w/identical fields,