palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +343 -209
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +639 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +62 -6
  19. palimpzest/prompts/filter_prompts.py +51 -6
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
  22. palimpzest/prompts/prompt_factory.py +375 -47
  23. palimpzest/prompts/split_proposer_prompts.py +1 -1
  24. palimpzest/prompts/util_phrases.py +5 -0
  25. palimpzest/prompts/validator.py +239 -0
  26. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  27. palimpzest/query/execution/execution_strategy.py +210 -317
  28. palimpzest/query/execution/execution_strategy_type.py +5 -7
  29. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  30. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  31. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  32. palimpzest/query/generators/generators.py +160 -331
  33. palimpzest/query/operators/__init__.py +15 -5
  34. palimpzest/query/operators/aggregate.py +50 -33
  35. palimpzest/query/operators/compute.py +201 -0
  36. palimpzest/query/operators/convert.py +33 -19
  37. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  38. palimpzest/query/operators/distinct.py +62 -0
  39. palimpzest/query/operators/filter.py +26 -16
  40. palimpzest/query/operators/join.py +403 -0
  41. palimpzest/query/operators/limit.py +3 -3
  42. palimpzest/query/operators/logical.py +205 -77
  43. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  44. palimpzest/query/operators/physical.py +27 -21
  45. palimpzest/query/operators/project.py +3 -3
  46. palimpzest/query/operators/rag_convert.py +7 -7
  47. palimpzest/query/operators/retrieve.py +9 -9
  48. palimpzest/query/operators/scan.py +81 -42
  49. palimpzest/query/operators/search.py +524 -0
  50. palimpzest/query/operators/split_convert.py +10 -8
  51. palimpzest/query/optimizer/__init__.py +7 -9
  52. palimpzest/query/optimizer/cost_model.py +108 -441
  53. palimpzest/query/optimizer/optimizer.py +123 -181
  54. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  55. palimpzest/query/optimizer/plan.py +352 -67
  56. palimpzest/query/optimizer/primitives.py +43 -19
  57. palimpzest/query/optimizer/rules.py +484 -646
  58. palimpzest/query/optimizer/tasks.py +127 -58
  59. palimpzest/query/processor/config.py +42 -76
  60. palimpzest/query/processor/query_processor.py +73 -18
  61. palimpzest/query/processor/query_processor_factory.py +46 -38
  62. palimpzest/schemabuilder/schema_builder.py +15 -28
  63. palimpzest/utils/model_helpers.py +32 -77
  64. palimpzest/utils/progress.py +114 -102
  65. palimpzest/validator/__init__.py +0 -0
  66. palimpzest/validator/validator.py +306 -0
  67. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
  68. palimpzest-0.8.1.dist-info/RECORD +95 -0
  69. palimpzest/core/lib/fields.py +0 -141
  70. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  71. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  72. palimpzest/query/generators/api_client_factory.py +0 -30
  73. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  74. palimpzest/query/operators/map.py +0 -130
  75. palimpzest/query/processor/nosentinel_processor.py +0 -33
  76. palimpzest/query/processor/processing_strategy_type.py +0 -28
  77. palimpzest/query/processor/sentinel_processor.py +0 -88
  78. palimpzest/query/processor/streaming_processor.py +0 -149
  79. palimpzest/sets.py +0 -405
  80. palimpzest/utils/datareader_helpers.py +0 -61
  81. palimpzest/utils/demo_helpers.py +0 -75
  82. palimpzest/utils/field_helpers.py +0 -69
  83. palimpzest/utils/generation_helpers.py +0 -69
  84. palimpzest/utils/sandbox.py +0 -183
  85. palimpzest-0.7.21.dist-info/RECORD +0 -95
  86. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  87. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
  88. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
  89. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,10 @@ from __future__ import annotations
3
3
  import logging
4
4
  from typing import Any
5
5
 
6
- from palimpzest.core.data.dataclasses import PlanCost
6
+ from palimpzest.core.models import PlanCost
7
7
  from palimpzest.policy import Policy
8
+ from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType
9
+ from palimpzest.query.operators.join import JoinOp
8
10
  from palimpzest.query.optimizer.cost_model import BaseCostModel
9
11
  from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
10
12
  from palimpzest.query.optimizer.primitives import Expression, Group
@@ -120,14 +122,15 @@ class OptimizeLogicalExpression(Task):
120
122
 
121
123
  def perform(
122
124
  self,
123
- transformation_rules: list[TransformationRule],
124
- implementation_rules: list[ImplementationRule],
125
+ transformation_rules: list[type[TransformationRule]],
126
+ implementation_rules: list[type[ImplementationRule]],
125
127
  context: dict[str, Any] | None = None,
126
128
  ) -> list[Task]:
127
129
  logger.debug(f"Optimizing logical expression {self.logical_expression}")
128
- # if we're exploring, only apply transformation rules
129
130
  if context is None:
130
131
  context = {}
132
+
133
+ # if we're exploring, only apply transformation rules
131
134
  rules = transformation_rules if self.exploring else transformation_rules + implementation_rules
132
135
 
133
136
  # filter out rules that have already been applied to logical expression
@@ -170,7 +173,7 @@ class ApplyRule(Task):
170
173
  - schedule OptimizePhysicalExpression tasks
171
174
  """
172
175
 
173
- def __init__(self, rule: Rule, logical_expression: Expression, exploring: bool = False):
176
+ def __init__(self, rule: type[Rule], logical_expression: Expression, exploring: bool = False):
174
177
  self.rule = rule
175
178
  self.logical_expression = logical_expression
176
179
  self.exploring = exploring
@@ -183,16 +186,13 @@ class ApplyRule(Task):
183
186
  **physical_op_params,
184
187
  ) -> tuple[list[Task], int]:
185
188
  logger.debug(f"Applying rule {self.rule} to logical expression {self.logical_expression}")
186
-
187
- # check if rule has already been applied to this logical expression; return [] if so
188
189
  if context is None:
189
190
  context = {}
191
+
192
+ # check if rule has already been applied to this logical expression; return [] if so
190
193
  if self.rule.get_rule_id() in self.logical_expression.rules_applied:
191
194
  return []
192
195
 
193
- # MAYBE ?TODO?: iterate over bindings for logical expression and rule?
194
- # perhaps some rules can be applied more than once to an expression?
195
-
196
196
  # get the group of the logical expression
197
197
  group_id = self.logical_expression.group_id
198
198
  group = groups[group_id]
@@ -206,8 +206,8 @@ class ApplyRule(Task):
206
206
  )
207
207
 
208
208
  # filter out any expressions which are duplicates (i.e. they've been previously computed)
209
- new_expressions = [expr for expr in new_expressions if expr.get_expr_id() not in expressions]
210
- expressions.update({expr.get_expr_id(): expr for expr in new_expressions})
209
+ new_expressions = [expr for expr in new_expressions if expr.expr_id not in expressions]
210
+ expressions.update({expr.expr_id: expr for expr in new_expressions})
211
211
 
212
212
  # add all new groups to the groups mapping
213
213
  for group in new_groups:
@@ -234,11 +234,11 @@ class ApplyRule(Task):
234
234
  else:
235
235
  # apply implementation rule
236
236
  new_expressions = self.rule.substitute(self.logical_expression, **physical_op_params)
237
- new_expressions = [expr for expr in new_expressions if expr.get_expr_id() not in expressions]
237
+ new_expressions = [expr for expr in new_expressions if expr.expr_id not in expressions]
238
238
  costed_full_op_ids = context['costed_full_op_ids']
239
239
  if costed_full_op_ids is not None:
240
240
  new_expressions = [expr for expr in new_expressions if expr.operator.get_full_op_id() in costed_full_op_ids]
241
- expressions.update({expr.get_expr_id(): expr for expr in new_expressions})
241
+ expressions.update({expr.expr_id: expr for expr in new_expressions})
242
242
  group.physical_expressions.update(new_expressions)
243
243
 
244
244
  # create new task
@@ -412,8 +412,9 @@ class OptimizePhysicalExpression(Task):
412
412
  if context is None:
413
413
  context = {}
414
414
 
415
- # get the optimizer strategy (type) from the context
415
+ # get the optimizer strategy (type) and the execution strategy (type) from the context
416
416
  optimizer_strategy: OptimizationStrategyType = context['optimizer_strategy']
417
+ execution_strategy: ExecutionStrategyType = context['execution_strategy']
417
418
 
418
419
  # return if we've already computed the cost of this physical expression
419
420
  if optimizer_strategy.is_pareto() and self.physical_expression.pareto_optimal_plan_costs is not None:
@@ -422,57 +423,90 @@ class OptimizePhysicalExpression(Task):
422
423
  if optimizer_strategy.is_not_pareto() and self.physical_expression.plan_cost is not None:
423
424
  return []
424
425
 
425
- # for expressions with an input group, compute the input plan cost(s)
426
- best_input_plan_cost = PlanCost(cost=0, time=0, quality=1)
427
- input_plan_costs = [PlanCost(cost=0, time=0, quality=1)]
426
+ # for expressions with input group(s), compute the input plan cost(s)
427
+ best_input_plan_costs = {}
428
+ pareto_optimal_input_plan_costs = {}
428
429
  if len(self.physical_expression.input_group_ids) > 0:
429
- # get the input group
430
- input_group_id = self.physical_expression.input_group_ids[0] # TODO: need to handle joins
431
- input_group = groups[input_group_id]
432
-
433
- # compute the input plan cost or list of input plan costs
434
430
  new_tasks = []
435
- if optimizer_strategy.is_not_pareto() and input_group.best_physical_expression is not None:
436
- # TODO: apply policy constraint here
437
- best_input_plan_cost = input_group.best_physical_expression.plan_cost
438
-
439
- elif optimizer_strategy.is_pareto() and input_group.pareto_optimal_physical_expressions is not None:
440
- # TODO: apply policy constraint here
441
- input_plan_costs = []
442
- for pareto_physical_expression in input_group.pareto_optimal_physical_expressions:
443
- plan_costs = list(map(lambda tup: tup[0], pareto_physical_expression.pareto_optimal_plan_costs))
444
- input_plan_costs.extend(plan_costs)
445
-
446
- # NOTE: this list will not necessarily be pareto-optimal, as a plan cost on the pareto frontier of
447
- # one pareto_optimal_physical_expression might be dominated by the plan cost on another physical
448
- # expression's pareto frontier; we handle this below by taking the pareto frontier of all_possible_plan_costs
449
- # de-duplicate equivalent plan costs; we will still reconstruct plans with equivalent cost in optimizer.py
450
- input_plan_costs = list(set(input_plan_costs))
451
-
452
- else:
453
- task = OptimizeGroup(input_group_id)
454
- new_tasks.append(task)
431
+ for input_group_id in self.physical_expression.input_group_ids:
432
+ # get the input group
433
+ input_group = groups[input_group_id]
434
+
435
+ # compute the input plan cost or list of input plan costs
436
+ if optimizer_strategy.is_not_pareto() and input_group.best_physical_expression is not None:
437
+ # TODO: apply policy constraint here
438
+ best_input_plan_costs[input_group_id] = input_group.best_physical_expression.plan_cost
439
+
440
+ elif optimizer_strategy.is_pareto() and input_group.pareto_optimal_physical_expressions is not None:
441
+ # TODO: apply policy constraint here
442
+ input_plan_costs = []
443
+ for pareto_physical_expression in input_group.pareto_optimal_physical_expressions:
444
+ plan_costs = list(map(lambda tup: tup[0], pareto_physical_expression.pareto_optimal_plan_costs))
445
+ input_plan_costs.extend(plan_costs)
446
+
447
+ # NOTE: this list will not necessarily be pareto-optimal, as a plan cost on the pareto frontier of
448
+ # one pareto_optimal_physical_expression might be dominated by the plan cost on another physical
449
+ # expression's pareto frontier; we handle this below by taking the pareto frontier of all_possible_plan_costs
450
+ # de-duplicate equivalent plan costs; we will still reconstruct plans with equivalent cost in optimizer.py
451
+ pareto_optimal_input_plan_costs[input_group_id] = list(set(input_plan_costs))
452
+
453
+ else:
454
+ task = OptimizeGroup(input_group_id)
455
+ new_tasks.append(task)
455
456
 
456
457
  # if not all input groups have been costed, we need to compute these first and then retry this task
457
458
  if len(new_tasks) > 0:
458
459
  return [self] + new_tasks
459
460
 
461
+ # once all input groups have been costed, compute the cost of this physical expression
460
462
  group = groups[self.physical_expression.group_id]
461
463
  if optimizer_strategy.is_pareto():
462
464
  # compute all possible plan costs for this physical expression given the pareto optimal input plan costs
463
465
  all_possible_plan_costs = []
464
- for input_plan_cost in input_plan_costs:
465
- op_plan_cost = cost_model(self.physical_expression.operator, input_plan_cost.op_estimates)
466
+ if isinstance(self.physical_expression.operator, JoinOp):
467
+ assert len(self.physical_expression.input_group_ids) == 2, "Join operator must have exactly two input groups."
468
+
469
+ # get the best input plan costs for both inputs
470
+ left_input_group_id, right_input_group_id = self.physical_expression.input_group_ids
471
+ left_best_input_plan_cost = pareto_optimal_input_plan_costs[left_input_group_id]
472
+ right_best_input_plan_cost = pareto_optimal_input_plan_costs[right_input_group_id]
473
+ for left_input_plan_cost in left_best_input_plan_cost:
474
+ for right_input_plan_cost in right_best_input_plan_cost:
475
+ # compute the cost of this operator given the input plan costs
476
+ op_plan_cost = cost_model(
477
+ self.physical_expression.operator,
478
+ left_input_plan_cost.op_estimates,
479
+ right_input_plan_cost.op_estimates,
480
+ )
481
+
482
+ # compute the total cost for this physical expression by summing its operator's PlanCost
483
+ # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
484
+ execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
485
+ full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy)
486
+ full_plan_cost.op_estimates = op_plan_cost.op_estimates
487
+ all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
466
488
 
467
- # compute the total cost for this physical expression by summing its operator's PlanCost
468
- # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
469
- full_plan_cost = op_plan_cost + input_plan_cost
470
- full_plan_cost.op_estimates = op_plan_cost.op_estimates
471
- all_possible_plan_costs.append((full_plan_cost, input_plan_cost))
489
+ else:
490
+ assert len(self.physical_expression.input_group_ids) < 2, "Non-join operator must have zero or one input groups."
491
+
492
+ input_plan_costs = [PlanCost(cost=0, time=0, quality=1)]
493
+ if len(self.physical_expression.input_group_ids) == 1:
494
+ input_group_id = self.physical_expression.input_group_ids[0]
495
+ input_plan_costs = pareto_optimal_input_plan_costs[input_group_id]
496
+
497
+ # get the pareto-optimal input plan costs for the single input
498
+ for input_plan_cost in input_plan_costs:
499
+ op_plan_cost = cost_model(self.physical_expression.operator, input_plan_cost.op_estimates)
500
+
501
+ # compute the total cost for this physical expression by summing its operator's PlanCost
502
+ # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
503
+ full_plan_cost = op_plan_cost + input_plan_cost
504
+ full_plan_cost.op_estimates = op_plan_cost.op_estimates
505
+ all_possible_plan_costs.append((full_plan_cost, (input_plan_cost, None)))
472
506
 
473
507
  # reduce the set of possible plan costs to the subset which are pareto-optimal
474
508
  pareto_optimal_plan_costs = []
475
- for idx, (plan_cost, input_plan_cost) in enumerate(all_possible_plan_costs):
509
+ for idx, (plan_cost, input_plan_cost_tuple) in enumerate(all_possible_plan_costs):
476
510
  pareto_optimal = True
477
511
 
478
512
  # check if any other_expr dominates expr
@@ -487,7 +521,7 @@ class OptimizePhysicalExpression(Task):
487
521
 
488
522
  # add expr to pareto frontier if it's not dominated
489
523
  if pareto_optimal:
490
- pareto_optimal_plan_costs.append((plan_cost, input_plan_cost))
524
+ pareto_optimal_plan_costs.append((plan_cost, input_plan_cost_tuple))
491
525
 
492
526
  # set the pareto frontier of plan costs which can be obtained by this physical expression
493
527
  self.physical_expression.pareto_optimal_plan_costs = pareto_optimal_plan_costs
@@ -496,13 +530,48 @@ class OptimizePhysicalExpression(Task):
496
530
  group = self.update_pareto_optimal_physical_expressions(group, policy)
497
531
 
498
532
  else:
499
- # otherwise, compute the cost of this operator given the optimal input plan cost
500
- op_plan_cost = cost_model(self.physical_expression.operator, best_input_plan_cost.op_estimates)
501
533
 
502
- # compute the total cost for this physical expression by summing its operator's PlanCost
503
- # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
504
- full_plan_cost = op_plan_cost + best_input_plan_cost
505
- full_plan_cost.op_estimates = op_plan_cost.op_estimates
534
+ # otherwise, compute the cost of this operator given the optimal input plan cost(s)
535
+ full_plan_cost = None
536
+ if isinstance(self.physical_expression.operator, JoinOp):
537
+ assert len(self.physical_expression.input_group_ids) == 2, "Join operator must have exactly two input groups."
538
+
539
+ # get the best input plan costs for both inputs
540
+ left_input_group_id, right_input_group_id = self.physical_expression.input_group_ids
541
+ left_best_input_plan_cost = best_input_plan_costs[left_input_group_id]
542
+ right_best_input_plan_cost = best_input_plan_costs[right_input_group_id]
543
+
544
+ # compute the cost of this operator given the best input plan costs
545
+ op_plan_cost = cost_model(
546
+ self.physical_expression.operator,
547
+ left_best_input_plan_cost.op_estimates,
548
+ right_best_input_plan_cost.op_estimates,
549
+ )
550
+
551
+ # compute the total cost for this physical expression by summing its operator's PlanCost
552
+ # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
553
+ execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
554
+ full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy)
555
+ full_plan_cost.op_estimates = op_plan_cost.op_estimates
556
+
557
+ else:
558
+ assert len(self.physical_expression.input_group_ids) < 2, "Non-join operator must have zero or one input groups."
559
+
560
+ # get the best input plan cost for the single input
561
+ best_input_plan_cost = PlanCost(cost=0, time=0, quality=1)
562
+ if len(self.physical_expression.input_group_ids) == 1:
563
+ input_group_id = self.physical_expression.input_group_ids[0]
564
+ best_input_plan_cost = best_input_plan_costs[input_group_id]
565
+
566
+ # compute the cost of this operator given the best input plan cost
567
+ op_plan_cost = cost_model(self.physical_expression.operator, best_input_plan_cost.op_estimates)
568
+
569
+ # compute the total cost for this physical expression by summing its operator's PlanCost
570
+ # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
571
+ full_plan_cost = op_plan_cost + best_input_plan_cost
572
+ full_plan_cost.op_estimates = op_plan_cost.op_estimates
573
+
574
+ # set the plan cost for this physical expression
506
575
  self.physical_expression.plan_cost = full_plan_cost
507
576
 
508
577
  # update the best physical expression for the group
@@ -1,86 +1,52 @@
1
- import json
2
- from dataclasses import dataclass, field
1
+ from pydantic import BaseModel, ConfigDict, Field
3
2
 
4
3
  from palimpzest.constants import Model
5
- from palimpzest.core.data.datareaders import DataReader
6
4
  from palimpzest.policy import MaxQuality, Policy
7
5
 
8
6
 
9
- # TODO: Separate out the config for the Optimizer, ExecutionStrategy, and QueryProcessor
10
7
  # TODO: Add description for each field.
11
- @dataclass
12
- class QueryProcessorConfig:
8
+ class QueryProcessorConfig(BaseModel):
13
9
  """Shared context for query processors"""
14
- processing_strategy: str = field(default="auto") # substituted with ProcessingStrategyType
15
- execution_strategy: str = field(default="sequential") # substituted with ExecutionStrategyType
16
- sentinel_execution_strategy: str | None = field(default="auto") # substituted with SentinelExecutionStrategyType
17
- optimizer_strategy: str = field(default="pareto") # substituted with OptimizationStrategyType
18
-
19
- val_datasource: DataReader | None = field(default=None)
20
-
21
- policy: Policy = field(default_factory=MaxQuality)
22
- scan_start_idx: int = field(default=0)
23
- num_samples: int = field(default=None)
24
- cache: bool = field(default=False) # NOTE: until we properly implement caching, let's set the default to False
25
- verbose: bool = field(default=False)
26
- progress: bool = field(default=True)
27
- available_models: list[Model] | None = field(default=None)
28
-
29
- max_workers: int | None = field(default=None)
30
-
31
- allow_bonded_query: bool = field(default=True)
32
- allow_model_selection: bool = field(default=True)
33
- allow_code_synth: bool = field(default=False)
34
- allow_rag_reduction: bool = field(default=True)
35
- allow_mixtures: bool = field(default=True)
36
- allow_critic: bool = field(default=True)
37
- allow_split_merge: bool = field(default=False)
38
- use_final_op_quality: bool = field(default=False)
39
-
40
- kwargs: dict = field(default_factory=dict)
10
+ model_config = ConfigDict(arbitrary_types_allowed=True)
11
+
12
+ # execution and optimization flags
13
+ execution_strategy: str = Field(default="parallel") # substituted with ExecutionStrategyType
14
+ sentinel_execution_strategy: str | None = Field(default="auto") # substituted with SentinelExecutionStrategyType
15
+ optimizer_strategy: str = Field(default="pareto") # substituted with OptimizationStrategyType
16
+
17
+ # general execution flags
18
+ policy: Policy = Field(default_factory=MaxQuality)
19
+ scan_start_idx: int = Field(default=0)
20
+ num_samples: int = Field(default=None)
21
+ verbose: bool = Field(default=False)
22
+ progress: bool = Field(default=True)
23
+ available_models: list[Model] | None = Field(default=None)
24
+ remove_models: list[Model] | None = Field(default=None)
25
+ max_workers: int | None = Field(default=64)
26
+ join_parallelism: int = Field(default=64)
27
+ batch_size: int | None = Field(default=None)
28
+ reasoning_effort: str | None = Field(default=None) # Gemini: "disable", "low", "medium", "high"
29
+ use_vertex: bool = Field(default=True) # Whether to use Vertex models for Gemini or Google models
30
+ gemini_credentials_path: str | None = Field(default=None) # Path to Gemini credentials file
31
+ api_base: str | None = Field(default=None) # API base URL for vLLM
32
+
33
+ # operator flags
34
+ allow_bonded_query: bool = Field(default=True)
35
+ allow_model_selection: bool = Field(default=True)
36
+ allow_rag_reduction: bool = Field(default=True)
37
+ allow_mixtures: bool = Field(default=True)
38
+ allow_critic: bool = Field(default=True)
39
+ allow_split_merge: bool = Field(default=False)
40
+ use_final_op_quality: bool = Field(default=False)
41
+
42
+ # sentinel optimization flags
43
+ k: int = Field(default=5)
44
+ j: int = Field(default=5)
45
+ sample_budget: int = Field(default=100)
46
+ seed: int = Field(default=42)
47
+ exp_name: str | None = Field(default=None)
48
+ priors: dict | None = Field(default=None)
41
49
 
42
50
  def to_dict(self) -> dict:
43
51
  """Convert the config to a dict representation."""
44
- return {
45
- "processing_strategy": self.processing_strategy,
46
- "execution_strategy": self.execution_strategy,
47
- "sentinel_execution_strategy": self.sentinel_execution_strategy,
48
- "optimizer_strategy": self.optimizer_strategy,
49
- "val_datasource": self.val_datasource,
50
- "policy": self.policy,
51
- "scan_start_idx": self.scan_start_idx,
52
- "num_samples": self.num_samples,
53
- "cache": self.cache,
54
- "verbose": self.verbose,
55
- "progress": self.progress,
56
- "available_models": self.available_models,
57
- "max_workers": self.max_workers,
58
- "allow_bonded_query": self.allow_bonded_query,
59
- "allow_model_selection": self.allow_model_selection,
60
- "allow_code_synth": self.allow_code_synth,
61
- "allow_rag_reduction": self.allow_rag_reduction,
62
- "allow_mixtures": self.allow_mixtures,
63
- "allow_critic": self.allow_critic,
64
- "allow_split_merge": self.allow_split_merge,
65
- "use_final_op_quality": self.use_final_op_quality,
66
- **self.kwargs,
67
- }
68
-
69
- def to_json_str(self):
70
- """Convert the config to a JSON string representation."""
71
- config_dict = self.to_dict()
72
- config_dict["val_datasource"] = (
73
- None if self.val_datasource is None else self.val_datasource.serialize()
74
- )
75
- config_dict["policy"] = self.policy.to_json_str()
76
- for strategy in ["processing_strategy", "execution_strategy", "sentinel_execution_strategy", "optimizer_strategy"]:
77
- config_dict[strategy] = str(config_dict[strategy])
78
-
79
- return json.dumps(config_dict, indent=2)
80
-
81
- def update(self, **kwargs) -> None:
82
- for key, value in kwargs.items():
83
- if hasattr(self, key):
84
- setattr(self, key, value)
85
-
86
- self.kwargs.update(kwargs)
52
+ return self.model_dump()
@@ -1,15 +1,16 @@
1
1
  import logging
2
- from abc import abstractmethod
3
2
 
4
- from palimpzest.core.data.dataclasses import PlanStats
5
- from palimpzest.core.data.datareaders import DataReader
3
+ from palimpzest.core.data.dataset import Dataset
6
4
  from palimpzest.core.elements.records import DataRecord, DataRecordCollection
5
+ from palimpzest.core.models import ExecutionStats, PlanStats
7
6
  from palimpzest.policy import Policy
8
7
  from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
8
+ from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
9
9
  from palimpzest.query.optimizer.optimizer import Optimizer
10
- from palimpzest.sets import Dataset
10
+ from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
11
+ from palimpzest.query.optimizer.plan import SentinelPlan
11
12
  from palimpzest.utils.hash_helpers import hash_for_id
12
- from palimpzest.utils.model_helpers import get_models
13
+ from palimpzest.validator.validator import Validator
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
@@ -27,15 +28,15 @@ class QueryProcessor:
27
28
  execution_strategy: ExecutionStrategy,
28
29
  sentinel_execution_strategy: SentinelExecutionStrategy | None,
29
30
  num_samples: int | None = None,
30
- val_datasource: DataReader | None = None,
31
+ train_dataset: dict[str, Dataset] | None = None,
32
+ validator: Validator | None = None,
31
33
  scan_start_idx: int = 0,
32
- cache: bool = False,
33
34
  verbose: bool = False,
34
35
  progress: bool = True,
35
36
  max_workers: int | None = None,
36
37
  policy: Policy | None = None,
37
38
  available_models: list[str] | None = None,
38
- **kwargs,
39
+ **kwargs, # needed in order to provide compatibility with QueryProcessorConfig
39
40
  ):
40
41
  """
41
42
  Initialize QueryProcessor with optional custom components.
@@ -48,20 +49,15 @@ class QueryProcessor:
48
49
  self.optimizer = optimizer
49
50
  self.execution_strategy = execution_strategy
50
51
  self.sentinel_execution_strategy = sentinel_execution_strategy
51
-
52
52
  self.num_samples = num_samples
53
- self.val_datasource = val_datasource
53
+ self.train_dataset = train_dataset
54
+ self.validator = validator
54
55
  self.scan_start_idx = scan_start_idx
55
- self.cache = cache
56
56
  self.verbose = verbose
57
57
  self.progress = progress
58
58
  self.max_workers = max_workers
59
-
60
59
  self.policy = policy
61
-
62
60
  self.available_models = available_models
63
- if self.available_models is None or len(self.available_models) == 0:
64
- self.available_models = get_models(include_vision=True)
65
61
 
66
62
  if self.verbose:
67
63
  print("Available models: ", self.available_models)
@@ -80,6 +76,26 @@ class QueryProcessor:
80
76
 
81
77
  return hash_for_id(id_str)
82
78
 
79
+ def _create_sentinel_plan(self, train_dataset: dict[str, Dataset] | None) -> SentinelPlan:
80
+ """
81
+ Generates and returns a SentinelPlan for the given dataset.
82
+ """
83
+ # create a new optimizer and update its strategy to SENTINEL
84
+ optimizer = self.optimizer.deepcopy_clean()
85
+ optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
86
+
87
+ # create copy of dataset, but change its root Dataset(s) to the validation Dataset(s)
88
+ dataset = self.dataset.copy()
89
+ if train_dataset is not None:
90
+ dataset._set_root_datasets(train_dataset)
91
+ dataset._generate_unique_logical_op_ids()
92
+
93
+ # get the sentinel plan for the given dataset
94
+ sentinel_plans = optimizer.optimize(dataset)
95
+ sentinel_plan = sentinel_plans[0]
96
+
97
+ return sentinel_plan
98
+
83
99
  def _execute_best_plan(self, dataset: Dataset, optimizer: Optimizer) -> tuple[list[DataRecord], list[PlanStats]]:
84
100
  # get the optimal plan according to the optimizer
85
101
  plans = optimizer.optimize(dataset)
@@ -91,7 +107,46 @@ class QueryProcessor:
91
107
  # return the output records and plan stats
92
108
  return records, [plan_stats]
93
109
 
94
- # TODO: consider to support dry_run.
95
- @abstractmethod
96
110
  def execute(self) -> DataRecordCollection:
97
- raise NotImplementedError("Abstract method to be overwritten by sub-classes")
111
+ logger.info(f"Executing {self.__class__.__name__}")
112
+
113
+ # create execution stats
114
+ execution_stats = ExecutionStats(execution_id=self.execution_id())
115
+ execution_stats.start()
116
+
117
+ # if the user provides a train_dataset or validator, we perform optimization
118
+ if self.train_dataset is not None or self.validator is not None:
119
+ # create sentinel plan
120
+ sentinel_plan = self._create_sentinel_plan(self.train_dataset)
121
+
122
+ # generate sample execution data
123
+ if self.train_dataset is not None:
124
+ sentinel_plan_stats = self.sentinel_execution_strategy.execute_sentinel_plan(sentinel_plan, self.train_dataset, self.validator)
125
+
126
+ else:
127
+ train_dataset = self.dataset._get_root_datasets()
128
+ sentinel_plan_stats = self.sentinel_execution_strategy.execute_sentinel_plan(sentinel_plan, train_dataset, self.validator)
129
+
130
+ # update the execution stats to account for the work done in optimization
131
+ execution_stats.add_plan_stats(sentinel_plan_stats)
132
+ execution_stats.finish_optimization()
133
+
134
+ # (re-)initialize the optimizer
135
+ self.optimizer = self.optimizer.deepcopy_clean()
136
+
137
+ # construct the CostModel with any sample execution data we've gathered
138
+ cost_model = SampleBasedCostModel(sentinel_plan_stats, self.verbose)
139
+ self.optimizer.update_cost_model(cost_model)
140
+
141
+ # execute plan(s) according to the optimization strategy
142
+ records, plan_stats = self._execute_best_plan(self.dataset, self.optimizer)
143
+
144
+ # update the execution stats to account for the work to execute the final plan
145
+ execution_stats.add_plan_stats(plan_stats)
146
+ execution_stats.finish()
147
+
148
+ # construct and return the DataRecordCollection
149
+ result = DataRecordCollection(records, execution_stats=execution_stats)
150
+ logger.info(f"Done executing {self.__class__.__name__}")
151
+
152
+ return result