palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.20.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
5
5
 
6
6
  from palimpzest.policy import Policy
7
7
  from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
8
+ from palimpzest.query.optimizer.primitives import Group
8
9
 
9
10
  logger = logging.getLogger(__name__)
10
11
 
@@ -15,31 +16,6 @@ class OptimizationStrategy(ABC):
15
16
  """Strategy decides how to search through the groups for optimal plan(s)"""
16
17
  pass
17
18
 
18
- def normalize_final_plans(self, plans: list[PhysicalPlan]) -> list[PhysicalPlan]:
19
- """
20
- For each plan in `plans`, this function enforces that the input schema of every
21
- operator is the output schema of the previous operator in the plan.
22
-
23
- Args:
24
- plans list[PhysicalPlan]: list of physical plans to normalize
25
-
26
- Returns:
27
- list[PhysicalPlan]: list of normalized physical plans
28
- """
29
- normalized_plans = []
30
- for plan in plans:
31
- normalized_ops = []
32
- for idx, op in enumerate(plan.operators):
33
- op_copy = op.copy()
34
- if idx == 0:
35
- normalized_ops.append(op_copy)
36
- else:
37
- op_copy.input_schema = plan.operators[-1].output_schema
38
- normalized_ops.append(op_copy)
39
- normalized_plans.append(PhysicalPlan(operators=normalized_ops, plan_cost=plan.plan_cost))
40
-
41
- return normalized_plans
42
-
43
19
 
44
20
  class GreedyStrategy(OptimizationStrategy):
45
21
  def _get_greedy_physical_plan(self, groups: dict, group_id: int) -> PhysicalPlan:
@@ -49,17 +25,35 @@ class GreedyStrategy(OptimizationStrategy):
49
25
  # get the best physical expression for this group
50
26
  best_phys_expr = groups[group_id].best_physical_expression
51
27
 
52
- # if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
53
- # create and return the physical plan
28
+ # if this expression has no inputs (i.e. it is a BaseScan), create and return the physical plan
29
+ best_plan = None
54
30
  if len(best_phys_expr.input_group_ids) == 0:
55
- return PhysicalPlan(operators=[best_phys_expr.operator], plan_cost=best_phys_expr.plan_cost)
31
+ best_plan = PhysicalPlan(best_phys_expr.operator, subplans=None, plan_cost=best_phys_expr.plan_cost)
32
+
33
+ # otherwise, if this expression is not a join (i.e. it has one input)
34
+ elif len(best_phys_expr.input_group_ids) == 1:
35
+ # get the best physical plan for this group's input
36
+ input_group_id = best_phys_expr.input_group_ids[0]
37
+ input_best_phys_plan = self._get_greedy_physical_plan(groups, input_group_id)
38
+
39
+ # add this operator to best physical plan and return
40
+ best_plan = PhysicalPlan(best_phys_expr.operator, subplans=[input_best_phys_plan], plan_cost=best_phys_expr.plan_cost)
56
41
 
57
- # get the best physical plan(s) for this group's inputs
58
- input_group_id = best_phys_expr.input_group_ids[0] # TODO: need to handle joins
59
- input_best_phys_plan = self._get_greedy_physical_plan(groups, input_group_id)
42
+ # otherwise, this expression is a join (i.e. it has two inputs)
43
+ elif len(best_phys_expr.input_group_ids) == 2:
44
+ left_input_group_id, right_input_group_id = best_phys_expr.input_group_ids
45
+
46
+ # get the best physical plan for the left input
47
+ left_best_phys_plan = self._get_greedy_physical_plan(groups, left_input_group_id)
48
+
49
+ # get the best physical plan for the right input
50
+ right_best_phys_plan = self._get_greedy_physical_plan(groups, right_input_group_id)
51
+
52
+ # add this operator to best physical plan and return
53
+ best_plan = PhysicalPlan(best_phys_expr.operator, subplans=[left_best_phys_plan, right_best_phys_plan], plan_cost=best_phys_expr.plan_cost)
60
54
 
61
55
  # add this operator to best physical plan and return
62
- return PhysicalPlan.from_ops_and_sub_plan([best_phys_expr.operator], input_best_phys_plan, best_phys_expr.plan_cost)
56
+ return best_plan
63
57
 
64
58
  def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
65
59
  logger.info(f"Getting greedy optimal plans for final group id: {final_group_id}")
@@ -85,35 +79,42 @@ class ParetoStrategy(OptimizationStrategy):
85
79
  # construct list of pareto optimal plans
86
80
  pareto_optimal_plans = []
87
81
  for phys_expr in pareto_optimal_phys_exprs:
88
- # if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
89
- # create and return the physical plan
82
+ # if this expression has no inputs (i.e. it is a BaseScan), create and return the physical plan
90
83
  if len(phys_expr.input_group_ids) == 0:
91
84
  for plan_cost, _ in phys_expr.pareto_optimal_plan_costs:
92
- plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=plan_cost)
85
+ plan = PhysicalPlan(phys_expr.operator, subplans=None, plan_cost=plan_cost)
93
86
  pareto_optimal_plans.append(plan)
94
87
 
95
- # otherwise, get the pareto optimal physical plan(s) for this group's inputs
96
- else:
88
+ # otherwise, if this expression is not a join (i.e. it has one input)
89
+ elif len(phys_expr.input_group_ids) == 1:
97
90
  # get the pareto optimal physical plan(s) for this group's inputs
98
- input_group_id = phys_expr.input_group_ids[0] # TODO: need to handle joins
91
+ input_group_id = phys_expr.input_group_ids[0]
99
92
  pareto_optimal_phys_subplans = self._get_candidate_pareto_physical_plans(groups, input_group_id, policy)
100
93
 
101
94
  # iterate over the input subplans and find the one(s) which combine with this physical expression
102
95
  # to make a pareto-optimal plan
103
- for plan_cost, input_plan_cost in phys_expr.pareto_optimal_plan_costs:
96
+ for plan_cost, (input_plan_cost, _) in phys_expr.pareto_optimal_plan_costs:
104
97
  for subplan in pareto_optimal_phys_subplans:
105
- if (
106
- subplan.plan_cost.cost == input_plan_cost.cost
107
- and subplan.plan_cost.time == input_plan_cost.time
108
- and subplan.plan_cost.quality == input_plan_cost.quality
109
- ):
110
- # TODO: The plan_cost gets summed with subplan.plan_cost;
111
- # am I defining expression.best_plan_cost to be the cost of that operator,
112
- # and expression.pareto_optimal_plan_costs to be the cost(s) of the subplan including that operator?
113
- # i.e. are my definitions inconsistent?
114
- plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, plan_cost)
98
+ if subplan.plan_cost == input_plan_cost:
99
+ plan = PhysicalPlan(phys_expr.operator, subplans=[subplan], plan_cost=plan_cost)
115
100
  pareto_optimal_plans.append(plan)
116
101
 
102
+ # otherwise, this expression is a join (i.e. it has two inputs)
103
+ elif len(phys_expr.input_group_ids) == 2:
104
+ left_input_group_id, right_input_group_id = phys_expr.input_group_ids
105
+ pareto_optimal_left_subplans = self._get_candidate_pareto_physical_plans(groups, left_input_group_id, policy)
106
+ pareto_optimal_right_subplans = self._get_candidate_pareto_physical_plans(groups, right_input_group_id, policy)
107
+
108
+ # iterate over the input subplans and find the one(s) which combine with this physical expression
109
+ # to make a pareto-optimal plan
110
+ for plan_cost, (left_input_plan_cost, right_input_plan_cost) in phys_expr.pareto_optimal_plan_costs:
111
+ for left_subplan in pareto_optimal_left_subplans:
112
+ if left_subplan.plan_cost == left_input_plan_cost:
113
+ for right_subplan in pareto_optimal_right_subplans:
114
+ if right_subplan.plan_cost == right_input_plan_cost:
115
+ plan = PhysicalPlan(phys_expr.operator, subplans=[left_subplan, right_subplan], plan_cost=plan_cost)
116
+ pareto_optimal_plans.append(plan)
117
+
117
118
  return pareto_optimal_plans
118
119
 
119
120
  def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
@@ -142,29 +143,33 @@ class ParetoStrategy(OptimizationStrategy):
142
143
 
143
144
 
144
145
  class SentinelStrategy(OptimizationStrategy):
145
- def _get_sentinel_plan(self, groups: dict, group_id: int) -> SentinelPlan:
146
+ def _get_sentinel_plan(self, groups: dict[str, Group], group_id: int) -> SentinelPlan:
146
147
  """
147
148
  Create and return a SentinelPlan object.
149
+
150
+ NOTE: this strategy is only used to construct a SentinelPlan before performing optimization.
151
+ Currently, we do not perform any transformation rules when building the groups which
152
+ are fed into this function. Thus, every physical expression will correspond to the same
153
+ logical operator and share the same logical_op_id. Eventually we will want to consider
154
+ multiple logical re-orderings of operators in our SentinelPlan, but for now it is static.
148
155
  """
149
- # get all the physical expressions for this group
156
+ # get all the physical expressions for this group as well as their logical_op_id
150
157
  phys_exprs = groups[group_id].physical_expressions
151
158
  phys_op_set = [expr.operator for expr in phys_exprs]
152
159
 
153
- # if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
154
- # create and return the physical plan
160
+ # if this expression has no inputs (i.e. it is a scan operator), create and return the sentinel plan
155
161
  best_phys_expr = groups[group_id].best_physical_expression
156
162
  if len(best_phys_expr.input_group_ids) == 0:
157
- return SentinelPlan(operator_sets=[phys_op_set])
163
+ return SentinelPlan(operator_set=phys_op_set, subplans=None)
158
164
 
159
- # TODO: need to handle joins
160
- # get the best physical plan(s) for this group's inputs
161
- best_phys_subplan = SentinelPlan(operator_sets=[])
165
+ # get the subplans
166
+ subplans = []
162
167
  for input_group_id in best_phys_expr.input_group_ids:
163
- input_best_phys_plan = self._get_sentinel_plan(groups, input_group_id)
164
- best_phys_subplan = SentinelPlan.from_ops_and_sub_plan(best_phys_subplan.operator_sets, input_best_phys_plan)
168
+ subplan = self._get_sentinel_plan(groups, input_group_id)
169
+ subplans.append(subplan)
165
170
 
166
- # add this operator set to best physical plan and return
167
- return SentinelPlan.from_ops_and_sub_plan([phys_op_set], best_phys_subplan)
171
+ # compose the current physical operator set with its subplans
172
+ return SentinelPlan(operator_set=phys_op_set, subplans=subplans)
168
173
 
169
174
  def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[SentinelPlan]:
170
175
  logger.info(f"Getting sentinel optimal plans for final group id: {final_group_id}")