palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.20.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,12 @@ from __future__ import annotations
2
2
 
3
3
  from abc import ABC, abstractmethod
4
4
 
5
- from palimpzest.core.data.dataclasses import PlanCost
5
+ from palimpzest.core.models import PlanCost
6
+ from palimpzest.query.operators.aggregate import AggregateOp
7
+ from palimpzest.query.operators.join import JoinOp
8
+ from palimpzest.query.operators.limit import LimitScanOp
6
9
  from palimpzest.query.operators.physical import PhysicalOperator
7
- from palimpzest.query.operators.scan import ScanPhysicalOp
10
+ from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
8
11
  from palimpzest.utils.hash_helpers import hash_for_id
9
12
 
10
13
 
@@ -42,19 +45,197 @@ class Plan(ABC):
42
45
  pass
43
46
 
44
47
  class PhysicalPlan(Plan):
45
- def __init__(self, operators: list[PhysicalOperator], plan_cost: PlanCost | None = None):
46
- self.operators = operators
48
+ def __init__(self, operator: PhysicalOperator, subplans: list[PhysicalPlan] | None, plan_cost: PlanCost | None = None):
49
+ self.operator = operator
50
+ self.subplans = [] if subplans is None else subplans
47
51
  self.plan_cost = plan_cost if plan_cost is not None else PlanCost(cost=0.0, time=0.0, quality=1.0)
48
52
  self.plan_id = self.compute_plan_id()
49
53
 
54
+ # NOTE: unique full_op_id is constructed as "{topological_index}-{full_op_id}" to
55
+ # differentiate between multiple instances of the same physical operator e.g. in self-joins
56
+
57
+ # compute mapping from unique full_op_id to next unique full_op_id in the plan
58
+ self.unique_full_op_id_to_next_unique_full_op_and_id = {}
59
+ current_idx, _ = self._compute_next_unique_full_op_map(self.unique_full_op_id_to_next_unique_full_op_and_id)
60
+ self.unique_full_op_id_to_next_unique_full_op_and_id[f"{current_idx}-{self.operator.get_full_op_id()}"] = (None, None)
61
+
62
+ # compute mapping from unique full_op_id to upstream unique full_op_ids
63
+ self.unique_full_op_id_to_upstream_full_op_ids = {}
64
+ self._compute_upstream_unique_full_op_ids_map(self.unique_full_op_id_to_upstream_full_op_ids)
65
+
66
+ # compute mapping from unique full_op_id to source unique full_op_ids
67
+ self.unique_full_op_id_to_source_full_op_ids = {}
68
+ self._compute_source_unique_full_op_ids_map(self.unique_full_op_id_to_source_full_op_ids)
69
+
50
70
  def compute_plan_id(self) -> str:
51
71
  """
52
72
  NOTE: This is NOT a universal ID.
53
73
 
54
74
  Two different PhysicalPlan instances with the identical lists of operators will have equivalent plan_ids.
55
75
  """
56
- hash_str = str(tuple(op.get_full_op_id() for op in self.operators))
57
- return hash_for_id(hash_str)
76
+ full_op_id = self.operator.get_full_op_id()
77
+ subplan_ids = [subplan.compute_plan_id() for subplan in self.subplans]
78
+ return hash_for_id(str((full_op_id,) + tuple(subplan_ids)))
79
+
80
+ def get_est_total_outputs(self, num_samples: int | None = None, current_idx: int | None = None, source_unique_full_op_ids_map: dict | None = None) -> tuple[dict[str, int], int]:
81
+ """Return the estimated total number of output records to be processed by the given operator in this plan."""
82
+ # get the source map from the root of the entire plan; use this map throughout all recursive calls
83
+ # (if you call self.get_source_unique_full_op_ids() from a subplan, it's topo indexes will be different)
84
+ if source_unique_full_op_ids_map is None:
85
+ source_unique_full_op_ids_map = self.unique_full_op_id_to_source_full_op_ids
86
+
87
+ # get the estimated total outputs from all subplans
88
+ # NOTE: this will be an empty dictionary for scans
89
+ all_subplan_total_outputs = {}
90
+ for subplan in self.subplans:
91
+ subplan_total_outputs, current_idx = subplan.get_est_total_outputs(num_samples, current_idx, source_unique_full_op_ids_map)
92
+ current_idx += 1
93
+ all_subplan_total_outputs.update(subplan_total_outputs)
94
+
95
+ # if current_idx is None, this is the first call, so we initialize it to 0
96
+ if current_idx is None:
97
+ current_idx = 0
98
+
99
+ # get total outputs for this operator
100
+ this_op_total_outputs = {}
101
+ this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
102
+
103
+ # if this operator is a scan, return the length of its datasource
104
+ if isinstance(self.operator, MarshalAndScanDataOp):
105
+ total = min(len(self.operator.datasource), num_samples) if num_samples is not None else len(self.operator.datasource)
106
+ this_op_total_outputs = {this_unique_full_op_id: total}
107
+
108
+ # if this operator is a context scan, return 1
109
+ elif isinstance(self.operator, ContextScanOp): # noqa: SIM114
110
+ this_op_total_outputs = {this_unique_full_op_id: 1}
111
+
112
+ # if this operator is an aggregate, return 1
113
+ elif isinstance(self.operator, AggregateOp):
114
+ this_op_total_outputs = {this_unique_full_op_id: 1}
115
+
116
+ # if this operator is a limit scan, return its limit
117
+ elif isinstance(self.operator, LimitScanOp):
118
+ this_op_total_outputs = {this_unique_full_op_id: self.operator.limit}
119
+
120
+ # if this operator is a join, return the Cartesian product of the estimated outputs of its inputs
121
+ elif isinstance(self.operator, JoinOp):
122
+ # get estimated outputs for immediate left and right inputs
123
+ source_unique_full_op_ids = source_unique_full_op_ids_map[f"{current_idx}-{self.operator.get_full_op_id()}"]
124
+ left_unique_full_op_id, right_unique_full_op_id = source_unique_full_op_ids[0], source_unique_full_op_ids[1]
125
+ left_total_outputs = all_subplan_total_outputs[left_unique_full_op_id]
126
+ right_total_outputs = all_subplan_total_outputs[right_unique_full_op_id]
127
+ this_op_total_outputs = {this_unique_full_op_id: left_total_outputs * right_total_outputs}
128
+
129
+ # otherwise, return the number of outputs from the immediate input
130
+ else:
131
+ source_unique_full_op_ids = source_unique_full_op_ids_map[f"{current_idx}-{self.operator.get_full_op_id()}"]
132
+ source_unique_full_op_id = source_unique_full_op_ids[0]
133
+ this_op_total_outputs = {this_unique_full_op_id: all_subplan_total_outputs[source_unique_full_op_id]}
134
+
135
+ return {**this_op_total_outputs, **all_subplan_total_outputs}, current_idx
136
+
137
+ def _compute_next_unique_full_op_map(self, next_map: dict[str, str | None], current_idx: int | None = None) -> tuple[int, str]:
138
+ """Compute a mapping from each operator's unique full_op_id to the next operator in the plan and its unique full_op_id.
139
+
140
+ The unique full_op_id is constructed as "{topological_index}-{full_op_id}" to differentiate between
141
+ multiple instances of the same physical operator in the plan (e.g., in self-joins).
142
+
143
+ Args:
144
+ next_map: A dictionary to populate with the mapping from unique full_op_id to next (operator, unique_full_op_id) pair.
145
+ current_idx: The current topological index in the plan. If None, starts at 0.
146
+
147
+ Returns:
148
+ A tuple containing:
149
+ - The current topological index after processing this plan.
150
+ - The unique full_op_id of this plan's root operator.
151
+ """
152
+ # If there are subplans, compute their next maps first
153
+ subplan_topo_idx_op_id_pairs = []
154
+ for subplan in self.subplans:
155
+ current_idx, current_full_op_id = subplan._compute_next_unique_full_op_map(next_map, current_idx)
156
+ subplan_topo_idx_op_id_pairs.append((current_idx, current_full_op_id))
157
+ current_idx += 1 # increment after processing each subplan
158
+
159
+ # for each subplan's root operator, set its next to this plan's root operator
160
+ for topo_idx, full_op_id in subplan_topo_idx_op_id_pairs:
161
+ unique_op_id = f"{topo_idx}-{full_op_id}"
162
+ this_unique_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
163
+ next_map[unique_op_id] = (self.operator, this_unique_op_id)
164
+
165
+ # if this is the first call, initialize current_idx
166
+ if current_idx is None:
167
+ current_idx = 0
168
+
169
+ return current_idx, self.operator.get_full_op_id()
170
+
171
+ def get_next_unique_full_op_and_id(self, topo_idx: int, operator: PhysicalOperator) -> tuple[PhysicalOperator | None, str | None]:
172
+ """Return the next operator in the plan after the given operator, or None if it is the last operator."""
173
+ unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
174
+ return self.unique_full_op_id_to_next_unique_full_op_and_id[unique_full_op_id]
175
+
176
+ def get_next_unique_full_op_id(self, topo_idx: int, operator: PhysicalOperator) -> str | None:
177
+ """Return the full_op_id of the next operator in the plan after the given operator, or None if it is the last operator."""
178
+ unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
179
+ _, next_unique_full_op_id = self.unique_full_op_id_to_next_unique_full_op_and_id[unique_full_op_id]
180
+ return next_unique_full_op_id
181
+
182
+ def _compute_upstream_unique_full_op_ids_map(self, upstream_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str, list[str]]:
183
+ # set the upstream unique full_op_ids for this operator
184
+ subplan_topo_idx_upstream_unique_full_op_id_tuples = []
185
+ for subplan in self.subplans:
186
+ current_idx, full_op_id, subplan_upstream_unique_full_op_ids = subplan._compute_upstream_unique_full_op_ids_map(upstream_map, current_idx)
187
+ subplan_topo_idx_upstream_unique_full_op_id_tuples.append((current_idx, full_op_id, subplan_upstream_unique_full_op_ids))
188
+ current_idx += 1
189
+
190
+ # if current_idx is None, this is the first call, so we initialize it to 0
191
+ if current_idx is None:
192
+ current_idx = 0
193
+
194
+ # compute this operator's unique full_op_id
195
+ this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
196
+
197
+ # update the upstream_map for this operator
198
+ upstream_map[this_unique_full_op_id] = []
199
+ for topo_idx, full_op_id, upstream_unique_full_op_ids in subplan_topo_idx_upstream_unique_full_op_id_tuples:
200
+ subplan_upstream_unique_full_op_ids = [f"{topo_idx}-{full_op_id}"] + upstream_unique_full_op_ids
201
+ upstream_map[this_unique_full_op_id].extend(subplan_upstream_unique_full_op_ids)
202
+
203
+ # return the current index and the upstream unique full_op_ids for this operator
204
+ return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
205
+
206
+ def get_upstream_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
207
+ """Return the list of unique full_op_ids for the upstream operators of this operator."""
208
+ unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
209
+ return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
210
+
211
+ def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
212
+ # get the topological index and full_op_id pairs for all subplans' root operators
213
+ subplan_topo_idx_op_id_pairs = []
214
+ for subplan in self.subplans:
215
+ current_idx, current_full_op_id = subplan._compute_source_unique_full_op_ids_map(source_map, current_idx)
216
+ subplan_topo_idx_op_id_pairs.append((current_idx, current_full_op_id))
217
+ current_idx += 1
218
+
219
+ # if current_idx is None, this is the first call, so we initialize it to 0
220
+ if current_idx is None:
221
+ current_idx = 0
222
+
223
+ # compute this operator's unique full_op_id
224
+ this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
225
+
226
+ # update the source_map for this operator
227
+ source_map[this_unique_full_op_id] = []
228
+ for topo_idx, full_op_id in subplan_topo_idx_op_id_pairs:
229
+ unique_full_op_id = f"{topo_idx}-{full_op_id}"
230
+ source_map[this_unique_full_op_id].append(unique_full_op_id)
231
+
232
+ # return the current unique full_op_id for this operator
233
+ return current_idx, self.operator.get_full_op_id()
234
+
235
+ def get_source_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
236
+ """Return the list of unique full_op_ids for the input(s) to this operator."""
237
+ unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
238
+ return self.unique_full_op_id_to_source_full_op_ids[unique_full_op_id]
58
239
 
59
240
  def __eq__(self, other):
60
241
  return isinstance(other, PhysicalPlan) and self.plan_id == other.plan_id
@@ -65,60 +246,78 @@ class PhysicalPlan(Plan):
65
246
  def __repr__(self) -> str:
66
247
  return str(self)
67
248
 
68
- def __str__(self):
69
- start = self.operators[0]
70
- plan_str = f" 0. {type(start).__name__} -> {start.output_schema.__name__} \n\n"
71
-
72
- for idx, operator in enumerate(self.operators[1:]):
73
- plan_str += f" {idx+1}. {str(operator)}\n"
249
+ def _get_str(self, idx: int = 0, indent: int = 0) -> str:
250
+ indent_str = " " * (indent * 2)
251
+ plan_str = f"{indent_str}{idx}. {str(self.operator)}\n"
252
+ for subplan in self.subplans:
253
+ plan_str += subplan._get_str(idx=idx + 1, indent=indent + 1)
74
254
 
75
255
  return plan_str
76
256
 
257
+ def __str__(self):
258
+ return self._get_str()
259
+
77
260
  def __getitem__(self, slice):
78
- return self.operators[slice]
261
+ ops = [op for op in self]
262
+ return ops[slice]
79
263
 
80
264
  def __iter__(self):
81
- return iter(self.operators)
265
+ for subplan in self.subplans:
266
+ yield from subplan
267
+ yield self.operator
82
268
 
83
269
  def __len__(self):
84
- return len(self.operators)
270
+ return 1 + sum(len(subplan) for subplan in self.subplans)
85
271
 
86
- @staticmethod
87
- def from_ops_and_sub_plan(ops: list[PhysicalOperator], sub_plan: PhysicalPlan, plan_cost: PlanCost) -> PhysicalPlan:
88
- # create copies of all logical operators
89
- copy_sub_plan = [op.copy() for op in sub_plan.operators]
90
- copy_ops = [op.copy() for op in ops]
272
+ @classmethod
273
+ def _from_ops(cls, ops: list[PhysicalOperator], plan_cost: PlanCost | None = None) -> PhysicalPlan:
274
+ """
275
+ NOTE: Do not use this in production code. This is a convenience method for constructing PhysicalPlans in tests.
276
+ This method assumes a left-deep tree structure (i.e. pipeline), where each operator has at most one subplan.
277
+ The PlanCost is applied to all subplans, thus it is not a true representation of the cost of the plan.
278
+ """
279
+ assert len(ops) > 0, "ops must contain at least one PhysicalOperator"
91
280
 
92
- # construct full set of operators
93
- copy_sub_plan.extend(copy_ops)
281
+ # build the PhysicalPlan from the list of operators
282
+ if len(ops) == 1:
283
+ return cls(operator=ops[0], subplans=None, plan_cost=plan_cost)
94
284
 
95
- # return the PhysicalPlan
96
- return PhysicalPlan(operators=copy_sub_plan, plan_cost=plan_cost)
285
+ # recursively build subplans
286
+ subplan = cls._from_ops(ops[:-1], plan_cost=plan_cost)
287
+ return cls(operator=ops[-1], subplans=[subplan], plan_cost=plan_cost)
97
288
 
98
289
 
290
+ # TODO(?): take list[PhysicalOperator] as input, but then store OpFrontier
99
291
  class SentinelPlan(Plan):
100
- def __init__(self, operator_sets: list[list[PhysicalOperator]]):
101
- # enforce that first operator_set is a scan and that every operator_set has at least one operator
102
- if len(operator_sets) > 0:
103
- assert isinstance(operator_sets[0][0], ScanPhysicalOp), "first operator set must be a scan"
104
- assert all(len(op_set) > 0 for op_set in operator_sets), "every operator set must have at least one operator"
105
-
106
- # store operator_sets and logical_op_ids; sort operator_sets internally by full_op_id
107
- self.operator_sets = operator_sets
108
- self.operator_sets = [sorted(op_set, key=lambda op: op.get_full_op_id()) for op_set in self.operator_sets]
109
- self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
292
+ def __init__(self, operator_set: list[PhysicalOperator], subplans: list[SentinelPlan] | None):
293
+ # store operator_set and logical_op_id; sort operator_set internally by full_op_id
294
+ self.operator_set = sorted(operator_set, key=lambda op: op.get_full_op_id())
295
+ self.logical_op_id = self.operator_set[0].logical_op_id
296
+ self.subplans = [] if subplans is None else subplans
110
297
  self.plan_id = self.compute_plan_id()
111
298
 
299
+ # compute mapping from unique logical_op_id to next unique logical_op_id in the plan
300
+ self.unique_logical_op_id_to_next_unique_logical_op_id = {}
301
+ current_idx, _ = self._compute_next_unique_logical_op_id_map(self.unique_logical_op_id_to_next_unique_logical_op_id)
302
+ self.unique_logical_op_id_to_next_unique_logical_op_id[f"{current_idx}-{self.logical_op_id}"] = None
303
+
304
+ # compute mapping from unique logical_op_id to root dataset ids
305
+ self.unique_logical_op_id_to_root_dataset_ids = {}
306
+ self._compute_root_dataset_ids_map(self.unique_logical_op_id_to_root_dataset_ids)
307
+
308
+ # compute mapping from unique logical_op_id to source unique logical_op_ids
309
+ self.unique_logical_op_id_to_source_logical_op_ids = {}
310
+ self._compute_source_unique_logical_op_ids_map(self.unique_logical_op_id_to_source_logical_op_ids)
311
+
112
312
  def compute_plan_id(self) -> str:
113
313
  """
114
314
  NOTE: This is NOT a universal ID.
115
315
 
116
316
  Two different SentinelPlan instances with the identical operator_sets will have equivalent plan_ids.
117
317
  """
118
- hash_str = ""
119
- for logical_op_id, op_set in zip(self.logical_op_ids, self.operator_sets):
120
- hash_str += f"{logical_op_id} {tuple(op.get_full_op_id() for op in op_set)} "
121
- return hash_for_id(hash_str)
318
+ full_id = (self.logical_op_id,) + tuple([op.get_full_op_id() for op in self.operator_set])
319
+ subplan_ids = [subplan.compute_plan_id() for subplan in self.subplans]
320
+ return hash_for_id(str((full_id,) + tuple(subplan_ids)))
122
321
 
123
322
  def __eq__(self, other):
124
323
  return isinstance(other, SentinelPlan) and self.plan_id == other.plan_id
@@ -129,40 +328,126 @@ class SentinelPlan(Plan):
129
328
  def __repr__(self) -> str:
130
329
  return str(self)
131
330
 
132
- def __str__(self):
133
- # by assertion, first operator_set is guaranteed to be a scan
134
- start = self.operator_sets[0][0]
135
- plan_str = f" 0. {type(start).__name__} -> {start.output_schema.__name__} \n\n"
136
-
137
- # build string one operator set at a time
138
- for idx, operator_set in enumerate(self.operator_sets[1:]):
139
- if len(operator_set) == 1:
140
- operator = operator_set[0]
141
- plan_str += f" {idx+1}. {str(operator)}\n"
142
-
143
- else:
144
- for inner_idx, operator in enumerate(operator_set):
145
- plan_str += f" {idx+1}.{inner_idx+1}. {str(operator)}\n"
331
+ def _get_str(self, idx: int = 0, indent: int = 0) -> str:
332
+ indent_str = " " * (indent * 2)
333
+ plan_str = ""
334
+ for inner_idx, operator in enumerate(self.operator_set):
335
+ inner_idx_str = "" if len(self.operator_set) == 1 else f"{inner_idx + 1}."
336
+ plan_str += f"{indent_str}{idx}.{inner_idx_str} {str(operator)}\n"
337
+ for subplan in self.subplans:
338
+ plan_str += subplan._get_str(idx=idx + 1, indent=indent + 1)
146
339
 
147
340
  return plan_str
148
341
 
342
+ def __str__(self):
343
+ return self._get_str()
344
+
149
345
  def __getitem__(self, slice):
150
- return self.logical_op_ids[slice], self.operator_sets[slice]
346
+ op_set_tuples = [op_set_tuple for op_set_tuple in self]
347
+ return op_set_tuples[slice]
151
348
 
152
349
  def __iter__(self):
153
- yield from zip(self.logical_op_ids, self.operator_sets)
350
+ for subplan in self.subplans:
351
+ yield from subplan
352
+ yield self.logical_op_id, self.operator_set
154
353
 
155
354
  def __len__(self):
156
- return len(self.logical_op_ids)
157
-
158
- @staticmethod
159
- def from_ops_and_sub_plan(op_sets: list[list[PhysicalOperator]], sub_plan: SentinelPlan) -> SentinelPlan:
160
- # create copies of all logical operators
161
- copy_sub_plan = [[op.copy() for op in op_set] for op_set in sub_plan.operator_sets]
162
- copy_ops = [[op.copy() for op in op_set] for op_set in op_sets]
163
-
164
- # construct full set of operators
165
- copy_sub_plan.extend(copy_ops)
166
-
167
- # return the SentinelPlan
168
- return SentinelPlan(operator_sets=copy_sub_plan)
355
+ return 1 + sum(len(subplan) for subplan in self.subplans)
356
+
357
+ def _compute_next_unique_logical_op_id_map(self, next_map: dict[str, str | None], current_idx: int | None = None) -> tuple[int, str]:
358
+ """Compute a mapping from each operator's unique logical_op_id to the next operator's unique logical_op_id.
359
+
360
+ The unique logical_op_id is constructed as "{topological_index}-{logical_op_id}" to differentiate between
361
+ multiple instances of the same physical operator in the plan (e.g., in self-joins).
362
+
363
+ Args:
364
+ next_map: A dictionary to populate with the mapping from unique logical_op_id to next logical_op_id.
365
+ current_idx: The current topological index in the plan. If None, starts at 0.
366
+
367
+ Returns:
368
+ A tuple containing:
369
+ - The current topological index after processing this plan.
370
+ - The unique logical_op_id of this plan's root logical operator.
371
+ """
372
+ # If there are subplans, compute their next maps first
373
+ subplan_topo_idx_op_id_pairs = []
374
+ for subplan in self.subplans:
375
+ current_idx, current_logical_op_id = subplan._compute_next_unique_logical_op_id_map(next_map, current_idx)
376
+ subplan_topo_idx_op_id_pairs.append((current_idx, current_logical_op_id))
377
+ current_idx += 1 # increment after processing each subplan
378
+
379
+ # for each subplan's root operator, set its next to this plan's root operator
380
+ for topo_idx, logical_op_id in subplan_topo_idx_op_id_pairs:
381
+ unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
382
+ this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
383
+ next_map[unique_logical_op_id] = this_unique_logical_op_id
384
+
385
+ # if this is the first call, initialize current_idx
386
+ if current_idx is None:
387
+ current_idx = 0
388
+
389
+ return current_idx, self.logical_op_id
390
+
391
+ def get_next_unique_logical_op_id(self, unique_logical_op_id: str) -> str | None:
392
+ """Return the unique logical_op_id of the next operator in the plan after the given operator, or None if it is the last operator."""
393
+ return self.unique_logical_op_id_to_next_unique_logical_op_id[unique_logical_op_id]
394
+
395
+ def _compute_root_dataset_ids_map(self, root_dataset_ids_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, list[str]]:
396
+ # set the root dataset ids for this operator
397
+ all_subplan_root_dataset_ids = []
398
+ for subplan in self.subplans:
399
+ current_idx, subplan_root_dataset_ids = subplan._compute_root_dataset_ids_map(root_dataset_ids_map, current_idx)
400
+ all_subplan_root_dataset_ids.extend(subplan_root_dataset_ids)
401
+ current_idx += 1
402
+
403
+ # if current_idx is None, this is the first call, so we initialize it to 0
404
+ if current_idx is None:
405
+ current_idx = 0
406
+
407
+ # compute this operator's unique logical_op_id
408
+ this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
409
+
410
+ # if this operator is a root dataset scan, update root_dataset_ids
411
+ root_dataset_ids = []
412
+ if isinstance(self.operator_set[0], MarshalAndScanDataOp):
413
+ root_dataset_ids.append(self.operator_set[0].datasource.id)
414
+ elif isinstance(self.operator_set[0], ContextScanOp):
415
+ root_dataset_ids.append(self.operator_set[0].context.id)
416
+
417
+ # update the root_dataset_ids_map for this operator
418
+ root_dataset_ids_map[this_unique_logical_op_id] = root_dataset_ids + all_subplan_root_dataset_ids
419
+
420
+ # return the current index and the upstream unique logical_op_ids for this operator
421
+ return current_idx, root_dataset_ids_map[this_unique_logical_op_id]
422
+
423
+ def get_root_dataset_ids(self, unique_logical_op_id: str) -> list[str]:
424
+ """Return the list of root dataset ids which are upstream of this operator."""
425
+ return self.unique_logical_op_id_to_root_dataset_ids[unique_logical_op_id]
426
+
427
+ def _compute_source_unique_logical_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
428
+ # get the topological index and logical_op_id pairs for all subplans' root operators
429
+ subplan_topo_idx_op_id_pairs = []
430
+ for subplan in self.subplans:
431
+ current_idx, current_logical_op_id = subplan._compute_source_unique_logical_op_ids_map(source_map, current_idx)
432
+ subplan_topo_idx_op_id_pairs.append((current_idx, current_logical_op_id))
433
+ current_idx += 1
434
+
435
+ # if current_idx is None, this is the first call, so we initialize it to 0
436
+ if current_idx is None:
437
+ current_idx = 0
438
+
439
+ # compute this operator's unique logical_op_id
440
+ this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
441
+
442
+ # update the source_map for this operator
443
+ source_map[this_unique_logical_op_id] = []
444
+ for topo_idx, logical_op_id in subplan_topo_idx_op_id_pairs:
445
+ unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
446
+ source_map[this_unique_logical_op_id].append(unique_logical_op_id)
447
+
448
+ # return the current unique logical_op_id for this operator
449
+ return current_idx, self.logical_op_id
450
+
451
+ def get_source_unique_logical_op_ids(self, unique_logical_op_id: str) -> list[str]:
452
+ """Return the list of unique logical_op_ids for the input(s) to this operator."""
453
+ return self.unique_logical_op_id_to_source_logical_op_ids[unique_logical_op_id]
@@ -1,8 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
- from palimpzest.core.lib.fields import Field
3
+ from pydantic.fields import FieldInfo
4
+
4
5
  from palimpzest.query.operators.logical import LogicalOperator
5
6
  from palimpzest.query.operators.physical import PhysicalOperator
7
+ from palimpzest.query.optimizer import rules
6
8
  from palimpzest.query.optimizer.plan import PlanCost
7
9
  from palimpzest.utils.hash_helpers import hash_for_id
8
10
 
@@ -18,9 +20,9 @@ class Expression:
18
20
  self,
19
21
  operator: LogicalOperator | PhysicalOperator,
20
22
  input_group_ids: list[int],
21
- input_fields: dict[str, Field],
23
+ input_fields: dict[str, FieldInfo],
22
24
  depends_on_field_names: set[str],
23
- generated_fields: dict[str, Field],
25
+ generated_fields: dict[str, FieldInfo],
24
26
  group_id: int | None = None,
25
27
  ):
26
28
  self.operator = operator
@@ -36,37 +38,59 @@ class Expression:
36
38
  self.plan_cost: PlanCost | None = None
37
39
 
38
40
  # NOTE: this will be a list of tuples where each tuple has a (pareto-optimal) plan cost
39
- # and the input plan cost for which that pareto-optimal plan cost is attainable
40
- self.pareto_optimal_plan_costs: list[tuple[PlanCost, PlanCost]] | None = None
41
+ # and the tuple of input plan cost(s) for which that pareto-optimal plan cost is attainable;
42
+ # the tuple of input plan cost(s) is (input_plan_cost, None) for non-join operators and
43
+ # (left_input_plan_cost, right_input_plan_cost) for join operators
44
+ self.pareto_optimal_plan_costs: list[tuple[PlanCost, tuple[PlanCost, PlanCost]]] | None = None
45
+
46
+ # compute the expression id
47
+ self.expr_id = self._compute_expr_id()
41
48
 
42
49
  def __eq__(self, other):
43
- return self.operator == other.operator and self.input_group_ids == other.input_group_ids
50
+ return self.expr_id == other.expr_id
44
51
 
45
52
  def __str__(self):
46
- op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_full_op_id()
47
- return str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
53
+ expr_str = f"{self.__class__.__name__}(group_id={self.group_id}, expr_id={self.expr_id})"
54
+ expr_str += f"\n - input_group_ids: {self.input_group_ids}"
55
+ expr_str += f"\n - input_fields: {self.input_fields}"
56
+ expr_str += f"\n - depends_on_field_names: {self.depends_on_field_names}"
57
+ expr_str += f"\n - generated_fields: {self.generated_fields}"
58
+ expr_str += f"\n - operator:\n{str(self.operator)}"
59
+ return expr_str
48
60
 
49
61
  def __hash__(self):
50
- hash_str = self.__str__()
62
+ op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_full_op_id()
63
+ hash_str = str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
51
64
  hash_id = int(hash_for_id(hash_str), 16)
52
65
  return hash_id
53
66
 
54
- def add_applied_rule(self, rule):
67
+ def _compute_expr_id(self) -> int:
68
+ return self.__hash__()
69
+
70
+ def add_applied_rule(self, rule: type[rules.Rule]):
55
71
  self.rules_applied.add(rule.get_rule_id())
56
72
 
57
73
  def set_group_id(self, group_id: int) -> None:
58
74
  self.group_id = group_id
59
75
 
60
- def get_expr_id(self) -> int:
61
- return self.__hash__()
62
-
63
76
 
64
77
  class LogicalExpression(Expression):
65
78
  pass
66
79
 
67
80
 
68
81
  class PhysicalExpression(Expression):
69
- pass
82
+
83
+ @classmethod
84
+ def from_op_and_logical_expr(cls, op: PhysicalOperator, logical_expression: LogicalExpression) -> PhysicalExpression:
85
+ """Construct a PhysicalExpression given a physical operator and a logical expression."""
86
+ return cls(
87
+ operator=op,
88
+ input_group_ids=logical_expression.input_group_ids,
89
+ input_fields=logical_expression.input_fields,
90
+ depends_on_field_names=logical_expression.depends_on_field_names,
91
+ generated_fields=logical_expression.generated_fields,
92
+ group_id=logical_expression.group_id,
93
+ )
70
94
 
71
95
 
72
96
  class Group:
@@ -76,9 +100,9 @@ class Group:
76
100
  Maintains a set of logical multi-expressions and physical multi-expressions.
77
101
  """
78
102
 
79
- def __init__(self, logical_expressions: list[Expression], fields: dict[str, Field], properties: dict[str, set[str]]):
80
- self.logical_expressions = set(logical_expressions)
81
- self.physical_expressions = set()
103
+ def __init__(self, logical_expressions: list[LogicalExpression], fields: dict[str, FieldInfo], properties: dict[str, set[str]]):
104
+ self.logical_expressions: set[LogicalExpression] = set(logical_expressions)
105
+ self.physical_expressions: set[PhysicalExpression] = set()
82
106
  self.fields = fields
83
107
  self.explored = False
84
108
  self.best_physical_expression: PhysicalExpression | None = None
@@ -90,12 +114,12 @@ class Group:
90
114
  self.properties = properties
91
115
 
92
116
  # compute the group id
93
- self.group_id = self.compute_group_id()
117
+ self.group_id = self._compute_group_id()
94
118
 
95
119
  def set_explored(self):
96
120
  self.explored = True
97
121
 
98
- def compute_group_id(self) -> int:
122
+ def _compute_group_id(self) -> int:
99
123
  # sort field names
100
124
  sorted_fields = sorted(self.fields.keys())
101
125