palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +259 -197
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +634 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +61 -5
- palimpzest/prompts/filter_prompts.py +50 -5
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
- palimpzest/prompts/prompt_factory.py +358 -46
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +157 -330
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +27 -21
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +22 -13
- palimpzest/query/operators/join.py +402 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +198 -80
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +41 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +27 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
- palimpzest-0.8.0.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.21.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -2,9 +2,12 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
|
|
5
|
-
from palimpzest.core.
|
|
5
|
+
from palimpzest.core.models import PlanCost
|
|
6
|
+
from palimpzest.query.operators.aggregate import AggregateOp
|
|
7
|
+
from palimpzest.query.operators.join import JoinOp
|
|
8
|
+
from palimpzest.query.operators.limit import LimitScanOp
|
|
6
9
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
7
|
-
from palimpzest.query.operators.scan import
|
|
10
|
+
from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
|
|
8
11
|
from palimpzest.utils.hash_helpers import hash_for_id
|
|
9
12
|
|
|
10
13
|
|
|
@@ -42,19 +45,197 @@ class Plan(ABC):
|
|
|
42
45
|
pass
|
|
43
46
|
|
|
44
47
|
class PhysicalPlan(Plan):
|
|
45
|
-
def __init__(self,
|
|
46
|
-
self.
|
|
48
|
+
def __init__(self, operator: PhysicalOperator, subplans: list[PhysicalPlan] | None, plan_cost: PlanCost | None = None):
|
|
49
|
+
self.operator = operator
|
|
50
|
+
self.subplans = [] if subplans is None else subplans
|
|
47
51
|
self.plan_cost = plan_cost if plan_cost is not None else PlanCost(cost=0.0, time=0.0, quality=1.0)
|
|
48
52
|
self.plan_id = self.compute_plan_id()
|
|
49
53
|
|
|
54
|
+
# NOTE: unique full_op_id is constructed as "{topological_index}-{full_op_id}" to
|
|
55
|
+
# differentiate between multiple instances of the same physical operator e.g. in self-joins
|
|
56
|
+
|
|
57
|
+
# compute mapping from unique full_op_id to next unique full_op_id in the plan
|
|
58
|
+
self.unique_full_op_id_to_next_unique_full_op_and_id = {}
|
|
59
|
+
current_idx, _ = self._compute_next_unique_full_op_map(self.unique_full_op_id_to_next_unique_full_op_and_id)
|
|
60
|
+
self.unique_full_op_id_to_next_unique_full_op_and_id[f"{current_idx}-{self.operator.get_full_op_id()}"] = (None, None)
|
|
61
|
+
|
|
62
|
+
# compute mapping from unique full_op_id to upstream unique full_op_ids
|
|
63
|
+
self.unique_full_op_id_to_upstream_full_op_ids = {}
|
|
64
|
+
self._compute_upstream_unique_full_op_ids_map(self.unique_full_op_id_to_upstream_full_op_ids)
|
|
65
|
+
|
|
66
|
+
# compute mapping from unique full_op_id to source unique full_op_ids
|
|
67
|
+
self.unique_full_op_id_to_source_full_op_ids = {}
|
|
68
|
+
self._compute_source_unique_full_op_ids_map(self.unique_full_op_id_to_source_full_op_ids)
|
|
69
|
+
|
|
50
70
|
def compute_plan_id(self) -> str:
|
|
51
71
|
"""
|
|
52
72
|
NOTE: This is NOT a universal ID.
|
|
53
73
|
|
|
54
74
|
Two different PhysicalPlan instances with the identical lists of operators will have equivalent plan_ids.
|
|
55
75
|
"""
|
|
56
|
-
|
|
57
|
-
|
|
76
|
+
full_op_id = self.operator.get_full_op_id()
|
|
77
|
+
subplan_ids = [subplan.compute_plan_id() for subplan in self.subplans]
|
|
78
|
+
return hash_for_id(str((full_op_id,) + tuple(subplan_ids)))
|
|
79
|
+
|
|
80
|
+
def get_est_total_outputs(self, num_samples: int | None = None, current_idx: int | None = None, source_unique_full_op_ids_map: dict | None = None) -> tuple[dict[str, int], int]:
|
|
81
|
+
"""Return the estimated total number of output records to be processed by the given operator in this plan."""
|
|
82
|
+
# get the source map from the root of the entire plan; use this map throughout all recursive calls
|
|
83
|
+
# (if you call self.get_source_unique_full_op_ids() from a subplan, it's topo indexes will be different)
|
|
84
|
+
if source_unique_full_op_ids_map is None:
|
|
85
|
+
source_unique_full_op_ids_map = self.unique_full_op_id_to_source_full_op_ids
|
|
86
|
+
|
|
87
|
+
# get the estimated total outputs from all subplans
|
|
88
|
+
# NOTE: this will be an empty dictionary for scans
|
|
89
|
+
all_subplan_total_outputs = {}
|
|
90
|
+
for subplan in self.subplans:
|
|
91
|
+
subplan_total_outputs, current_idx = subplan.get_est_total_outputs(num_samples, current_idx, source_unique_full_op_ids_map)
|
|
92
|
+
current_idx += 1
|
|
93
|
+
all_subplan_total_outputs.update(subplan_total_outputs)
|
|
94
|
+
|
|
95
|
+
# if current_idx is None, this is the first call, so we initialize it to 0
|
|
96
|
+
if current_idx is None:
|
|
97
|
+
current_idx = 0
|
|
98
|
+
|
|
99
|
+
# get total outputs for this operator
|
|
100
|
+
this_op_total_outputs = {}
|
|
101
|
+
this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
|
|
102
|
+
|
|
103
|
+
# if this operator is a scan, return the length of its datasource
|
|
104
|
+
if isinstance(self.operator, MarshalAndScanDataOp):
|
|
105
|
+
total = min(len(self.operator.datasource), num_samples) if num_samples is not None else len(self.operator.datasource)
|
|
106
|
+
this_op_total_outputs = {this_unique_full_op_id: total}
|
|
107
|
+
|
|
108
|
+
# if this operator is a context scan, return 1
|
|
109
|
+
elif isinstance(self.operator, ContextScanOp): # noqa: SIM114
|
|
110
|
+
this_op_total_outputs = {this_unique_full_op_id: 1}
|
|
111
|
+
|
|
112
|
+
# if this operator is an aggregate, return 1
|
|
113
|
+
elif isinstance(self.operator, AggregateOp):
|
|
114
|
+
this_op_total_outputs = {this_unique_full_op_id: 1}
|
|
115
|
+
|
|
116
|
+
# if this operator is a limit scan, return its limit
|
|
117
|
+
elif isinstance(self.operator, LimitScanOp):
|
|
118
|
+
this_op_total_outputs = {this_unique_full_op_id: self.operator.limit}
|
|
119
|
+
|
|
120
|
+
# if this operator is a join, return the Cartesian product of the estimated outputs of its inputs
|
|
121
|
+
elif isinstance(self.operator, JoinOp):
|
|
122
|
+
# get estimated outputs for immediate left and right inputs
|
|
123
|
+
source_unique_full_op_ids = source_unique_full_op_ids_map[f"{current_idx}-{self.operator.get_full_op_id()}"]
|
|
124
|
+
left_unique_full_op_id, right_unique_full_op_id = source_unique_full_op_ids[0], source_unique_full_op_ids[1]
|
|
125
|
+
left_total_outputs = all_subplan_total_outputs[left_unique_full_op_id]
|
|
126
|
+
right_total_outputs = all_subplan_total_outputs[right_unique_full_op_id]
|
|
127
|
+
this_op_total_outputs = {this_unique_full_op_id: left_total_outputs * right_total_outputs}
|
|
128
|
+
|
|
129
|
+
# otherwise, return the number of outputs from the immediate input
|
|
130
|
+
else:
|
|
131
|
+
source_unique_full_op_ids = source_unique_full_op_ids_map[f"{current_idx}-{self.operator.get_full_op_id()}"]
|
|
132
|
+
source_unique_full_op_id = source_unique_full_op_ids[0]
|
|
133
|
+
this_op_total_outputs = {this_unique_full_op_id: all_subplan_total_outputs[source_unique_full_op_id]}
|
|
134
|
+
|
|
135
|
+
return {**this_op_total_outputs, **all_subplan_total_outputs}, current_idx
|
|
136
|
+
|
|
137
|
+
def _compute_next_unique_full_op_map(self, next_map: dict[str, str | None], current_idx: int | None = None) -> tuple[int, str]:
|
|
138
|
+
"""Compute a mapping from each operator's unique full_op_id to the next operator in the plan and its unique full_op_id.
|
|
139
|
+
|
|
140
|
+
The unique full_op_id is constructed as "{topological_index}-{full_op_id}" to differentiate between
|
|
141
|
+
multiple instances of the same physical operator in the plan (e.g., in self-joins).
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
next_map: A dictionary to populate with the mapping from unique full_op_id to next (operator, unique_full_op_id) pair.
|
|
145
|
+
current_idx: The current topological index in the plan. If None, starts at 0.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
A tuple containing:
|
|
149
|
+
- The current topological index after processing this plan.
|
|
150
|
+
- The unique full_op_id of this plan's root operator.
|
|
151
|
+
"""
|
|
152
|
+
# If there are subplans, compute their next maps first
|
|
153
|
+
subplan_topo_idx_op_id_pairs = []
|
|
154
|
+
for subplan in self.subplans:
|
|
155
|
+
current_idx, current_full_op_id = subplan._compute_next_unique_full_op_map(next_map, current_idx)
|
|
156
|
+
subplan_topo_idx_op_id_pairs.append((current_idx, current_full_op_id))
|
|
157
|
+
current_idx += 1 # increment after processing each subplan
|
|
158
|
+
|
|
159
|
+
# for each subplan's root operator, set its next to this plan's root operator
|
|
160
|
+
for topo_idx, full_op_id in subplan_topo_idx_op_id_pairs:
|
|
161
|
+
unique_op_id = f"{topo_idx}-{full_op_id}"
|
|
162
|
+
this_unique_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
|
|
163
|
+
next_map[unique_op_id] = (self.operator, this_unique_op_id)
|
|
164
|
+
|
|
165
|
+
# if this is the first call, initialize current_idx
|
|
166
|
+
if current_idx is None:
|
|
167
|
+
current_idx = 0
|
|
168
|
+
|
|
169
|
+
return current_idx, self.operator.get_full_op_id()
|
|
170
|
+
|
|
171
|
+
def get_next_unique_full_op_and_id(self, topo_idx: int, operator: PhysicalOperator) -> tuple[PhysicalOperator | None, str | None]:
|
|
172
|
+
"""Return the next operator in the plan after the given operator, or None if it is the last operator."""
|
|
173
|
+
unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
|
|
174
|
+
return self.unique_full_op_id_to_next_unique_full_op_and_id[unique_full_op_id]
|
|
175
|
+
|
|
176
|
+
def get_next_unique_full_op_id(self, topo_idx: int, operator: PhysicalOperator) -> str | None:
|
|
177
|
+
"""Return the full_op_id of the next operator in the plan after the given operator, or None if it is the last operator."""
|
|
178
|
+
unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
|
|
179
|
+
_, next_unique_full_op_id = self.unique_full_op_id_to_next_unique_full_op_and_id[unique_full_op_id]
|
|
180
|
+
return next_unique_full_op_id
|
|
181
|
+
|
|
182
|
+
def _compute_upstream_unique_full_op_ids_map(self, upstream_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str, list[str]]:
|
|
183
|
+
# set the upstream unique full_op_ids for this operator
|
|
184
|
+
subplan_topo_idx_upstream_unique_full_op_id_tuples = []
|
|
185
|
+
for subplan in self.subplans:
|
|
186
|
+
current_idx, full_op_id, subplan_upstream_unique_full_op_ids = subplan._compute_upstream_unique_full_op_ids_map(upstream_map, current_idx)
|
|
187
|
+
subplan_topo_idx_upstream_unique_full_op_id_tuples.append((current_idx, full_op_id, subplan_upstream_unique_full_op_ids))
|
|
188
|
+
current_idx += 1
|
|
189
|
+
|
|
190
|
+
# if current_idx is None, this is the first call, so we initialize it to 0
|
|
191
|
+
if current_idx is None:
|
|
192
|
+
current_idx = 0
|
|
193
|
+
|
|
194
|
+
# compute this operator's unique full_op_id
|
|
195
|
+
this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
|
|
196
|
+
|
|
197
|
+
# update the upstream_map for this operator
|
|
198
|
+
upstream_map[this_unique_full_op_id] = []
|
|
199
|
+
for topo_idx, full_op_id, upstream_unique_full_op_ids in subplan_topo_idx_upstream_unique_full_op_id_tuples:
|
|
200
|
+
subplan_upstream_unique_full_op_ids = [f"{topo_idx}-{full_op_id}"] + upstream_unique_full_op_ids
|
|
201
|
+
upstream_map[this_unique_full_op_id].extend(subplan_upstream_unique_full_op_ids)
|
|
202
|
+
|
|
203
|
+
# return the current index and the upstream unique full_op_ids for this operator
|
|
204
|
+
return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
|
|
205
|
+
|
|
206
|
+
def get_upstream_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
|
|
207
|
+
"""Return the list of unique full_op_ids for the upstream operators of this operator."""
|
|
208
|
+
unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
|
|
209
|
+
return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
|
|
210
|
+
|
|
211
|
+
def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
|
|
212
|
+
# get the topological index and full_op_id pairs for all subplans' root operators
|
|
213
|
+
subplan_topo_idx_op_id_pairs = []
|
|
214
|
+
for subplan in self.subplans:
|
|
215
|
+
current_idx, current_full_op_id = subplan._compute_source_unique_full_op_ids_map(source_map, current_idx)
|
|
216
|
+
subplan_topo_idx_op_id_pairs.append((current_idx, current_full_op_id))
|
|
217
|
+
current_idx += 1
|
|
218
|
+
|
|
219
|
+
# if current_idx is None, this is the first call, so we initialize it to 0
|
|
220
|
+
if current_idx is None:
|
|
221
|
+
current_idx = 0
|
|
222
|
+
|
|
223
|
+
# compute this operator's unique full_op_id
|
|
224
|
+
this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
|
|
225
|
+
|
|
226
|
+
# update the source_map for this operator
|
|
227
|
+
source_map[this_unique_full_op_id] = []
|
|
228
|
+
for topo_idx, full_op_id in subplan_topo_idx_op_id_pairs:
|
|
229
|
+
unique_full_op_id = f"{topo_idx}-{full_op_id}"
|
|
230
|
+
source_map[this_unique_full_op_id].append(unique_full_op_id)
|
|
231
|
+
|
|
232
|
+
# return the current unique full_op_id for this operator
|
|
233
|
+
return current_idx, self.operator.get_full_op_id()
|
|
234
|
+
|
|
235
|
+
def get_source_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
|
|
236
|
+
"""Return the list of unique full_op_ids for the input(s) to this operator."""
|
|
237
|
+
unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
|
|
238
|
+
return self.unique_full_op_id_to_source_full_op_ids[unique_full_op_id]
|
|
58
239
|
|
|
59
240
|
def __eq__(self, other):
|
|
60
241
|
return isinstance(other, PhysicalPlan) and self.plan_id == other.plan_id
|
|
@@ -65,60 +246,78 @@ class PhysicalPlan(Plan):
|
|
|
65
246
|
def __repr__(self) -> str:
|
|
66
247
|
return str(self)
|
|
67
248
|
|
|
68
|
-
def
|
|
69
|
-
|
|
70
|
-
plan_str = f"
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
plan_str += f" {idx+1}. {str(operator)}\n"
|
|
249
|
+
def _get_str(self, idx: int = 0, indent: int = 0) -> str:
|
|
250
|
+
indent_str = " " * (indent * 2)
|
|
251
|
+
plan_str = f"{indent_str}{idx}. {str(self.operator)}\n"
|
|
252
|
+
for subplan in self.subplans:
|
|
253
|
+
plan_str += subplan._get_str(idx=idx + 1, indent=indent + 1)
|
|
74
254
|
|
|
75
255
|
return plan_str
|
|
76
256
|
|
|
257
|
+
def __str__(self):
|
|
258
|
+
return self._get_str()
|
|
259
|
+
|
|
77
260
|
def __getitem__(self, slice):
|
|
78
|
-
|
|
261
|
+
ops = [op for op in self]
|
|
262
|
+
return ops[slice]
|
|
79
263
|
|
|
80
264
|
def __iter__(self):
|
|
81
|
-
|
|
265
|
+
for subplan in self.subplans:
|
|
266
|
+
yield from subplan
|
|
267
|
+
yield self.operator
|
|
82
268
|
|
|
83
269
|
def __len__(self):
|
|
84
|
-
return len(self.
|
|
270
|
+
return 1 + sum(len(subplan) for subplan in self.subplans)
|
|
85
271
|
|
|
86
|
-
@
|
|
87
|
-
def
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
272
|
+
@classmethod
|
|
273
|
+
def _from_ops(cls, ops: list[PhysicalOperator], plan_cost: PlanCost | None = None) -> PhysicalPlan:
|
|
274
|
+
"""
|
|
275
|
+
NOTE: Do not use this in production code. This is a convenience method for constructing PhysicalPlans in tests.
|
|
276
|
+
This method assumes a left-deep tree structure (i.e. pipeline), where each operator has at most one subplan.
|
|
277
|
+
The PlanCost is applied to all subplans, thus it is not a true representation of the cost of the plan.
|
|
278
|
+
"""
|
|
279
|
+
assert len(ops) > 0, "ops must contain at least one PhysicalOperator"
|
|
91
280
|
|
|
92
|
-
#
|
|
93
|
-
|
|
281
|
+
# build the PhysicalPlan from the list of operators
|
|
282
|
+
if len(ops) == 1:
|
|
283
|
+
return cls(operator=ops[0], subplans=None, plan_cost=plan_cost)
|
|
94
284
|
|
|
95
|
-
#
|
|
96
|
-
|
|
285
|
+
# recursively build subplans
|
|
286
|
+
subplan = cls._from_ops(ops[:-1], plan_cost=plan_cost)
|
|
287
|
+
return cls(operator=ops[-1], subplans=[subplan], plan_cost=plan_cost)
|
|
97
288
|
|
|
98
289
|
|
|
290
|
+
# TODO(?): take list[PhysicalOperator] as input, but then store OpFrontier
|
|
99
291
|
class SentinelPlan(Plan):
|
|
100
|
-
def __init__(self,
|
|
101
|
-
#
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
# store operator_sets and logical_op_ids; sort operator_sets internally by full_op_id
|
|
107
|
-
self.operator_sets = operator_sets
|
|
108
|
-
self.operator_sets = [sorted(op_set, key=lambda op: op.get_full_op_id()) for op_set in self.operator_sets]
|
|
109
|
-
self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
|
|
292
|
+
def __init__(self, operator_set: list[PhysicalOperator], subplans: list[SentinelPlan] | None):
|
|
293
|
+
# store operator_set and logical_op_id; sort operator_set internally by full_op_id
|
|
294
|
+
self.operator_set = sorted(operator_set, key=lambda op: op.get_full_op_id())
|
|
295
|
+
self.logical_op_id = self.operator_set[0].logical_op_id
|
|
296
|
+
self.subplans = [] if subplans is None else subplans
|
|
110
297
|
self.plan_id = self.compute_plan_id()
|
|
111
298
|
|
|
299
|
+
# compute mapping from unique logical_op_id to next unique logical_op_id in the plan
|
|
300
|
+
self.unique_logical_op_id_to_next_unique_logical_op_id = {}
|
|
301
|
+
current_idx, _ = self._compute_next_unique_logical_op_id_map(self.unique_logical_op_id_to_next_unique_logical_op_id)
|
|
302
|
+
self.unique_logical_op_id_to_next_unique_logical_op_id[f"{current_idx}-{self.logical_op_id}"] = None
|
|
303
|
+
|
|
304
|
+
# compute mapping from unique logical_op_id to root dataset ids
|
|
305
|
+
self.unique_logical_op_id_to_root_dataset_ids = {}
|
|
306
|
+
self._compute_root_dataset_ids_map(self.unique_logical_op_id_to_root_dataset_ids)
|
|
307
|
+
|
|
308
|
+
# compute mapping from unique logical_op_id to source unique logical_op_ids
|
|
309
|
+
self.unique_logical_op_id_to_source_logical_op_ids = {}
|
|
310
|
+
self._compute_source_unique_logical_op_ids_map(self.unique_logical_op_id_to_source_logical_op_ids)
|
|
311
|
+
|
|
112
312
|
def compute_plan_id(self) -> str:
|
|
113
313
|
"""
|
|
114
314
|
NOTE: This is NOT a universal ID.
|
|
115
315
|
|
|
116
316
|
Two different SentinelPlan instances with the identical operator_sets will have equivalent plan_ids.
|
|
117
317
|
"""
|
|
118
|
-
|
|
119
|
-
for
|
|
120
|
-
|
|
121
|
-
return hash_for_id(hash_str)
|
|
318
|
+
full_id = (self.logical_op_id,) + tuple([op.get_full_op_id() for op in self.operator_set])
|
|
319
|
+
subplan_ids = [subplan.compute_plan_id() for subplan in self.subplans]
|
|
320
|
+
return hash_for_id(str((full_id,) + tuple(subplan_ids)))
|
|
122
321
|
|
|
123
322
|
def __eq__(self, other):
|
|
124
323
|
return isinstance(other, SentinelPlan) and self.plan_id == other.plan_id
|
|
@@ -129,40 +328,126 @@ class SentinelPlan(Plan):
|
|
|
129
328
|
def __repr__(self) -> str:
|
|
130
329
|
return str(self)
|
|
131
330
|
|
|
132
|
-
def
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
operator = operator_set[0]
|
|
141
|
-
plan_str += f" {idx+1}. {str(operator)}\n"
|
|
142
|
-
|
|
143
|
-
else:
|
|
144
|
-
for inner_idx, operator in enumerate(operator_set):
|
|
145
|
-
plan_str += f" {idx+1}.{inner_idx+1}. {str(operator)}\n"
|
|
331
|
+
def _get_str(self, idx: int = 0, indent: int = 0) -> str:
|
|
332
|
+
indent_str = " " * (indent * 2)
|
|
333
|
+
plan_str = ""
|
|
334
|
+
for inner_idx, operator in enumerate(self.operator_set):
|
|
335
|
+
inner_idx_str = "" if len(self.operator_set) == 1 else f"{inner_idx + 1}."
|
|
336
|
+
plan_str += f"{indent_str}{idx}.{inner_idx_str} {str(operator)}\n"
|
|
337
|
+
for subplan in self.subplans:
|
|
338
|
+
plan_str += subplan._get_str(idx=idx + 1, indent=indent + 1)
|
|
146
339
|
|
|
147
340
|
return plan_str
|
|
148
341
|
|
|
342
|
+
def __str__(self):
|
|
343
|
+
return self._get_str()
|
|
344
|
+
|
|
149
345
|
def __getitem__(self, slice):
|
|
150
|
-
|
|
346
|
+
op_set_tuples = [op_set_tuple for op_set_tuple in self]
|
|
347
|
+
return op_set_tuples[slice]
|
|
151
348
|
|
|
152
349
|
def __iter__(self):
|
|
153
|
-
|
|
350
|
+
for subplan in self.subplans:
|
|
351
|
+
yield from subplan
|
|
352
|
+
yield self.logical_op_id, self.operator_set
|
|
154
353
|
|
|
155
354
|
def __len__(self):
|
|
156
|
-
return len(self.
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
355
|
+
return 1 + sum(len(subplan) for subplan in self.subplans)
|
|
356
|
+
|
|
357
|
+
def _compute_next_unique_logical_op_id_map(self, next_map: dict[str, str | None], current_idx: int | None = None) -> tuple[int, str]:
|
|
358
|
+
"""Compute a mapping from each operator's unique logical_op_id to the next operator's unique logical_op_id.
|
|
359
|
+
|
|
360
|
+
The unique logical_op_id is constructed as "{topological_index}-{logical_op_id}" to differentiate between
|
|
361
|
+
multiple instances of the same physical operator in the plan (e.g., in self-joins).
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
next_map: A dictionary to populate with the mapping from unique logical_op_id to next logical_op_id.
|
|
365
|
+
current_idx: The current topological index in the plan. If None, starts at 0.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
A tuple containing:
|
|
369
|
+
- The current topological index after processing this plan.
|
|
370
|
+
- The unique logical_op_id of this plan's root logical operator.
|
|
371
|
+
"""
|
|
372
|
+
# If there are subplans, compute their next maps first
|
|
373
|
+
subplan_topo_idx_op_id_pairs = []
|
|
374
|
+
for subplan in self.subplans:
|
|
375
|
+
current_idx, current_logical_op_id = subplan._compute_next_unique_logical_op_id_map(next_map, current_idx)
|
|
376
|
+
subplan_topo_idx_op_id_pairs.append((current_idx, current_logical_op_id))
|
|
377
|
+
current_idx += 1 # increment after processing each subplan
|
|
378
|
+
|
|
379
|
+
# for each subplan's root operator, set its next to this plan's root operator
|
|
380
|
+
for topo_idx, logical_op_id in subplan_topo_idx_op_id_pairs:
|
|
381
|
+
unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
|
|
382
|
+
this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
|
|
383
|
+
next_map[unique_logical_op_id] = this_unique_logical_op_id
|
|
384
|
+
|
|
385
|
+
# if this is the first call, initialize current_idx
|
|
386
|
+
if current_idx is None:
|
|
387
|
+
current_idx = 0
|
|
388
|
+
|
|
389
|
+
return current_idx, self.logical_op_id
|
|
390
|
+
|
|
391
|
+
def get_next_unique_logical_op_id(self, unique_logical_op_id: str) -> str | None:
|
|
392
|
+
"""Return the unique logical_op_id of the next operator in the plan after the given operator, or None if it is the last operator."""
|
|
393
|
+
return self.unique_logical_op_id_to_next_unique_logical_op_id[unique_logical_op_id]
|
|
394
|
+
|
|
395
|
+
def _compute_root_dataset_ids_map(self, root_dataset_ids_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, list[str]]:
|
|
396
|
+
# set the root dataset ids for this operator
|
|
397
|
+
all_subplan_root_dataset_ids = []
|
|
398
|
+
for subplan in self.subplans:
|
|
399
|
+
current_idx, subplan_root_dataset_ids = subplan._compute_root_dataset_ids_map(root_dataset_ids_map, current_idx)
|
|
400
|
+
all_subplan_root_dataset_ids.extend(subplan_root_dataset_ids)
|
|
401
|
+
current_idx += 1
|
|
402
|
+
|
|
403
|
+
# if current_idx is None, this is the first call, so we initialize it to 0
|
|
404
|
+
if current_idx is None:
|
|
405
|
+
current_idx = 0
|
|
406
|
+
|
|
407
|
+
# compute this operator's unique logical_op_id
|
|
408
|
+
this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
|
|
409
|
+
|
|
410
|
+
# if this operator is a root dataset scan, update root_dataset_ids
|
|
411
|
+
root_dataset_ids = []
|
|
412
|
+
if isinstance(self.operator_set[0], MarshalAndScanDataOp):
|
|
413
|
+
root_dataset_ids.append(self.operator_set[0].datasource.id)
|
|
414
|
+
elif isinstance(self.operator_set[0], ContextScanOp):
|
|
415
|
+
root_dataset_ids.append(self.operator_set[0].context.id)
|
|
416
|
+
|
|
417
|
+
# update the root_dataset_ids_map for this operator
|
|
418
|
+
root_dataset_ids_map[this_unique_logical_op_id] = root_dataset_ids + all_subplan_root_dataset_ids
|
|
419
|
+
|
|
420
|
+
# return the current index and the upstream unique logical_op_ids for this operator
|
|
421
|
+
return current_idx, root_dataset_ids_map[this_unique_logical_op_id]
|
|
422
|
+
|
|
423
|
+
def get_root_dataset_ids(self, unique_logical_op_id: str) -> list[str]:
|
|
424
|
+
"""Return the list of root dataset ids which are upstream of this operator."""
|
|
425
|
+
return self.unique_logical_op_id_to_root_dataset_ids[unique_logical_op_id]
|
|
426
|
+
|
|
427
|
+
def _compute_source_unique_logical_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
|
|
428
|
+
# get the topological index and logical_op_id pairs for all subplans' root operators
|
|
429
|
+
subplan_topo_idx_op_id_pairs = []
|
|
430
|
+
for subplan in self.subplans:
|
|
431
|
+
current_idx, current_logical_op_id = subplan._compute_source_unique_logical_op_ids_map(source_map, current_idx)
|
|
432
|
+
subplan_topo_idx_op_id_pairs.append((current_idx, current_logical_op_id))
|
|
433
|
+
current_idx += 1
|
|
434
|
+
|
|
435
|
+
# if current_idx is None, this is the first call, so we initialize it to 0
|
|
436
|
+
if current_idx is None:
|
|
437
|
+
current_idx = 0
|
|
438
|
+
|
|
439
|
+
# compute this operator's unique logical_op_id
|
|
440
|
+
this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
|
|
441
|
+
|
|
442
|
+
# update the source_map for this operator
|
|
443
|
+
source_map[this_unique_logical_op_id] = []
|
|
444
|
+
for topo_idx, logical_op_id in subplan_topo_idx_op_id_pairs:
|
|
445
|
+
unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
|
|
446
|
+
source_map[this_unique_logical_op_id].append(unique_logical_op_id)
|
|
447
|
+
|
|
448
|
+
# return the current unique logical_op_id for this operator
|
|
449
|
+
return current_idx, self.logical_op_id
|
|
450
|
+
|
|
451
|
+
def get_source_unique_logical_op_ids(self, unique_logical_op_id: str) -> list[str]:
|
|
452
|
+
"""Return the list of unique logical_op_ids for the input(s) to this operator."""
|
|
453
|
+
return self.unique_logical_op_id_to_source_logical_op_ids[unique_logical_op_id]
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from pydantic.fields import FieldInfo
|
|
4
|
+
|
|
4
5
|
from palimpzest.query.operators.logical import LogicalOperator
|
|
5
6
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
7
|
+
from palimpzest.query.optimizer import rules
|
|
6
8
|
from palimpzest.query.optimizer.plan import PlanCost
|
|
7
9
|
from palimpzest.utils.hash_helpers import hash_for_id
|
|
8
10
|
|
|
@@ -18,9 +20,9 @@ class Expression:
|
|
|
18
20
|
self,
|
|
19
21
|
operator: LogicalOperator | PhysicalOperator,
|
|
20
22
|
input_group_ids: list[int],
|
|
21
|
-
input_fields: dict[str,
|
|
23
|
+
input_fields: dict[str, FieldInfo],
|
|
22
24
|
depends_on_field_names: set[str],
|
|
23
|
-
generated_fields: dict[str,
|
|
25
|
+
generated_fields: dict[str, FieldInfo],
|
|
24
26
|
group_id: int | None = None,
|
|
25
27
|
):
|
|
26
28
|
self.operator = operator
|
|
@@ -36,37 +38,59 @@ class Expression:
|
|
|
36
38
|
self.plan_cost: PlanCost | None = None
|
|
37
39
|
|
|
38
40
|
# NOTE: this will be a list of tuples where each tuple has a (pareto-optimal) plan cost
|
|
39
|
-
# and the input plan cost for which that pareto-optimal plan cost is attainable
|
|
40
|
-
|
|
41
|
+
# and the tuple of input plan cost(s) for which that pareto-optimal plan cost is attainable;
|
|
42
|
+
# the tuple of input plan cost(s) is (input_plan_cost, None) for non-join operators and
|
|
43
|
+
# (left_input_plan_cost, right_input_plan_cost) for join operators
|
|
44
|
+
self.pareto_optimal_plan_costs: list[tuple[PlanCost, tuple[PlanCost, PlanCost]]] | None = None
|
|
45
|
+
|
|
46
|
+
# compute the expression id
|
|
47
|
+
self.expr_id = self._compute_expr_id()
|
|
41
48
|
|
|
42
49
|
def __eq__(self, other):
|
|
43
|
-
return self.
|
|
50
|
+
return self.expr_id == other.expr_id
|
|
44
51
|
|
|
45
52
|
def __str__(self):
|
|
46
|
-
|
|
47
|
-
|
|
53
|
+
expr_str = f"{self.__class__.__name__}(group_id={self.group_id}, expr_id={self.expr_id})"
|
|
54
|
+
expr_str += f"\n - input_group_ids: {self.input_group_ids}"
|
|
55
|
+
expr_str += f"\n - input_fields: {self.input_fields}"
|
|
56
|
+
expr_str += f"\n - depends_on_field_names: {self.depends_on_field_names}"
|
|
57
|
+
expr_str += f"\n - generated_fields: {self.generated_fields}"
|
|
58
|
+
expr_str += f"\n - operator:\n{str(self.operator)}"
|
|
59
|
+
return expr_str
|
|
48
60
|
|
|
49
61
|
def __hash__(self):
|
|
50
|
-
|
|
62
|
+
op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_full_op_id()
|
|
63
|
+
hash_str = str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
|
|
51
64
|
hash_id = int(hash_for_id(hash_str), 16)
|
|
52
65
|
return hash_id
|
|
53
66
|
|
|
54
|
-
def
|
|
67
|
+
def _compute_expr_id(self) -> int:
|
|
68
|
+
return self.__hash__()
|
|
69
|
+
|
|
70
|
+
def add_applied_rule(self, rule: type[rules.Rule]):
|
|
55
71
|
self.rules_applied.add(rule.get_rule_id())
|
|
56
72
|
|
|
57
73
|
def set_group_id(self, group_id: int) -> None:
|
|
58
74
|
self.group_id = group_id
|
|
59
75
|
|
|
60
|
-
def get_expr_id(self) -> int:
|
|
61
|
-
return self.__hash__()
|
|
62
|
-
|
|
63
76
|
|
|
64
77
|
class LogicalExpression(Expression):
|
|
65
78
|
pass
|
|
66
79
|
|
|
67
80
|
|
|
68
81
|
class PhysicalExpression(Expression):
|
|
69
|
-
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def from_op_and_logical_expr(cls, op: PhysicalOperator, logical_expression: LogicalExpression) -> PhysicalExpression:
|
|
85
|
+
"""Construct a PhysicalExpression given a physical operator and a logical expression."""
|
|
86
|
+
return cls(
|
|
87
|
+
operator=op,
|
|
88
|
+
input_group_ids=logical_expression.input_group_ids,
|
|
89
|
+
input_fields=logical_expression.input_fields,
|
|
90
|
+
depends_on_field_names=logical_expression.depends_on_field_names,
|
|
91
|
+
generated_fields=logical_expression.generated_fields,
|
|
92
|
+
group_id=logical_expression.group_id,
|
|
93
|
+
)
|
|
70
94
|
|
|
71
95
|
|
|
72
96
|
class Group:
|
|
@@ -76,9 +100,9 @@ class Group:
|
|
|
76
100
|
Maintains a set of logical multi-expressions and physical multi-expressions.
|
|
77
101
|
"""
|
|
78
102
|
|
|
79
|
-
def __init__(self, logical_expressions: list[
|
|
80
|
-
self.logical_expressions = set(logical_expressions)
|
|
81
|
-
self.physical_expressions = set()
|
|
103
|
+
def __init__(self, logical_expressions: list[LogicalExpression], fields: dict[str, FieldInfo], properties: dict[str, set[str]]):
|
|
104
|
+
self.logical_expressions: set[LogicalExpression] = set(logical_expressions)
|
|
105
|
+
self.physical_expressions: set[PhysicalExpression] = set()
|
|
82
106
|
self.fields = fields
|
|
83
107
|
self.explored = False
|
|
84
108
|
self.best_physical_expression: PhysicalExpression | None = None
|
|
@@ -90,12 +114,12 @@ class Group:
|
|
|
90
114
|
self.properties = properties
|
|
91
115
|
|
|
92
116
|
# compute the group id
|
|
93
|
-
self.group_id = self.
|
|
117
|
+
self.group_id = self._compute_group_id()
|
|
94
118
|
|
|
95
119
|
def set_explored(self):
|
|
96
120
|
self.explored = True
|
|
97
121
|
|
|
98
|
-
def
|
|
122
|
+
def _compute_group_id(self) -> int:
|
|
99
123
|
# sort field names
|
|
100
124
|
sorted_fields = sorted(self.fields.keys())
|
|
101
125
|
|