palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +5 -0
- palimpzest/constants.py +110 -43
- palimpzest/core/__init__.py +0 -78
- palimpzest/core/data/dataclasses.py +382 -44
- palimpzest/core/elements/filters.py +7 -3
- palimpzest/core/elements/index.py +70 -0
- palimpzest/core/elements/records.py +33 -11
- palimpzest/core/lib/fields.py +1 -0
- palimpzest/core/lib/schemas.py +4 -3
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
- palimpzest/prompts/prompt_factory.py +44 -7
- palimpzest/prompts/split_merge_prompts.py +56 -0
- palimpzest/prompts/split_proposer_prompts.py +55 -0
- palimpzest/query/execution/execution_strategy.py +435 -53
- palimpzest/query/execution/execution_strategy_type.py +20 -0
- palimpzest/query/execution/mab_execution_strategy.py +532 -0
- palimpzest/query/execution/parallel_execution_strategy.py +143 -172
- palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
- palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
- palimpzest/query/generators/api_client_factory.py +31 -0
- palimpzest/query/generators/generators.py +256 -76
- palimpzest/query/operators/__init__.py +1 -2
- palimpzest/query/operators/code_synthesis_convert.py +33 -18
- palimpzest/query/operators/convert.py +30 -97
- palimpzest/query/operators/critique_and_refine_convert.py +5 -6
- palimpzest/query/operators/filter.py +7 -10
- palimpzest/query/operators/logical.py +54 -10
- palimpzest/query/operators/map.py +130 -0
- palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
- palimpzest/query/operators/physical.py +3 -12
- palimpzest/query/operators/rag_convert.py +66 -18
- palimpzest/query/operators/retrieve.py +230 -34
- palimpzest/query/operators/scan.py +5 -2
- palimpzest/query/operators/split_convert.py +169 -0
- palimpzest/query/operators/token_reduction_convert.py +8 -14
- palimpzest/query/optimizer/__init__.py +4 -16
- palimpzest/query/optimizer/cost_model.py +73 -266
- palimpzest/query/optimizer/optimizer.py +87 -58
- palimpzest/query/optimizer/optimizer_strategy.py +18 -97
- palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/primitives.py +5 -3
- palimpzest/query/optimizer/rules.py +336 -172
- palimpzest/query/optimizer/tasks.py +30 -100
- palimpzest/query/processor/config.py +38 -22
- palimpzest/query/processor/nosentinel_processor.py +16 -520
- palimpzest/query/processor/processing_strategy_type.py +28 -0
- palimpzest/query/processor/query_processor.py +38 -206
- palimpzest/query/processor/query_processor_factory.py +117 -130
- palimpzest/query/processor/sentinel_processor.py +90 -0
- palimpzest/query/processor/streaming_processor.py +25 -32
- palimpzest/sets.py +88 -41
- palimpzest/utils/model_helpers.py +8 -7
- palimpzest/utils/progress.py +368 -152
- palimpzest/utils/token_reduction_helpers.py +1 -3
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
- palimpzest-0.7.1.dist-info/RECORD +96 -0
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
- palimpzest/query/processor/mab_sentinel_processor.py +0 -884
- palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
- palimpzest/utils/index_helpers.py +0 -6
- palimpzest-0.6.4.dist-info/RECORD +0 -87
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from copy import deepcopy
|
|
4
5
|
|
|
5
6
|
from palimpzest.constants import Model
|
|
@@ -14,6 +15,7 @@ from palimpzest.query.operators.logical import (
|
|
|
14
15
|
GroupByAggregate,
|
|
15
16
|
LimitScan,
|
|
16
17
|
LogicalOperator,
|
|
18
|
+
MapScan,
|
|
17
19
|
Project,
|
|
18
20
|
RetrieveScan,
|
|
19
21
|
)
|
|
@@ -22,22 +24,17 @@ from palimpzest.query.optimizer import (
|
|
|
22
24
|
TRANSFORMATION_RULES,
|
|
23
25
|
)
|
|
24
26
|
from palimpzest.query.optimizer.cost_model import CostModel
|
|
25
|
-
from palimpzest.query.optimizer.
|
|
26
|
-
OptimizationStrategyType,
|
|
27
|
-
OptimizerStrategyRegistry,
|
|
28
|
-
)
|
|
27
|
+
from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
|
|
29
28
|
from palimpzest.query.optimizer.plan import PhysicalPlan
|
|
30
29
|
from palimpzest.query.optimizer.primitives import Group, LogicalExpression
|
|
31
30
|
from palimpzest.query.optimizer.rules import (
|
|
32
31
|
CodeSynthesisConvertRule,
|
|
33
32
|
CriticAndRefineConvertRule,
|
|
34
33
|
LLMConvertBondedRule,
|
|
35
|
-
LLMConvertConventionalRule,
|
|
36
34
|
MixtureOfAgentsConvertRule,
|
|
37
35
|
RAGConvertRule,
|
|
36
|
+
SplitConvertRule,
|
|
38
37
|
TokenReducedConvertBondedRule,
|
|
39
|
-
TokenReducedConvertConventionalRule,
|
|
40
|
-
TokenReducedConvertRule,
|
|
41
38
|
)
|
|
42
39
|
from palimpzest.query.optimizer.tasks import (
|
|
43
40
|
ApplyRule,
|
|
@@ -48,7 +45,9 @@ from palimpzest.query.optimizer.tasks import (
|
|
|
48
45
|
)
|
|
49
46
|
from palimpzest.sets import Dataset, Set
|
|
50
47
|
from palimpzest.utils.hash_helpers import hash_for_serialized_dict
|
|
51
|
-
from palimpzest.utils.model_helpers import get_champion_model, get_code_champion_model,
|
|
48
|
+
from palimpzest.utils.model_helpers import get_champion_model, get_code_champion_model, get_fallback_model
|
|
49
|
+
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
52
51
|
|
|
53
52
|
|
|
54
53
|
def get_node_uid(node: Dataset | DataReader) -> str:
|
|
@@ -86,22 +85,21 @@ class Optimizer:
|
|
|
86
85
|
self,
|
|
87
86
|
policy: Policy,
|
|
88
87
|
cost_model: CostModel,
|
|
89
|
-
|
|
88
|
+
available_models: list[Model],
|
|
89
|
+
cache: bool = False,
|
|
90
90
|
verbose: bool = False,
|
|
91
|
-
available_models: list[Model] | None = None,
|
|
92
91
|
allow_bonded_query: bool = True,
|
|
93
|
-
allow_conventional_query: bool = False,
|
|
94
92
|
allow_code_synth: bool = False,
|
|
95
93
|
allow_token_reduction: bool = False,
|
|
96
94
|
allow_rag_reduction: bool = False,
|
|
97
95
|
allow_mixtures: bool = True,
|
|
98
96
|
allow_critic: bool = False,
|
|
99
|
-
|
|
97
|
+
allow_split_merge: bool = False,
|
|
98
|
+
optimizer_strategy: OptimizationStrategyType = OptimizationStrategyType.PARETO,
|
|
100
99
|
use_final_op_quality: bool = False, # TODO: make this func(plan) -> final_quality
|
|
100
|
+
**kwargs,
|
|
101
101
|
):
|
|
102
102
|
# store the policy
|
|
103
|
-
if available_models is None or len(available_models) == 0:
|
|
104
|
-
available_models = []
|
|
105
103
|
self.policy = policy
|
|
106
104
|
|
|
107
105
|
# store the cost model
|
|
@@ -123,36 +121,38 @@ class Optimizer:
|
|
|
123
121
|
self.implementation_rules = IMPLEMENTATION_RULES
|
|
124
122
|
self.transformation_rules = TRANSFORMATION_RULES
|
|
125
123
|
|
|
126
|
-
|
|
124
|
+
# get the strategy class associated with the optimizer strategy
|
|
125
|
+
optimizer_strategy_cls = optimizer_strategy.value
|
|
126
|
+
self.strategy = optimizer_strategy_cls()
|
|
127
127
|
|
|
128
|
-
#
|
|
129
|
-
if
|
|
128
|
+
# remove transformation rules for optimization strategies which do not require them
|
|
129
|
+
if optimizer_strategy.no_transformation():
|
|
130
130
|
self.transformation_rules = []
|
|
131
131
|
|
|
132
132
|
# if we are not performing optimization, set available models to be single model
|
|
133
133
|
# and remove all optimizations (except for bonded queries)
|
|
134
|
-
if
|
|
134
|
+
if optimizer_strategy == OptimizationStrategyType.NONE:
|
|
135
135
|
self.allow_bonded_query = True
|
|
136
|
-
self.allow_conventional_query = False
|
|
137
136
|
self.allow_code_synth = False
|
|
138
137
|
self.allow_token_reduction = False
|
|
139
138
|
self.allow_rag_reduction = False
|
|
140
139
|
self.allow_mixtures = False
|
|
141
140
|
self.allow_critic = False
|
|
141
|
+
self.allow_split_merge = False
|
|
142
142
|
self.available_models = [available_models[0]]
|
|
143
143
|
|
|
144
144
|
# store optimization hyperparameters
|
|
145
|
-
self.
|
|
145
|
+
self.cache = cache
|
|
146
146
|
self.verbose = verbose
|
|
147
147
|
self.available_models = available_models
|
|
148
148
|
self.allow_bonded_query = allow_bonded_query
|
|
149
|
-
self.allow_conventional_query = allow_conventional_query
|
|
150
149
|
self.allow_code_synth = allow_code_synth
|
|
151
150
|
self.allow_token_reduction = allow_token_reduction
|
|
152
151
|
self.allow_rag_reduction = allow_rag_reduction
|
|
153
152
|
self.allow_mixtures = allow_mixtures
|
|
154
153
|
self.allow_critic = allow_critic
|
|
155
|
-
self.
|
|
154
|
+
self.allow_split_merge = allow_split_merge
|
|
155
|
+
self.optimizer_strategy = optimizer_strategy
|
|
156
156
|
self.use_final_op_quality = use_final_op_quality
|
|
157
157
|
|
|
158
158
|
# prune implementation rules based on boolean flags
|
|
@@ -163,13 +163,6 @@ class Optimizer:
|
|
|
163
163
|
if rule not in [LLMConvertBondedRule, TokenReducedConvertBondedRule]
|
|
164
164
|
]
|
|
165
165
|
|
|
166
|
-
if not self.allow_conventional_query:
|
|
167
|
-
self.implementation_rules = [
|
|
168
|
-
rule
|
|
169
|
-
for rule in self.implementation_rules
|
|
170
|
-
if rule not in [LLMConvertConventionalRule, TokenReducedConvertConventionalRule]
|
|
171
|
-
]
|
|
172
|
-
|
|
173
166
|
if not self.allow_code_synth:
|
|
174
167
|
self.implementation_rules = [
|
|
175
168
|
rule for rule in self.implementation_rules if not issubclass(rule, CodeSynthesisConvertRule)
|
|
@@ -177,7 +170,7 @@ class Optimizer:
|
|
|
177
170
|
|
|
178
171
|
if not self.allow_token_reduction:
|
|
179
172
|
self.implementation_rules = [
|
|
180
|
-
rule for rule in self.implementation_rules if not issubclass(rule,
|
|
173
|
+
rule for rule in self.implementation_rules if not issubclass(rule, TokenReducedConvertBondedRule)
|
|
181
174
|
]
|
|
182
175
|
|
|
183
176
|
if not self.allow_rag_reduction:
|
|
@@ -187,8 +180,7 @@ class Optimizer:
|
|
|
187
180
|
|
|
188
181
|
if not self.allow_mixtures:
|
|
189
182
|
self.implementation_rules = [
|
|
190
|
-
rule for rule in self.implementation_rules
|
|
191
|
-
if not issubclass(rule, MixtureOfAgentsConvertRule)
|
|
183
|
+
rule for rule in self.implementation_rules if not issubclass(rule, MixtureOfAgentsConvertRule)
|
|
192
184
|
]
|
|
193
185
|
|
|
194
186
|
if not self.allow_critic:
|
|
@@ -196,8 +188,17 @@ class Optimizer:
|
|
|
196
188
|
rule for rule in self.implementation_rules if not issubclass(rule, CriticAndRefineConvertRule)
|
|
197
189
|
]
|
|
198
190
|
|
|
191
|
+
if not self.allow_split_merge:
|
|
192
|
+
self.implementation_rules = [
|
|
193
|
+
rule for rule in self.implementation_rules if not issubclass(rule, SplitConvertRule)
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
logger.info(f"Initialized Optimizer with verbose={self.verbose}")
|
|
197
|
+
logger.debug(f"Initialized Optimizer with params: {self.__dict__}")
|
|
198
|
+
|
|
199
199
|
def update_cost_model(self, cost_model: CostModel):
|
|
200
200
|
self.cost_model = cost_model
|
|
201
|
+
self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
|
|
201
202
|
|
|
202
203
|
def get_physical_op_params(self):
|
|
203
204
|
return {
|
|
@@ -205,38 +206,41 @@ class Optimizer:
|
|
|
205
206
|
"available_models": self.available_models,
|
|
206
207
|
"champion_model": get_champion_model(self.available_models),
|
|
207
208
|
"code_champion_model": get_code_champion_model(self.available_models),
|
|
208
|
-
"
|
|
209
|
+
"fallback_model": get_fallback_model(self.available_models),
|
|
209
210
|
}
|
|
210
211
|
|
|
211
212
|
def deepcopy_clean(self):
|
|
212
213
|
optimizer = Optimizer(
|
|
213
214
|
policy=self.policy,
|
|
214
215
|
cost_model=CostModel(),
|
|
215
|
-
|
|
216
|
+
cache=self.cache,
|
|
216
217
|
verbose=self.verbose,
|
|
217
218
|
available_models=self.available_models,
|
|
218
219
|
allow_bonded_query=self.allow_bonded_query,
|
|
219
|
-
allow_conventional_query=self.allow_conventional_query,
|
|
220
220
|
allow_code_synth=self.allow_code_synth,
|
|
221
221
|
allow_token_reduction=self.allow_token_reduction,
|
|
222
222
|
allow_rag_reduction=self.allow_rag_reduction,
|
|
223
223
|
allow_mixtures=self.allow_mixtures,
|
|
224
224
|
allow_critic=self.allow_critic,
|
|
225
|
-
|
|
225
|
+
allow_split_merge=self.allow_split_merge,
|
|
226
|
+
optimizer_strategy=self.optimizer_strategy,
|
|
226
227
|
use_final_op_quality=self.use_final_op_quality,
|
|
227
228
|
)
|
|
228
229
|
return optimizer
|
|
229
|
-
|
|
230
|
-
def update_strategy(self,
|
|
231
|
-
self.
|
|
232
|
-
|
|
233
|
-
|
|
230
|
+
|
|
231
|
+
def update_strategy(self, optimizer_strategy: OptimizationStrategyType):
|
|
232
|
+
self.optimizer_strategy = optimizer_strategy
|
|
233
|
+
optimizer_strategy_cls = optimizer_strategy.value
|
|
234
|
+
self.strategy = optimizer_strategy_cls()
|
|
235
|
+
|
|
234
236
|
def construct_group_tree(self, dataset_nodes: list[Set]) -> tuple[list[int], dict[str, Field], dict[str, set[str]]]:
|
|
235
237
|
# get node, output_schema, and input_schema (if applicable)
|
|
238
|
+
logger.debug(f"Constructing group tree for dataset_nodes: {dataset_nodes}")
|
|
239
|
+
|
|
236
240
|
node = dataset_nodes[-1]
|
|
237
241
|
output_schema = node.schema
|
|
238
242
|
input_schema = dataset_nodes[-2].schema if len(dataset_nodes) > 1 else None
|
|
239
|
-
|
|
243
|
+
|
|
240
244
|
### convert node --> Group ###
|
|
241
245
|
uid = get_node_uid(node)
|
|
242
246
|
|
|
@@ -244,7 +248,7 @@ class Optimizer:
|
|
|
244
248
|
op: LogicalOperator | None = None
|
|
245
249
|
|
|
246
250
|
# TODO: add cache scan when we add caching back to PZ
|
|
247
|
-
# if
|
|
251
|
+
# if self.cache:
|
|
248
252
|
# op = CacheScan(datareader=node, output_schema=output_schema)
|
|
249
253
|
if isinstance(node, DataReader):
|
|
250
254
|
op = BaseScan(datareader=node, output_schema=output_schema)
|
|
@@ -291,9 +295,9 @@ class Optimizer:
|
|
|
291
295
|
index=node._index,
|
|
292
296
|
search_func=node._search_func,
|
|
293
297
|
search_attr=node._search_attr,
|
|
294
|
-
|
|
298
|
+
output_attrs=node._output_attrs,
|
|
295
299
|
k=node._k,
|
|
296
|
-
target_cache_id=uid
|
|
300
|
+
target_cache_id=uid,
|
|
297
301
|
)
|
|
298
302
|
elif output_schema != input_schema:
|
|
299
303
|
op = ConvertScan(
|
|
@@ -304,6 +308,13 @@ class Optimizer:
|
|
|
304
308
|
depends_on=node._depends_on,
|
|
305
309
|
target_cache_id=uid,
|
|
306
310
|
)
|
|
311
|
+
elif output_schema == input_schema and node._udf is not None:
|
|
312
|
+
op = MapScan(
|
|
313
|
+
input_schema=input_schema,
|
|
314
|
+
output_schema=output_schema,
|
|
315
|
+
udf=node._udf,
|
|
316
|
+
target_cache_id=uid,
|
|
317
|
+
)
|
|
307
318
|
# some legacy plans may have a useless convert; for now we simply skip it
|
|
308
319
|
elif output_schema == input_schema:
|
|
309
320
|
return self.construct_group_tree(dataset_nodes[:-1]) if len(dataset_nodes) > 1 else ([], {}, {})
|
|
@@ -319,7 +330,9 @@ class Optimizer:
|
|
|
319
330
|
)
|
|
320
331
|
|
|
321
332
|
# compute the fields added by this operation and all fields
|
|
322
|
-
input_group_short_field_names = list(
|
|
333
|
+
input_group_short_field_names = list(
|
|
334
|
+
map(lambda full_field: full_field.split(".")[-1], input_group_fields.keys())
|
|
335
|
+
)
|
|
323
336
|
new_fields = {
|
|
324
337
|
field_name: field
|
|
325
338
|
for field_name, field in op.output_schema.field_map(unique=True, id=uid).items()
|
|
@@ -329,9 +342,7 @@ class Optimizer:
|
|
|
329
342
|
|
|
330
343
|
# compute the set of (short) field names this operation depends on
|
|
331
344
|
depends_on_field_names = (
|
|
332
|
-
{}
|
|
333
|
-
if isinstance(node, DataReader)
|
|
334
|
-
else {field_name.split(".")[-1] for field_name in node._depends_on}
|
|
345
|
+
{} if isinstance(node, DataReader) else {field_name.split(".")[-1] for field_name in node._depends_on}
|
|
335
346
|
)
|
|
336
347
|
|
|
337
348
|
# compute all properties including this operations'
|
|
@@ -351,7 +362,7 @@ class Optimizer:
|
|
|
351
362
|
all_properties["limits"].add(op_limit_str)
|
|
352
363
|
else:
|
|
353
364
|
all_properties["limits"] = set([op_limit_str])
|
|
354
|
-
|
|
365
|
+
|
|
355
366
|
elif isinstance(op, Project):
|
|
356
367
|
op_project_str = op.get_logical_op_id()
|
|
357
368
|
if "projects" in all_properties:
|
|
@@ -359,6 +370,13 @@ class Optimizer:
|
|
|
359
370
|
else:
|
|
360
371
|
all_properties["projects"] = set([op_project_str])
|
|
361
372
|
|
|
373
|
+
elif isinstance(op, MapScan):
|
|
374
|
+
op_udf_str = op.udf.__name__
|
|
375
|
+
if "udfs" in all_properties:
|
|
376
|
+
all_properties["udfs"].add(op_udf_str)
|
|
377
|
+
else:
|
|
378
|
+
all_properties["udfs"] = set([op_udf_str])
|
|
379
|
+
|
|
362
380
|
# construct the logical expression and group
|
|
363
381
|
logical_expression = LogicalExpression(
|
|
364
382
|
operator=op,
|
|
@@ -378,13 +396,16 @@ class Optimizer:
|
|
|
378
396
|
# add the expression and group to the optimizer's expressions and groups and return
|
|
379
397
|
self.expressions[logical_expression.get_expr_id()] = logical_expression
|
|
380
398
|
self.groups[group.group_id] = group
|
|
399
|
+
logger.debug(f"Constructed group tree for dataset_nodes: {dataset_nodes}")
|
|
400
|
+
logger.debug(f"Group: {group.group_id}, {all_fields}, {all_properties}")
|
|
381
401
|
|
|
382
402
|
return [group.group_id], all_fields, all_properties
|
|
383
403
|
|
|
384
404
|
def convert_query_plan_to_group_tree(self, query_plan: Dataset) -> str:
|
|
405
|
+
logger.debug(f"Converting query plan to group tree for query_plan: {query_plan}")
|
|
385
406
|
# Obtain ordered list of datasets
|
|
386
407
|
dataset_nodes: list[Dataset | DataReader] = []
|
|
387
|
-
node =
|
|
408
|
+
node = query_plan.copy()
|
|
388
409
|
|
|
389
410
|
# NOTE: the very first node will be a DataReader; the rest will be Dataset
|
|
390
411
|
while isinstance(node, Dataset):
|
|
@@ -427,7 +448,8 @@ class Optimizer:
|
|
|
427
448
|
# check that final_group_id is a singleton
|
|
428
449
|
assert len(final_group_id) == 1
|
|
429
450
|
final_group_id = final_group_id[0]
|
|
430
|
-
|
|
451
|
+
logger.debug(f"Converted query plan to group tree for query_plan: {query_plan}")
|
|
452
|
+
logger.debug(f"Final group id: {final_group_id}")
|
|
431
453
|
return final_group_id
|
|
432
454
|
|
|
433
455
|
def heuristic_optimization(self, group_id: int) -> None:
|
|
@@ -437,6 +459,8 @@ class Optimizer:
|
|
|
437
459
|
pass
|
|
438
460
|
|
|
439
461
|
def search_optimization_space(self, group_id: int) -> None:
|
|
462
|
+
logger.debug(f"Searching optimization space for group_id: {group_id}")
|
|
463
|
+
|
|
440
464
|
# begin the search for an optimal plan with a task to optimize the final group
|
|
441
465
|
initial_task = OptimizeGroup(group_id)
|
|
442
466
|
self.tasks_stack.append(initial_task)
|
|
@@ -451,18 +475,23 @@ class Optimizer:
|
|
|
451
475
|
new_tasks = task.perform(self.transformation_rules, self.implementation_rules)
|
|
452
476
|
elif isinstance(task, ApplyRule):
|
|
453
477
|
context = {"costed_phys_op_ids": self.costed_phys_op_ids}
|
|
454
|
-
new_tasks = task.perform(
|
|
478
|
+
new_tasks = task.perform(
|
|
479
|
+
self.groups, self.expressions, context=context, **self.get_physical_op_params()
|
|
480
|
+
)
|
|
455
481
|
elif isinstance(task, OptimizePhysicalExpression):
|
|
456
|
-
context = {"
|
|
482
|
+
context = {"optimizer_strategy": self.optimizer_strategy}
|
|
457
483
|
new_tasks = task.perform(self.cost_model, self.groups, self.policy, context=context)
|
|
458
484
|
|
|
459
485
|
self.tasks_stack.extend(new_tasks)
|
|
460
486
|
|
|
461
|
-
|
|
487
|
+
logger.debug(f"Done searching optimization space for group_id: {group_id}")
|
|
488
|
+
|
|
489
|
+
def optimize(self, query_plan: Dataset) -> list[PhysicalPlan]:
|
|
462
490
|
"""
|
|
463
491
|
The optimize function takes in an initial query plan and searches the space of
|
|
464
492
|
logical and physical plans in order to cost and produce a (near) optimal physical plan.
|
|
465
493
|
"""
|
|
494
|
+
logger.info(f"Optimizing query plan: {query_plan}")
|
|
466
495
|
# compute the initial group tree for the user plan
|
|
467
496
|
final_group_id = self.convert_query_plan_to_group_tree(query_plan)
|
|
468
497
|
|
|
@@ -472,6 +501,6 @@ class Optimizer:
|
|
|
472
501
|
|
|
473
502
|
# search the optimization space by applying logical and physical transformations to the initial group tree
|
|
474
503
|
self.search_optimization_space(final_group_id)
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
504
|
+
logger.info(f"Getting optimal plans for final group id: {final_group_id}")
|
|
505
|
+
|
|
506
|
+
return self.strategy.get_optimal_plans(self.groups, final_group_id, self.policy, self.use_final_op_quality)
|
|
@@ -1,24 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
|
-
from copy import deepcopy
|
|
5
|
-
from enum import Enum
|
|
6
5
|
|
|
7
6
|
from palimpzest.policy import Policy
|
|
8
7
|
from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
class OptimizationStrategyType(str, Enum):
|
|
12
|
-
"""
|
|
13
|
-
OptimizationStrategyType determines which (set of) plan(s) the Optimizer
|
|
14
|
-
will return to the Execution layer.
|
|
15
|
-
"""
|
|
16
|
-
GREEDY = "greedy"
|
|
17
|
-
CONFIDENCE_INTERVAL = "confidence-interval"
|
|
18
|
-
PARETO = "pareto"
|
|
19
|
-
SENTINEL = "sentinel"
|
|
20
|
-
NONE = "none"
|
|
21
|
-
AUTO = "auto"
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
22
10
|
|
|
23
11
|
|
|
24
12
|
class OptimizationStrategy(ABC):
|
|
@@ -27,11 +15,6 @@ class OptimizationStrategy(ABC):
|
|
|
27
15
|
"""Strategy decides how to search through the groups for optimal plan(s)"""
|
|
28
16
|
pass
|
|
29
17
|
|
|
30
|
-
@classmethod
|
|
31
|
-
def get_strategy(cls, strategy_type: str) -> OptimizationStrategy:
|
|
32
|
-
"""Factory method to create strategy instances"""
|
|
33
|
-
return OptimizerStrategyRegistry.get_strategy(strategy_type)
|
|
34
|
-
|
|
35
18
|
def normalize_final_plans(self, plans: list[PhysicalPlan]) -> list[PhysicalPlan]:
|
|
36
19
|
"""
|
|
37
20
|
For each plan in `plans`, this function enforces that the input schema of every
|
|
@@ -47,7 +30,7 @@ class OptimizationStrategy(ABC):
|
|
|
47
30
|
for plan in plans:
|
|
48
31
|
normalized_ops = []
|
|
49
32
|
for idx, op in enumerate(plan.operators):
|
|
50
|
-
op_copy =
|
|
33
|
+
op_copy = op.copy()
|
|
51
34
|
if idx == 0:
|
|
52
35
|
normalized_ops.append(op_copy)
|
|
53
36
|
else:
|
|
@@ -79,7 +62,12 @@ class GreedyStrategy(OptimizationStrategy):
|
|
|
79
62
|
return PhysicalPlan.from_ops_and_sub_plan([best_phys_expr.operator], input_best_phys_plan, best_phys_expr.plan_cost)
|
|
80
63
|
|
|
81
64
|
def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
|
|
82
|
-
|
|
65
|
+
logger.info(f"Getting greedy optimal plans for final group id: {final_group_id}")
|
|
66
|
+
plans = [self._get_greedy_physical_plan(groups, final_group_id)]
|
|
67
|
+
logger.info(f"Greedy optimal plans: {plans}")
|
|
68
|
+
logger.info(f"Done getting greedy optimal plans for final group id: {final_group_id}")
|
|
69
|
+
|
|
70
|
+
return plans
|
|
83
71
|
|
|
84
72
|
|
|
85
73
|
class ParetoStrategy(OptimizationStrategy):
|
|
@@ -127,8 +115,9 @@ class ParetoStrategy(OptimizationStrategy):
|
|
|
127
115
|
pareto_optimal_plans.append(plan)
|
|
128
116
|
|
|
129
117
|
return pareto_optimal_plans
|
|
130
|
-
|
|
118
|
+
|
|
131
119
|
def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
|
|
120
|
+
logger.info(f"Getting pareto optimal plans for final group id: {final_group_id}")
|
|
132
121
|
# compute all of the pareto optimal physical plans
|
|
133
122
|
plans = self._get_candidate_pareto_physical_plans(groups, final_group_id, policy)
|
|
134
123
|
|
|
@@ -138,7 +127,6 @@ class ParetoStrategy(OptimizationStrategy):
|
|
|
138
127
|
plan.plan_cost.quality = plan.plan_cost.op_estimates.quality
|
|
139
128
|
|
|
140
129
|
# filter pareto optimal plans for ones which satisfy policy constraint (if at least one of them does)
|
|
141
|
-
# import pdb; pdb.set_trace()
|
|
142
130
|
if any([policy.constraint(plan.plan_cost) for plan in plans]):
|
|
143
131
|
plans = [plan for plan in plans if policy.constraint(plan.plan_cost)]
|
|
144
132
|
|
|
@@ -148,6 +136,8 @@ class ParetoStrategy(OptimizationStrategy):
|
|
|
148
136
|
optimal_plan = optimal_plan if policy.choose(optimal_plan.plan_cost, plan.plan_cost) else plan
|
|
149
137
|
|
|
150
138
|
plans = [optimal_plan]
|
|
139
|
+
logger.info(f"Pareto optimal plans: {plans}")
|
|
140
|
+
logger.info(f"Done getting pareto optimal plans for final group id: {final_group_id}")
|
|
151
141
|
return plans
|
|
152
142
|
|
|
153
143
|
|
|
@@ -177,7 +167,11 @@ class SentinelStrategy(OptimizationStrategy):
|
|
|
177
167
|
return SentinelPlan.from_ops_and_sub_plan([phys_op_set], best_phys_subplan)
|
|
178
168
|
|
|
179
169
|
def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[SentinelPlan]:
|
|
180
|
-
|
|
170
|
+
logger.info(f"Getting sentinel optimal plans for final group id: {final_group_id}")
|
|
171
|
+
plans = [self._get_sentinel_plan(groups, final_group_id)]
|
|
172
|
+
logger.info(f"Sentinel optimal plans: {plans}")
|
|
173
|
+
logger.info(f"Done getting sentinel optimal plans for final group id: {final_group_id}")
|
|
174
|
+
return plans
|
|
181
175
|
|
|
182
176
|
|
|
183
177
|
class NoOptimizationStrategy(GreedyStrategy):
|
|
@@ -186,76 +180,3 @@ class NoOptimizationStrategy(GreedyStrategy):
|
|
|
186
180
|
logical transformations or optimizations. It uses the same get_optimal_plans logic as the
|
|
187
181
|
GreedyOptimizationStrategy.
|
|
188
182
|
"""
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
class ConfidenceIntervalStrategy(OptimizationStrategy):
|
|
192
|
-
def _get_confidence_interval_optimal_plans(self, groups: dict, group_id: int) -> list[PhysicalPlan]:
|
|
193
|
-
"""
|
|
194
|
-
Return all physical plans whose upper bound on the primary policy metric is greater than the
|
|
195
|
-
best plan's lower bound on the primary policy metric (subject to satisfying the policy constraint).
|
|
196
|
-
|
|
197
|
-
The OptimizePhysicalExpression task guarantees that each group's `ci_best_physical_expressions`
|
|
198
|
-
maintains a list of expressions with overlapping CI's on the primary policy metric (while also
|
|
199
|
-
satisfying the policy constraint).
|
|
200
|
-
|
|
201
|
-
This function computes the cross-product of all such expressions across all groups.
|
|
202
|
-
"""
|
|
203
|
-
# get all the physical expressions which could be the best for this group
|
|
204
|
-
best_phys_exprs = groups[group_id].ci_best_physical_expressions
|
|
205
|
-
|
|
206
|
-
best_plans = []
|
|
207
|
-
for phys_expr in best_phys_exprs:
|
|
208
|
-
# if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
|
|
209
|
-
# create the physical plan and append it to the best_plans for this group
|
|
210
|
-
if len(phys_expr.input_group_ids) == 0:
|
|
211
|
-
plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=phys_expr.plan_cost)
|
|
212
|
-
best_plans.append(plan)
|
|
213
|
-
|
|
214
|
-
# otherwise, get the best physical plan(s) for this group's inputs
|
|
215
|
-
else:
|
|
216
|
-
# TODO: need to handle joins
|
|
217
|
-
best_phys_subplans = [PhysicalPlan(operators=[])]
|
|
218
|
-
for input_group_id in phys_expr.input_group_ids:
|
|
219
|
-
input_best_phys_plans = self._get_confidence_interval_optimal_plans(groups, input_group_id)
|
|
220
|
-
best_phys_subplans = [
|
|
221
|
-
PhysicalPlan.from_ops_and_sub_plan(subplan.operators, input_subplan, subplan.plan_cost)
|
|
222
|
-
for subplan in best_phys_subplans
|
|
223
|
-
for input_subplan in input_best_phys_plans
|
|
224
|
-
]
|
|
225
|
-
|
|
226
|
-
# add this operator to best physical plan and return
|
|
227
|
-
for subplan in best_phys_subplans:
|
|
228
|
-
plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, phys_expr.plan_cost)
|
|
229
|
-
best_plans.append(plan)
|
|
230
|
-
|
|
231
|
-
return best_plans
|
|
232
|
-
|
|
233
|
-
def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
|
|
234
|
-
# TODO: fix this to properly handle multiple potential plans
|
|
235
|
-
raise Exception("NotImplementedError")
|
|
236
|
-
# plans = self._get_confidence_interval_optimal_plans(final_group_id)
|
|
237
|
-
|
|
238
|
-
class AutoOptimizationStrategy(OptimizationStrategy):
|
|
239
|
-
def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
|
|
240
|
-
raise NotImplementedError("Auto optimization strategy not implemented")
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
class OptimizerStrategyRegistry:
|
|
244
|
-
"""Registry to map strategy types to their implementations"""
|
|
245
|
-
|
|
246
|
-
_strategies: dict[str, type[OptimizationStrategy]] = {
|
|
247
|
-
OptimizationStrategyType.GREEDY.value: GreedyStrategy,
|
|
248
|
-
OptimizationStrategyType.CONFIDENCE_INTERVAL.value: ConfidenceIntervalStrategy,
|
|
249
|
-
OptimizationStrategyType.PARETO.value: ParetoStrategy,
|
|
250
|
-
OptimizationStrategyType.SENTINEL.value: SentinelStrategy,
|
|
251
|
-
OptimizationStrategyType.NONE.value: NoOptimizationStrategy,
|
|
252
|
-
OptimizationStrategyType.AUTO.value: AutoOptimizationStrategy,
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
@classmethod
|
|
256
|
-
def get_strategy(cls, strategy_type: str) -> OptimizationStrategy:
|
|
257
|
-
"""Get strategy instance by type"""
|
|
258
|
-
strategy_class = cls._strategies.get(strategy_type)
|
|
259
|
-
if not strategy_class:
|
|
260
|
-
raise ValueError(f"Unknown optimization strategy: {strategy_type}")
|
|
261
|
-
return strategy_class()
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from palimpzest.query.optimizer.optimizer_strategy import (
|
|
4
|
+
GreedyStrategy,
|
|
5
|
+
NoOptimizationStrategy,
|
|
6
|
+
ParetoStrategy,
|
|
7
|
+
SentinelStrategy,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class OptimizationStrategyType(Enum):
|
|
12
|
+
"""
|
|
13
|
+
OptimizationStrategyType determines which (set of) plan(s) the Optimizer
|
|
14
|
+
will return to the Execution layer.
|
|
15
|
+
"""
|
|
16
|
+
GREEDY = GreedyStrategy
|
|
17
|
+
PARETO = ParetoStrategy
|
|
18
|
+
SENTINEL = SentinelStrategy
|
|
19
|
+
NONE = NoOptimizationStrategy
|
|
20
|
+
|
|
21
|
+
def no_transformation(self) -> bool:
|
|
22
|
+
"""
|
|
23
|
+
Return True if this optimization strategy does not transform the logical plan.
|
|
24
|
+
"""
|
|
25
|
+
return self in [OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE]
|
|
26
|
+
|
|
27
|
+
def is_pareto(self) -> bool:
|
|
28
|
+
"""
|
|
29
|
+
Return True if this optimization strategy uses Pareto optimization.
|
|
30
|
+
"""
|
|
31
|
+
return self == OptimizationStrategyType.PARETO
|
|
32
|
+
|
|
33
|
+
def is_not_pareto(self) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Return True if this optimization strategy does not use Pareto optimization.
|
|
36
|
+
"""
|
|
37
|
+
return not self.is_pareto()
|
|
@@ -106,8 +106,7 @@ class SentinelPlan(Plan):
|
|
|
106
106
|
# store operator_sets and logical_op_ids; sort operator_sets internally by op_id
|
|
107
107
|
self.operator_sets = operator_sets
|
|
108
108
|
self.operator_sets = [sorted(op_set, key=lambda op: op.get_op_id()) for op_set in self.operator_sets]
|
|
109
|
-
self.logical_op_ids = [op_set[0].logical_op_id for op_set in operator_sets]
|
|
110
|
-
self.logical_op_names = [op_set[0].logical_op_name for op_set in operator_sets]
|
|
109
|
+
self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
|
|
111
110
|
self.plan_id = self.compute_plan_id()
|
|
112
111
|
|
|
113
112
|
def compute_plan_id(self) -> str:
|
|
@@ -151,7 +150,7 @@ class SentinelPlan(Plan):
|
|
|
151
150
|
return self.logical_op_ids[slice], self.operator_sets[slice]
|
|
152
151
|
|
|
153
152
|
def __iter__(self):
|
|
154
|
-
yield from zip(self.logical_op_ids, self.
|
|
153
|
+
yield from zip(self.logical_op_ids, self.operator_sets)
|
|
155
154
|
|
|
156
155
|
def __len__(self):
|
|
157
156
|
return len(self.logical_op_ids)
|
|
@@ -42,9 +42,12 @@ class Expression:
|
|
|
42
42
|
def __eq__(self, other):
|
|
43
43
|
return self.operator == other.operator and self.input_group_ids == other.input_group_ids
|
|
44
44
|
|
|
45
|
-
def
|
|
45
|
+
def __str__(self):
|
|
46
46
|
op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_op_id()
|
|
47
|
-
|
|
47
|
+
return str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
|
|
48
|
+
|
|
49
|
+
def __hash__(self):
|
|
50
|
+
hash_str = self.__str__()
|
|
48
51
|
hash_id = int(hash_for_id(hash_str), 16)
|
|
49
52
|
return hash_id
|
|
50
53
|
|
|
@@ -80,7 +83,6 @@ class Group:
|
|
|
80
83
|
self.explored = False
|
|
81
84
|
self.best_physical_expression: PhysicalExpression | None = None
|
|
82
85
|
self.pareto_optimal_physical_expressions: list[PhysicalExpression] | None = None
|
|
83
|
-
self.ci_best_physical_expressions: list[PhysicalExpression] | None = None
|
|
84
86
|
self.optimized = False
|
|
85
87
|
|
|
86
88
|
# properties of the Group which distinguish it from groups w/identical fields,
|