palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +259 -197
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +634 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +61 -5
- palimpzest/prompts/filter_prompts.py +50 -5
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
- palimpzest/prompts/prompt_factory.py +358 -46
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +157 -330
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +27 -21
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +22 -13
- palimpzest/query/operators/join.py +402 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +198 -80
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +41 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +27 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
- palimpzest-0.8.0.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.20.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -3,8 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Any
|
|
5
5
|
|
|
6
|
-
from palimpzest.core.
|
|
6
|
+
from palimpzest.core.models import PlanCost
|
|
7
7
|
from palimpzest.policy import Policy
|
|
8
|
+
from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType
|
|
9
|
+
from palimpzest.query.operators.join import JoinOp
|
|
8
10
|
from palimpzest.query.optimizer.cost_model import BaseCostModel
|
|
9
11
|
from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
|
|
10
12
|
from palimpzest.query.optimizer.primitives import Expression, Group
|
|
@@ -120,14 +122,15 @@ class OptimizeLogicalExpression(Task):
|
|
|
120
122
|
|
|
121
123
|
def perform(
|
|
122
124
|
self,
|
|
123
|
-
transformation_rules: list[TransformationRule],
|
|
124
|
-
implementation_rules: list[ImplementationRule],
|
|
125
|
+
transformation_rules: list[type[TransformationRule]],
|
|
126
|
+
implementation_rules: list[type[ImplementationRule]],
|
|
125
127
|
context: dict[str, Any] | None = None,
|
|
126
128
|
) -> list[Task]:
|
|
127
129
|
logger.debug(f"Optimizing logical expression {self.logical_expression}")
|
|
128
|
-
# if we're exploring, only apply transformation rules
|
|
129
130
|
if context is None:
|
|
130
131
|
context = {}
|
|
132
|
+
|
|
133
|
+
# if we're exploring, only apply transformation rules
|
|
131
134
|
rules = transformation_rules if self.exploring else transformation_rules + implementation_rules
|
|
132
135
|
|
|
133
136
|
# filter out rules that have already been applied to logical expression
|
|
@@ -170,7 +173,7 @@ class ApplyRule(Task):
|
|
|
170
173
|
- schedule OptimizePhysicalExpression tasks
|
|
171
174
|
"""
|
|
172
175
|
|
|
173
|
-
def __init__(self, rule: Rule, logical_expression: Expression, exploring: bool = False):
|
|
176
|
+
def __init__(self, rule: type[Rule], logical_expression: Expression, exploring: bool = False):
|
|
174
177
|
self.rule = rule
|
|
175
178
|
self.logical_expression = logical_expression
|
|
176
179
|
self.exploring = exploring
|
|
@@ -183,16 +186,13 @@ class ApplyRule(Task):
|
|
|
183
186
|
**physical_op_params,
|
|
184
187
|
) -> tuple[list[Task], int]:
|
|
185
188
|
logger.debug(f"Applying rule {self.rule} to logical expression {self.logical_expression}")
|
|
186
|
-
|
|
187
|
-
# check if rule has already been applied to this logical expression; return [] if so
|
|
188
189
|
if context is None:
|
|
189
190
|
context = {}
|
|
191
|
+
|
|
192
|
+
# check if rule has already been applied to this logical expression; return [] if so
|
|
190
193
|
if self.rule.get_rule_id() in self.logical_expression.rules_applied:
|
|
191
194
|
return []
|
|
192
195
|
|
|
193
|
-
# MAYBE ?TODO?: iterate over bindings for logical expression and rule?
|
|
194
|
-
# perhaps some rules can be applied more than once to an expression?
|
|
195
|
-
|
|
196
196
|
# get the group of the logical expression
|
|
197
197
|
group_id = self.logical_expression.group_id
|
|
198
198
|
group = groups[group_id]
|
|
@@ -206,8 +206,8 @@ class ApplyRule(Task):
|
|
|
206
206
|
)
|
|
207
207
|
|
|
208
208
|
# filter out any expressions which are duplicates (i.e. they've been previously computed)
|
|
209
|
-
new_expressions = [expr for expr in new_expressions if expr.
|
|
210
|
-
expressions.update({expr.
|
|
209
|
+
new_expressions = [expr for expr in new_expressions if expr.expr_id not in expressions]
|
|
210
|
+
expressions.update({expr.expr_id: expr for expr in new_expressions})
|
|
211
211
|
|
|
212
212
|
# add all new groups to the groups mapping
|
|
213
213
|
for group in new_groups:
|
|
@@ -234,11 +234,11 @@ class ApplyRule(Task):
|
|
|
234
234
|
else:
|
|
235
235
|
# apply implementation rule
|
|
236
236
|
new_expressions = self.rule.substitute(self.logical_expression, **physical_op_params)
|
|
237
|
-
new_expressions = [expr for expr in new_expressions if expr.
|
|
237
|
+
new_expressions = [expr for expr in new_expressions if expr.expr_id not in expressions]
|
|
238
238
|
costed_full_op_ids = context['costed_full_op_ids']
|
|
239
239
|
if costed_full_op_ids is not None:
|
|
240
240
|
new_expressions = [expr for expr in new_expressions if expr.operator.get_full_op_id() in costed_full_op_ids]
|
|
241
|
-
expressions.update({expr.
|
|
241
|
+
expressions.update({expr.expr_id: expr for expr in new_expressions})
|
|
242
242
|
group.physical_expressions.update(new_expressions)
|
|
243
243
|
|
|
244
244
|
# create new task
|
|
@@ -412,8 +412,9 @@ class OptimizePhysicalExpression(Task):
|
|
|
412
412
|
if context is None:
|
|
413
413
|
context = {}
|
|
414
414
|
|
|
415
|
-
# get the optimizer strategy (type) from the context
|
|
415
|
+
# get the optimizer strategy (type) and the execution strategy (type) from the context
|
|
416
416
|
optimizer_strategy: OptimizationStrategyType = context['optimizer_strategy']
|
|
417
|
+
execution_strategy: ExecutionStrategyType = context['execution_strategy']
|
|
417
418
|
|
|
418
419
|
# return if we've already computed the cost of this physical expression
|
|
419
420
|
if optimizer_strategy.is_pareto() and self.physical_expression.pareto_optimal_plan_costs is not None:
|
|
@@ -422,57 +423,90 @@ class OptimizePhysicalExpression(Task):
|
|
|
422
423
|
if optimizer_strategy.is_not_pareto() and self.physical_expression.plan_cost is not None:
|
|
423
424
|
return []
|
|
424
425
|
|
|
425
|
-
# for expressions with
|
|
426
|
-
|
|
427
|
-
|
|
426
|
+
# for expressions with input group(s), compute the input plan cost(s)
|
|
427
|
+
best_input_plan_costs = {}
|
|
428
|
+
pareto_optimal_input_plan_costs = {}
|
|
428
429
|
if len(self.physical_expression.input_group_ids) > 0:
|
|
429
|
-
# get the input group
|
|
430
|
-
input_group_id = self.physical_expression.input_group_ids[0] # TODO: need to handle joins
|
|
431
|
-
input_group = groups[input_group_id]
|
|
432
|
-
|
|
433
|
-
# compute the input plan cost or list of input plan costs
|
|
434
430
|
new_tasks = []
|
|
435
|
-
|
|
436
|
-
#
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
431
|
+
for input_group_id in self.physical_expression.input_group_ids:
|
|
432
|
+
# get the input group
|
|
433
|
+
input_group = groups[input_group_id]
|
|
434
|
+
|
|
435
|
+
# compute the input plan cost or list of input plan costs
|
|
436
|
+
if optimizer_strategy.is_not_pareto() and input_group.best_physical_expression is not None:
|
|
437
|
+
# TODO: apply policy constraint here
|
|
438
|
+
best_input_plan_costs[input_group_id] = input_group.best_physical_expression.plan_cost
|
|
439
|
+
|
|
440
|
+
elif optimizer_strategy.is_pareto() and input_group.pareto_optimal_physical_expressions is not None:
|
|
441
|
+
# TODO: apply policy constraint here
|
|
442
|
+
input_plan_costs = []
|
|
443
|
+
for pareto_physical_expression in input_group.pareto_optimal_physical_expressions:
|
|
444
|
+
plan_costs = list(map(lambda tup: tup[0], pareto_physical_expression.pareto_optimal_plan_costs))
|
|
445
|
+
input_plan_costs.extend(plan_costs)
|
|
446
|
+
|
|
447
|
+
# NOTE: this list will not necessarily be pareto-optimal, as a plan cost on the pareto frontier of
|
|
448
|
+
# one pareto_optimal_physical_expression might be dominated by the plan cost on another physical
|
|
449
|
+
# expression's pareto frontier; we handle this below by taking the pareto frontier of all_possible_plan_costs
|
|
450
|
+
# de-duplicate equivalent plan costs; we will still reconstruct plans with equivalent cost in optimizer.py
|
|
451
|
+
pareto_optimal_input_plan_costs[input_group_id] = list(set(input_plan_costs))
|
|
452
|
+
|
|
453
|
+
else:
|
|
454
|
+
task = OptimizeGroup(input_group_id)
|
|
455
|
+
new_tasks.append(task)
|
|
455
456
|
|
|
456
457
|
# if not all input groups have been costed, we need to compute these first and then retry this task
|
|
457
458
|
if len(new_tasks) > 0:
|
|
458
459
|
return [self] + new_tasks
|
|
459
460
|
|
|
461
|
+
# once all input groups have been costed, compute the cost of this physical expression
|
|
460
462
|
group = groups[self.physical_expression.group_id]
|
|
461
463
|
if optimizer_strategy.is_pareto():
|
|
462
464
|
# compute all possible plan costs for this physical expression given the pareto optimal input plan costs
|
|
463
465
|
all_possible_plan_costs = []
|
|
464
|
-
|
|
465
|
-
|
|
466
|
+
if isinstance(self.physical_expression.operator, JoinOp):
|
|
467
|
+
assert len(self.physical_expression.input_group_ids) == 2, "Join operator must have exactly two input groups."
|
|
468
|
+
|
|
469
|
+
# get the best input plan costs for both inputs
|
|
470
|
+
left_input_group_id, right_input_group_id = self.physical_expression.input_group_ids
|
|
471
|
+
left_best_input_plan_cost = pareto_optimal_input_plan_costs[left_input_group_id]
|
|
472
|
+
right_best_input_plan_cost = pareto_optimal_input_plan_costs[right_input_group_id]
|
|
473
|
+
for left_input_plan_cost in left_best_input_plan_cost:
|
|
474
|
+
for right_input_plan_cost in right_best_input_plan_cost:
|
|
475
|
+
# compute the cost of this operator given the input plan costs
|
|
476
|
+
op_plan_cost = cost_model(
|
|
477
|
+
self.physical_expression.operator,
|
|
478
|
+
left_input_plan_cost.op_estimates,
|
|
479
|
+
right_input_plan_cost.op_estimates,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# compute the total cost for this physical expression by summing its operator's PlanCost
|
|
483
|
+
# with the input groups' total PlanCost; also set the op_estimates for this expression's operator
|
|
484
|
+
execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
|
|
485
|
+
full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy)
|
|
486
|
+
full_plan_cost.op_estimates = op_plan_cost.op_estimates
|
|
487
|
+
all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
|
|
466
488
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
489
|
+
else:
|
|
490
|
+
assert len(self.physical_expression.input_group_ids) < 2, "Non-join operator must have zero or one input groups."
|
|
491
|
+
|
|
492
|
+
input_plan_costs = [PlanCost(cost=0, time=0, quality=1)]
|
|
493
|
+
if len(self.physical_expression.input_group_ids) == 1:
|
|
494
|
+
input_group_id = self.physical_expression.input_group_ids[0]
|
|
495
|
+
input_plan_costs = pareto_optimal_input_plan_costs[input_group_id]
|
|
496
|
+
|
|
497
|
+
# get the pareto-optimal input plan costs for the single input
|
|
498
|
+
for input_plan_cost in input_plan_costs:
|
|
499
|
+
op_plan_cost = cost_model(self.physical_expression.operator, input_plan_cost.op_estimates)
|
|
500
|
+
|
|
501
|
+
# compute the total cost for this physical expression by summing its operator's PlanCost
|
|
502
|
+
# with the input groups' total PlanCost; also set the op_estimates for this expression's operator
|
|
503
|
+
full_plan_cost = op_plan_cost + input_plan_cost
|
|
504
|
+
full_plan_cost.op_estimates = op_plan_cost.op_estimates
|
|
505
|
+
all_possible_plan_costs.append((full_plan_cost, (input_plan_cost, None)))
|
|
472
506
|
|
|
473
507
|
# reduce the set of possible plan costs to the subset which are pareto-optimal
|
|
474
508
|
pareto_optimal_plan_costs = []
|
|
475
|
-
for idx, (plan_cost,
|
|
509
|
+
for idx, (plan_cost, input_plan_cost_tuple) in enumerate(all_possible_plan_costs):
|
|
476
510
|
pareto_optimal = True
|
|
477
511
|
|
|
478
512
|
# check if any other_expr dominates expr
|
|
@@ -487,7 +521,7 @@ class OptimizePhysicalExpression(Task):
|
|
|
487
521
|
|
|
488
522
|
# add expr to pareto frontier if it's not dominated
|
|
489
523
|
if pareto_optimal:
|
|
490
|
-
pareto_optimal_plan_costs.append((plan_cost,
|
|
524
|
+
pareto_optimal_plan_costs.append((plan_cost, input_plan_cost_tuple))
|
|
491
525
|
|
|
492
526
|
# set the pareto frontier of plan costs which can be obtained by this physical expression
|
|
493
527
|
self.physical_expression.pareto_optimal_plan_costs = pareto_optimal_plan_costs
|
|
@@ -496,13 +530,48 @@ class OptimizePhysicalExpression(Task):
|
|
|
496
530
|
group = self.update_pareto_optimal_physical_expressions(group, policy)
|
|
497
531
|
|
|
498
532
|
else:
|
|
499
|
-
# otherwise, compute the cost of this operator given the optimal input plan cost
|
|
500
|
-
op_plan_cost = cost_model(self.physical_expression.operator, best_input_plan_cost.op_estimates)
|
|
501
533
|
|
|
502
|
-
# compute the
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
534
|
+
# otherwise, compute the cost of this operator given the optimal input plan cost(s)
|
|
535
|
+
full_plan_cost = None
|
|
536
|
+
if isinstance(self.physical_expression.operator, JoinOp):
|
|
537
|
+
assert len(self.physical_expression.input_group_ids) == 2, "Join operator must have exactly two input groups."
|
|
538
|
+
|
|
539
|
+
# get the best input plan costs for both inputs
|
|
540
|
+
left_input_group_id, right_input_group_id = self.physical_expression.input_group_ids
|
|
541
|
+
left_best_input_plan_cost = best_input_plan_costs[left_input_group_id]
|
|
542
|
+
right_best_input_plan_cost = best_input_plan_costs[right_input_group_id]
|
|
543
|
+
|
|
544
|
+
# compute the cost of this operator given the best input plan costs
|
|
545
|
+
op_plan_cost = cost_model(
|
|
546
|
+
self.physical_expression.operator,
|
|
547
|
+
left_best_input_plan_cost.op_estimates,
|
|
548
|
+
right_best_input_plan_cost.op_estimates,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# compute the total cost for this physical expression by summing its operator's PlanCost
|
|
552
|
+
# with the input groups' total PlanCost; also set the op_estimates for this expression's operator
|
|
553
|
+
execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
|
|
554
|
+
full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy)
|
|
555
|
+
full_plan_cost.op_estimates = op_plan_cost.op_estimates
|
|
556
|
+
|
|
557
|
+
else:
|
|
558
|
+
assert len(self.physical_expression.input_group_ids) < 2, "Non-join operator must have zero or one input groups."
|
|
559
|
+
|
|
560
|
+
# get the best input plan cost for the single input
|
|
561
|
+
best_input_plan_cost = PlanCost(cost=0, time=0, quality=1)
|
|
562
|
+
if len(self.physical_expression.input_group_ids) == 1:
|
|
563
|
+
input_group_id = self.physical_expression.input_group_ids[0]
|
|
564
|
+
best_input_plan_cost = best_input_plan_costs[input_group_id]
|
|
565
|
+
|
|
566
|
+
# compute the cost of this operator given the best input plan cost
|
|
567
|
+
op_plan_cost = cost_model(self.physical_expression.operator, best_input_plan_cost.op_estimates)
|
|
568
|
+
|
|
569
|
+
# compute the total cost for this physical expression by summing its operator's PlanCost
|
|
570
|
+
# with the input groups' total PlanCost; also set the op_estimates for this expression's operator
|
|
571
|
+
full_plan_cost = op_plan_cost + best_input_plan_cost
|
|
572
|
+
full_plan_cost.op_estimates = op_plan_cost.op_estimates
|
|
573
|
+
|
|
574
|
+
# set the plan cost for this physical expression
|
|
506
575
|
self.physical_expression.plan_cost = full_plan_cost
|
|
507
576
|
|
|
508
577
|
# update the best physical expression for the group
|
|
@@ -1,86 +1,51 @@
|
|
|
1
|
-
import
|
|
2
|
-
from dataclasses import dataclass, field
|
|
1
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
3
2
|
|
|
4
3
|
from palimpzest.constants import Model
|
|
5
|
-
from palimpzest.core.data.datareaders import DataReader
|
|
6
4
|
from palimpzest.policy import MaxQuality, Policy
|
|
7
5
|
|
|
8
6
|
|
|
9
|
-
# TODO: Separate out the config for the Optimizer, ExecutionStrategy, and QueryProcessor
|
|
10
7
|
# TODO: Add description for each field.
|
|
11
|
-
|
|
12
|
-
class QueryProcessorConfig:
|
|
8
|
+
class QueryProcessorConfig(BaseModel):
|
|
13
9
|
"""Shared context for query processors"""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
verbose: bool =
|
|
26
|
-
progress: bool =
|
|
27
|
-
available_models: list[Model] | None =
|
|
28
|
-
|
|
29
|
-
max_workers: int | None =
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
10
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
11
|
+
|
|
12
|
+
# execution and optimization flags
|
|
13
|
+
execution_strategy: str = Field(default="parallel") # substituted with ExecutionStrategyType
|
|
14
|
+
sentinel_execution_strategy: str | None = Field(default="auto") # substituted with SentinelExecutionStrategyType
|
|
15
|
+
optimizer_strategy: str = Field(default="pareto") # substituted with OptimizationStrategyType
|
|
16
|
+
|
|
17
|
+
# general execution flags
|
|
18
|
+
policy: Policy = Field(default_factory=MaxQuality)
|
|
19
|
+
scan_start_idx: int = Field(default=0)
|
|
20
|
+
num_samples: int = Field(default=None)
|
|
21
|
+
verbose: bool = Field(default=False)
|
|
22
|
+
progress: bool = Field(default=True)
|
|
23
|
+
available_models: list[Model] | None = Field(default=None)
|
|
24
|
+
remove_models: list[Model] | None = Field(default=None)
|
|
25
|
+
max_workers: int | None = Field(default=64)
|
|
26
|
+
join_parallelism: int = Field(default=64)
|
|
27
|
+
batch_size: int | None = Field(default=None)
|
|
28
|
+
reasoning_effort: str | None = Field(default=None) # Gemini: "disable", "low", "medium", "high"
|
|
29
|
+
gemini_credentials_path: str | None = Field(default=None) # Path to Gemini credentials file
|
|
30
|
+
api_base: str | None = Field(default=None) # API base URL for vLLM
|
|
31
|
+
|
|
32
|
+
# operator flags
|
|
33
|
+
allow_bonded_query: bool = Field(default=True)
|
|
34
|
+
allow_model_selection: bool = Field(default=True)
|
|
35
|
+
allow_rag_reduction: bool = Field(default=True)
|
|
36
|
+
allow_mixtures: bool = Field(default=True)
|
|
37
|
+
allow_critic: bool = Field(default=True)
|
|
38
|
+
allow_split_merge: bool = Field(default=False)
|
|
39
|
+
use_final_op_quality: bool = Field(default=False)
|
|
40
|
+
|
|
41
|
+
# sentinel optimization flags
|
|
42
|
+
k: int = Field(default=5)
|
|
43
|
+
j: int = Field(default=5)
|
|
44
|
+
sample_budget: int = Field(default=100)
|
|
45
|
+
seed: int = Field(default=42)
|
|
46
|
+
exp_name: str | None = Field(default=None)
|
|
47
|
+
priors: dict | None = Field(default=None)
|
|
41
48
|
|
|
42
49
|
def to_dict(self) -> dict:
|
|
43
50
|
"""Convert the config to a dict representation."""
|
|
44
|
-
return
|
|
45
|
-
"processing_strategy": self.processing_strategy,
|
|
46
|
-
"execution_strategy": self.execution_strategy,
|
|
47
|
-
"sentinel_execution_strategy": self.sentinel_execution_strategy,
|
|
48
|
-
"optimizer_strategy": self.optimizer_strategy,
|
|
49
|
-
"val_datasource": self.val_datasource,
|
|
50
|
-
"policy": self.policy,
|
|
51
|
-
"scan_start_idx": self.scan_start_idx,
|
|
52
|
-
"num_samples": self.num_samples,
|
|
53
|
-
"cache": self.cache,
|
|
54
|
-
"verbose": self.verbose,
|
|
55
|
-
"progress": self.progress,
|
|
56
|
-
"available_models": self.available_models,
|
|
57
|
-
"max_workers": self.max_workers,
|
|
58
|
-
"allow_bonded_query": self.allow_bonded_query,
|
|
59
|
-
"allow_model_selection": self.allow_model_selection,
|
|
60
|
-
"allow_code_synth": self.allow_code_synth,
|
|
61
|
-
"allow_rag_reduction": self.allow_rag_reduction,
|
|
62
|
-
"allow_mixtures": self.allow_mixtures,
|
|
63
|
-
"allow_critic": self.allow_critic,
|
|
64
|
-
"allow_split_merge": self.allow_split_merge,
|
|
65
|
-
"use_final_op_quality": self.use_final_op_quality,
|
|
66
|
-
**self.kwargs,
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
def to_json_str(self):
|
|
70
|
-
"""Convert the config to a JSON string representation."""
|
|
71
|
-
config_dict = self.to_dict()
|
|
72
|
-
config_dict["val_datasource"] = (
|
|
73
|
-
None if self.val_datasource is None else self.val_datasource.serialize()
|
|
74
|
-
)
|
|
75
|
-
config_dict["policy"] = self.policy.to_json_str()
|
|
76
|
-
for strategy in ["processing_strategy", "execution_strategy", "sentinel_execution_strategy", "optimizer_strategy"]:
|
|
77
|
-
config_dict[strategy] = str(config_dict[strategy])
|
|
78
|
-
|
|
79
|
-
return json.dumps(config_dict, indent=2)
|
|
80
|
-
|
|
81
|
-
def update(self, **kwargs) -> None:
|
|
82
|
-
for key, value in kwargs.items():
|
|
83
|
-
if hasattr(self, key):
|
|
84
|
-
setattr(self, key, value)
|
|
85
|
-
|
|
86
|
-
self.kwargs.update(kwargs)
|
|
51
|
+
return self.model_dump()
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from abc import abstractmethod
|
|
3
2
|
|
|
4
|
-
from palimpzest.core.data.
|
|
5
|
-
from palimpzest.core.data.datareaders import DataReader
|
|
3
|
+
from palimpzest.core.data.dataset import Dataset
|
|
6
4
|
from palimpzest.core.elements.records import DataRecord, DataRecordCollection
|
|
5
|
+
from palimpzest.core.models import ExecutionStats, PlanStats
|
|
7
6
|
from palimpzest.policy import Policy
|
|
8
7
|
from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
|
|
8
|
+
from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
|
|
9
9
|
from palimpzest.query.optimizer.optimizer import Optimizer
|
|
10
|
-
from palimpzest.
|
|
10
|
+
from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
|
|
11
|
+
from palimpzest.query.optimizer.plan import SentinelPlan
|
|
11
12
|
from palimpzest.utils.hash_helpers import hash_for_id
|
|
12
|
-
from palimpzest.
|
|
13
|
+
from palimpzest.validator.validator import Validator
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
15
16
|
|
|
@@ -27,15 +28,15 @@ class QueryProcessor:
|
|
|
27
28
|
execution_strategy: ExecutionStrategy,
|
|
28
29
|
sentinel_execution_strategy: SentinelExecutionStrategy | None,
|
|
29
30
|
num_samples: int | None = None,
|
|
30
|
-
|
|
31
|
+
train_dataset: dict[str, Dataset] | None = None,
|
|
32
|
+
validator: Validator | None = None,
|
|
31
33
|
scan_start_idx: int = 0,
|
|
32
|
-
cache: bool = False,
|
|
33
34
|
verbose: bool = False,
|
|
34
35
|
progress: bool = True,
|
|
35
36
|
max_workers: int | None = None,
|
|
36
37
|
policy: Policy | None = None,
|
|
37
38
|
available_models: list[str] | None = None,
|
|
38
|
-
**kwargs,
|
|
39
|
+
**kwargs, # needed in order to provide compatibility with QueryProcessorConfig
|
|
39
40
|
):
|
|
40
41
|
"""
|
|
41
42
|
Initialize QueryProcessor with optional custom components.
|
|
@@ -48,20 +49,15 @@ class QueryProcessor:
|
|
|
48
49
|
self.optimizer = optimizer
|
|
49
50
|
self.execution_strategy = execution_strategy
|
|
50
51
|
self.sentinel_execution_strategy = sentinel_execution_strategy
|
|
51
|
-
|
|
52
52
|
self.num_samples = num_samples
|
|
53
|
-
self.
|
|
53
|
+
self.train_dataset = train_dataset
|
|
54
|
+
self.validator = validator
|
|
54
55
|
self.scan_start_idx = scan_start_idx
|
|
55
|
-
self.cache = cache
|
|
56
56
|
self.verbose = verbose
|
|
57
57
|
self.progress = progress
|
|
58
58
|
self.max_workers = max_workers
|
|
59
|
-
|
|
60
59
|
self.policy = policy
|
|
61
|
-
|
|
62
60
|
self.available_models = available_models
|
|
63
|
-
if self.available_models is None or len(self.available_models) == 0:
|
|
64
|
-
self.available_models = get_models(include_vision=True)
|
|
65
61
|
|
|
66
62
|
if self.verbose:
|
|
67
63
|
print("Available models: ", self.available_models)
|
|
@@ -80,6 +76,26 @@ class QueryProcessor:
|
|
|
80
76
|
|
|
81
77
|
return hash_for_id(id_str)
|
|
82
78
|
|
|
79
|
+
def _create_sentinel_plan(self, train_dataset: dict[str, Dataset] | None) -> SentinelPlan:
|
|
80
|
+
"""
|
|
81
|
+
Generates and returns a SentinelPlan for the given dataset.
|
|
82
|
+
"""
|
|
83
|
+
# create a new optimizer and update its strategy to SENTINEL
|
|
84
|
+
optimizer = self.optimizer.deepcopy_clean()
|
|
85
|
+
optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
|
|
86
|
+
|
|
87
|
+
# create copy of dataset, but change its root Dataset(s) to the validation Dataset(s)
|
|
88
|
+
dataset = self.dataset.copy()
|
|
89
|
+
if train_dataset is not None:
|
|
90
|
+
dataset._set_root_datasets(train_dataset)
|
|
91
|
+
dataset._generate_unique_logical_op_ids()
|
|
92
|
+
|
|
93
|
+
# get the sentinel plan for the given dataset
|
|
94
|
+
sentinel_plans = optimizer.optimize(dataset)
|
|
95
|
+
sentinel_plan = sentinel_plans[0]
|
|
96
|
+
|
|
97
|
+
return sentinel_plan
|
|
98
|
+
|
|
83
99
|
def _execute_best_plan(self, dataset: Dataset, optimizer: Optimizer) -> tuple[list[DataRecord], list[PlanStats]]:
|
|
84
100
|
# get the optimal plan according to the optimizer
|
|
85
101
|
plans = optimizer.optimize(dataset)
|
|
@@ -91,7 +107,46 @@ class QueryProcessor:
|
|
|
91
107
|
# return the output records and plan stats
|
|
92
108
|
return records, [plan_stats]
|
|
93
109
|
|
|
94
|
-
# TODO: consider to support dry_run.
|
|
95
|
-
@abstractmethod
|
|
96
110
|
def execute(self) -> DataRecordCollection:
|
|
97
|
-
|
|
111
|
+
logger.info(f"Executing {self.__class__.__name__}")
|
|
112
|
+
|
|
113
|
+
# create execution stats
|
|
114
|
+
execution_stats = ExecutionStats(execution_id=self.execution_id())
|
|
115
|
+
execution_stats.start()
|
|
116
|
+
|
|
117
|
+
# if the user provides a train_dataset or validator, we perform optimization
|
|
118
|
+
if self.train_dataset is not None or self.validator is not None:
|
|
119
|
+
# create sentinel plan
|
|
120
|
+
sentinel_plan = self._create_sentinel_plan(self.train_dataset)
|
|
121
|
+
|
|
122
|
+
# generate sample execution data
|
|
123
|
+
if self.train_dataset is not None:
|
|
124
|
+
sentinel_plan_stats = self.sentinel_execution_strategy.execute_sentinel_plan(sentinel_plan, self.train_dataset, self.validator)
|
|
125
|
+
|
|
126
|
+
else:
|
|
127
|
+
train_dataset = self.dataset._get_root_datasets()
|
|
128
|
+
sentinel_plan_stats = self.sentinel_execution_strategy.execute_sentinel_plan(sentinel_plan, train_dataset, self.validator)
|
|
129
|
+
|
|
130
|
+
# update the execution stats to account for the work done in optimization
|
|
131
|
+
execution_stats.add_plan_stats(sentinel_plan_stats)
|
|
132
|
+
execution_stats.finish_optimization()
|
|
133
|
+
|
|
134
|
+
# (re-)initialize the optimizer
|
|
135
|
+
self.optimizer = self.optimizer.deepcopy_clean()
|
|
136
|
+
|
|
137
|
+
# construct the CostModel with any sample execution data we've gathered
|
|
138
|
+
cost_model = SampleBasedCostModel(sentinel_plan_stats, self.verbose)
|
|
139
|
+
self.optimizer.update_cost_model(cost_model)
|
|
140
|
+
|
|
141
|
+
# execute plan(s) according to the optimization strategy
|
|
142
|
+
records, plan_stats = self._execute_best_plan(self.dataset, self.optimizer)
|
|
143
|
+
|
|
144
|
+
# update the execution stats to account for the work to execute the final plan
|
|
145
|
+
execution_stats.add_plan_stats(plan_stats)
|
|
146
|
+
execution_stats.finish()
|
|
147
|
+
|
|
148
|
+
# construct and return the DataRecordCollection
|
|
149
|
+
result = DataRecordCollection(records, execution_stats=execution_stats)
|
|
150
|
+
logger.info(f"Done executing {self.__class__.__name__}")
|
|
151
|
+
|
|
152
|
+
return result
|