palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +259 -197
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +634 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +61 -5
- palimpzest/prompts/filter_prompts.py +50 -5
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
- palimpzest/prompts/prompt_factory.py +358 -46
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +157 -330
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +27 -21
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +22 -13
- palimpzest/query/operators/join.py +402 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +198 -80
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +41 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +27 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
- palimpzest-0.8.0.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.20.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
|
|
6
6
|
from palimpzest.policy import Policy
|
|
7
7
|
from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
|
|
8
|
+
from palimpzest.query.optimizer.primitives import Group
|
|
8
9
|
|
|
9
10
|
logger = logging.getLogger(__name__)
|
|
10
11
|
|
|
@@ -15,31 +16,6 @@ class OptimizationStrategy(ABC):
|
|
|
15
16
|
"""Strategy decides how to search through the groups for optimal plan(s)"""
|
|
16
17
|
pass
|
|
17
18
|
|
|
18
|
-
def normalize_final_plans(self, plans: list[PhysicalPlan]) -> list[PhysicalPlan]:
|
|
19
|
-
"""
|
|
20
|
-
For each plan in `plans`, this function enforces that the input schema of every
|
|
21
|
-
operator is the output schema of the previous operator in the plan.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
plans list[PhysicalPlan]: list of physical plans to normalize
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
list[PhysicalPlan]: list of normalized physical plans
|
|
28
|
-
"""
|
|
29
|
-
normalized_plans = []
|
|
30
|
-
for plan in plans:
|
|
31
|
-
normalized_ops = []
|
|
32
|
-
for idx, op in enumerate(plan.operators):
|
|
33
|
-
op_copy = op.copy()
|
|
34
|
-
if idx == 0:
|
|
35
|
-
normalized_ops.append(op_copy)
|
|
36
|
-
else:
|
|
37
|
-
op_copy.input_schema = plan.operators[-1].output_schema
|
|
38
|
-
normalized_ops.append(op_copy)
|
|
39
|
-
normalized_plans.append(PhysicalPlan(operators=normalized_ops, plan_cost=plan.plan_cost))
|
|
40
|
-
|
|
41
|
-
return normalized_plans
|
|
42
|
-
|
|
43
19
|
|
|
44
20
|
class GreedyStrategy(OptimizationStrategy):
|
|
45
21
|
def _get_greedy_physical_plan(self, groups: dict, group_id: int) -> PhysicalPlan:
|
|
@@ -49,17 +25,35 @@ class GreedyStrategy(OptimizationStrategy):
|
|
|
49
25
|
# get the best physical expression for this group
|
|
50
26
|
best_phys_expr = groups[group_id].best_physical_expression
|
|
51
27
|
|
|
52
|
-
# if this expression has no inputs (i.e. it is a BaseScan
|
|
53
|
-
|
|
28
|
+
# if this expression has no inputs (i.e. it is a BaseScan), create and return the physical plan
|
|
29
|
+
best_plan = None
|
|
54
30
|
if len(best_phys_expr.input_group_ids) == 0:
|
|
55
|
-
|
|
31
|
+
best_plan = PhysicalPlan(best_phys_expr.operator, subplans=None, plan_cost=best_phys_expr.plan_cost)
|
|
32
|
+
|
|
33
|
+
# otherwise, if this expression is not a join (i.e. it has one input)
|
|
34
|
+
elif len(best_phys_expr.input_group_ids) == 1:
|
|
35
|
+
# get the best physical plan for this group's input
|
|
36
|
+
input_group_id = best_phys_expr.input_group_ids[0]
|
|
37
|
+
input_best_phys_plan = self._get_greedy_physical_plan(groups, input_group_id)
|
|
38
|
+
|
|
39
|
+
# add this operator to best physical plan and return
|
|
40
|
+
best_plan = PhysicalPlan(best_phys_expr.operator, subplans=[input_best_phys_plan], plan_cost=best_phys_expr.plan_cost)
|
|
56
41
|
|
|
57
|
-
#
|
|
58
|
-
|
|
59
|
-
|
|
42
|
+
# otherwise, this expression is a join (i.e. it has two inputs)
|
|
43
|
+
elif len(best_phys_expr.input_group_ids) == 2:
|
|
44
|
+
left_input_group_id, right_input_group_id = best_phys_expr.input_group_ids
|
|
45
|
+
|
|
46
|
+
# get the best physical plan for the left input
|
|
47
|
+
left_best_phys_plan = self._get_greedy_physical_plan(groups, left_input_group_id)
|
|
48
|
+
|
|
49
|
+
# get the best physical plan for the right input
|
|
50
|
+
right_best_phys_plan = self._get_greedy_physical_plan(groups, right_input_group_id)
|
|
51
|
+
|
|
52
|
+
# add this operator to best physical plan and return
|
|
53
|
+
best_plan = PhysicalPlan(best_phys_expr.operator, subplans=[left_best_phys_plan, right_best_phys_plan], plan_cost=best_phys_expr.plan_cost)
|
|
60
54
|
|
|
61
55
|
# add this operator to best physical plan and return
|
|
62
|
-
return
|
|
56
|
+
return best_plan
|
|
63
57
|
|
|
64
58
|
def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
|
|
65
59
|
logger.info(f"Getting greedy optimal plans for final group id: {final_group_id}")
|
|
@@ -85,35 +79,42 @@ class ParetoStrategy(OptimizationStrategy):
|
|
|
85
79
|
# construct list of pareto optimal plans
|
|
86
80
|
pareto_optimal_plans = []
|
|
87
81
|
for phys_expr in pareto_optimal_phys_exprs:
|
|
88
|
-
# if this expression has no inputs (i.e. it is a BaseScan
|
|
89
|
-
# create and return the physical plan
|
|
82
|
+
# if this expression has no inputs (i.e. it is a BaseScan), create and return the physical plan
|
|
90
83
|
if len(phys_expr.input_group_ids) == 0:
|
|
91
84
|
for plan_cost, _ in phys_expr.pareto_optimal_plan_costs:
|
|
92
|
-
plan = PhysicalPlan(
|
|
85
|
+
plan = PhysicalPlan(phys_expr.operator, subplans=None, plan_cost=plan_cost)
|
|
93
86
|
pareto_optimal_plans.append(plan)
|
|
94
87
|
|
|
95
|
-
# otherwise,
|
|
96
|
-
|
|
88
|
+
# otherwise, if this expression is not a join (i.e. it has one input)
|
|
89
|
+
elif len(phys_expr.input_group_ids) == 1:
|
|
97
90
|
# get the pareto optimal physical plan(s) for this group's inputs
|
|
98
|
-
input_group_id = phys_expr.input_group_ids[0]
|
|
91
|
+
input_group_id = phys_expr.input_group_ids[0]
|
|
99
92
|
pareto_optimal_phys_subplans = self._get_candidate_pareto_physical_plans(groups, input_group_id, policy)
|
|
100
93
|
|
|
101
94
|
# iterate over the input subplans and find the one(s) which combine with this physical expression
|
|
102
95
|
# to make a pareto-optimal plan
|
|
103
|
-
for plan_cost, input_plan_cost in phys_expr.pareto_optimal_plan_costs:
|
|
96
|
+
for plan_cost, (input_plan_cost, _) in phys_expr.pareto_optimal_plan_costs:
|
|
104
97
|
for subplan in pareto_optimal_phys_subplans:
|
|
105
|
-
if
|
|
106
|
-
|
|
107
|
-
and subplan.plan_cost.time == input_plan_cost.time
|
|
108
|
-
and subplan.plan_cost.quality == input_plan_cost.quality
|
|
109
|
-
):
|
|
110
|
-
# TODO: The plan_cost gets summed with subplan.plan_cost;
|
|
111
|
-
# am I defining expression.best_plan_cost to be the cost of that operator,
|
|
112
|
-
# and expression.pareto_optimal_plan_costs to be the cost(s) of the subplan including that operator?
|
|
113
|
-
# i.e. are my definitions inconsistent?
|
|
114
|
-
plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, plan_cost)
|
|
98
|
+
if subplan.plan_cost == input_plan_cost:
|
|
99
|
+
plan = PhysicalPlan(phys_expr.operator, subplans=[subplan], plan_cost=plan_cost)
|
|
115
100
|
pareto_optimal_plans.append(plan)
|
|
116
101
|
|
|
102
|
+
# otherwise, this expression is a join (i.e. it has two inputs)
|
|
103
|
+
elif len(phys_expr.input_group_ids) == 2:
|
|
104
|
+
left_input_group_id, right_input_group_id = phys_expr.input_group_ids
|
|
105
|
+
pareto_optimal_left_subplans = self._get_candidate_pareto_physical_plans(groups, left_input_group_id, policy)
|
|
106
|
+
pareto_optimal_right_subplans = self._get_candidate_pareto_physical_plans(groups, right_input_group_id, policy)
|
|
107
|
+
|
|
108
|
+
# iterate over the input subplans and find the one(s) which combine with this physical expression
|
|
109
|
+
# to make a pareto-optimal plan
|
|
110
|
+
for plan_cost, (left_input_plan_cost, right_input_plan_cost) in phys_expr.pareto_optimal_plan_costs:
|
|
111
|
+
for left_subplan in pareto_optimal_left_subplans:
|
|
112
|
+
if left_subplan.plan_cost == left_input_plan_cost:
|
|
113
|
+
for right_subplan in pareto_optimal_right_subplans:
|
|
114
|
+
if right_subplan.plan_cost == right_input_plan_cost:
|
|
115
|
+
plan = PhysicalPlan(phys_expr.operator, subplans=[left_subplan, right_subplan], plan_cost=plan_cost)
|
|
116
|
+
pareto_optimal_plans.append(plan)
|
|
117
|
+
|
|
117
118
|
return pareto_optimal_plans
|
|
118
119
|
|
|
119
120
|
def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
|
|
@@ -142,29 +143,33 @@ class ParetoStrategy(OptimizationStrategy):
|
|
|
142
143
|
|
|
143
144
|
|
|
144
145
|
class SentinelStrategy(OptimizationStrategy):
|
|
145
|
-
def _get_sentinel_plan(self, groups: dict, group_id: int) -> SentinelPlan:
|
|
146
|
+
def _get_sentinel_plan(self, groups: dict[str, Group], group_id: int) -> SentinelPlan:
|
|
146
147
|
"""
|
|
147
148
|
Create and return a SentinelPlan object.
|
|
149
|
+
|
|
150
|
+
NOTE: this strategy is only used to construct a SentinelPlan before performing optimization.
|
|
151
|
+
Currently, we do not perform any transformation rules when building the groups which
|
|
152
|
+
are fed into this function. Thus, every physical expression will correspond to the same
|
|
153
|
+
logical operator and share the same logical_op_id. Eventually we will want to consider
|
|
154
|
+
multiple logical re-orderings of operators in our SentinelPlan, but for now it is static.
|
|
148
155
|
"""
|
|
149
|
-
# get all the physical expressions for this group
|
|
156
|
+
# get all the physical expressions for this group as well as their logical_op_id
|
|
150
157
|
phys_exprs = groups[group_id].physical_expressions
|
|
151
158
|
phys_op_set = [expr.operator for expr in phys_exprs]
|
|
152
159
|
|
|
153
|
-
# if this expression has no inputs (i.e. it is a
|
|
154
|
-
# create and return the physical plan
|
|
160
|
+
# if this expression has no inputs (i.e. it is a scan operator), create and return the sentinel plan
|
|
155
161
|
best_phys_expr = groups[group_id].best_physical_expression
|
|
156
162
|
if len(best_phys_expr.input_group_ids) == 0:
|
|
157
|
-
return SentinelPlan(
|
|
163
|
+
return SentinelPlan(operator_set=phys_op_set, subplans=None)
|
|
158
164
|
|
|
159
|
-
#
|
|
160
|
-
|
|
161
|
-
best_phys_subplan = SentinelPlan(operator_sets=[])
|
|
165
|
+
# get the subplans
|
|
166
|
+
subplans = []
|
|
162
167
|
for input_group_id in best_phys_expr.input_group_ids:
|
|
163
|
-
|
|
164
|
-
|
|
168
|
+
subplan = self._get_sentinel_plan(groups, input_group_id)
|
|
169
|
+
subplans.append(subplan)
|
|
165
170
|
|
|
166
|
-
#
|
|
167
|
-
return SentinelPlan
|
|
171
|
+
# compose the current physical operator set with its subplans
|
|
172
|
+
return SentinelPlan(operator_set=phys_op_set, subplans=subplans)
|
|
168
173
|
|
|
169
174
|
def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[SentinelPlan]:
|
|
170
175
|
logger.info(f"Getting sentinel optimal plans for final group id: {final_group_id}")
|