PyPI - palimpzest - Versions diffs - 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

palimpzest 0.7.20py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +259 -197
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +634 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +61 -5
palimpzest/prompts/filter_prompts.py +50 -5
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
palimpzest/prompts/prompt_factory.py +358 -46
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +157 -330
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +27 -21
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +22 -13
palimpzest/query/operators/join.py +402 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +198 -80
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +41 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +27 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
palimpzest-0.8.0.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.20.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0

palimpzest/query/optimizer/optimizer_strategy.py CHANGED Viewed

@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
 from palimpzest.policy import Policy
 from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
+from palimpzest.query.optimizer.primitives import Group
 logger = logging.getLogger(__name__)
@@ -15,31 +16,6 @@ class OptimizationStrategy(ABC):
         """Strategy decides how to search through the groups for optimal plan(s)"""
         pass
-    def normalize_final_plans(self, plans: list[PhysicalPlan]) -> list[PhysicalPlan]:
-        """
-        For each plan in `plans`, this function enforces that the input schema of every
-        operator is the output schema of the previous operator in the plan.
-        Args:
-            plans list[PhysicalPlan]: list of physical plans to normalize
-        Returns:
-            list[PhysicalPlan]: list of normalized physical plans
-        """
-        normalized_plans = []
-        for plan in plans:
-            normalized_ops = []
-            for idx, op in enumerate(plan.operators):
-                op_copy = op.copy()
-                if idx == 0:
-                    normalized_ops.append(op_copy)
-                else:
-                    op_copy.input_schema = plan.operators[-1].output_schema
-                    normalized_ops.append(op_copy)
-            normalized_plans.append(PhysicalPlan(operators=normalized_ops, plan_cost=plan.plan_cost))
-        return normalized_plans
 class GreedyStrategy(OptimizationStrategy):
     def _get_greedy_physical_plan(self, groups: dict, group_id: int) -> PhysicalPlan:
@@ -49,17 +25,35 @@ class GreedyStrategy(OptimizationStrategy):
         # get the best physical expression for this group
         best_phys_expr = groups[group_id].best_physical_expression
-        # if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
-        # create and return the physical plan
+        # if this expression has no inputs (i.e. it is a BaseScan), create and return the physical plan
+        best_plan = None
         if len(best_phys_expr.input_group_ids) == 0:
-            return PhysicalPlan(operators=[best_phys_expr.operator], plan_cost=best_phys_expr.plan_cost)
+            best_plan = PhysicalPlan(best_phys_expr.operator, subplans=None, plan_cost=best_phys_expr.plan_cost)
+        # otherwise, if this expression is not a join (i.e. it has one input)
+        elif len(best_phys_expr.input_group_ids) == 1:
+            # get the best physical plan for this group's input
+            input_group_id = best_phys_expr.input_group_ids[0]
+            input_best_phys_plan = self._get_greedy_physical_plan(groups, input_group_id)
+            # add this operator to best physical plan and return
+            best_plan = PhysicalPlan(best_phys_expr.operator, subplans=[input_best_phys_plan], plan_cost=best_phys_expr.plan_cost)
-        # get the best physical plan(s) for this group's inputs
-        input_group_id = best_phys_expr.input_group_ids[0] # TODO: need to handle joins
-        input_best_phys_plan = self._get_greedy_physical_plan(groups, input_group_id)
+        # otherwise, this expression is a join (i.e. it has two inputs)
+        elif len(best_phys_expr.input_group_ids) == 2:
+            left_input_group_id, right_input_group_id = best_phys_expr.input_group_ids
+            # get the best physical plan for the left input
+            left_best_phys_plan = self._get_greedy_physical_plan(groups, left_input_group_id)
+            # get the best physical plan for the right input
+            right_best_phys_plan = self._get_greedy_physical_plan(groups, right_input_group_id)
+            # add this operator to best physical plan and return
+            best_plan = PhysicalPlan(best_phys_expr.operator, subplans=[left_best_phys_plan, right_best_phys_plan], plan_cost=best_phys_expr.plan_cost)
         # add this operator to best physical plan and return
-        return PhysicalPlan.from_ops_and_sub_plan([best_phys_expr.operator], input_best_phys_plan, best_phys_expr.plan_cost)
+        return best_plan
     def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
         logger.info(f"Getting greedy optimal plans for final group id: {final_group_id}")
@@ -85,35 +79,42 @@ class ParetoStrategy(OptimizationStrategy):
         # construct list of pareto optimal plans
         pareto_optimal_plans = []
         for phys_expr in pareto_optimal_phys_exprs:
-            # if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
-            # create and return the physical plan
+            # if this expression has no inputs (i.e. it is a BaseScan), create and return the physical plan
             if len(phys_expr.input_group_ids) == 0:
                 for plan_cost, _ in phys_expr.pareto_optimal_plan_costs:
-                    plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=plan_cost)
+                    plan = PhysicalPlan(phys_expr.operator, subplans=None, plan_cost=plan_cost)
                     pareto_optimal_plans.append(plan)
-            # otherwise, get the pareto optimal physical plan(s) for this group's inputs
-            else:
+            # otherwise, if this expression is not a join (i.e. it has one input)
+            elif len(phys_expr.input_group_ids) == 1:
                 # get the pareto optimal physical plan(s) for this group's inputs
-                input_group_id = phys_expr.input_group_ids[0] # TODO: need to handle joins
+                input_group_id = phys_expr.input_group_ids[0]
                 pareto_optimal_phys_subplans = self._get_candidate_pareto_physical_plans(groups, input_group_id, policy)
                 # iterate over the input subplans and find the one(s) which combine with this physical expression
                 # to make a pareto-optimal plan
-                for plan_cost, input_plan_cost in phys_expr.pareto_optimal_plan_costs:
+                for plan_cost, (input_plan_cost, _) in phys_expr.pareto_optimal_plan_costs:
                     for subplan in pareto_optimal_phys_subplans:
-                        if (
-                            subplan.plan_cost.cost == input_plan_cost.cost
-                            and subplan.plan_cost.time == input_plan_cost.time
-                            and subplan.plan_cost.quality == input_plan_cost.quality
-                        ):
-                            # TODO: The plan_cost gets summed with subplan.plan_cost;
-                            #       am I defining expression.best_plan_cost to be the cost of that operator,
-                            #       and expression.pareto_optimal_plan_costs to be the cost(s) of the subplan including that operator?
-                            #       i.e. are my definitions inconsistent?
-                            plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, plan_cost)
+                        if subplan.plan_cost == input_plan_cost:
+                            plan = PhysicalPlan(phys_expr.operator, subplans=[subplan], plan_cost=plan_cost)
                             pareto_optimal_plans.append(plan)
+            # otherwise, this expression is a join (i.e. it has two inputs)
+            elif len(phys_expr.input_group_ids) == 2:
+                left_input_group_id, right_input_group_id = phys_expr.input_group_ids
+                pareto_optimal_left_subplans = self._get_candidate_pareto_physical_plans(groups, left_input_group_id, policy)
+                pareto_optimal_right_subplans = self._get_candidate_pareto_physical_plans(groups, right_input_group_id, policy)
+                # iterate over the input subplans and find the one(s) which combine with this physical expression
+                # to make a pareto-optimal plan
+                for plan_cost, (left_input_plan_cost, right_input_plan_cost) in phys_expr.pareto_optimal_plan_costs:
+                    for left_subplan in pareto_optimal_left_subplans:
+                        if left_subplan.plan_cost == left_input_plan_cost:
+                            for right_subplan in pareto_optimal_right_subplans:
+                                if right_subplan.plan_cost == right_input_plan_cost:
+                                    plan = PhysicalPlan(phys_expr.operator, subplans=[left_subplan, right_subplan], plan_cost=plan_cost)
+                                    pareto_optimal_plans.append(plan)
         return pareto_optimal_plans
     def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
@@ -142,29 +143,33 @@ class ParetoStrategy(OptimizationStrategy):
 class SentinelStrategy(OptimizationStrategy):
-    def _get_sentinel_plan(self, groups: dict, group_id: int) -> SentinelPlan:
+    def _get_sentinel_plan(self, groups: dict[str, Group], group_id: int) -> SentinelPlan:
         """
         Create and return a SentinelPlan object.
+        NOTE: this strategy is only used to construct a SentinelPlan before performing optimization.
+              Currently, we do not perform any transformation rules when building the groups which
+              are fed into this function. Thus, every physical expression will correspond to the same
+              logical operator and share the same logical_op_id. Eventually we will want to consider
+              multiple logical re-orderings of operators in our SentinelPlan, but for now it is static.
         """
-        # get all the physical expressions for this group
+        # get all the physical expressions for this group as well as their logical_op_id
         phys_exprs = groups[group_id].physical_expressions
         phys_op_set = [expr.operator for expr in phys_exprs]
-        # if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
-        # create and return the physical plan
+        # if this expression has no inputs (i.e. it is a scan operator), create and return the sentinel plan
         best_phys_expr = groups[group_id].best_physical_expression
         if len(best_phys_expr.input_group_ids) == 0:
-            return SentinelPlan(operator_sets=[phys_op_set])
+            return SentinelPlan(operator_set=phys_op_set, subplans=None)
-        # TODO: need to handle joins
-        # get the best physical plan(s) for this group's inputs
-        best_phys_subplan = SentinelPlan(operator_sets=[])
+        # get the subplans
+        subplans = []
         for input_group_id in best_phys_expr.input_group_ids:
-            input_best_phys_plan = self._get_sentinel_plan(groups, input_group_id)
-            best_phys_subplan = SentinelPlan.from_ops_and_sub_plan(best_phys_subplan.operator_sets, input_best_phys_plan)
+            subplan = self._get_sentinel_plan(groups, input_group_id)
+            subplans.append(subplan)
-        # add this operator set to best physical plan and return
-        return SentinelPlan.from_ops_and_sub_plan([phys_op_set], best_phys_subplan)
+        # compose the current physical operator set with its subplans
+        return SentinelPlan(operator_set=phys_op_set, subplans=subplans)
     def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[SentinelPlan]:
         logger.info(f"Getting sentinel optimal plans for final group id: {final_group_id}")

palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

palimpzest 0.7.20py3-none-any.whl → 0.8.0py3-none-any.whl