PyPI - palimpzest - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
palimpzest-0.7.1.dist-info/RECORD +96 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.4.dist-info/RECORD +0 -87
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0

palimpzest/query/optimizer/tasks.py CHANGED Viewed

@@ -1,14 +1,16 @@
 from __future__ import annotations
+import logging
 from typing import Any
 from palimpzest.core.data.dataclasses import PlanCost
 from palimpzest.policy import Policy
 from palimpzest.query.optimizer.cost_model import BaseCostModel
-from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
+from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
 from palimpzest.query.optimizer.primitives import Expression, Group
 from palimpzest.query.optimizer.rules import ImplementationRule, Rule, TransformationRule
+logger = logging.getLogger(__name__)
 class Task:
     """
@@ -41,6 +43,7 @@ class OptimizeGroup(Task):
         self.group_id = group_id
     def perform(self, groups: dict[int, Group], context: dict[str, Any] | None = None) -> list[Task]:
+        logger.debug(f"Optimizing group {self.group_id}")
         # get updated instance of the group to be optimized
         if context is None:
             context = {}
@@ -61,6 +64,8 @@ class OptimizeGroup(Task):
             task = OptimizePhysicalExpression(physical_expr)
             new_tasks.append(task)
+        logger.debug(f"Done optimizing group {self.group_id}")
+        logger.debug(f"New tasks: {len(new_tasks)}")
         return new_tasks
@@ -76,6 +81,8 @@ class ExpandGroup(Task):
         self.group_id = group_id
     def perform(self, groups: dict[int, Group], context: dict[str, Any] | None = None) -> list[Task]:
+        logger.debug(f"Expanding group {self.group_id}")
         # fetch group
         if context is None:
             context = {}
@@ -94,6 +101,8 @@ class ExpandGroup(Task):
         # mark the group as explored and return tasks
         group.set_explored()
+        logger.debug(f"Done expanding group {self.group_id}")
+        logger.debug(f"New tasks: {len(new_tasks)}")
         return new_tasks
@@ -115,6 +124,7 @@ class OptimizeLogicalExpression(Task):
         implementation_rules: list[ImplementationRule],
         context: dict[str, Any] | None = None,
     ) -> list[Task]:
+        logger.debug(f"Optimizing logical expression {self.logical_expression}")
         # if we're exploring, only apply transformation rules
         if context is None:
             context = {}
@@ -135,6 +145,8 @@ class OptimizeLogicalExpression(Task):
             apply_rule_task = ApplyRule(rule, self.logical_expression, self.exploring)
             new_tasks.append(apply_rule_task)
+        logger.debug(f"Done optimizing logical expression {self.logical_expression}")
+        logger.debug(f"New tasks: {len(new_tasks)}")
         return new_tasks
@@ -170,6 +182,8 @@ class ApplyRule(Task):
         context: dict[str, Any] | None = None,
         **physical_op_params,
     ) -> tuple[list[Task], int]:
+        logger.debug(f"Applying rule {self.rule} to logical expression {self.logical_expression}")
         # check if rule has already been applied to this logical expression; return [] if so
         if context is None:
             context = {}
@@ -235,6 +249,8 @@ class ApplyRule(Task):
         # mark that the rule has been applied to the logical expression
         self.logical_expression.add_applied_rule(self.rule)
+        logger.debug(f"Done applying rule {self.rule} to logical expression {self.logical_expression}")
+        logger.debug(f"New tasks: {len(new_tasks)}")
         return new_tasks
@@ -244,8 +260,8 @@ class OptimizePhysicalExpression(Task):
     This task computes the cost of input groups for the given physical expression (scheduling
     OptimizeGroup tasks if needed), computes the cost of the given expression, and then updates
-    the expression's group depending on whether this expression is its best_physical_expression
-    or in its ci_best_physical_expressions.
+    the expression's group depending on whether this expression is its `best_physical_expression`
+    or in its `pareto_optimal_physical_expressions`.
     """
     def __init__(self, physical_expression: Expression, exploring: bool = False):
@@ -384,74 +400,6 @@ class OptimizePhysicalExpression(Task):
         return group
-    def update_ci_best_physical_expressions(self, group: Group, policy: Policy) -> Group:
-        """
-        Update the CI best physical expressions for the given group and policy (if necessary).
-        """
-        # get the primary metric for the policy
-        policy_metric = policy.get_primary_metric()
-        # get the PlanCost for this physical expression
-        expr_plan_cost = self.physical_expression.plan_cost
-        # pre-compute whether or not this physical expression satisfies the policy constraint
-        expr_satisfies_constraint = policy.constraint(expr_plan_cost)
-        # attribute names for lower and upper bounds
-        lower_bound = f"{policy_metric}_lower_bound"
-        upper_bound = f"{policy_metric}_upper_bound"
-        # get the expression and plan's upper and lower bounds on the metric of interest
-        expr_lower_bound = getattr(expr_plan_cost, lower_bound)
-        expr_upper_bound = getattr(expr_plan_cost, upper_bound)
-        group_lower_bound = getattr(group, lower_bound)
-        # if either of the following is true:
-        # 1) the CI best physical expressions are empty
-        # 2) the group does not satisfy the constrant but this physical expression does
-        # set the CI best physical expressions to be this expression
-        if (
-            group.ci_best_physical_expressions == []
-            or (not group.satisfies_constraint and expr_satisfies_constraint)
-        ):
-            group.ci_best_physical_expressions = [self.physical_expression]
-            group.satisfies_constraint = expr_satisfies_constraint
-            setattr(group, lower_bound, expr_lower_bound)
-            setattr(group, upper_bound, expr_upper_bound)
-        # otherwise, if this expression and the group both satisfy the constraint (or both do not satisfy the constraint),
-        # then update the CI best physical expressions if this expression also has an upper bound on the policy metric
-        # above the group's lower bound on the policy metric
-        elif (
-            (group.satisfies_constraint == expr_satisfies_constraint)
-            and expr_upper_bound > group_lower_bound
-        ):
-            # filter out any current best expressions whose upper bound is below the lower bound of this expression
-            group.ci_best_physical_expressions = [
-                curr_expr
-                for curr_expr in group.ci_best_physical_expressions
-                if not getattr(curr_expr, upper_bound) < expr_lower_bound
-            ]
-            # add this expression to the CI best physical expressions
-            group.ci_best_physical_expressions.append(self.physical_expression)
-            # compute the upper and lower bounds for the group
-            new_group_upper_bound = max(
-                map(lambda expr: getattr(expr, upper_bound), group.ci_best_physical_expressions)
-            )
-            new_group_lower_bound = max(
-                map(lambda expr: getattr(expr, lower_bound), group.ci_best_physical_expressions)
-            )
-            # set the new upper and lower bounds for the group
-            setattr(group, lower_bound, new_group_lower_bound)
-            setattr(group, upper_bound, new_group_upper_bound)
-        return group
     def perform(
         self,
         cost_model: BaseCostModel,
@@ -459,20 +407,19 @@ class OptimizePhysicalExpression(Task):
         policy: Policy,
         context: dict[str, Any] | None = None,
     ) -> list[Task]:
+        logger.debug(f"Optimizing physical expression {self.physical_expression}")
         if context is None:
             context = {}
+        # get the optimizer strategy (type) from the context
+        optimizer_strategy: OptimizationStrategyType = context['optimizer_strategy']
         # return if we've already computed the cost of this physical expression
-        if (  # noqa: SIM114
-            context['optimization_strategy_type'] in [OptimizationStrategyType.GREEDY, OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE]
-            and self.physical_expression.plan_cost is not None
-        ):
+        if optimizer_strategy.is_pareto() and self.physical_expression.pareto_optimal_plan_costs is not None:
             return []
-        elif (
-            context['optimization_strategy_type'] == OptimizationStrategyType.PARETO
-            and self.physical_expression.pareto_optimal_plan_costs is not None
-        ):
+        if optimizer_strategy.is_not_pareto() and self.physical_expression.plan_cost is not None:
             return []
         # for expressions with an input group, compute the input plan cost(s)
@@ -485,24 +432,11 @@ class OptimizePhysicalExpression(Task):
             # compute the input plan cost or list of input plan costs
             new_tasks = []
-            if (
-                context['optimization_strategy_type'] in [OptimizationStrategyType.GREEDY, OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE]
-                and input_group.best_physical_expression is not None
-            ):
+            if optimizer_strategy.is_not_pareto() and input_group.best_physical_expression is not None:
                 # TODO: apply policy constraint here
                 best_input_plan_cost = input_group.best_physical_expression.plan_cost
-            elif (
-                context['optimization_strategy_type'] == OptimizationStrategyType.CONFIDENCE_INTERVAL
-                and input_group.ci_best_physical_expressions is not None
-            ):
-                # TODO: fix this to properly compute set of potential input plan costs
-                raise Exception("NotImplementedError")
-            elif (
-                context['optimization_strategy_type'] == OptimizationStrategyType.PARETO
-                and input_group.pareto_optimal_physical_expressions is not None
-            ):
+            elif optimizer_strategy.is_pareto() and input_group.pareto_optimal_physical_expressions is not None:
                 # TODO: apply policy constraint here
                 input_plan_costs = []
                 for pareto_physical_expression in input_group.pareto_optimal_physical_expressions:
@@ -524,12 +458,7 @@ class OptimizePhysicalExpression(Task):
                 return [self] + new_tasks
         group = groups[self.physical_expression.group_id]
-        if context['optimization_strategy_type'] == OptimizationStrategyType.CONFIDENCE_INTERVAL:
-            # TODO: fix this to properly compute and update set of possible plan costs
-            raise Exception("NotImplementedError")
-            group = self.update_ci_best_physical_expressions(group, policy)
-        elif context['optimization_strategy_type'] == OptimizationStrategyType.PARETO:
+        if optimizer_strategy.is_pareto():
             # compute all possible plan costs for this physical expression given the pareto optimal input plan costs
             all_possible_plan_costs = []
             for input_plan_cost in input_plan_costs:
@@ -583,4 +512,5 @@ class OptimizePhysicalExpression(Task):
         group.optimized = True
         groups[self.physical_expression.group_id] = group
+        logger.debug(f"Done optimizing physical expression {self.physical_expression}")
         return []

palimpzest/query/processor/config.py CHANGED Viewed

@@ -11,62 +11,78 @@ from palimpzest.policy import MaxQuality, Policy
 @dataclass
 class QueryProcessorConfig:
     """Shared context for query processors"""
-    processing_strategy: str = field(default="no_sentinel")
-    execution_strategy: str = field(default="sequential")
-    optimizer_strategy: str = field(default="pareto")
+    processing_strategy: str = field(default="auto")                 # substituted with ProcessingStrategyType
+    execution_strategy: str = field(default="sequential")            # substituted with ExecutionStrategyType
+    sentinel_execution_strategy: str | None = field(default="auto")  # substituted with SentinelExecutionStrategyType
+    optimizer_strategy: str = field(default="pareto")                # substituted with OptimizationStrategyType
     val_datasource: DataReader | None = field(default=None)
     policy: Policy = field(default_factory=MaxQuality)
     scan_start_idx: int = field(default=0)
-    num_samples: int = field(default=float("inf"))
-    nocache: bool = field(default=True)  # NOTE: until we properly implement caching, let's set the default to True
-    include_baselines: bool = field(default=False)
-    min_plans: int | None = field(default=None)
+    num_samples: int = field(default=None)
+    cache: bool = field(default=False)  # NOTE: until we properly implement caching, let's set the default to False
     verbose: bool = field(default=False)
+    progress: bool = field(default=True)
     available_models: list[Model] | None = field(default=None)
     max_workers: int | None = field(default=None)
-    num_workers_per_plan: int = field(default=1)
     allow_bonded_query: bool = field(default=True)
-    allow_conventional_query: bool = field(default=False)
     allow_model_selection: bool = field(default=True)
     allow_code_synth: bool = field(default=False)
     allow_token_reduction: bool = field(default=False)
-    allow_rag_reduction: bool = field(default=False)
+    allow_rag_reduction: bool = field(default=True)
     allow_mixtures: bool = field(default=True)
-    allow_critic: bool = field(default=False)
+    allow_critic: bool = field(default=True)
+    allow_split_merge: bool = field(default=False)
     use_final_op_quality: bool = field(default=False)
-    def to_json_str(self):
-        return json.dumps({
+    kwargs: dict = field(default_factory=dict)
+    def to_dict(self) -> dict:
+        """Convert the config to a dict representation."""
+        return {
             "processing_strategy": self.processing_strategy,
             "execution_strategy": self.execution_strategy,
+            "sentinel_execution_strategy": self.sentinel_execution_strategy,
             "optimizer_strategy": self.optimizer_strategy,
-            "val_datasource": None if self.val_datasource is None else self.val_datasource.serialize(),
-            "policy": self.policy.to_json_str(),
+            "val_datasource": self.val_datasource,
+            "policy": self.policy,
             "scan_start_idx": self.scan_start_idx,
             "num_samples": self.num_samples,
-            "nocache": self.nocache,
-            "include_baselines": self.include_baselines,
-            "min_plans": self.min_plans,
+            "cache": self.cache,
             "verbose": self.verbose,
+            "progress": self.progress,
             "available_models": self.available_models,
             "max_workers": self.max_workers,
-            "num_workers_per_plan": self.num_workers_per_plan,
             "allow_bonded_query": self.allow_bonded_query,
-            "allow_conventional_query": self.allow_conventional_query,
             "allow_model_selection": self.allow_model_selection,
             "allow_code_synth": self.allow_code_synth,
             "allow_token_reduction": self.allow_token_reduction,
             "allow_rag_reduction": self.allow_rag_reduction,
             "allow_mixtures": self.allow_mixtures,
             "allow_critic": self.allow_critic,
+            "allow_split_merge": self.allow_split_merge,
             "use_final_op_quality": self.use_final_op_quality,
-        }, indent=2)
+            **self.kwargs,
+        }
+    def to_json_str(self):
+        """Convert the config to a JSON string representation."""
+        config_dict = self.to_dict()
+        config_dict["val_datasource"] = (
+            None if self.val_datasource is None else self.val_datasource.serialize()
+        )
+        config_dict["policy"] = self.policy.to_json_str()
+        for strategy in ["processing_strategy", "execution_strategy", "sentinel_execution_strategy", "optimizer_strategy"]:
+            config_dict[strategy] = str(config_dict[strategy])
+        return json.dumps(config_dict, indent=2)
     def update(self, **kwargs) -> None:
         for key, value in kwargs.items():
             if hasattr(self, key):
                 setattr(self, key, value)
+        self.kwargs.update(kwargs)

palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl