PyPI - palimpzest - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
palimpzest-0.7.1.dist-info/RECORD +96 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.4.dist-info/RECORD +0 -87
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0

palimpzest/query/optimizer/optimizer.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import logging
 from copy import deepcopy
 from palimpzest.constants import Model
@@ -14,6 +15,7 @@ from palimpzest.query.operators.logical import (
     GroupByAggregate,
     LimitScan,
     LogicalOperator,
+    MapScan,
     Project,
     RetrieveScan,
 )
@@ -22,22 +24,17 @@ from palimpzest.query.optimizer import (
     TRANSFORMATION_RULES,
 )
 from palimpzest.query.optimizer.cost_model import CostModel
-from palimpzest.query.optimizer.optimizer_strategy import (
-    OptimizationStrategyType,
-    OptimizerStrategyRegistry,
-)
+from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
 from palimpzest.query.optimizer.plan import PhysicalPlan
 from palimpzest.query.optimizer.primitives import Group, LogicalExpression
 from palimpzest.query.optimizer.rules import (
     CodeSynthesisConvertRule,
     CriticAndRefineConvertRule,
     LLMConvertBondedRule,
-    LLMConvertConventionalRule,
     MixtureOfAgentsConvertRule,
     RAGConvertRule,
+    SplitConvertRule,
     TokenReducedConvertBondedRule,
-    TokenReducedConvertConventionalRule,
-    TokenReducedConvertRule,
 )
 from palimpzest.query.optimizer.tasks import (
     ApplyRule,
@@ -48,7 +45,9 @@ from palimpzest.query.optimizer.tasks import (
 )
 from palimpzest.sets import Dataset, Set
 from palimpzest.utils.hash_helpers import hash_for_serialized_dict
-from palimpzest.utils.model_helpers import get_champion_model, get_code_champion_model, get_conventional_fallback_model
+from palimpzest.utils.model_helpers import get_champion_model, get_code_champion_model, get_fallback_model
+logger = logging.getLogger(__name__)
 def get_node_uid(node: Dataset | DataReader) -> str:
@@ -86,22 +85,21 @@ class Optimizer:
         self,
         policy: Policy,
         cost_model: CostModel,
-        no_cache: bool = False,
+        available_models: list[Model],
+        cache: bool = False,
         verbose: bool = False,
-        available_models: list[Model] | None = None,
         allow_bonded_query: bool = True,
-        allow_conventional_query: bool = False,
         allow_code_synth: bool = False,
         allow_token_reduction: bool = False,
         allow_rag_reduction: bool = False,
         allow_mixtures: bool = True,
         allow_critic: bool = False,
-        optimization_strategy_type: OptimizationStrategyType = OptimizationStrategyType.PARETO,
+        allow_split_merge: bool = False,
+        optimizer_strategy: OptimizationStrategyType = OptimizationStrategyType.PARETO,
         use_final_op_quality: bool = False, # TODO: make this func(plan) -> final_quality
+        **kwargs,
     ):
         # store the policy
-        if available_models is None or len(available_models) == 0:
-            available_models = []
         self.policy = policy
         # store the cost model
@@ -123,36 +121,38 @@ class Optimizer:
         self.implementation_rules = IMPLEMENTATION_RULES
         self.transformation_rules = TRANSFORMATION_RULES
-        self.strategy = OptimizerStrategyRegistry.get_strategy(optimization_strategy_type.value)
+        # get the strategy class associated with the optimizer strategy
+        optimizer_strategy_cls = optimizer_strategy.value
+        self.strategy = optimizer_strategy_cls()
-        # if we are doing SENTINEL / NONE optimization; remove transformation rules
-        if optimization_strategy_type in [OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE]:
+        # remove transformation rules for optimization strategies which do not require them
+        if optimizer_strategy.no_transformation():
             self.transformation_rules = []
         # if we are not performing optimization, set available models to be single model
         # and remove all optimizations (except for bonded queries)
-        if optimization_strategy_type == OptimizationStrategyType.NONE:
+        if optimizer_strategy == OptimizationStrategyType.NONE:
             self.allow_bonded_query = True
-            self.allow_conventional_query = False
             self.allow_code_synth = False
             self.allow_token_reduction = False
             self.allow_rag_reduction = False
             self.allow_mixtures = False
             self.allow_critic = False
+            self.allow_split_merge = False
             self.available_models = [available_models[0]]
         # store optimization hyperparameters
-        self.no_cache = no_cache
+        self.cache = cache
         self.verbose = verbose
         self.available_models = available_models
         self.allow_bonded_query = allow_bonded_query
-        self.allow_conventional_query = allow_conventional_query
         self.allow_code_synth = allow_code_synth
         self.allow_token_reduction = allow_token_reduction
         self.allow_rag_reduction = allow_rag_reduction
         self.allow_mixtures = allow_mixtures
         self.allow_critic = allow_critic
-        self.optimization_strategy_type = optimization_strategy_type
+        self.allow_split_merge = allow_split_merge
+        self.optimizer_strategy = optimizer_strategy
         self.use_final_op_quality = use_final_op_quality
         # prune implementation rules based on boolean flags
@@ -163,13 +163,6 @@ class Optimizer:
                 if rule not in [LLMConvertBondedRule, TokenReducedConvertBondedRule]
             ]
-        if not self.allow_conventional_query:
-            self.implementation_rules = [
-                rule
-                for rule in self.implementation_rules
-                if rule not in [LLMConvertConventionalRule, TokenReducedConvertConventionalRule]
-            ]
         if not self.allow_code_synth:
             self.implementation_rules = [
                 rule for rule in self.implementation_rules if not issubclass(rule, CodeSynthesisConvertRule)
@@ -177,7 +170,7 @@ class Optimizer:
         if not self.allow_token_reduction:
             self.implementation_rules = [
-                rule for rule in self.implementation_rules if not issubclass(rule, TokenReducedConvertRule)
+                rule for rule in self.implementation_rules if not issubclass(rule, TokenReducedConvertBondedRule)
             ]
         if not self.allow_rag_reduction:
@@ -187,8 +180,7 @@ class Optimizer:
         if not self.allow_mixtures:
             self.implementation_rules = [
-                rule for rule in self.implementation_rules
-                if not issubclass(rule, MixtureOfAgentsConvertRule)
+                rule for rule in self.implementation_rules if not issubclass(rule, MixtureOfAgentsConvertRule)
             ]
         if not self.allow_critic:
@@ -196,8 +188,17 @@ class Optimizer:
                 rule for rule in self.implementation_rules if not issubclass(rule, CriticAndRefineConvertRule)
             ]
+        if not self.allow_split_merge:
+            self.implementation_rules = [
+                rule for rule in self.implementation_rules if not issubclass(rule, SplitConvertRule)
+            ]
+        logger.info(f"Initialized Optimizer with verbose={self.verbose}")
+        logger.debug(f"Initialized Optimizer with params: {self.__dict__}")
     def update_cost_model(self, cost_model: CostModel):
         self.cost_model = cost_model
+        self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
     def get_physical_op_params(self):
         return {
@@ -205,38 +206,41 @@ class Optimizer:
             "available_models": self.available_models,
             "champion_model": get_champion_model(self.available_models),
             "code_champion_model": get_code_champion_model(self.available_models),
-            "conventional_fallback_model": get_conventional_fallback_model(self.available_models),
+            "fallback_model": get_fallback_model(self.available_models),
         }
     def deepcopy_clean(self):
         optimizer = Optimizer(
             policy=self.policy,
             cost_model=CostModel(),
-            no_cache=self.no_cache,
+            cache=self.cache,
             verbose=self.verbose,
             available_models=self.available_models,
             allow_bonded_query=self.allow_bonded_query,
-            allow_conventional_query=self.allow_conventional_query,
             allow_code_synth=self.allow_code_synth,
             allow_token_reduction=self.allow_token_reduction,
             allow_rag_reduction=self.allow_rag_reduction,
             allow_mixtures=self.allow_mixtures,
             allow_critic=self.allow_critic,
-            optimization_strategy_type=self.optimization_strategy_type,
+            allow_split_merge=self.allow_split_merge,
+            optimizer_strategy=self.optimizer_strategy,
             use_final_op_quality=self.use_final_op_quality,
         )
         return optimizer
-    def update_strategy(self, optimizer_strategy_type: OptimizationStrategyType):
-        self.optimization_strategy_type = optimizer_strategy_type
-        self.strategy = OptimizerStrategyRegistry.get_strategy(optimizer_strategy_type.value)
+    def update_strategy(self, optimizer_strategy: OptimizationStrategyType):
+        self.optimizer_strategy = optimizer_strategy
+        optimizer_strategy_cls = optimizer_strategy.value
+        self.strategy = optimizer_strategy_cls()
     def construct_group_tree(self, dataset_nodes: list[Set]) -> tuple[list[int], dict[str, Field], dict[str, set[str]]]:
         # get node, output_schema, and input_schema (if applicable)
+        logger.debug(f"Constructing group tree for dataset_nodes: {dataset_nodes}")
         node = dataset_nodes[-1]
         output_schema = node.schema
         input_schema = dataset_nodes[-2].schema if len(dataset_nodes) > 1 else None
         ### convert node --> Group ###
         uid = get_node_uid(node)
@@ -244,7 +248,7 @@ class Optimizer:
         op: LogicalOperator | None = None
         # TODO: add cache scan when we add caching back to PZ
-        # if not self.no_cache:
+        # if self.cache:
         #     op = CacheScan(datareader=node, output_schema=output_schema)
         if isinstance(node, DataReader):
             op = BaseScan(datareader=node, output_schema=output_schema)
@@ -291,9 +295,9 @@ class Optimizer:
                 index=node._index,
                 search_func=node._search_func,
                 search_attr=node._search_attr,
-                output_attr=node._output_attr,
+                output_attrs=node._output_attrs,
                 k=node._k,
-                target_cache_id=uid
+                target_cache_id=uid,
             )
         elif output_schema != input_schema:
             op = ConvertScan(
@@ -304,6 +308,13 @@ class Optimizer:
                 depends_on=node._depends_on,
                 target_cache_id=uid,
             )
+        elif output_schema == input_schema and node._udf is not None:
+            op = MapScan(
+                input_schema=input_schema,
+                output_schema=output_schema,
+                udf=node._udf,
+                target_cache_id=uid,
+            )
         # some legacy plans may have a useless convert; for now we simply skip it
         elif output_schema == input_schema:
             return self.construct_group_tree(dataset_nodes[:-1]) if len(dataset_nodes) > 1 else ([], {}, {})
@@ -319,7 +330,9 @@ class Optimizer:
         )
         # compute the fields added by this operation and all fields
-        input_group_short_field_names = list(map(lambda full_field: full_field.split(".")[-1], input_group_fields.keys()))
+        input_group_short_field_names = list(
+            map(lambda full_field: full_field.split(".")[-1], input_group_fields.keys())
+        )
         new_fields = {
             field_name: field
             for field_name, field in op.output_schema.field_map(unique=True, id=uid).items()
@@ -329,9 +342,7 @@ class Optimizer:
         # compute the set of (short) field names this operation depends on
         depends_on_field_names = (
-            {}
-            if isinstance(node, DataReader)
-            else {field_name.split(".")[-1] for field_name in node._depends_on}
+            {} if isinstance(node, DataReader) else {field_name.split(".")[-1] for field_name in node._depends_on}
         )
         # compute all properties including this operations'
@@ -351,7 +362,7 @@ class Optimizer:
                 all_properties["limits"].add(op_limit_str)
             else:
                 all_properties["limits"] = set([op_limit_str])
         elif isinstance(op, Project):
             op_project_str = op.get_logical_op_id()
             if "projects" in all_properties:
@@ -359,6 +370,13 @@ class Optimizer:
             else:
                 all_properties["projects"] = set([op_project_str])
+        elif isinstance(op, MapScan):
+            op_udf_str = op.udf.__name__
+            if "udfs" in all_properties:
+                all_properties["udfs"].add(op_udf_str)
+            else:
+                all_properties["udfs"] = set([op_udf_str])
         # construct the logical expression and group
         logical_expression = LogicalExpression(
             operator=op,
@@ -378,13 +396,16 @@ class Optimizer:
         # add the expression and group to the optimizer's expressions and groups and return
         self.expressions[logical_expression.get_expr_id()] = logical_expression
         self.groups[group.group_id] = group
+        logger.debug(f"Constructed group tree for dataset_nodes: {dataset_nodes}")
+        logger.debug(f"Group: {group.group_id}, {all_fields}, {all_properties}")
         return [group.group_id], all_fields, all_properties
     def convert_query_plan_to_group_tree(self, query_plan: Dataset) -> str:
+        logger.debug(f"Converting query plan to group tree for query_plan: {query_plan}")
         # Obtain ordered list of datasets
         dataset_nodes: list[Dataset | DataReader] = []
-        node = deepcopy(query_plan)
+        node = query_plan.copy()
         # NOTE: the very first node will be a DataReader; the rest will be Dataset
         while isinstance(node, Dataset):
@@ -427,7 +448,8 @@ class Optimizer:
         # check that final_group_id is a singleton
         assert len(final_group_id) == 1
         final_group_id = final_group_id[0]
+        logger.debug(f"Converted query plan to group tree for query_plan: {query_plan}")
+        logger.debug(f"Final group id: {final_group_id}")
         return final_group_id
     def heuristic_optimization(self, group_id: int) -> None:
@@ -437,6 +459,8 @@ class Optimizer:
         pass
     def search_optimization_space(self, group_id: int) -> None:
+        logger.debug(f"Searching optimization space for group_id: {group_id}")
         # begin the search for an optimal plan with a task to optimize the final group
         initial_task = OptimizeGroup(group_id)
         self.tasks_stack.append(initial_task)
@@ -451,18 +475,23 @@ class Optimizer:
                 new_tasks = task.perform(self.transformation_rules, self.implementation_rules)
             elif isinstance(task, ApplyRule):
                 context = {"costed_phys_op_ids": self.costed_phys_op_ids}
-                new_tasks = task.perform(self.groups, self.expressions, context=context, **self.get_physical_op_params())
+                new_tasks = task.perform(
+                    self.groups, self.expressions, context=context, **self.get_physical_op_params()
+                )
             elif isinstance(task, OptimizePhysicalExpression):
-                context = {"optimization_strategy_type": self.optimization_strategy_type}
+                context = {"optimizer_strategy": self.optimizer_strategy}
                 new_tasks = task.perform(self.cost_model, self.groups, self.policy, context=context)
             self.tasks_stack.extend(new_tasks)
-    def optimize(self, query_plan: Dataset, policy: Policy | None = None) -> list[PhysicalPlan]:
+        logger.debug(f"Done searching optimization space for group_id: {group_id}")
+    def optimize(self, query_plan: Dataset) -> list[PhysicalPlan]:
         """
         The optimize function takes in an initial query plan and searches the space of
         logical and physical plans in order to cost and produce a (near) optimal physical plan.
         """
+        logger.info(f"Optimizing query plan: {query_plan}")
         # compute the initial group tree for the user plan
         final_group_id = self.convert_query_plan_to_group_tree(query_plan)
@@ -472,6 +501,6 @@ class Optimizer:
         # search the optimization space by applying logical and physical transformations to the initial group tree
         self.search_optimization_space(final_group_id)
-        return self.strategy.get_optimal_plans(self.groups, final_group_id, policy, self.use_final_op_quality)
+        logger.info(f"Getting optimal plans for final group id: {final_group_id}")
+        return self.strategy.get_optimal_plans(self.groups, final_group_id, self.policy, self.use_final_op_quality)

palimpzest/query/optimizer/optimizer_strategy.py CHANGED Viewed

@@ -1,24 +1,12 @@
 from __future__ import annotations
+import logging
 from abc import ABC, abstractmethod
-from copy import deepcopy
-from enum import Enum
 from palimpzest.policy import Policy
 from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
-class OptimizationStrategyType(str, Enum):
-    """
-    OptimizationStrategyType determines which (set of) plan(s) the Optimizer
-    will return to the Execution layer.
-    """
-    GREEDY = "greedy"
-    CONFIDENCE_INTERVAL = "confidence-interval"
-    PARETO = "pareto"
-    SENTINEL = "sentinel"
-    NONE = "none"
-    AUTO = "auto"
+logger = logging.getLogger(__name__)
 class OptimizationStrategy(ABC):
@@ -27,11 +15,6 @@ class OptimizationStrategy(ABC):
         """Strategy decides how to search through the groups for optimal plan(s)"""
         pass
-    @classmethod
-    def get_strategy(cls, strategy_type: str) -> OptimizationStrategy:
-        """Factory method to create strategy instances"""
-        return OptimizerStrategyRegistry.get_strategy(strategy_type)
     def normalize_final_plans(self, plans: list[PhysicalPlan]) -> list[PhysicalPlan]:
         """
         For each plan in `plans`, this function enforces that the input schema of every
@@ -47,7 +30,7 @@ class OptimizationStrategy(ABC):
         for plan in plans:
             normalized_ops = []
             for idx, op in enumerate(plan.operators):
-                op_copy = deepcopy(op)
+                op_copy = op.copy()
                 if idx == 0:
                     normalized_ops.append(op_copy)
                 else:
@@ -79,7 +62,12 @@ class GreedyStrategy(OptimizationStrategy):
         return PhysicalPlan.from_ops_and_sub_plan([best_phys_expr.operator], input_best_phys_plan, best_phys_expr.plan_cost)
     def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
-        return [self._get_greedy_physical_plan(groups, final_group_id)]
+        logger.info(f"Getting greedy optimal plans for final group id: {final_group_id}")
+        plans = [self._get_greedy_physical_plan(groups, final_group_id)]
+        logger.info(f"Greedy optimal plans: {plans}")
+        logger.info(f"Done getting greedy optimal plans for final group id: {final_group_id}")
+        return plans
 class ParetoStrategy(OptimizationStrategy):
@@ -127,8 +115,9 @@ class ParetoStrategy(OptimizationStrategy):
                             pareto_optimal_plans.append(plan)
         return pareto_optimal_plans
     def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
+        logger.info(f"Getting pareto optimal plans for final group id: {final_group_id}")
         # compute all of the pareto optimal physical plans
         plans = self._get_candidate_pareto_physical_plans(groups, final_group_id, policy)
@@ -138,7 +127,6 @@ class ParetoStrategy(OptimizationStrategy):
                 plan.plan_cost.quality = plan.plan_cost.op_estimates.quality
         # filter pareto optimal plans for ones which satisfy policy constraint (if at least one of them does)
-        # import pdb; pdb.set_trace()
         if any([policy.constraint(plan.plan_cost) for plan in plans]):
             plans = [plan for plan in plans if policy.constraint(plan.plan_cost)]
@@ -148,6 +136,8 @@ class ParetoStrategy(OptimizationStrategy):
             optimal_plan = optimal_plan if policy.choose(optimal_plan.plan_cost, plan.plan_cost) else plan
         plans = [optimal_plan]
+        logger.info(f"Pareto optimal plans: {plans}")
+        logger.info(f"Done getting pareto optimal plans for final group id: {final_group_id}")
         return plans
@@ -177,7 +167,11 @@ class SentinelStrategy(OptimizationStrategy):
         return SentinelPlan.from_ops_and_sub_plan([phys_op_set], best_phys_subplan)
     def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[SentinelPlan]:
-        return [self._get_sentinel_plan(groups, final_group_id)]
+        logger.info(f"Getting sentinel optimal plans for final group id: {final_group_id}")
+        plans = [self._get_sentinel_plan(groups, final_group_id)]
+        logger.info(f"Sentinel optimal plans: {plans}")
+        logger.info(f"Done getting sentinel optimal plans for final group id: {final_group_id}")
+        return plans
 class NoOptimizationStrategy(GreedyStrategy):
@@ -186,76 +180,3 @@ class NoOptimizationStrategy(GreedyStrategy):
     logical transformations or optimizations. It uses the same get_optimal_plans logic as the
     GreedyOptimizationStrategy.
     """
-class ConfidenceIntervalStrategy(OptimizationStrategy):
-    def _get_confidence_interval_optimal_plans(self, groups: dict, group_id: int) -> list[PhysicalPlan]:
-        """
-        Return all physical plans whose upper bound on the primary policy metric is greater than the
-        best plan's lower bound on the primary policy metric (subject to satisfying the policy constraint).
-        The OptimizePhysicalExpression task guarantees that each group's `ci_best_physical_expressions`
-        maintains a list of expressions with overlapping CI's on the primary policy metric (while also
-        satisfying the policy constraint).
-        This function computes the cross-product of all such expressions across all groups.
-        """
-        # get all the physical expressions which could be the best for this group
-        best_phys_exprs = groups[group_id].ci_best_physical_expressions
-        best_plans = []
-        for phys_expr in best_phys_exprs:
-            # if this expression has no inputs (i.e. it is a BaseScan or CacheScan),
-            # create the physical plan and append it to the best_plans for this group
-            if len(phys_expr.input_group_ids) == 0:
-                plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=phys_expr.plan_cost)
-                best_plans.append(plan)
-            # otherwise, get the best physical plan(s) for this group's inputs
-            else:
-                # TODO: need to handle joins
-                best_phys_subplans = [PhysicalPlan(operators=[])]
-                for input_group_id in phys_expr.input_group_ids:
-                    input_best_phys_plans = self._get_confidence_interval_optimal_plans(groups, input_group_id)
-                    best_phys_subplans = [
-                        PhysicalPlan.from_ops_and_sub_plan(subplan.operators, input_subplan, subplan.plan_cost)
-                        for subplan in best_phys_subplans
-                        for input_subplan in input_best_phys_plans
-                    ]
-                # add this operator to best physical plan and return
-                for subplan in best_phys_subplans:
-                    plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, phys_expr.plan_cost)
-                    best_plans.append(plan)
-        return best_plans
-    def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
-        # TODO: fix this to properly handle multiple potential plans
-        raise Exception("NotImplementedError")
-        # plans = self._get_confidence_interval_optimal_plans(final_group_id)
-class AutoOptimizationStrategy(OptimizationStrategy):
-    def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy, use_final_op_quality: bool) -> list[PhysicalPlan]:
-        raise NotImplementedError("Auto optimization strategy not implemented")
-class OptimizerStrategyRegistry:
-    """Registry to map strategy types to their implementations"""
-    _strategies: dict[str, type[OptimizationStrategy]] = {
-        OptimizationStrategyType.GREEDY.value: GreedyStrategy,
-        OptimizationStrategyType.CONFIDENCE_INTERVAL.value: ConfidenceIntervalStrategy,
-        OptimizationStrategyType.PARETO.value: ParetoStrategy,
-        OptimizationStrategyType.SENTINEL.value: SentinelStrategy,
-        OptimizationStrategyType.NONE.value: NoOptimizationStrategy,
-        OptimizationStrategyType.AUTO.value: AutoOptimizationStrategy,
-    }
-    @classmethod
-    def get_strategy(cls, strategy_type: str) -> OptimizationStrategy:
-        """Get strategy instance by type"""
-        strategy_class = cls._strategies.get(strategy_type)
-        if not strategy_class:
-            raise ValueError(f"Unknown optimization strategy: {strategy_type}")
-        return strategy_class()

palimpzest/query/optimizer/optimizer_strategy_type.py ADDED Viewed

@@ -0,0 +1,37 @@
+from enum import Enum
+from palimpzest.query.optimizer.optimizer_strategy import (
+    GreedyStrategy,
+    NoOptimizationStrategy,
+    ParetoStrategy,
+    SentinelStrategy,
+)
+class OptimizationStrategyType(Enum):
+    """
+    OptimizationStrategyType determines which (set of) plan(s) the Optimizer
+    will return to the Execution layer.
+    """
+    GREEDY = GreedyStrategy
+    PARETO = ParetoStrategy
+    SENTINEL = SentinelStrategy
+    NONE = NoOptimizationStrategy
+    def no_transformation(self) -> bool:
+        """
+        Return True if this optimization strategy does not transform the logical plan.
+        """
+        return self in [OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE]
+    def is_pareto(self) -> bool:
+        """
+        Return True if this optimization strategy uses Pareto optimization.
+        """
+        return self == OptimizationStrategyType.PARETO
+    def is_not_pareto(self) -> bool:
+        """
+        Return True if this optimization strategy does not use Pareto optimization.
+        """
+        return not self.is_pareto()

palimpzest/query/optimizer/plan.py CHANGED Viewed

@@ -106,8 +106,7 @@ class SentinelPlan(Plan):
         # store operator_sets and logical_op_ids; sort operator_sets internally by op_id
         self.operator_sets = operator_sets
         self.operator_sets = [sorted(op_set, key=lambda op: op.get_op_id()) for op_set in self.operator_sets]
-        self.logical_op_ids = [op_set[0].logical_op_id for op_set in operator_sets]
-        self.logical_op_names = [op_set[0].logical_op_name for op_set in operator_sets]
+        self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
         self.plan_id = self.compute_plan_id()
     def compute_plan_id(self) -> str:
@@ -151,7 +150,7 @@ class SentinelPlan(Plan):
         return self.logical_op_ids[slice], self.operator_sets[slice]
     def __iter__(self):
-        yield from zip(self.logical_op_ids, self.logical_op_names, self.operator_sets)
+        yield from zip(self.logical_op_ids, self.operator_sets)
     def __len__(self):
         return len(self.logical_op_ids)

palimpzest/query/optimizer/primitives.py CHANGED Viewed

@@ -42,9 +42,12 @@ class Expression:
     def __eq__(self, other):
         return self.operator == other.operator and self.input_group_ids == other.input_group_ids
-    def __hash__(self):
+    def __str__(self):
         op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_op_id()
-        hash_str = str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
+        return str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
+    def __hash__(self):
+        hash_str = self.__str__()
         hash_id = int(hash_for_id(hash_str), 16)
         return hash_id
@@ -80,7 +83,6 @@ class Group:
         self.explored = False
         self.best_physical_expression: PhysicalExpression | None = None
         self.pareto_optimal_physical_expressions: list[PhysicalExpression] | None = None
-        self.ci_best_physical_expressions: list[PhysicalExpression] | None = None
         self.optimized = False
         # properties of the Group which distinguish it from groups w/identical fields,

palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl