PyPI - palimpzest - Versions diffs - 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

palimpzest 0.7.20py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +259 -197
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +634 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +61 -5
palimpzest/prompts/filter_prompts.py +50 -5
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
palimpzest/prompts/prompt_factory.py +358 -46
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +157 -330
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +27 -21
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +22 -13
palimpzest/query/operators/join.py +402 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +198 -80
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +41 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +27 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
palimpzest-0.8.0.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.20.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0

palimpzest/query/optimizer/tasks.py CHANGED Viewed

@@ -3,8 +3,10 @@ from __future__ import annotations
 import logging
 from typing import Any
-from palimpzest.core.data.dataclasses import PlanCost
+from palimpzest.core.models import PlanCost
 from palimpzest.policy import Policy
+from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType
+from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.optimizer.cost_model import BaseCostModel
 from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
 from palimpzest.query.optimizer.primitives import Expression, Group
@@ -120,14 +122,15 @@ class OptimizeLogicalExpression(Task):
     def perform(
         self,
-        transformation_rules: list[TransformationRule],
-        implementation_rules: list[ImplementationRule],
+        transformation_rules: list[type[TransformationRule]],
+        implementation_rules: list[type[ImplementationRule]],
         context: dict[str, Any] | None = None,
     ) -> list[Task]:
         logger.debug(f"Optimizing logical expression {self.logical_expression}")
-        # if we're exploring, only apply transformation rules
         if context is None:
             context = {}
+        # if we're exploring, only apply transformation rules
         rules = transformation_rules if self.exploring else transformation_rules + implementation_rules
         # filter out rules that have already been applied to logical expression
@@ -170,7 +173,7 @@ class ApplyRule(Task):
     - schedule OptimizePhysicalExpression tasks
     """
-    def __init__(self, rule: Rule, logical_expression: Expression, exploring: bool = False):
+    def __init__(self, rule: type[Rule], logical_expression: Expression, exploring: bool = False):
         self.rule = rule
         self.logical_expression = logical_expression
         self.exploring = exploring
@@ -183,16 +186,13 @@ class ApplyRule(Task):
         **physical_op_params,
     ) -> tuple[list[Task], int]:
         logger.debug(f"Applying rule {self.rule} to logical expression {self.logical_expression}")
-        # check if rule has already been applied to this logical expression; return [] if so
         if context is None:
             context = {}
+        # check if rule has already been applied to this logical expression; return [] if so
         if self.rule.get_rule_id() in self.logical_expression.rules_applied:
             return []
-        # MAYBE ?TODO?: iterate over bindings for logical expression and rule?
-        #               perhaps some rules can be applied more than once to an expression?
         # get the group of the logical expression
         group_id = self.logical_expression.group_id
         group = groups[group_id]
@@ -206,8 +206,8 @@ class ApplyRule(Task):
             )
             # filter out any expressions which are duplicates (i.e. they've been previously computed)
-            new_expressions = [expr for expr in new_expressions if expr.get_expr_id() not in expressions]
-            expressions.update({expr.get_expr_id(): expr for expr in new_expressions})
+            new_expressions = [expr for expr in new_expressions if expr.expr_id not in expressions]
+            expressions.update({expr.expr_id: expr for expr in new_expressions})
             # add all new groups to the groups mapping
             for group in new_groups:
@@ -234,11 +234,11 @@ class ApplyRule(Task):
         else:
             # apply implementation rule
             new_expressions = self.rule.substitute(self.logical_expression, **physical_op_params)
-            new_expressions = [expr for expr in new_expressions if expr.get_expr_id() not in expressions]
+            new_expressions = [expr for expr in new_expressions if expr.expr_id not in expressions]
             costed_full_op_ids = context['costed_full_op_ids']
             if costed_full_op_ids is not None:
                 new_expressions = [expr for expr in new_expressions if expr.operator.get_full_op_id() in costed_full_op_ids]
-            expressions.update({expr.get_expr_id(): expr for expr in new_expressions})
+            expressions.update({expr.expr_id: expr for expr in new_expressions})
             group.physical_expressions.update(new_expressions)
             # create new task
@@ -412,8 +412,9 @@ class OptimizePhysicalExpression(Task):
         if context is None:
             context = {}
-        # get the optimizer strategy (type) from the context
+        # get the optimizer strategy (type) and the execution strategy (type) from the context
         optimizer_strategy: OptimizationStrategyType = context['optimizer_strategy']
+        execution_strategy: ExecutionStrategyType = context['execution_strategy']
         # return if we've already computed the cost of this physical expression
         if optimizer_strategy.is_pareto() and self.physical_expression.pareto_optimal_plan_costs is not None:
@@ -422,57 +423,90 @@ class OptimizePhysicalExpression(Task):
         if optimizer_strategy.is_not_pareto() and self.physical_expression.plan_cost is not None:
             return []
-        # for expressions with an input group, compute the input plan cost(s)
-        best_input_plan_cost = PlanCost(cost=0, time=0, quality=1)
-        input_plan_costs = [PlanCost(cost=0, time=0, quality=1)]
+        # for expressions with input group(s), compute the input plan cost(s)
+        best_input_plan_costs = {}
+        pareto_optimal_input_plan_costs = {}
         if len(self.physical_expression.input_group_ids) > 0:
-            # get the input group
-            input_group_id = self.physical_expression.input_group_ids[0]  # TODO: need to handle joins
-            input_group = groups[input_group_id]
-            # compute the input plan cost or list of input plan costs
             new_tasks = []
-            if optimizer_strategy.is_not_pareto() and input_group.best_physical_expression is not None:
-                # TODO: apply policy constraint here
-                best_input_plan_cost = input_group.best_physical_expression.plan_cost
-            elif optimizer_strategy.is_pareto() and input_group.pareto_optimal_physical_expressions is not None:
-                # TODO: apply policy constraint here
-                input_plan_costs = []
-                for pareto_physical_expression in input_group.pareto_optimal_physical_expressions:
-                    plan_costs = list(map(lambda tup: tup[0], pareto_physical_expression.pareto_optimal_plan_costs))
-                    input_plan_costs.extend(plan_costs)
-                # NOTE: this list will not necessarily be pareto-optimal, as a plan cost on the pareto frontier of
-                # one pareto_optimal_physical_expression might be dominated by the plan cost on another physical
-                # expression's pareto frontier; we handle this below by taking the pareto frontier of all_possible_plan_costs
-                # de-duplicate equivalent plan costs; we will still reconstruct plans with equivalent cost in optimizer.py
-                input_plan_costs = list(set(input_plan_costs))
-            else:
-                task = OptimizeGroup(input_group_id)
-                new_tasks.append(task)
+            for input_group_id in self.physical_expression.input_group_ids:
+                # get the input group
+                input_group = groups[input_group_id]
+                # compute the input plan cost or list of input plan costs
+                if optimizer_strategy.is_not_pareto() and input_group.best_physical_expression is not None:
+                    # TODO: apply policy constraint here
+                    best_input_plan_costs[input_group_id] = input_group.best_physical_expression.plan_cost
+                elif optimizer_strategy.is_pareto() and input_group.pareto_optimal_physical_expressions is not None:
+                    # TODO: apply policy constraint here
+                    input_plan_costs = []
+                    for pareto_physical_expression in input_group.pareto_optimal_physical_expressions:
+                        plan_costs = list(map(lambda tup: tup[0], pareto_physical_expression.pareto_optimal_plan_costs))
+                        input_plan_costs.extend(plan_costs)
+                    # NOTE: this list will not necessarily be pareto-optimal, as a plan cost on the pareto frontier of
+                    # one pareto_optimal_physical_expression might be dominated by the plan cost on another physical
+                    # expression's pareto frontier; we handle this below by taking the pareto frontier of all_possible_plan_costs
+                    # de-duplicate equivalent plan costs; we will still reconstruct plans with equivalent cost in optimizer.py
+                    pareto_optimal_input_plan_costs[input_group_id] = list(set(input_plan_costs))
+                else:
+                    task = OptimizeGroup(input_group_id)
+                    new_tasks.append(task)
             # if not all input groups have been costed, we need to compute these first and then retry this task
             if len(new_tasks) > 0:
                 return [self] + new_tasks
+        # once all input groups have been costed, compute the cost of this physical expression
         group = groups[self.physical_expression.group_id]
         if optimizer_strategy.is_pareto():
             # compute all possible plan costs for this physical expression given the pareto optimal input plan costs
             all_possible_plan_costs = []
-            for input_plan_cost in input_plan_costs:
-                op_plan_cost = cost_model(self.physical_expression.operator, input_plan_cost.op_estimates)
+            if isinstance(self.physical_expression.operator, JoinOp):
+                assert len(self.physical_expression.input_group_ids) == 2, "Join operator must have exactly two input groups."
+                # get the best input plan costs for both inputs
+                left_input_group_id, right_input_group_id = self.physical_expression.input_group_ids
+                left_best_input_plan_cost = pareto_optimal_input_plan_costs[left_input_group_id]
+                right_best_input_plan_cost = pareto_optimal_input_plan_costs[right_input_group_id]
+                for left_input_plan_cost in left_best_input_plan_cost:
+                    for right_input_plan_cost in right_best_input_plan_cost:
+                        # compute the cost of this operator given the input plan costs
+                        op_plan_cost = cost_model(
+                            self.physical_expression.operator,
+                            left_input_plan_cost.op_estimates,
+                            right_input_plan_cost.op_estimates,
+                        )
+                        # compute the total cost for this physical expression by summing its operator's PlanCost
+                        # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
+                        execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
+                        full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy)
+                        full_plan_cost.op_estimates = op_plan_cost.op_estimates
+                        all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
-                # compute the total cost for this physical expression by summing its operator's PlanCost
-                # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
-                full_plan_cost = op_plan_cost + input_plan_cost
-                full_plan_cost.op_estimates = op_plan_cost.op_estimates
-                all_possible_plan_costs.append((full_plan_cost, input_plan_cost))
+            else:
+                assert len(self.physical_expression.input_group_ids) < 2, "Non-join operator must have zero or one input groups."
+                input_plan_costs = [PlanCost(cost=0, time=0, quality=1)]
+                if len(self.physical_expression.input_group_ids) == 1:
+                    input_group_id = self.physical_expression.input_group_ids[0]
+                    input_plan_costs = pareto_optimal_input_plan_costs[input_group_id]
+                # get the pareto-optimal input plan costs for the single input
+                for input_plan_cost in input_plan_costs:
+                    op_plan_cost = cost_model(self.physical_expression.operator, input_plan_cost.op_estimates)
+                    # compute the total cost for this physical expression by summing its operator's PlanCost
+                    # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
+                    full_plan_cost = op_plan_cost + input_plan_cost
+                    full_plan_cost.op_estimates = op_plan_cost.op_estimates
+                    all_possible_plan_costs.append((full_plan_cost, (input_plan_cost, None)))
             # reduce the set of possible plan costs to the subset which are pareto-optimal
             pareto_optimal_plan_costs = []
-            for idx, (plan_cost, input_plan_cost) in enumerate(all_possible_plan_costs):
+            for idx, (plan_cost, input_plan_cost_tuple) in enumerate(all_possible_plan_costs):
                 pareto_optimal = True
                 # check if any other_expr dominates expr
@@ -487,7 +521,7 @@ class OptimizePhysicalExpression(Task):
                 # add expr to pareto frontier if it's not dominated
                 if pareto_optimal:
-                    pareto_optimal_plan_costs.append((plan_cost, input_plan_cost))
+                    pareto_optimal_plan_costs.append((plan_cost, input_plan_cost_tuple))
             # set the pareto frontier of plan costs which can be obtained by this physical expression
             self.physical_expression.pareto_optimal_plan_costs = pareto_optimal_plan_costs
@@ -496,13 +530,48 @@ class OptimizePhysicalExpression(Task):
             group = self.update_pareto_optimal_physical_expressions(group, policy)
         else:
-            # otherwise, compute the cost of this operator given the optimal input plan cost
-            op_plan_cost = cost_model(self.physical_expression.operator, best_input_plan_cost.op_estimates)
-            # compute the total cost for this physical expression by summing its operator's PlanCost
-            # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
-            full_plan_cost = op_plan_cost + best_input_plan_cost
-            full_plan_cost.op_estimates = op_plan_cost.op_estimates
+            # otherwise, compute the cost of this operator given the optimal input plan cost(s)
+            full_plan_cost = None
+            if isinstance(self.physical_expression.operator, JoinOp):
+                assert len(self.physical_expression.input_group_ids) == 2, "Join operator must have exactly two input groups."
+                # get the best input plan costs for both inputs
+                left_input_group_id, right_input_group_id = self.physical_expression.input_group_ids
+                left_best_input_plan_cost = best_input_plan_costs[left_input_group_id]
+                right_best_input_plan_cost = best_input_plan_costs[right_input_group_id]
+                # compute the cost of this operator given the best input plan costs
+                op_plan_cost = cost_model(
+                    self.physical_expression.operator,
+                    left_best_input_plan_cost.op_estimates,
+                    right_best_input_plan_cost.op_estimates,
+                )
+                # compute the total cost for this physical expression by summing its operator's PlanCost
+                # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
+                execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
+                full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy)
+                full_plan_cost.op_estimates = op_plan_cost.op_estimates
+            else:
+                assert len(self.physical_expression.input_group_ids) < 2, "Non-join operator must have zero or one input groups."
+                # get the best input plan cost for the single input
+                best_input_plan_cost = PlanCost(cost=0, time=0, quality=1)
+                if len(self.physical_expression.input_group_ids) == 1:
+                    input_group_id = self.physical_expression.input_group_ids[0]
+                    best_input_plan_cost = best_input_plan_costs[input_group_id]
+                # compute the cost of this operator given the best input plan cost
+                op_plan_cost = cost_model(self.physical_expression.operator, best_input_plan_cost.op_estimates)
+                # compute the total cost for this physical expression by summing its operator's PlanCost
+                # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
+                full_plan_cost = op_plan_cost + best_input_plan_cost
+                full_plan_cost.op_estimates = op_plan_cost.op_estimates
+            # set the plan cost for this physical expression
             self.physical_expression.plan_cost = full_plan_cost
             # update the best physical expression for the group

palimpzest/query/processor/config.py CHANGED Viewed

@@ -1,86 +1,51 @@
-import json
-from dataclasses import dataclass, field
+from pydantic import BaseModel, ConfigDict, Field
 from palimpzest.constants import Model
-from palimpzest.core.data.datareaders import DataReader
 from palimpzest.policy import MaxQuality, Policy
-# TODO: Separate out the config for the Optimizer, ExecutionStrategy, and QueryProcessor
 # TODO: Add description for each field.
-@dataclass
-class QueryProcessorConfig:
+class QueryProcessorConfig(BaseModel):
     """Shared context for query processors"""
-    processing_strategy: str = field(default="auto")                 # substituted with ProcessingStrategyType
-    execution_strategy: str = field(default="sequential")            # substituted with ExecutionStrategyType
-    sentinel_execution_strategy: str | None = field(default="auto")  # substituted with SentinelExecutionStrategyType
-    optimizer_strategy: str = field(default="pareto")                # substituted with OptimizationStrategyType
-    val_datasource: DataReader | None = field(default=None)
-    policy: Policy = field(default_factory=MaxQuality)
-    scan_start_idx: int = field(default=0)
-    num_samples: int = field(default=None)
-    cache: bool = field(default=False)  # NOTE: until we properly implement caching, let's set the default to False
-    verbose: bool = field(default=False)
-    progress: bool = field(default=True)
-    available_models: list[Model] | None = field(default=None)
-    max_workers: int | None = field(default=None)
-    allow_bonded_query: bool = field(default=True)
-    allow_model_selection: bool = field(default=True)
-    allow_code_synth: bool = field(default=False)
-    allow_rag_reduction: bool = field(default=True)
-    allow_mixtures: bool = field(default=True)
-    allow_critic: bool = field(default=True)
-    allow_split_merge: bool = field(default=False)
-    use_final_op_quality: bool = field(default=False)
-    kwargs: dict = field(default_factory=dict)
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    # execution and optimization flags
+    execution_strategy: str = Field(default="parallel")              # substituted with ExecutionStrategyType
+    sentinel_execution_strategy: str | None = Field(default="auto")  # substituted with SentinelExecutionStrategyType
+    optimizer_strategy: str = Field(default="pareto")                # substituted with OptimizationStrategyType
+    # general execution flags
+    policy: Policy = Field(default_factory=MaxQuality)
+    scan_start_idx: int = Field(default=0)
+    num_samples: int = Field(default=None)
+    verbose: bool = Field(default=False)
+    progress: bool = Field(default=True)
+    available_models: list[Model] | None = Field(default=None)
+    remove_models: list[Model] | None = Field(default=None)
+    max_workers: int | None = Field(default=64)
+    join_parallelism: int = Field(default=64)
+    batch_size: int | None = Field(default=None)
+    reasoning_effort: str | None = Field(default=None)  # Gemini: "disable", "low", "medium", "high"
+    gemini_credentials_path: str | None = Field(default=None)  # Path to Gemini credentials file
+    api_base: str | None = Field(default=None)  # API base URL for vLLM
+    # operator flags
+    allow_bonded_query: bool = Field(default=True)
+    allow_model_selection: bool = Field(default=True)
+    allow_rag_reduction: bool = Field(default=True)
+    allow_mixtures: bool = Field(default=True)
+    allow_critic: bool = Field(default=True)
+    allow_split_merge: bool = Field(default=False)
+    use_final_op_quality: bool = Field(default=False)
+    # sentinel optimization flags
+    k: int = Field(default=5)
+    j: int = Field(default=5)
+    sample_budget: int = Field(default=100)
+    seed: int = Field(default=42)
+    exp_name: str | None = Field(default=None)
+    priors: dict | None = Field(default=None)
     def to_dict(self) -> dict:
         """Convert the config to a dict representation."""
-        return {
-            "processing_strategy": self.processing_strategy,
-            "execution_strategy": self.execution_strategy,
-            "sentinel_execution_strategy": self.sentinel_execution_strategy,
-            "optimizer_strategy": self.optimizer_strategy,
-            "val_datasource": self.val_datasource,
-            "policy": self.policy,
-            "scan_start_idx": self.scan_start_idx,
-            "num_samples": self.num_samples,
-            "cache": self.cache,
-            "verbose": self.verbose,
-            "progress": self.progress,
-            "available_models": self.available_models,
-            "max_workers": self.max_workers,
-            "allow_bonded_query": self.allow_bonded_query,
-            "allow_model_selection": self.allow_model_selection,
-            "allow_code_synth": self.allow_code_synth,
-            "allow_rag_reduction": self.allow_rag_reduction,
-            "allow_mixtures": self.allow_mixtures,
-            "allow_critic": self.allow_critic,
-            "allow_split_merge": self.allow_split_merge,
-            "use_final_op_quality": self.use_final_op_quality,
-            **self.kwargs,
-        }
-    def to_json_str(self):
-        """Convert the config to a JSON string representation."""
-        config_dict = self.to_dict()
-        config_dict["val_datasource"] = (
-            None if self.val_datasource is None else self.val_datasource.serialize()
-        )
-        config_dict["policy"] = self.policy.to_json_str()
-        for strategy in ["processing_strategy", "execution_strategy", "sentinel_execution_strategy", "optimizer_strategy"]:
-            config_dict[strategy] = str(config_dict[strategy])
-        return json.dumps(config_dict, indent=2)
-    def update(self, **kwargs) -> None:
-        for key, value in kwargs.items():
-            if hasattr(self, key):
-                setattr(self, key, value)
-        self.kwargs.update(kwargs)
+        return self.model_dump()

palimpzest/query/processor/query_processor.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import logging
-from abc import abstractmethod
-from palimpzest.core.data.dataclasses import PlanStats
-from palimpzest.core.data.datareaders import DataReader
+from palimpzest.core.data.dataset import Dataset
 from palimpzest.core.elements.records import DataRecord, DataRecordCollection
+from palimpzest.core.models import ExecutionStats, PlanStats
 from palimpzest.policy import Policy
 from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
+from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
 from palimpzest.query.optimizer.optimizer import Optimizer
-from palimpzest.sets import Dataset
+from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
+from palimpzest.query.optimizer.plan import SentinelPlan
 from palimpzest.utils.hash_helpers import hash_for_id
-from palimpzest.utils.model_helpers import get_models
+from palimpzest.validator.validator import Validator
 logger = logging.getLogger(__name__)
@@ -27,15 +28,15 @@ class QueryProcessor:
         execution_strategy: ExecutionStrategy,
         sentinel_execution_strategy: SentinelExecutionStrategy | None,
         num_samples: int | None = None,
-        val_datasource: DataReader | None = None,
+        train_dataset: dict[str, Dataset] | None = None,
+        validator: Validator | None = None,
         scan_start_idx: int = 0,
-        cache: bool = False,
         verbose: bool = False,
         progress: bool = True,
         max_workers: int | None = None,
         policy: Policy | None = None,
         available_models: list[str] | None = None,
-        **kwargs,
+        **kwargs,  # needed in order to provide compatibility with QueryProcessorConfig
     ):
         """
         Initialize QueryProcessor with optional custom components.
@@ -48,20 +49,15 @@ class QueryProcessor:
         self.optimizer = optimizer
         self.execution_strategy = execution_strategy
         self.sentinel_execution_strategy = sentinel_execution_strategy
         self.num_samples = num_samples
-        self.val_datasource = val_datasource
+        self.train_dataset = train_dataset
+        self.validator = validator
         self.scan_start_idx = scan_start_idx
-        self.cache = cache
         self.verbose = verbose
         self.progress = progress
         self.max_workers = max_workers
         self.policy = policy
         self.available_models = available_models
-        if self.available_models is None or len(self.available_models) == 0:
-            self.available_models = get_models(include_vision=True)
         if self.verbose:
             print("Available models: ", self.available_models)
@@ -80,6 +76,26 @@ class QueryProcessor:
         return hash_for_id(id_str)
+    def _create_sentinel_plan(self, train_dataset: dict[str, Dataset] | None) -> SentinelPlan:
+        """
+        Generates and returns a SentinelPlan for the given dataset.
+        """
+        # create a new optimizer and update its strategy to SENTINEL
+        optimizer = self.optimizer.deepcopy_clean()
+        optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
+        # create copy of dataset, but change its root Dataset(s) to the validation Dataset(s)
+        dataset = self.dataset.copy()
+        if train_dataset is not None:
+            dataset._set_root_datasets(train_dataset)
+            dataset._generate_unique_logical_op_ids()
+        # get the sentinel plan for the given dataset
+        sentinel_plans = optimizer.optimize(dataset)
+        sentinel_plan = sentinel_plans[0]
+        return sentinel_plan
     def _execute_best_plan(self, dataset: Dataset, optimizer: Optimizer) -> tuple[list[DataRecord], list[PlanStats]]:
         # get the optimal plan according to the optimizer
         plans = optimizer.optimize(dataset)
@@ -91,7 +107,46 @@ class QueryProcessor:
         # return the output records and plan stats
         return records, [plan_stats]
-    # TODO: consider to support dry_run.
-    @abstractmethod
     def execute(self) -> DataRecordCollection:
-        raise NotImplementedError("Abstract method to be overwritten by sub-classes")
+        logger.info(f"Executing {self.__class__.__name__}")
+        # create execution stats
+        execution_stats = ExecutionStats(execution_id=self.execution_id())
+        execution_stats.start()
+        # if the user provides a train_dataset or validator, we perform optimization
+        if self.train_dataset is not None or self.validator is not None:
+            # create sentinel plan
+            sentinel_plan = self._create_sentinel_plan(self.train_dataset)
+            # generate sample execution data
+            if self.train_dataset is not None:
+                sentinel_plan_stats = self.sentinel_execution_strategy.execute_sentinel_plan(sentinel_plan, self.train_dataset, self.validator)
+            else:
+                train_dataset = self.dataset._get_root_datasets()
+                sentinel_plan_stats = self.sentinel_execution_strategy.execute_sentinel_plan(sentinel_plan, train_dataset, self.validator)
+            # update the execution stats to account for the work done in optimization
+            execution_stats.add_plan_stats(sentinel_plan_stats)
+            execution_stats.finish_optimization()
+            # (re-)initialize the optimizer
+            self.optimizer = self.optimizer.deepcopy_clean()
+            # construct the CostModel with any sample execution data we've gathered
+            cost_model = SampleBasedCostModel(sentinel_plan_stats, self.verbose)
+            self.optimizer.update_cost_model(cost_model)
+        # execute plan(s) according to the optimization strategy
+        records, plan_stats = self._execute_best_plan(self.dataset, self.optimizer)
+        # update the execution stats to account for the work to execute the final plan
+        execution_stats.add_plan_stats(plan_stats)
+        execution_stats.finish()
+        # construct and return the DataRecordCollection
+        result = DataRecordCollection(records, execution_stats=execution_stats)
+        logger.info(f"Done executing {self.__class__.__name__}")
+        return result

palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

palimpzest 0.7.20py3-none-any.whl → 0.8.0py3-none-any.whl