PyPI - palimpzest - Versions diffs - 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

palimpzest 0.7.21py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +343 -209
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +639 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +62 -6
palimpzest/prompts/filter_prompts.py +51 -6
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
palimpzest/prompts/prompt_factory.py +375 -47
palimpzest/prompts/split_proposer_prompts.py +1 -1
palimpzest/prompts/util_phrases.py +5 -0
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +160 -331
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +33 -19
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +26 -16
palimpzest/query/operators/join.py +403 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +205 -77
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +42 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +32 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
palimpzest-0.8.1.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.21.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0

palimpzest/query/optimizer/optimizer.py CHANGED Viewed

@@ -3,32 +3,32 @@ from __future__ import annotations
 import logging
 from copy import deepcopy
+from pydantic.fields import FieldInfo
 from palimpzest.constants import Model
-from palimpzest.core.data.datareaders import DataReader
-from palimpzest.core.lib.fields import Field
+from palimpzest.core.data.dataset import Dataset
+from palimpzest.core.lib.schemas import get_schema_field_names
 from palimpzest.policy import Policy
+from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType
 from palimpzest.query.operators.logical import (
-    Aggregate,
-    BaseScan,
+    ComputeOperator,
     ConvertScan,
+    Distinct,
     FilteredScan,
-    GroupByAggregate,
+    JoinOp,
     LimitScan,
-    LogicalOperator,
-    MapScan,
     Project,
-    RetrieveScan,
+    SearchOperator,
 )
 from palimpzest.query.optimizer import (
     IMPLEMENTATION_RULES,
     TRANSFORMATION_RULES,
 )
-from palimpzest.query.optimizer.cost_model import CostModel
+from palimpzest.query.optimizer.cost_model import BaseCostModel, SampleBasedCostModel
 from palimpzest.query.optimizer.optimizer_strategy_type import OptimizationStrategyType
 from palimpzest.query.optimizer.plan import PhysicalPlan
 from palimpzest.query.optimizer.primitives import Group, LogicalExpression
 from palimpzest.query.optimizer.rules import (
-    CodeSynthesisConvertRule,
     CriticAndRefineConvertRule,
     LLMConvertBondedRule,
     MixtureOfAgentsConvertRule,
@@ -42,21 +42,10 @@ from palimpzest.query.optimizer.tasks import (
     OptimizeLogicalExpression,
     OptimizePhysicalExpression,
 )
-from palimpzest.sets import Dataset, Set
-from palimpzest.utils.hash_helpers import hash_for_serialized_dict
-from palimpzest.utils.model_helpers import get_champion_model, get_code_champion_model, get_fallback_model
 logger = logging.getLogger(__name__)
-def get_node_uid(node: Dataset | DataReader) -> str:
-    """Helper function to compute the universal identifier for a node in the query plan."""
-    # NOTE: technically, hash_for_serialized_dict(node.serialize()) would be valid for both DataReader and Dataset;
-    #       for the moment, I want to be explicit in Dataset about what constitutes a unique Dataset object, but
-    #       in ther future we may be able to remove universal_identifier() from Dataset and just use this function
-    return node.universal_identifier() if isinstance(node, Dataset) else hash_for_serialized_dict(node.serialize())
 class Optimizer:
     """
     The optimizer is responsible for searching the space of possible physical plans
@@ -83,17 +72,19 @@ class Optimizer:
     def __init__(
         self,
         policy: Policy,
-        cost_model: CostModel,
+        cost_model: BaseCostModel,
         available_models: list[Model],
-        cache: bool = False,
+        join_parallelism: int = 64,
+        reasoning_effort: str | None = None,
+        api_base: str | None = None,
         verbose: bool = False,
         allow_bonded_query: bool = True,
-        allow_code_synth: bool = False,
         allow_rag_reduction: bool = False,
         allow_mixtures: bool = True,
         allow_critic: bool = False,
         allow_split_merge: bool = False,
         optimizer_strategy: OptimizationStrategyType = OptimizationStrategyType.PARETO,
+        execution_strategy: ExecutionStrategyType = ExecutionStrategyType.PARALLEL,
         use_final_op_quality: bool = False, # TODO: make this func(plan) -> final_quality
         **kwargs,
     ):
@@ -128,7 +119,6 @@ class Optimizer:
         # and remove all optimizations (except for bonded queries)
         if optimizer_strategy == OptimizationStrategyType.NONE:
             self.allow_bonded_query = True
-            self.allow_code_synth = False
             self.allow_rag_reduction = False
             self.allow_mixtures = False
             self.allow_critic = False
@@ -136,16 +126,18 @@ class Optimizer:
             self.available_models = [available_models[0]]
         # store optimization hyperparameters
-        self.cache = cache
         self.verbose = verbose
         self.available_models = available_models
+        self.join_parallelism = join_parallelism
+        self.reasoning_effort = reasoning_effort
+        self.api_base = api_base
         self.allow_bonded_query = allow_bonded_query
-        self.allow_code_synth = allow_code_synth
         self.allow_rag_reduction = allow_rag_reduction
         self.allow_mixtures = allow_mixtures
         self.allow_critic = allow_critic
         self.allow_split_merge = allow_split_merge
         self.optimizer_strategy = optimizer_strategy
+        self.execution_strategy = execution_strategy
         self.use_final_op_quality = use_final_op_quality
         # prune implementation rules based on boolean flags
@@ -156,11 +148,6 @@ class Optimizer:
                 if rule not in [LLMConvertBondedRule]
             ]
-        if not self.allow_code_synth:
-            self.implementation_rules = [
-                rule for rule in self.implementation_rules if not issubclass(rule, CodeSynthesisConvertRule)
-            ]
         if not self.allow_rag_reduction:
             self.implementation_rules = [
                 rule for rule in self.implementation_rules if not issubclass(rule, RAGConvertRule)
@@ -184,32 +171,34 @@ class Optimizer:
         logger.info(f"Initialized Optimizer with verbose={self.verbose}")
         logger.debug(f"Initialized Optimizer with params: {self.__dict__}")
-    def update_cost_model(self, cost_model: CostModel):
+    def update_cost_model(self, cost_model: BaseCostModel):
         self.cost_model = cost_model
     def get_physical_op_params(self):
         return {
             "verbose": self.verbose,
             "available_models": self.available_models,
-            "champion_model": get_champion_model(self.available_models),
-            "code_champion_model": get_code_champion_model(self.available_models),
-            "fallback_model": get_fallback_model(self.available_models),
+            "join_parallelism": self.join_parallelism,
+            "reasoning_effort": self.reasoning_effort,
+            "api_base": self.api_base,
         }
     def deepcopy_clean(self):
         optimizer = Optimizer(
             policy=self.policy,
-            cost_model=CostModel(),
-            cache=self.cache,
+            cost_model=SampleBasedCostModel(),
             verbose=self.verbose,
             available_models=self.available_models,
+            join_parallelism=self.join_parallelism,
+            reasoning_effort=self.reasoning_effort,
+            api_base=self.api_base,
             allow_bonded_query=self.allow_bonded_query,
-            allow_code_synth=self.allow_code_synth,
             allow_rag_reduction=self.allow_rag_reduction,
             allow_mixtures=self.allow_mixtures,
             allow_critic=self.allow_critic,
             allow_split_merge=self.allow_split_merge,
             optimizer_strategy=self.optimizer_strategy,
+            execution_strategy=self.execution_strategy,
             use_final_op_quality=self.use_final_op_quality,
         )
         return optimizer
@@ -219,121 +208,65 @@ class Optimizer:
         optimizer_strategy_cls = optimizer_strategy.value
         self.strategy = optimizer_strategy_cls()
-    def construct_group_tree(self, dataset_nodes: list[Set]) -> tuple[list[int], dict[str, Field], dict[str, set[str]]]:
-        # get node, output_schema, and input_schema (if applicable)
-        logger.debug(f"Constructing group tree for dataset_nodes: {dataset_nodes}")
-        node = dataset_nodes[-1]
-        output_schema = node.schema
-        input_schema = dataset_nodes[-2].schema if len(dataset_nodes) > 1 else None
+    def construct_group_tree(self, dataset: Dataset) -> tuple[int, dict[str, FieldInfo], dict[str, set[str]]]:
+        logger.debug(f"Constructing group tree for dataset: {dataset}")
         ### convert node --> Group ###
-        uid = get_node_uid(node)
         # create the op for the given node
-        op: LogicalOperator | None = None
-        # TODO: add cache scan when we add caching back to PZ
-        # if self.cache:
-        #     op = CacheScan(datareader=node, output_schema=output_schema)
-        if isinstance(node, DataReader):
-            op = BaseScan(datareader=node, output_schema=output_schema)
-        elif node._filter is not None:
-            op = FilteredScan(
-                input_schema=input_schema,
-                output_schema=output_schema,
-                filter=node._filter,
-                depends_on=node._depends_on,
-                target_cache_id=uid,
-            )
-        elif node._group_by is not None:
-            op = GroupByAggregate(
-                input_schema=input_schema,
-                output_schema=output_schema,
-                group_by_sig=node._group_by,
-                target_cache_id=uid,
-            )
-        elif node._agg_func is not None:
-            op = Aggregate(
-                input_schema=input_schema,
-                output_schema=output_schema,
-                agg_func=node._agg_func,
-                target_cache_id=uid,
-            )
-        elif node._limit is not None:
-            op = LimitScan(
-                input_schema=input_schema,
-                output_schema=output_schema,
-                limit=node._limit,
-                target_cache_id=uid,
-            )
-        elif node._project_cols is not None:
-            op = Project(
-                input_schema=input_schema,
-                output_schema=output_schema,
-                project_cols=node._project_cols,
-                target_cache_id=uid,
-            )
-        elif node._index is not None:
-            op = RetrieveScan(
-                input_schema=input_schema,
-                output_schema=output_schema,
-                index=node._index,
-                search_func=node._search_func,
-                search_attr=node._search_attr,
-                output_attrs=node._output_attrs,
-                k=node._k,
-                target_cache_id=uid,
-            )
-        elif output_schema != input_schema:
-            op = ConvertScan(
-                input_schema=input_schema,
-                output_schema=output_schema,
-                cardinality=node._cardinality,
-                udf=node._udf,
-                depends_on=node._depends_on,
-                target_cache_id=uid,
-            )
-        elif output_schema == input_schema and node._udf is not None:
-            op = MapScan(
-                input_schema=input_schema,
-                output_schema=output_schema,
-                udf=node._udf,
-                target_cache_id=uid,
-            )
-        # some legacy plans may have a useless convert; for now we simply skip it
-        elif output_schema == input_schema:
-            return self.construct_group_tree(dataset_nodes[:-1]) if len(dataset_nodes) > 1 else ([], {}, {})
+        op = dataset._operator
+        # compute the input group id(s) and field(s) for this node
+        if len(dataset._sources) == 0:
+            input_group_ids, input_group_fields, input_group_properties = ([], {}, {})
+        elif len(dataset._sources) == 1:
+            input_group_id, input_group_fields, input_group_properties = self.construct_group_tree(dataset._sources[0])
+            input_group_ids = [input_group_id]
+        elif len(dataset._sources) == 2:
+            left_input_group_id, left_input_group_fields, left_input_group_properties = self.construct_group_tree(dataset._sources[0])
+            right_input_group_id, right_input_group_fields, right_input_group_properties = self.construct_group_tree(dataset._sources[1])
+            input_group_ids = [left_input_group_id, right_input_group_id]
+            input_group_fields = {**left_input_group_fields, **right_input_group_fields}
+            input_group_properties = deepcopy(left_input_group_properties)
+            for k, v in right_input_group_properties.items():
+                if k in input_group_properties:
+                    input_group_properties[k].update(v)
+                else:
+                    input_group_properties[k] = deepcopy(v)
         else:
-            raise NotImplementedError(
-                f"""No logical operator exists for the specified dataset construction.
-                {input_schema}->{output_schema} {"with filter:'" + node._filter + "'" if node._filter is not None else ""}"""
-            )
-        # compute the input group ids and fields for this node
-        input_group_ids, input_group_fields, input_group_properties = (
-            self.construct_group_tree(dataset_nodes[:-1]) if len(dataset_nodes) > 1 else ([], {}, {})
-        )
+            raise NotImplementedError("Constructing group trees for datasets with more than 2 sources is not supported.")
         # compute the fields added by this operation and all fields
         input_group_short_field_names = list(
             map(lambda full_field: full_field.split(".")[-1], input_group_fields.keys())
         )
         new_fields = {
-            field_name: field
-            for field_name, field in op.output_schema.field_map(unique=True, id=uid).items()
-            if (field_name.split(".")[-1] not in input_group_short_field_names) or (node._udf is not None)
+            field_name: op.output_schema.model_fields[field_name.split(".")[-1]]
+            for field_name in get_schema_field_names(op.output_schema, id=dataset.id)
+            if (field_name not in input_group_short_field_names) or (hasattr(op, "udf") and op.udf is not None)
         }
         all_fields = {**input_group_fields, **new_fields}
         # compute the set of (short) field names this operation depends on
         depends_on_field_names = (
-            {} if isinstance(node, DataReader) else {field_name.split(".")[-1] for field_name in node._depends_on}
+            {} if dataset.is_root else {field_name.split(".")[-1] for field_name in op.depends_on}
         )
+        # NOTE: group_id is computed as the unique (sorted) set of fields and properties;
+        #       If an operation does not modify the fields (or modifies them in a way that
+        #       can create an idential field set to an earlier group) then we must add an
+        #       id from the operator to disambiguate the two groups.
         # compute all properties including this operations'
         all_properties = deepcopy(input_group_properties)
-        if isinstance(op, FilteredScan):
+        if isinstance(op, ConvertScan) and sorted(op.input_schema.model_fields.keys()) == sorted(op.output_schema.model_fields.keys()):
+            model_fields_dict = {
+                k: {"annotation": v.annotation, "default": v.default, "description": v.description}
+                for k, v in op.output_schema.model_fields.items()
+            }
+            if "maps" in all_properties:
+                all_properties["maps"].add(model_fields_dict)
+            else:
+                all_properties["maps"] = set([model_fields_dict])
+        elif isinstance(op, FilteredScan):
             # NOTE: we could use op.get_full_op_id() here, but storing filter strings makes
             #       debugging a bit easier as you can read which filters are in the Group
             op_filter_str = op.filter.get_filter_str()
@@ -342,6 +275,12 @@ class Optimizer:
             else:
                 all_properties["filters"] = set([op_filter_str])
+        elif isinstance(op, JoinOp):
+            if "joins" in all_properties:
+                all_properties["joins"].add(op.condition)
+            else:
+                all_properties["joins"] = set([op.condition])
         elif isinstance(op, LimitScan):
             op_limit_str = op.get_logical_op_id()
             if "limits" in all_properties:
@@ -356,12 +295,27 @@ class Optimizer:
             else:
                 all_properties["projects"] = set([op_project_str])
-        elif isinstance(op, MapScan):
-            op_udf_str = op.udf.__name__
-            if "udfs" in all_properties:
-                all_properties["udfs"].add(op_udf_str)
+        elif isinstance(op, Distinct):
+            op_distinct_str = op.get_logical_op_id()
+            if "distincts" in all_properties:
+                all_properties["distincts"].add(op_distinct_str)
+            else:
+                all_properties["distincts"] = set([op_distinct_str])
+        # TODO: temporary fix; perhaps use op_ids to identify group?
+        elif isinstance(op, ComputeOperator):
+            op_instruction = op.instruction
+            if "instructions" in all_properties:
+                all_properties["instructions"].add(op_instruction)
             else:
-                all_properties["udfs"] = set([op_udf_str])
+                all_properties["instructions"] = set([op_instruction])
+        elif isinstance(op, SearchOperator):
+            op_search_query = op.search_query
+            if "search_queries" in all_properties:
+                all_properties["search_queries"].add(op_search_query)
+            else:
+                all_properties["search_queries"] = set([op_search_query])
         # construct the logical expression and group
         logical_expression = LogicalExpression(
@@ -380,62 +334,50 @@ class Optimizer:
         logical_expression.set_group_id(group.group_id)
         # add the expression and group to the optimizer's expressions and groups and return
-        self.expressions[logical_expression.get_expr_id()] = logical_expression
+        self.expressions[logical_expression.expr_id] = logical_expression
         self.groups[group.group_id] = group
-        logger.debug(f"Constructed group tree for dataset_nodes: {dataset_nodes}")
+        logger.debug(f"Constructed group tree for dataset: {dataset}")
         logger.debug(f"Group: {group.group_id}, {all_fields}, {all_properties}")
-        return [group.group_id], all_fields, all_properties
-    def convert_query_plan_to_group_tree(self, query_plan: Dataset) -> str:
-        logger.debug(f"Converting query plan to group tree for query_plan: {query_plan}")
-        # Obtain ordered list of datasets
-        dataset_nodes: list[Dataset | DataReader] = []
-        node = query_plan.copy()
+        return group.group_id, all_fields, all_properties
-        # NOTE: the very first node will be a DataReader; the rest will be Dataset
-        while isinstance(node, Dataset):
-            dataset_nodes.append(node)
-            node = node._source
-        dataset_nodes.append(node)
-        dataset_nodes = list(reversed(dataset_nodes))
+    def convert_query_plan_to_group_tree(self, dataset: Dataset) -> str:
+        logger.debug(f"Converting query plan to group tree for dataset: {dataset}")
         # compute depends_on field for every node
         short_to_full_field_name = {}
-        for node_idx, node in enumerate(dataset_nodes):
+        for node in dataset:
             # update mapping from short to full field names
-            short_field_names = node.schema.field_names()
-            full_field_names = node.schema.field_names(unique=True, id=get_node_uid(node))
+            short_field_names = get_schema_field_names(node.schema)
+            full_field_names = get_schema_field_names(node.schema, id=node.id)
             for short_field_name, full_field_name in zip(short_field_names, full_field_names):
                 # set mapping automatically if this is a new field
-                if short_field_name not in short_to_full_field_name or (
-                    node_idx > 0 and dataset_nodes[node_idx - 1].schema != node.schema and node._udf is not None
-                ):
+                if short_field_name not in short_to_full_field_name or (hasattr(node._operator, "udf") and node._operator.udf is not None):
                     short_to_full_field_name[short_field_name] = full_field_name
-            # if the node is a data source, then skip
-            if isinstance(node, DataReader):
+            # if the node is a root Dataset, then skip
+            if node.is_root:
                 continue
             # If the node already has depends_on specified, then resolve each field name to a full (unique) field name
-            if len(node._depends_on) > 0:
-                node._depends_on = list(map(lambda field: short_to_full_field_name[field], node._depends_on))
+            if len(node._operator.depends_on) > 0:
+                node._operator.depends_on = list(map(lambda field: short_to_full_field_name[field], node._operator.depends_on))
                 continue
             # otherwise, make the node depend on all upstream nodes
-            node._depends_on = set()
-            for upstream_node in dataset_nodes[:node_idx]:
-                node._depends_on.update(upstream_node.schema.field_names(unique=True, id=get_node_uid(upstream_node)))
-            node._depends_on = list(node._depends_on)
+            node._operator.depends_on = set()
+            upstream_nodes = node.get_upstream_datasets()
+            for upstream_node in upstream_nodes:
+                upstream_field_names = get_schema_field_names(upstream_node.schema, id=upstream_node.id)
+                node._operator.depends_on.update(upstream_field_names)
+            node._operator.depends_on = list(node._operator.depends_on)
         # construct tree of groups
-        final_group_id, _, _ = self.construct_group_tree(dataset_nodes)
+        final_group_id, _, _ = self.construct_group_tree(dataset)
-        # check that final_group_id is a singleton
-        assert len(final_group_id) == 1
-        final_group_id = final_group_id[0]
-        logger.debug(f"Converted query plan to group tree for query_plan: {query_plan}")
+        logger.debug(f"Converted query plan to group tree for dataset: {dataset}")
         logger.debug(f"Final group id: {final_group_id}")
         return final_group_id
     def heuristic_optimization(self, group_id: int) -> None:
@@ -462,24 +404,24 @@ class Optimizer:
             elif isinstance(task, ApplyRule):
                 context = {"costed_full_op_ids": self.cost_model.get_costed_full_op_ids()}
                 new_tasks = task.perform(
-                    self.groups, self.expressions, context=context, **self.get_physical_op_params()
+                    self.groups, self.expressions, context=context, **self.get_physical_op_params(),
                 )
             elif isinstance(task, OptimizePhysicalExpression):
-                context = {"optimizer_strategy": self.optimizer_strategy}
+                context = {"optimizer_strategy": self.optimizer_strategy, "execution_strategy": self.execution_strategy}
                 new_tasks = task.perform(self.cost_model, self.groups, self.policy, context=context)
             self.tasks_stack.extend(new_tasks)
         logger.debug(f"Done searching optimization space for group_id: {group_id}")
-    def optimize(self, query_plan: Dataset) -> list[PhysicalPlan]:
+    def optimize(self, dataset: Dataset) -> list[PhysicalPlan]:
         """
         The optimize function takes in an initial query plan and searches the space of
         logical and physical plans in order to cost and produce a (near) optimal physical plan.
         """
-        logger.info(f"Optimizing query plan: {query_plan}")
+        logger.info(f"Optimizing query plan: {dataset}")
         # compute the initial group tree for the user plan
-        final_group_id = self.convert_query_plan_to_group_tree(query_plan)
+        dataset_copy = dataset.copy()
+        final_group_id = self.convert_query_plan_to_group_tree(dataset_copy)
         # TODO
         # # do heuristic based pre-optimization

palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl

palimpzest 0.7.21py3-none-any.whl → 0.8.1py3-none-any.whl