PyPI - palimpzest - Versions diffs - 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

palimpzest 0.7.21py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +259 -197
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +634 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +61 -5
palimpzest/prompts/filter_prompts.py +50 -5
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
palimpzest/prompts/prompt_factory.py +358 -46
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +157 -330
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +27 -21
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +22 -13
palimpzest/query/operators/join.py +402 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +198 -80
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +41 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +27 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
palimpzest-0.8.0.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.21.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0

palimpzest/query/optimizer/plan.py CHANGED Viewed

@@ -2,9 +2,12 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
-from palimpzest.core.data.dataclasses import PlanCost
+from palimpzest.core.models import PlanCost
+from palimpzest.query.operators.aggregate import AggregateOp
+from palimpzest.query.operators.join import JoinOp
+from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.physical import PhysicalOperator
-from palimpzest.query.operators.scan import ScanPhysicalOp
+from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
 from palimpzest.utils.hash_helpers import hash_for_id
@@ -42,19 +45,197 @@ class Plan(ABC):
         pass
 class PhysicalPlan(Plan):
-    def __init__(self, operators: list[PhysicalOperator], plan_cost: PlanCost | None = None):
-        self.operators = operators
+    def __init__(self, operator: PhysicalOperator, subplans: list[PhysicalPlan] | None, plan_cost: PlanCost | None = None):
+        self.operator = operator
+        self.subplans = [] if subplans is None else subplans
         self.plan_cost = plan_cost if plan_cost is not None else PlanCost(cost=0.0, time=0.0, quality=1.0)
         self.plan_id = self.compute_plan_id()
+        # NOTE: unique full_op_id is constructed as "{topological_index}-{full_op_id}" to
+        # differentiate between multiple instances of the same physical operator e.g. in self-joins
+        # compute mapping from unique full_op_id to next unique full_op_id in the plan
+        self.unique_full_op_id_to_next_unique_full_op_and_id = {}
+        current_idx, _ = self._compute_next_unique_full_op_map(self.unique_full_op_id_to_next_unique_full_op_and_id)
+        self.unique_full_op_id_to_next_unique_full_op_and_id[f"{current_idx}-{self.operator.get_full_op_id()}"] = (None, None)
+        # compute mapping from unique full_op_id to upstream unique full_op_ids
+        self.unique_full_op_id_to_upstream_full_op_ids = {}
+        self._compute_upstream_unique_full_op_ids_map(self.unique_full_op_id_to_upstream_full_op_ids)
+        # compute mapping from unique full_op_id to source unique full_op_ids
+        self.unique_full_op_id_to_source_full_op_ids = {}
+        self._compute_source_unique_full_op_ids_map(self.unique_full_op_id_to_source_full_op_ids)
     def compute_plan_id(self) -> str:
         """
         NOTE: This is NOT a universal ID.
         Two different PhysicalPlan instances with the identical lists of operators will have equivalent plan_ids.
         """
-        hash_str = str(tuple(op.get_full_op_id() for op in self.operators))
-        return hash_for_id(hash_str)
+        full_op_id = self.operator.get_full_op_id()
+        subplan_ids = [subplan.compute_plan_id() for subplan in self.subplans]
+        return hash_for_id(str((full_op_id,) + tuple(subplan_ids)))
+    def get_est_total_outputs(self, num_samples: int | None = None, current_idx: int | None = None, source_unique_full_op_ids_map: dict | None = None) -> tuple[dict[str, int], int]:
+        """Return the estimated total number of output records to be processed by the given operator in this plan."""
+        # get the source map from the root of the entire plan; use this map throughout all recursive calls
+        # (if you call self.get_source_unique_full_op_ids() from a subplan, it's topo indexes will be different)
+        if source_unique_full_op_ids_map is None:
+            source_unique_full_op_ids_map = self.unique_full_op_id_to_source_full_op_ids
+        # get the estimated total outputs from all subplans
+        # NOTE: this will be an empty dictionary for scans
+        all_subplan_total_outputs = {}
+        for subplan in self.subplans:
+            subplan_total_outputs, current_idx = subplan.get_est_total_outputs(num_samples, current_idx, source_unique_full_op_ids_map)
+            current_idx += 1
+            all_subplan_total_outputs.update(subplan_total_outputs)
+        # if current_idx is None, this is the first call, so we initialize it to 0
+        if current_idx is None:
+            current_idx = 0
+        # get total outputs for this operator
+        this_op_total_outputs = {}
+        this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
+        # if this operator is a scan, return the length of its datasource
+        if isinstance(self.operator, MarshalAndScanDataOp):
+            total = min(len(self.operator.datasource), num_samples) if num_samples is not None else len(self.operator.datasource)
+            this_op_total_outputs = {this_unique_full_op_id: total}
+        # if this operator is a context scan, return 1
+        elif isinstance(self.operator, ContextScanOp):  # noqa: SIM114
+            this_op_total_outputs = {this_unique_full_op_id: 1}
+        # if this operator is an aggregate, return 1
+        elif isinstance(self.operator, AggregateOp):
+            this_op_total_outputs = {this_unique_full_op_id: 1}
+        # if this operator is a limit scan, return its limit
+        elif isinstance(self.operator, LimitScanOp):
+            this_op_total_outputs = {this_unique_full_op_id: self.operator.limit}
+        # if this operator is a join, return the Cartesian product of the estimated outputs of its inputs
+        elif isinstance(self.operator, JoinOp):
+            # get estimated outputs for immediate left and right inputs
+            source_unique_full_op_ids = source_unique_full_op_ids_map[f"{current_idx}-{self.operator.get_full_op_id()}"]
+            left_unique_full_op_id, right_unique_full_op_id = source_unique_full_op_ids[0], source_unique_full_op_ids[1]
+            left_total_outputs = all_subplan_total_outputs[left_unique_full_op_id]
+            right_total_outputs = all_subplan_total_outputs[right_unique_full_op_id]
+            this_op_total_outputs = {this_unique_full_op_id: left_total_outputs * right_total_outputs}
+        # otherwise, return the number of outputs from the immediate input
+        else:
+            source_unique_full_op_ids = source_unique_full_op_ids_map[f"{current_idx}-{self.operator.get_full_op_id()}"]
+            source_unique_full_op_id = source_unique_full_op_ids[0]
+            this_op_total_outputs = {this_unique_full_op_id: all_subplan_total_outputs[source_unique_full_op_id]}
+        return {**this_op_total_outputs, **all_subplan_total_outputs}, current_idx
+    def _compute_next_unique_full_op_map(self, next_map: dict[str, str | None], current_idx: int | None = None) -> tuple[int, str]:
+        """Compute a mapping from each operator's unique full_op_id to the next operator in the plan and its unique full_op_id.
+        The unique full_op_id is constructed as "{topological_index}-{full_op_id}" to differentiate between
+        multiple instances of the same physical operator in the plan (e.g., in self-joins).
+        Args:
+            next_map: A dictionary to populate with the mapping from unique full_op_id to next (operator, unique_full_op_id) pair.
+            current_idx: The current topological index in the plan. If None, starts at 0.
+        Returns:
+            A tuple containing:
+                - The current topological index after processing this plan.
+                - The unique full_op_id of this plan's root operator.
+        """
+        # If there are subplans, compute their next maps first
+        subplan_topo_idx_op_id_pairs = []
+        for subplan in self.subplans:
+            current_idx, current_full_op_id = subplan._compute_next_unique_full_op_map(next_map, current_idx)
+            subplan_topo_idx_op_id_pairs.append((current_idx, current_full_op_id))
+            current_idx += 1  # increment after processing each subplan
+        # for each subplan's root operator, set its next to this plan's root operator
+        for topo_idx, full_op_id in subplan_topo_idx_op_id_pairs:
+            unique_op_id = f"{topo_idx}-{full_op_id}"
+            this_unique_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
+            next_map[unique_op_id] = (self.operator, this_unique_op_id)
+        # if this is the first call, initialize current_idx
+        if current_idx is None:
+            current_idx = 0
+        return current_idx, self.operator.get_full_op_id()
+    def get_next_unique_full_op_and_id(self, topo_idx: int, operator: PhysicalOperator) -> tuple[PhysicalOperator | None, str | None]:
+        """Return the next operator in the plan after the given operator, or None if it is the last operator."""
+        unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
+        return self.unique_full_op_id_to_next_unique_full_op_and_id[unique_full_op_id]
+    def get_next_unique_full_op_id(self, topo_idx: int, operator: PhysicalOperator) -> str | None:
+        """Return the full_op_id of the next operator in the plan after the given operator, or None if it is the last operator."""
+        unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
+        _, next_unique_full_op_id = self.unique_full_op_id_to_next_unique_full_op_and_id[unique_full_op_id]
+        return next_unique_full_op_id
+    def _compute_upstream_unique_full_op_ids_map(self, upstream_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str, list[str]]:
+        # set the upstream unique full_op_ids for this operator
+        subplan_topo_idx_upstream_unique_full_op_id_tuples = []
+        for subplan in self.subplans:
+            current_idx, full_op_id, subplan_upstream_unique_full_op_ids = subplan._compute_upstream_unique_full_op_ids_map(upstream_map, current_idx)
+            subplan_topo_idx_upstream_unique_full_op_id_tuples.append((current_idx, full_op_id, subplan_upstream_unique_full_op_ids))
+            current_idx += 1
+        # if current_idx is None, this is the first call, so we initialize it to 0
+        if current_idx is None:
+            current_idx = 0
+        # compute this operator's unique full_op_id
+        this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
+        # update the upstream_map for this operator
+        upstream_map[this_unique_full_op_id] = []
+        for topo_idx, full_op_id, upstream_unique_full_op_ids in subplan_topo_idx_upstream_unique_full_op_id_tuples:
+            subplan_upstream_unique_full_op_ids = [f"{topo_idx}-{full_op_id}"] + upstream_unique_full_op_ids
+            upstream_map[this_unique_full_op_id].extend(subplan_upstream_unique_full_op_ids)
+        # return the current index and the upstream unique full_op_ids for this operator
+        return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
+    def get_upstream_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
+        """Return the list of unique full_op_ids for the upstream operators of this operator."""
+        unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
+        return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
+    def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
+        # get the topological index and full_op_id pairs for all subplans' root operators
+        subplan_topo_idx_op_id_pairs = []
+        for subplan in self.subplans:
+            current_idx, current_full_op_id = subplan._compute_source_unique_full_op_ids_map(source_map, current_idx)
+            subplan_topo_idx_op_id_pairs.append((current_idx, current_full_op_id))
+            current_idx += 1
+        # if current_idx is None, this is the first call, so we initialize it to 0
+        if current_idx is None:
+            current_idx = 0
+        # compute this operator's unique full_op_id
+        this_unique_full_op_id = f"{current_idx}-{self.operator.get_full_op_id()}"
+        # update the source_map for this operator
+        source_map[this_unique_full_op_id] = []
+        for topo_idx, full_op_id in subplan_topo_idx_op_id_pairs:
+            unique_full_op_id = f"{topo_idx}-{full_op_id}"
+            source_map[this_unique_full_op_id].append(unique_full_op_id)
+        # return the current unique full_op_id for this operator
+        return current_idx, self.operator.get_full_op_id()
+    def get_source_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
+        """Return the list of unique full_op_ids for the input(s) to this operator."""
+        unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
+        return self.unique_full_op_id_to_source_full_op_ids[unique_full_op_id]
     def __eq__(self, other):
         return isinstance(other, PhysicalPlan) and self.plan_id == other.plan_id
@@ -65,60 +246,78 @@ class PhysicalPlan(Plan):
     def __repr__(self) -> str:
         return str(self)
-    def __str__(self):
-        start = self.operators[0]
-        plan_str = f" 0. {type(start).__name__} -> {start.output_schema.__name__} \n\n"
-        for idx, operator in enumerate(self.operators[1:]):
-            plan_str += f" {idx+1}. {str(operator)}\n"
+    def _get_str(self, idx: int = 0, indent: int = 0) -> str:
+        indent_str = " " * (indent * 2)
+        plan_str = f"{indent_str}{idx}. {str(self.operator)}\n"
+        for subplan in self.subplans:
+            plan_str += subplan._get_str(idx=idx + 1, indent=indent + 1)
         return plan_str
+    def __str__(self):
+        return self._get_str()
     def __getitem__(self, slice):
-        return self.operators[slice]
+        ops = [op for op in self]
+        return ops[slice]
     def __iter__(self):
-        return iter(self.operators)
+        for subplan in self.subplans:
+            yield from subplan
+        yield self.operator
     def __len__(self):
-        return len(self.operators)
+        return 1 + sum(len(subplan) for subplan in self.subplans)
-    @staticmethod
-    def from_ops_and_sub_plan(ops: list[PhysicalOperator], sub_plan: PhysicalPlan, plan_cost: PlanCost) -> PhysicalPlan:
-        # create copies of all logical operators
-        copy_sub_plan = [op.copy() for op in sub_plan.operators]
-        copy_ops = [op.copy() for op in ops]
+    @classmethod
+    def _from_ops(cls, ops: list[PhysicalOperator], plan_cost: PlanCost | None = None) -> PhysicalPlan:
+        """
+        NOTE: Do not use this in production code. This is a convenience method for constructing PhysicalPlans in tests.
+        This method assumes a left-deep tree structure (i.e. pipeline), where each operator has at most one subplan.
+        The PlanCost is applied to all subplans, thus it is not a true representation of the cost of the plan.
+        """
+        assert len(ops) > 0, "ops must contain at least one PhysicalOperator"
-        # construct full set of operators
-        copy_sub_plan.extend(copy_ops)
+        # build the PhysicalPlan from the list of operators
+        if len(ops) == 1:
+            return cls(operator=ops[0], subplans=None, plan_cost=plan_cost)
-        # return the PhysicalPlan
-        return PhysicalPlan(operators=copy_sub_plan, plan_cost=plan_cost)
+        # recursively build subplans
+        subplan = cls._from_ops(ops[:-1], plan_cost=plan_cost)
+        return cls(operator=ops[-1], subplans=[subplan], plan_cost=plan_cost)
+# TODO(?): take list[PhysicalOperator] as input, but then store OpFrontier
 class SentinelPlan(Plan):
-    def __init__(self, operator_sets: list[list[PhysicalOperator]]):
-        # enforce that first operator_set is a scan and that every operator_set has at least one operator
-        if len(operator_sets) > 0:
-            assert isinstance(operator_sets[0][0], ScanPhysicalOp), "first operator set must be a scan"
-            assert all(len(op_set) > 0 for op_set in operator_sets), "every operator set must have at least one operator"
-        # store operator_sets and logical_op_ids; sort operator_sets internally by full_op_id
-        self.operator_sets = operator_sets
-        self.operator_sets = [sorted(op_set, key=lambda op: op.get_full_op_id()) for op_set in self.operator_sets]
-        self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
+    def __init__(self, operator_set: list[PhysicalOperator], subplans: list[SentinelPlan] | None):
+        # store operator_set and logical_op_id; sort operator_set internally by full_op_id
+        self.operator_set = sorted(operator_set, key=lambda op: op.get_full_op_id())
+        self.logical_op_id = self.operator_set[0].logical_op_id
+        self.subplans = [] if subplans is None else subplans
         self.plan_id = self.compute_plan_id()
+        # compute mapping from unique logical_op_id to next unique logical_op_id in the plan
+        self.unique_logical_op_id_to_next_unique_logical_op_id = {}
+        current_idx, _ = self._compute_next_unique_logical_op_id_map(self.unique_logical_op_id_to_next_unique_logical_op_id)
+        self.unique_logical_op_id_to_next_unique_logical_op_id[f"{current_idx}-{self.logical_op_id}"] = None
+        # compute mapping from unique logical_op_id to root dataset ids
+        self.unique_logical_op_id_to_root_dataset_ids = {}
+        self._compute_root_dataset_ids_map(self.unique_logical_op_id_to_root_dataset_ids)
+        # compute mapping from unique logical_op_id to source unique logical_op_ids
+        self.unique_logical_op_id_to_source_logical_op_ids = {}
+        self._compute_source_unique_logical_op_ids_map(self.unique_logical_op_id_to_source_logical_op_ids)
     def compute_plan_id(self) -> str:
         """
         NOTE: This is NOT a universal ID.
         Two different SentinelPlan instances with the identical operator_sets will have equivalent plan_ids.
         """
-        hash_str = ""
-        for logical_op_id, op_set in zip(self.logical_op_ids, self.operator_sets):
-            hash_str += f"{logical_op_id} {tuple(op.get_full_op_id() for op in op_set)} "
-        return hash_for_id(hash_str)
+        full_id = (self.logical_op_id,) + tuple([op.get_full_op_id() for op in self.operator_set])
+        subplan_ids = [subplan.compute_plan_id() for subplan in self.subplans]
+        return hash_for_id(str((full_id,) + tuple(subplan_ids)))
     def __eq__(self, other):
         return isinstance(other, SentinelPlan) and self.plan_id == other.plan_id
@@ -129,40 +328,126 @@ class SentinelPlan(Plan):
     def __repr__(self) -> str:
         return str(self)
-    def __str__(self):
-        # by assertion, first operator_set is guaranteed to be a scan
-        start = self.operator_sets[0][0]
-        plan_str = f" 0. {type(start).__name__} -> {start.output_schema.__name__} \n\n"
-        # build string one operator set at a time
-        for idx, operator_set in enumerate(self.operator_sets[1:]):
-            if len(operator_set) == 1:
-                operator = operator_set[0]
-                plan_str += f" {idx+1}. {str(operator)}\n"
-            else:
-                for inner_idx, operator in enumerate(operator_set):
-                    plan_str += f" {idx+1}.{inner_idx+1}. {str(operator)}\n"
+    def _get_str(self, idx: int = 0, indent: int = 0) -> str:
+        indent_str = " " * (indent * 2)
+        plan_str = ""
+        for inner_idx, operator in enumerate(self.operator_set):
+            inner_idx_str = "" if len(self.operator_set) == 1 else f"{inner_idx + 1}."
+            plan_str += f"{indent_str}{idx}.{inner_idx_str} {str(operator)}\n"
+            for subplan in self.subplans:
+                plan_str += subplan._get_str(idx=idx + 1, indent=indent + 1)
         return plan_str
+    def __str__(self):
+        return self._get_str()
     def __getitem__(self, slice):
-        return self.logical_op_ids[slice], self.operator_sets[slice]
+        op_set_tuples = [op_set_tuple for op_set_tuple in self]
+        return op_set_tuples[slice]
     def __iter__(self):
-        yield from zip(self.logical_op_ids, self.operator_sets)
+        for subplan in self.subplans:
+            yield from subplan
+        yield self.logical_op_id, self.operator_set
     def __len__(self):
-        return len(self.logical_op_ids)
-    @staticmethod
-    def from_ops_and_sub_plan(op_sets: list[list[PhysicalOperator]], sub_plan: SentinelPlan) -> SentinelPlan:
-        # create copies of all logical operators
-        copy_sub_plan = [[op.copy() for op in op_set] for op_set in sub_plan.operator_sets]
-        copy_ops = [[op.copy() for op in op_set] for op_set in op_sets]
-        # construct full set of operators
-        copy_sub_plan.extend(copy_ops)
-        # return the SentinelPlan
-        return SentinelPlan(operator_sets=copy_sub_plan)
+        return 1 + sum(len(subplan) for subplan in self.subplans)
+    def _compute_next_unique_logical_op_id_map(self, next_map: dict[str, str | None], current_idx: int | None = None) -> tuple[int, str]:
+        """Compute a mapping from each operator's unique logical_op_id to the next operator's unique logical_op_id.
+        The unique logical_op_id is constructed as "{topological_index}-{logical_op_id}" to differentiate between
+        multiple instances of the same physical operator in the plan (e.g., in self-joins).
+        Args:
+            next_map: A dictionary to populate with the mapping from unique logical_op_id to next logical_op_id.
+            current_idx: The current topological index in the plan. If None, starts at 0.
+        Returns:
+            A tuple containing:
+                - The current topological index after processing this plan.
+                - The unique logical_op_id of this plan's root logical operator.
+        """
+        # If there are subplans, compute their next maps first
+        subplan_topo_idx_op_id_pairs = []
+        for subplan in self.subplans:
+            current_idx, current_logical_op_id = subplan._compute_next_unique_logical_op_id_map(next_map, current_idx)
+            subplan_topo_idx_op_id_pairs.append((current_idx, current_logical_op_id))
+            current_idx += 1  # increment after processing each subplan
+        # for each subplan's root operator, set its next to this plan's root operator
+        for topo_idx, logical_op_id in subplan_topo_idx_op_id_pairs:
+            unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
+            this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
+            next_map[unique_logical_op_id] = this_unique_logical_op_id
+        # if this is the first call, initialize current_idx
+        if current_idx is None:
+            current_idx = 0
+        return current_idx, self.logical_op_id
+    def get_next_unique_logical_op_id(self, unique_logical_op_id: str) -> str | None:
+        """Return the unique logical_op_id of the next operator in the plan after the given operator, or None if it is the last operator."""
+        return self.unique_logical_op_id_to_next_unique_logical_op_id[unique_logical_op_id]
+    def _compute_root_dataset_ids_map(self, root_dataset_ids_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, list[str]]:
+        # set the root dataset ids for this operator
+        all_subplan_root_dataset_ids = []
+        for subplan in self.subplans:
+            current_idx, subplan_root_dataset_ids = subplan._compute_root_dataset_ids_map(root_dataset_ids_map, current_idx)
+            all_subplan_root_dataset_ids.extend(subplan_root_dataset_ids)
+            current_idx += 1
+        # if current_idx is None, this is the first call, so we initialize it to 0
+        if current_idx is None:
+            current_idx = 0
+        # compute this operator's unique logical_op_id
+        this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
+        # if this operator is a root dataset scan, update root_dataset_ids
+        root_dataset_ids = []
+        if isinstance(self.operator_set[0], MarshalAndScanDataOp):
+            root_dataset_ids.append(self.operator_set[0].datasource.id)
+        elif isinstance(self.operator_set[0], ContextScanOp):
+            root_dataset_ids.append(self.operator_set[0].context.id)
+        # update the root_dataset_ids_map for this operator
+        root_dataset_ids_map[this_unique_logical_op_id] = root_dataset_ids + all_subplan_root_dataset_ids
+        # return the current index and the upstream unique logical_op_ids for this operator
+        return current_idx, root_dataset_ids_map[this_unique_logical_op_id]
+    def get_root_dataset_ids(self, unique_logical_op_id: str) -> list[str]:
+        """Return the list of root dataset ids which are upstream of this operator."""
+        return self.unique_logical_op_id_to_root_dataset_ids[unique_logical_op_id]
+    def _compute_source_unique_logical_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
+        # get the topological index and logical_op_id pairs for all subplans' root operators
+        subplan_topo_idx_op_id_pairs = []
+        for subplan in self.subplans:
+            current_idx, current_logical_op_id = subplan._compute_source_unique_logical_op_ids_map(source_map, current_idx)
+            subplan_topo_idx_op_id_pairs.append((current_idx, current_logical_op_id))
+            current_idx += 1
+        # if current_idx is None, this is the first call, so we initialize it to 0
+        if current_idx is None:
+            current_idx = 0
+        # compute this operator's unique logical_op_id
+        this_unique_logical_op_id = f"{current_idx}-{self.logical_op_id}"
+        # update the source_map for this operator
+        source_map[this_unique_logical_op_id] = []
+        for topo_idx, logical_op_id in subplan_topo_idx_op_id_pairs:
+            unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
+            source_map[this_unique_logical_op_id].append(unique_logical_op_id)
+        # return the current unique logical_op_id for this operator
+        return current_idx, self.logical_op_id
+    def get_source_unique_logical_op_ids(self, unique_logical_op_id: str) -> list[str]:
+        """Return the list of unique logical_op_ids for the input(s) to this operator."""
+        return self.unique_logical_op_id_to_source_logical_op_ids[unique_logical_op_id]

palimpzest/query/optimizer/primitives.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from __future__ import annotations
-from palimpzest.core.lib.fields import Field
+from pydantic.fields import FieldInfo
 from palimpzest.query.operators.logical import LogicalOperator
 from palimpzest.query.operators.physical import PhysicalOperator
+from palimpzest.query.optimizer import rules
 from palimpzest.query.optimizer.plan import PlanCost
 from palimpzest.utils.hash_helpers import hash_for_id
@@ -18,9 +20,9 @@ class Expression:
         self,
         operator: LogicalOperator | PhysicalOperator,
         input_group_ids: list[int],
-        input_fields: dict[str, Field],
+        input_fields: dict[str, FieldInfo],
         depends_on_field_names: set[str],
-        generated_fields: dict[str, Field],
+        generated_fields: dict[str, FieldInfo],
         group_id: int | None = None,
     ):
         self.operator = operator
@@ -36,37 +38,59 @@ class Expression:
         self.plan_cost: PlanCost | None = None
         # NOTE: this will be a list of tuples where each tuple has a (pareto-optimal) plan cost
-        # and the input plan cost for which that pareto-optimal plan cost is attainable
-        self.pareto_optimal_plan_costs: list[tuple[PlanCost, PlanCost]] | None = None
+        # and the tuple of input plan cost(s) for which that pareto-optimal plan cost is attainable;
+        # the tuple of input plan cost(s) is (input_plan_cost, None) for non-join operators and
+        # (left_input_plan_cost, right_input_plan_cost) for join operators
+        self.pareto_optimal_plan_costs: list[tuple[PlanCost, tuple[PlanCost, PlanCost]]] | None = None
+        # compute the expression id
+        self.expr_id = self._compute_expr_id()
     def __eq__(self, other):
-        return self.operator == other.operator and self.input_group_ids == other.input_group_ids
+        return self.expr_id == other.expr_id
     def __str__(self):
-        op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_full_op_id()
-        return str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
+        expr_str = f"{self.__class__.__name__}(group_id={self.group_id}, expr_id={self.expr_id})"
+        expr_str += f"\n  - input_group_ids: {self.input_group_ids}"
+        expr_str += f"\n  - input_fields: {self.input_fields}"
+        expr_str += f"\n  - depends_on_field_names: {self.depends_on_field_names}"
+        expr_str += f"\n  - generated_fields: {self.generated_fields}"
+        expr_str += f"\n  - operator:\n{str(self.operator)}"
+        return expr_str
     def __hash__(self):
-        hash_str = self.__str__()
+        op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_full_op_id()
+        hash_str = str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
         hash_id = int(hash_for_id(hash_str), 16)
         return hash_id
-    def add_applied_rule(self, rule):
+    def _compute_expr_id(self) -> int:
+        return self.__hash__()
+    def add_applied_rule(self, rule: type[rules.Rule]):
         self.rules_applied.add(rule.get_rule_id())
     def set_group_id(self, group_id: int) -> None:
         self.group_id = group_id
-    def get_expr_id(self) -> int:
-        return self.__hash__()
 class LogicalExpression(Expression):
     pass
 class PhysicalExpression(Expression):
-    pass
+    @classmethod
+    def from_op_and_logical_expr(cls, op: PhysicalOperator, logical_expression: LogicalExpression) -> PhysicalExpression:
+        """Construct a PhysicalExpression given a physical operator and a logical expression."""
+        return cls(
+            operator=op,
+            input_group_ids=logical_expression.input_group_ids,
+            input_fields=logical_expression.input_fields,
+            depends_on_field_names=logical_expression.depends_on_field_names,
+            generated_fields=logical_expression.generated_fields,
+            group_id=logical_expression.group_id,
+        )
 class Group:
@@ -76,9 +100,9 @@ class Group:
     Maintains a set of logical multi-expressions and physical multi-expressions.
     """
-    def __init__(self, logical_expressions: list[Expression], fields: dict[str, Field], properties: dict[str, set[str]]):
-        self.logical_expressions = set(logical_expressions)
-        self.physical_expressions = set()
+    def __init__(self, logical_expressions: list[LogicalExpression], fields: dict[str, FieldInfo], properties: dict[str, set[str]]):
+        self.logical_expressions: set[LogicalExpression] = set(logical_expressions)
+        self.physical_expressions: set[PhysicalExpression] = set()
         self.fields = fields
         self.explored = False
         self.best_physical_expression: PhysicalExpression | None = None
@@ -90,12 +114,12 @@ class Group:
         self.properties = properties
         # compute the group id
-        self.group_id = self.compute_group_id()
+        self.group_id = self._compute_group_id()
     def set_explored(self):
         self.explored = True
-    def compute_group_id(self) -> int:
+    def _compute_group_id(self) -> int:
         # sort field names
         sorted_fields = sorted(self.fields.keys())

palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl

palimpzest 0.7.21py3-none-any.whl → 0.8.0py3-none-any.whl