PyPI - palimpzest - Versions diffs - 0.9.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

palimpzest 0.9.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

palimpzest/constants.py +1 -0
palimpzest/core/data/dataset.py +33 -5
palimpzest/core/elements/groupbysig.py +10 -1
palimpzest/core/elements/records.py +16 -7
palimpzest/core/lib/schemas.py +20 -3
palimpzest/core/models.py +10 -4
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +13 -11
palimpzest/query/execution/mab_execution_strategy.py +40 -14
palimpzest/query/execution/parallel_execution_strategy.py +31 -7
palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
palimpzest/query/generators/generators.py +1 -1
palimpzest/query/operators/__init__.py +7 -6
palimpzest/query/operators/aggregate.py +110 -5
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/join.py +279 -23
palimpzest/query/operators/logical.py +20 -8
palimpzest/query/operators/mixture_of_agents.py +3 -1
palimpzest/query/operators/physical.py +5 -2
palimpzest/query/operators/rag.py +5 -4
palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
palimpzest/query/optimizer/__init__.py +7 -3
palimpzest/query/optimizer/cost_model.py +5 -5
palimpzest/query/optimizer/optimizer.py +3 -2
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/rules.py +31 -11
palimpzest/query/optimizer/tasks.py +4 -4
palimpzest/query/processor/config.py +1 -0
palimpzest/utils/progress.py +51 -23
palimpzest/validator/validator.py +7 -7
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/METADATA +26 -66
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/RECORD +35 -35
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/WHEEL +0 -0
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/top_level.txt +0 -0

palimpzest/constants.py CHANGED Viewed

@@ -207,6 +207,7 @@ class Modality(str, Enum):
 class AggFunc(str, Enum):
     COUNT = "count"
     AVERAGE = "average"
+    SUM = "sum"
     MIN = "min"
     MAX = "max"

palimpzest/core/data/dataset.py CHANGED Viewed

@@ -22,7 +22,7 @@ from palimpzest.query.operators.logical import (
     LimitScan,
     LogicalOperator,
     Project,
-    RetrieveScan,
+    TopKScan,
 )
 from palimpzest.query.processor.config import QueryProcessorConfig
 from palimpzest.utils.hash_helpers import hash_for_serialized_dict
@@ -243,7 +243,30 @@ class Dataset:
             id=self.id,
         )
-    def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
+    def join(self, other: Dataset, on: str | list[str], how: str = "inner") -> Dataset:
+        """
+        Perform the specified join on the specified (list of) column(s)
+        """
+        # enforce type for on
+        if isinstance(on, str):
+            on = [on]
+        # construct new output schema
+        combined_schema = union_schemas([self.schema, other.schema], join=True, on=on)
+        # construct logical operator
+        operator = JoinOp(
+            input_schema=combined_schema,
+            output_schema=combined_schema,
+            condition="",
+            on=on,
+            how=how,
+            depends_on=on,
+        )
+        return Dataset(sources=[self, other], operator=operator, schema=combined_schema)
+    def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None, how: str = "inner") -> Dataset:
         """
         Perform a semantic (inner) join on the specified join predicate
         """
@@ -259,6 +282,7 @@ class Dataset:
             input_schema=combined_schema,
             output_schema=combined_schema,
             condition=condition,
+            how=how,
             desc=desc,
             depends_on=depends_on,
         )
@@ -346,7 +370,6 @@ class Dataset:
         return Dataset(sources=[self], operator=operator, schema=new_output_schema)
     def sem_add_columns(self, cols: list[dict] | type[BaseModel],
                         cardinality: Cardinality = Cardinality.ONE_TO_ONE,
                         desc: str | None = None,
@@ -534,6 +557,11 @@ class Dataset:
         operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.AVERAGE)
         return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
+    def sum(self) -> Dataset:
+        """Apply a summation to this set"""
+        operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.SUM)
+        return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
     def min(self) -> Dataset:
         """Apply an min operator to this set"""
         operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MIN)
@@ -581,7 +609,7 @@ class Dataset:
         return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
-    def retrieve(
+    def sem_topk(
         self,
         index: Collection,
         search_attr: str,
@@ -608,7 +636,7 @@ class Dataset:
         # index = index_factory(index)
         # construct logical operator
-        operator = RetrieveScan(
+        operator = TopKScan(
             input_schema=self.schema,
             output_schema=new_output_schema,
             index=index,

palimpzest/core/elements/groupbysig.py CHANGED Viewed

@@ -6,8 +6,16 @@ from pydantic import BaseModel
 from palimpzest.core.lib.schemas import create_schema_from_fields
+# TODO:
+# - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator
+# - construct the correct output schema using the input schema and the group by and aggregation fields
+# - remove/update all other references to GroupBySig in the codebase
+# TODO:
+# - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator
+# - construct the correct output schema using the input schema and the group by and aggregation fields
+# - remove/update all other references to GroupBySig in the codebase
-# TODO: need to rethink how group bys work
 # signature for a group by aggregate that applies
 # group and aggregation to an input tuple
 class GroupBySig:
@@ -50,6 +58,7 @@ class GroupBySig:
             ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")")
         return ops
+    # TODO: output schema needs to account for input schema types and create new output schema types
     def output_schema(self) -> type[BaseModel]:
         # the output class varies depending on the group by, so here
         # we dynamically construct this output

palimpzest/core/elements/records.py CHANGED Viewed

@@ -140,7 +140,7 @@ class DataRecord:
     def schema(self) -> type[BaseModel]:
         return type(self._data_item)
-    def copy(self):
+    def copy(self) -> DataRecord:
         # get the set of fields to copy from the parent record
         copy_field_names = [field.split(".")[-1] for field in self.get_field_names()]
@@ -228,18 +228,18 @@ class DataRecord:
     @staticmethod
     def from_join_parents(
         schema: type[BaseModel],
-        left_parent_record: DataRecord,
-        right_parent_record: DataRecord,
+        left_parent_record: DataRecord | None,
+        right_parent_record: DataRecord | None,
         project_cols: list[str] | None = None,
         cardinality_idx: int = None,
     ) -> DataRecord:
         # get the set of fields and field descriptions to copy from the parent record(s)
-        left_copy_field_names = (
+        left_copy_field_names = [] if left_parent_record is None else (
             left_parent_record.get_field_names()
             if project_cols is None
             else [col for col in project_cols if col in left_parent_record.get_field_names()]
         )
-        right_copy_field_names = (
+        right_copy_field_names = [] if right_parent_record is None else (
             right_parent_record.get_field_names()
             if project_cols is None
             else [col for col in project_cols if col in right_parent_record.get_field_names()]
@@ -255,11 +255,20 @@ class DataRecord:
                 new_field_name = f"{field_name}_right"
             data_item[new_field_name] = right_parent_record[field_name]
+        # for any missing fields in the schema, set them to None
+        for field_name in schema.model_fields:
+            if field_name not in data_item:
+                data_item[field_name] = None
         # make new record which has left and right parent record as its parents
+        left_parent_source_indices = [] if left_parent_record is None else list(left_parent_record._source_indices)
+        right_parent_source_indices = [] if right_parent_record is None else list(right_parent_record._source_indices)
+        left_parent_record_id = [] if left_parent_record is None else [left_parent_record._id]
+        right_parent_record_id = [] if right_parent_record is None else [right_parent_record._id]
         new_dr = DataRecord(
             schema(**data_item),
-            source_indices=list(left_parent_record._source_indices) + list(right_parent_record._source_indices),
-            parent_ids=[left_parent_record._id, right_parent_record._id],
+            source_indices=left_parent_source_indices + right_parent_source_indices,
+            parent_ids=left_parent_record_id + right_parent_record_id,
             cardinality_idx=cardinality_idx,
         )

palimpzest/core/lib/schemas.py CHANGED Viewed

@@ -142,16 +142,30 @@ def create_schema_from_df(df: pd.DataFrame) -> type[BaseModel]:
     return _create_pickleable_model(fields)
-def union_schemas(models: list[type[BaseModel]], join: bool = False) -> type[BaseModel]:
+def union_schemas(models: list[type[BaseModel]], join: bool = False, on: list[str] | None = None) -> type[BaseModel]:
     """Union multiple Pydantic models into a single model."""
+    # convert on to empty list if None
+    if on is None:
+        on = []
+    # build up the fields for the new schema
     fields = {}
     for model in models:
         for field_name, field in model.model_fields.items():
-            if field_name in fields and not join:
+            # for non-join unions, make sure duplicate fields have the same type
+            if not join and field_name in fields:
                 assert fields[field_name][0] == field.annotation, f"Field {field_name} has different types in different models"
-            elif field_name in fields and join:
+            # for joins with "on" specified, no need to rename fields in "on"
+            elif join and field_name in on and field_name in fields:
+                continue
+            # otherwise, rename duplicate fields by appending _right
+            elif join and field_name in fields:
                 while field_name in fields:
                     field_name = f"{field_name}_right"
+            # add the field to the new schema
             fields[field_name] = (field.annotation, field)
     # create and return the new schema
@@ -194,6 +208,9 @@ class Average(BaseModel):
 class Count(BaseModel):
     count: int = Field(description="The count of items in the dataset")
+class Sum(BaseModel):
+    sum: int = Field(description="The summation of items in the dataset")
 class Min(BaseModel):
     min: int | float = Field(description="The minimum value of some items in the dataset")

palimpzest/core/models.py CHANGED Viewed

@@ -51,10 +51,10 @@ class GenerationStats(BaseModel):
     fn_call_duration_secs: float = 0.0
     # (if applicable) the total number of LLM calls made by this operator
-    total_llm_calls: int = 0
+    total_llm_calls: float = 0
     # (if applicable) the total number of embedding LLM calls made by this operator
-    total_embedding_llm_calls: int = 0
+    total_embedding_llm_calls: float = 0
     def __iadd__(self, other: GenerationStats) -> GenerationStats:
         # self.raw_answers.extend(other.raw_answers)
@@ -243,10 +243,10 @@ class RecordOpStats(BaseModel):
     fn_call_duration_secs: float = 0.0
     # (if applicable) the total number of LLM calls made by this operator
-    total_llm_calls: int = 0
+    total_llm_calls: float = 0
     # (if applicable) the total number of embedding LLM calls made by this operator
-    total_embedding_llm_calls: int = 0
+    total_embedding_llm_calls: float = 0
     # (if applicable) a boolean indicating whether this is the statistics captured from a failed convert operation
     failed_convert: bool | None = None
@@ -454,6 +454,12 @@ class BasePlanStats(BaseModel):
         """
         return sum([gen_stats.total_output_tokens for _, gen_stats in self.validation_gen_stats.items()])
+    def get_total_cost_so_far(self) -> float:
+        """
+        Get the total cost incurred so far in this plan execution.
+        """
+        return self.sum_op_costs() + self.sum_validation_costs()
 class PlanStats(BasePlanStats):
     """

palimpzest/query/execution/all_sample_execution_strategy.py CHANGED Viewed

@@ -225,7 +225,7 @@ class AllSamplingExecutionStrategy(SentinelExecutionStrategy):
         dataset_id_to_source_indices = {}
         for dataset_id, dataset in train_dataset.items():
             total_num_samples = len(dataset)
-            source_indices = [f"{dataset_id}-{int(idx)}" for idx in np.arange(total_num_samples)]
+            source_indices = [f"{dataset_id}---{int(idx)}" for idx in np.arange(total_num_samples)]
             dataset_id_to_source_indices[dataset_id] = source_indices
         # initialize set of physical operators for each logical operator

palimpzest/query/execution/execution_strategy.py CHANGED Viewed

@@ -14,8 +14,8 @@ from palimpzest.query.operators.convert import LLMConvert
 from palimpzest.query.operators.filter import LLMFilter
 from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.physical import PhysicalOperator
-from palimpzest.query.operators.retrieve import RetrieveOp
 from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
+from palimpzest.query.operators.topk import TopKOp
 from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
 from palimpzest.utils.progress import PZSentinelProgressManager
 from palimpzest.validator.validator import Validator
@@ -82,10 +82,11 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
     """
     def __init__(
         self,
-        k: int,
-        j: int,
-        sample_budget: int,
         policy: Policy,
+        k: int = 6,
+        j: int = 4,
+        sample_budget: int = 100,
+        sample_cost_budget: float | None = None,
         priors: dict | None = None,
         use_final_op_quality: bool = False,
         seed: int = 42,
@@ -97,6 +98,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         self.k = k
         self.j = j
         self.sample_budget = sample_budget
+        self.sample_cost_budget = sample_cost_budget
         self.policy = policy
         self.priors = priors
         self.use_final_op_quality = use_final_op_quality
@@ -123,7 +125,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
             return (
                 not isinstance(op, LLMConvert)
                 and not isinstance(op, LLMFilter)
-                and not isinstance(op, RetrieveOp)
+                and not isinstance(op, TopKOp)
                 and not isinstance(op, JoinOp)
             )
@@ -167,8 +169,8 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
                             full_hashes.add(full_hash)
                             futures.append(executor.submit(validator._score_flat_map, op, fields, input_record, output, full_hash))
-                    # create future for retrieve
-                    elif isinstance(op, RetrieveOp):
+                    # create future for top-k
+                    elif isinstance(op, TopKOp):
                         fields = op.generated_fields
                         input_record: DataRecord = record_set.input
                         output = record_set.data_records[0].to_dict(project_cols=fields)
@@ -176,7 +178,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
                         full_hash = f"{hash(input_record)}{hash(output_str)}"
                         if full_hash not in full_hashes:
                             full_hashes.add(full_hash)
-                            futures.append(executor.submit(validator._score_retrieve, op, fields, input_record, output, full_hash))
+                            futures.append(executor.submit(validator._score_topk, op, fields, input_record, output, full_hash))
                     # create future for filter
                     elif isinstance(op, LLMFilter):
@@ -235,7 +237,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
                 # TODO: this scoring function will (likely) bias towards small values of k since it
                 # measures precision and not recall / F1; will need to revisit this in the future
-                elif isinstance(op, RetrieveOp):
+                elif isinstance(op, TopKOp):
                     fields = op.generated_fields
                     input_record: DataRecord = record_set.input
                     output_str = record_set.data_records[0].to_json_str(project_cols=fields, bytes_to_str=True, sorted=True)
@@ -341,9 +343,9 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
     def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
         is_llm_convert = isinstance(physical_op, LLMConvert)
         is_llm_filter = isinstance(physical_op, LLMFilter)
-        is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
+        is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
         is_llm_join = isinstance(physical_op, JoinOp)
-        return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
+        return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
     @abstractmethod
     def execute_sentinel_plan(self, sentinel_plan: SentinelPlan, train_dataset: dict[str, Dataset], validator: Validator) -> SentinelPlanStats:

palimpzest/query/execution/mab_execution_strategy.py CHANGED Viewed

@@ -14,8 +14,8 @@ from palimpzest.query.operators.convert import LLMConvert
 from palimpzest.query.operators.filter import FilterOp, LLMFilter, NonLLMFilter
 from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.physical import PhysicalOperator
-from palimpzest.query.operators.retrieve import RetrieveOp
 from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
+from palimpzest.query.operators.topk import TopKOp
 from palimpzest.query.optimizer.plan import SentinelPlan
 from palimpzest.utils.progress import create_progress_manager
 from palimpzest.validator.validator import Validator
@@ -66,8 +66,8 @@ class OpFrontier:
         self.is_llm_join = isinstance(sample_op, JoinOp)
         is_llm_convert = isinstance(sample_op, LLMConvert)
         is_llm_filter = isinstance(sample_op, LLMFilter)
-        is_llm_retrieve = isinstance(sample_op, RetrieveOp) and isinstance(sample_op.index, Collection)
-        self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_retrieve or self.is_llm_join
+        is_llm_topk = isinstance(sample_op, TopKOp) and isinstance(sample_op.index, Collection)
+        self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_topk or self.is_llm_join
         # get order in which we will sample physical operators for this logical operator
         sample_op_indices = self._get_op_index_order(op_set, seed)
@@ -96,6 +96,12 @@ class OpFrontier:
         """
         return self.frontier_ops
+    def get_off_frontier_ops(self) -> list[PhysicalOperator]:
+        """
+        Returns the set of off-frontier operators for this OpFrontier.
+        """
+        return self.off_frontier_ops
     def _compute_op_id_to_pareto_distance(self, priors: dict[str, dict[str, float]]) -> dict[str, float]:
         """
         Return l2-distance for each operator from the pareto frontier.
@@ -298,7 +304,7 @@ class OpFrontier:
         def remove_unavailable_root_datasets(source_indices: str | tuple) -> str | tuple | None:
             # base case: source_indices is a string
             if isinstance(source_indices, str):
-                return source_indices if source_indices.split("-")[0] in self.root_dataset_ids else None
+                return source_indices if source_indices.split("---")[0] in self.root_dataset_ids else None
             # recursive case: source_indices is a tuple
             left_indices = source_indices[0]
@@ -383,6 +389,12 @@ class OpFrontier:
             # compute final list of record op stats
             full_op_id_to_record_op_stats[full_op_id] = list(record_id_to_max_quality_record_op_stats.values())
+        # NOTE: it is possible for the full_op_id_to_record_op_stats to be empty if there is a duplicate operator
+        # (e.g. a scan of the same dataset) which has all of its results cached and no new_record_op_stats;
+        # in this case, we do not update the frontier
+        if full_op_id_to_record_op_stats == {}:
+            return
         # update the set of source indices processed by each physical operator
         for full_op_id, source_indices_processed in full_op_id_to_source_indices_processed.items():
             # update the set of source indices processed
@@ -641,8 +653,8 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
         """
         Returns the operator in the frontier with the highest (estimated) quality.
         """
-        # get the operators in the frontier set for this logical_op_id
-        frontier_ops = op_frontiers[unique_logical_op_id].get_frontier_ops()
+        # get the (off) frontier operators for this logical_op_id
+        frontier_ops = op_frontiers[unique_logical_op_id].get_frontier_ops() + op_frontiers[unique_logical_op_id].get_off_frontier_ops()
         # get a mapping from full_op_id --> list[RecordOpStats]
         full_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(unique_logical_op_id, {})
@@ -668,6 +680,9 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
         return max_quality_op
+    def _compute_termination_condition(self, samples_drawn: int, sampling_cost: float) -> bool:
+        return (samples_drawn >= self.sample_budget) if self.sample_cost_budget is None else (sampling_cost >= self.sample_cost_budget)
     def _execute_sentinel_plan(
             self,
             plan: SentinelPlan,
@@ -676,8 +691,8 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
             plan_stats: SentinelPlanStats,
         ) -> SentinelPlanStats:
         # sample records and operators and update the frontiers
-        samples_drawn = 0
-        while samples_drawn < self.sample_budget:
+        samples_drawn, sampling_cost = 0, 0.0
+        while not self._compute_termination_condition(samples_drawn, sampling_cost):
             # pre-compute the set of source indices which will need to be sampled
             source_indices_to_sample = set()
             for op_frontier in op_frontiers.values():
@@ -693,14 +708,21 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 max_quality_op = self._get_max_quality_op(unique_logical_op_id, op_frontiers, plan_stats)
                 # get frontier ops and their next input
-                def is_filtered_out(tup: tuple) -> bool:
-                    return tup[-1] is None or isinstance(tup[-1], list) and all([record is None for record in tup[-1]])
+                def filter_and_clean_inputs(frontier_op_inputs: list[tuple]) -> bool:
+                    cleaned_inputs = []
+                    for tup in frontier_op_inputs:
+                        input = tup[-1]
+                        if isinstance(input, list):
+                            input = [record for record in input if record is not None]
+                        if input is not None and input != []:
+                            cleaned_inputs.append((tup[0], tup[1], input))
+                    return cleaned_inputs
                 frontier_op_inputs = op_frontiers[unique_logical_op_id].get_frontier_op_inputs(source_indices_to_sample, max_quality_op)
-                frontier_op_inputs = list(filter(lambda tup: not is_filtered_out(tup), frontier_op_inputs))
+                frontier_op_inputs = filter_and_clean_inputs(frontier_op_inputs)
                 # break out of the loop if frontier_op_inputs is empty, as this means all records have been filtered out
                 if len(frontier_op_inputs) == 0:
-                    break
+                    continue
                 # run sampled operators on sampled inputs and update the number of samples drawn
                 source_indices_to_record_set_tuples, num_llm_ops = self._execute_op_set(unique_logical_op_id, frontier_op_inputs)
@@ -713,6 +735,9 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 }
                 source_indices_to_all_record_sets, val_gen_stats = self._score_quality(validator, source_indices_to_all_record_sets)
+                # update the progress manager with validation cost
+                self.progress_manager.incr_overall_progress_cost(val_gen_stats.cost_per_record)
                 # remove records that were read from the execution cache before adding to record op stats
                 new_record_op_stats = []
                 for _, record_set_tuples in source_indices_to_record_set_tuples.items():
@@ -723,6 +748,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 # update plan stats
                 plan_stats.add_record_op_stats(unique_logical_op_id, new_record_op_stats)
                 plan_stats.add_validation_gen_stats(unique_logical_op_id, val_gen_stats)
+                sampling_cost = plan_stats.get_total_cost_so_far()
                 # provide the best record sets as inputs to the next logical operator
                 next_unique_logical_op_id = plan.get_next_unique_logical_op_id(unique_logical_op_id)
@@ -764,7 +790,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
         dataset_id_to_shuffled_source_indices = {}
         for dataset_id, dataset in train_dataset.items():
             total_num_samples = len(dataset)
-            shuffled_source_indices = [f"{dataset_id}-{int(idx)}" for idx in np.arange(total_num_samples)]
+            shuffled_source_indices = [f"{dataset_id}---{int(idx)}" for idx in np.arange(total_num_samples)]
             self.rng.shuffle(shuffled_source_indices)
             dataset_id_to_shuffled_source_indices[dataset_id] = shuffled_source_indices
@@ -794,7 +820,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors)
         # initialize and start the progress manager
-        self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, progress=self.progress)
+        self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, sample_cost_budget=self.sample_cost_budget, progress=self.progress)
         self.progress_manager.start()
         # NOTE: we must handle progress manager outside of _execute_sentinel_plan to ensure that it is shut down correctly;

palimpzest/query/execution/parallel_execution_strategy.py CHANGED Viewed

@@ -9,7 +9,6 @@ from palimpzest.query.operators.aggregate import AggregateOp
 from palimpzest.query.operators.distinct import DistinctOp
 from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.limit import LimitScanOp
-from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
 from palimpzest.query.optimizer.plan import PhysicalPlan
 from palimpzest.utils.progress import create_progress_manager
@@ -35,14 +34,27 @@ class ParallelExecutionStrategy(ExecutionStrategy):
                 return True
         return False
-    def _upstream_ops_finished(self, plan: PhysicalPlan, topo_idx: int, operator: PhysicalOperator, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> bool:
+    def _upstream_ops_finished(self, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> bool:
         """Helper function to check if agg / join operator is ready to process its inputs."""
-        # for agg / join operator, we can only process it when all upstream operators have finished processing their inputs
-        upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
+        upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(unique_full_op_id)
         upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
         upstream_future_queues = {upstream_unique_full_op_id: future_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
         return not (self._any_queue_not_empty(upstream_input_queues) or self._any_queue_not_empty(upstream_future_queues))
+    def _finish_outer_join(self, executor: ThreadPoolExecutor, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> None:
+        join_op_upstream_finished = self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
+        join_input_queues_empty = all(len(inputs) == 0 for inputs in input_queues[unique_full_op_id].values())
+        join_future_queue_empty = len(future_queues[unique_full_op_id]) == 0
+        if join_op_upstream_finished and join_input_queues_empty and join_future_queue_empty:
+            # process the join one last time with final=True to handle any left/right/outer join logic
+            operator = self.unique_full_op_id_to_operator[unique_full_op_id]
+            if not operator.finished:
+                def finalize_op(operator):
+                    return operator([], [], final=True)
+                future = executor.submit(finalize_op, operator)
+                future_queues[unique_full_op_id].append(future)
+                operator.set_finished()
     def _process_future_results(self, unique_full_op_id: str, future_queues: dict[str, list], plan_stats: PlanStats) -> list[DataRecord]:
         """
         Helper function which takes a full operator id, the future queues, and plan stats, and performs
@@ -117,15 +129,23 @@ class ParallelExecutionStrategy(ExecutionStrategy):
                             records = self._process_future_results(source_unique_full_op_id, future_queues, plan_stats)
                             input_queues[unique_full_op_id][source_unique_full_op_id].extend(records)
+                            # if the source is a left/right/outer join operator with no more inputs to process, then finish it
+                            if self.is_outer_join_op[source_unique_full_op_id]:
+                                self._finish_outer_join(executor, plan, source_unique_full_op_id, input_queues, future_queues)
                     # for the final operator, add any finished futures to the output_records
                     if unique_full_op_id == f"{topo_idx}-{final_op.get_full_op_id()}":
                         records = self._process_future_results(unique_full_op_id, future_queues, plan_stats)
                         output_records.extend(records)
+                        # if this is a left/right/outer join operator with no more inputs to process, then finish it
+                        if self.is_outer_join_op[unique_full_op_id]:
+                            self._finish_outer_join(executor, plan, unique_full_op_id, input_queues, future_queues)
                     # if this operator does not have enough inputs to execute, then skip it
                     num_inputs = sum(len(inputs) for inputs in input_queues[unique_full_op_id].values())
-                    agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues, future_queues)
-                    join_op_not_ready = isinstance(operator, JoinOp) and not self.join_has_downstream_limit_op[unique_full_op_id] and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues, future_queues)
+                    agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
+                    join_op_not_ready = isinstance(operator, JoinOp) and not self.join_has_downstream_limit_op[unique_full_op_id] and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
                     if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
                         continue
@@ -225,8 +245,9 @@ class ParallelExecutionStrategy(ExecutionStrategy):
         input_queues = self._create_input_queues(plan)
         future_queues = {f"{topo_idx}-{op.get_full_op_id()}": [] for topo_idx, op in enumerate(plan)}
-        # precompute which operators are joins and which joins have downstream limit ops
+        # precompute which operators are (outer) joins and which joins have downstream limit ops
         self.is_join_op = {f"{topo_idx}-{op.get_full_op_id()}": isinstance(op, JoinOp) for topo_idx, op in enumerate(plan)}
+        self.is_outer_join_op = {f"{topo_idx}-{op.get_full_op_id()}": isinstance(op, JoinOp) and op.how in ("left", "right", "outer") for topo_idx, op in enumerate(plan)}
         self.join_has_downstream_limit_op = {}
         for topo_idx, op in enumerate(plan):
             if isinstance(op, JoinOp):
@@ -240,6 +261,9 @@ class ParallelExecutionStrategy(ExecutionStrategy):
                         break
                 self.join_has_downstream_limit_op[unique_full_op_id] = has_downstream_limit_op
+        # precompute mapping from unique_full_op_id to operator instance
+        self.unique_full_op_id_to_operator = {f"{topo_idx}-{op.get_full_op_id()}": op for topo_idx, op in enumerate(plan)}
         # initialize and start the progress manager
         self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
         self.progress_manager.start()

palimpzest/query/execution/single_threaded_execution_strategy.py CHANGED Viewed

@@ -6,7 +6,6 @@ from palimpzest.query.execution.execution_strategy import ExecutionStrategy
 from palimpzest.query.operators.aggregate import AggregateOp
 from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.limit import LimitScanOp
-from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
 from palimpzest.query.optimizer.plan import PhysicalPlan
 from palimpzest.utils.progress import create_progress_manager
@@ -70,6 +69,13 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
                 record_set, num_inputs_processed = operator(left_input_records, right_input_records)
                 records = record_set.data_records
                 record_op_stats = record_set.record_op_stats
+                # process the join one last time with final=True to handle any left/right/outer join logic
+                if operator.how in ("left", "right", "outer"):
+                    record_set, num_inputs_processed = operator([], [], final=True)
+                    records.extend(record_set.data_records)
+                    record_op_stats.extend(record_set.record_op_stats)
                 num_outputs = sum(record._passed_operator for record in records)
                 # update the progress manager
@@ -168,10 +174,9 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                 return True
         return False
-    def _upstream_ops_finished(self, plan: PhysicalPlan, topo_idx: int, operator: PhysicalOperator, input_queues: dict[str, dict[str, list]]) -> bool:
+    def _upstream_ops_finished(self, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]]) -> bool:
         """Helper function to check if agg / join operator is ready to process its inputs."""
-        # for agg / join operator, we can only process it when all upstream operators have finished processing their inputs
-        upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
+        upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(unique_full_op_id)
         upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
         return not self._any_queue_not_empty(upstream_input_queues)
@@ -192,8 +197,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                 unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
                 num_inputs = sum(len(input_queues[unique_full_op_id][source_unique_full_op_id]) for source_unique_full_op_id in source_unique_full_op_ids)
-                agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues)
-                join_op_not_ready = isinstance(operator, JoinOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues)
+                agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
+                join_op_not_ready = isinstance(operator, JoinOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
                 if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
                     continue
@@ -242,6 +247,18 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                     # update the progress manager
                     self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                # if this is a join operator with no more inputs to process, then finish it
+                if isinstance(operator, JoinOp) and operator.how in ("left", "right", "outer"):
+                    join_op_upstream_finished = self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
+                    join_input_queues_empty = all(len(inputs) == 0 for inputs in input_queues[unique_full_op_id].values())
+                    if join_op_upstream_finished and join_input_queues_empty and not operator.finished:
+                        # process the join one last time with final=True to handle any left/right/outer join logic
+                        record_set, num_inputs_processed = operator([], [], final=True)
+                        records.extend(record_set.data_records)
+                        record_op_stats.extend(record_set.record_op_stats)
+                        num_outputs += sum(record._passed_operator for record in record_set.data_records)
+                        operator.set_finished()
                 # update plan stats
                 plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)

palimpzest 0.9.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

palimpzest 0.9.0py3-none-any.whl → 1.1.0py3-none-any.whl