PyPI - palimpzest - Versions diffs - 0.7.7__py3-none-any.whl → 0.7.9__py3-none-any.whl - Mend

palimpzest 0.7.7py3-none-any.whl → 0.7.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

palimpzest/constants.py +113 -75
palimpzest/core/data/dataclasses.py +55 -38
palimpzest/core/elements/index.py +5 -15
palimpzest/core/elements/records.py +1 -1
palimpzest/prompts/prompt_factory.py +1 -1
palimpzest/query/execution/all_sample_execution_strategy.py +216 -0
palimpzest/query/execution/execution_strategy.py +4 -4
palimpzest/query/execution/execution_strategy_type.py +7 -1
palimpzest/query/execution/mab_execution_strategy.py +184 -72
palimpzest/query/execution/parallel_execution_strategy.py +182 -15
palimpzest/query/execution/single_threaded_execution_strategy.py +21 -21
palimpzest/query/generators/api_client_factory.py +6 -7
palimpzest/query/generators/generators.py +5 -8
palimpzest/query/operators/aggregate.py +4 -3
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/map.py +1 -1
palimpzest/query/operators/physical.py +8 -4
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/retrieve.py +7 -23
palimpzest/query/operators/scan.py +1 -1
palimpzest/query/optimizer/cost_model.py +54 -62
palimpzest/query/optimizer/optimizer.py +2 -6
palimpzest/query/optimizer/plan.py +4 -4
palimpzest/query/optimizer/primitives.py +1 -1
palimpzest/query/optimizer/rules.py +8 -26
palimpzest/query/optimizer/tasks.py +3 -3
palimpzest/query/processor/processing_strategy_type.py +2 -2
palimpzest/query/processor/sentinel_processor.py +0 -2
palimpzest/sets.py +2 -3
palimpzest/utils/generation_helpers.py +1 -1
palimpzest/utils/model_helpers.py +27 -9
palimpzest/utils/progress.py +81 -72
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/METADATA +4 -2
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/RECORD +39 -38
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/WHEEL +1 -1
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/top_level.txt +0 -0

palimpzest/query/execution/all_sample_execution_strategy.py ADDED Viewed

@@ -0,0 +1,216 @@
+import logging
+import numpy as np
+from palimpzest.core.data.dataclasses import SentinelPlanStats
+from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.query.execution.execution_strategy import SentinelExecutionStrategy
+from palimpzest.query.operators.physical import PhysicalOperator
+from palimpzest.query.operators.scan import ScanPhysicalOp
+from palimpzest.query.optimizer.plan import SentinelPlan
+from palimpzest.utils.progress import create_progress_manager
+logger = logging.getLogger(__name__)
+class OpSet:
+    """
+    This class represents the set of operators which are currently in the frontier for a given logical operator.
+    Each operator in the frontier is an instance of a PhysicalOperator which either:
+    1. lies on the Pareto frontier of the set of sampled operators, or
+    2. has been sampled fewer than j times
+    """
+    def __init__(self, op_set: list[PhysicalOperator], source_indices: list[int]):
+        # construct the set of operators
+        self.ops = op_set
+        # store the order in which we will sample the source records
+        self.source_indices = source_indices
+        # set the initial inputs for this logical operator
+        is_scan_op = isinstance(op_set[0], ScanPhysicalOp)
+        self.source_idx_to_input = {source_idx: [source_idx] for source_idx in self.source_indices} if is_scan_op else {}
+    def get_op_input_pairs(self) -> list[PhysicalOperator, DataRecord | int | None]:
+        """
+        Returns the list of frontier operators and their next input to process. If there are
+        any indices in `source_indices_to_sample` which this operator does not sample on its own, then
+        we also have this frontier process that source_idx's input with its max quality operator.
+        """
+        # get the list of (op, source_idx) pairs which this operator needs to execute
+        op_source_idx_pairs = []
+        for op in self.ops:
+            # construct list of inputs by looking up the input for the given source_idx
+            for source_idx in self.source_indices:
+                op_source_idx_pairs.append((op, source_idx))
+        # fetch the corresponding (op, input) pairs
+        op_input_pairs = []
+        for op, source_idx in op_source_idx_pairs:
+            op_input_pairs.extend([(op, input_record) for input_record in self.source_idx_to_input[source_idx]])
+        return op_input_pairs
+    def pick_highest_quality_output(self, record_sets: list[DataRecordSet]) -> DataRecordSet:
+        # if there's only one operator in the set, we return its record_set
+        if len(record_sets) == 1:
+            return record_sets[0]
+        # NOTE: I don't like that this assumes the models are consistent in
+        #       how they order their record outputs for one-to-many converts;
+        #       eventually we can try out more robust schemes to account for
+        #       differences in ordering
+        # aggregate records at each index in the response
+        idx_to_records = {}
+        for record_set in record_sets:
+            for idx in range(len(record_set)):
+                record, record_op_stats = record_set[idx], record_set.record_op_stats[idx]
+                if idx not in idx_to_records:
+                    idx_to_records[idx] = [(record, record_op_stats)]
+                else:
+                    idx_to_records[idx].append((record, record_op_stats))
+        # compute highest quality answer at each index
+        out_records = []
+        out_record_op_stats = []
+        for idx in range(len(idx_to_records)):
+            records_lst, record_op_stats_lst = zip(*idx_to_records[idx])
+            max_quality_record, max_quality = records_lst[0], record_op_stats_lst[0].quality
+            max_quality_stats = record_op_stats_lst[0]
+            for record, record_op_stats in zip(records_lst[1:], record_op_stats_lst[1:]):
+                record_quality = record_op_stats.quality
+                if record_quality > max_quality:
+                    max_quality_record = record
+                    max_quality = record_quality
+                    max_quality_stats = record_op_stats
+            out_records.append(max_quality_record)
+            out_record_op_stats.append(max_quality_stats)
+        # create and return final DataRecordSet
+        return DataRecordSet(out_records, out_record_op_stats)
+    def update_inputs(self, source_idx_to_record_sets: dict[int, DataRecordSet]):
+        """
+        Update the inputs for this logical operator based on the outputs of the previous logical operator.
+        """
+        for source_idx, record_sets in source_idx_to_record_sets.items():
+            input = []
+            max_quality_record_set = self.pick_highest_quality_output(record_sets)
+            for record in max_quality_record_set:
+                input.append(record if record.passed_operator else None)
+            self.source_idx_to_input[source_idx] = input
+class AllSamplingExecutionStrategy(SentinelExecutionStrategy):
+    def _get_source_indices(self):
+        """Get the list of source indices which the sentinel plan should execute over."""
+        # create list of all source indices and shuffle it
+        total_num_samples = len(self.val_datasource)
+        source_indices = list(np.arange(total_num_samples))
+        return source_indices
+    def _execute_sentinel_plan(self,
+            plan: SentinelPlan,
+            op_sets: dict[str, OpSet],
+            expected_outputs: dict[int, dict] | None,
+            plan_stats: SentinelPlanStats,
+        ) -> SentinelPlanStats:
+        # execute operator sets in sequence
+        for op_idx, (logical_op_id, op_set) in enumerate(plan):
+            # get frontier ops and their next input
+            op_input_pairs = op_sets[logical_op_id].get_op_input_pairs()
+            # break out of the loop if op_input_pairs is empty, as this means all records have been filtered out
+            if len(op_input_pairs) == 0:
+                break
+            # run sampled operators on sampled inputs
+            source_idx_to_record_sets_and_ops, _ = self._execute_op_set(op_input_pairs)
+            # FUTURE TODO: have this return the highest quality record set simply based on our posterior (or prior) belief on operator quality
+            # get the target record set for each source_idx
+            source_idx_to_target_record_set = self._get_target_record_sets(logical_op_id, source_idx_to_record_sets_and_ops, expected_outputs)
+            # TODO: make consistent across here and RandomSampling
+            # FUTURE TODO: move this outside of the loop (i.e. assume we only get quality label(s) after executing full program)
+            # score the quality of each generated output
+            physical_op_cls = op_set[0].__class__
+            source_idx_to_record_sets = {
+                source_idx: list(map(lambda tup: tup[0], record_sets_and_ops))
+                for source_idx, record_sets_and_ops in source_idx_to_record_sets_and_ops.items()
+            }
+            source_idx_to_record_sets = self._score_quality(physical_op_cls, source_idx_to_record_sets, source_idx_to_target_record_set)
+            # flatten the lists of records and record_op_stats
+            all_records, all_record_op_stats = self._flatten_record_sets(source_idx_to_record_sets)
+            # update plan stats
+            plan_stats.add_record_op_stats(all_record_op_stats)
+            # add records (which are not filtered) to the cache, if allowed
+            self._add_records_to_cache(logical_op_id, all_records)
+            # FUTURE TODO: simply set input based on source_idx_to_target_record_set (b/c we won't have scores computed)
+            # provide the champion record sets as inputs to the next logical operator
+            if op_idx + 1 < len(plan):
+                next_logical_op_id = plan.logical_op_ids[op_idx + 1]
+                op_sets[next_logical_op_id].update_inputs(source_idx_to_record_sets)
+        # close the cache
+        self._close_cache(plan.logical_op_ids)
+        # finalize plan stats
+        plan_stats.finish()
+        return plan_stats
+    def execute_sentinel_plan(self, plan: SentinelPlan, expected_outputs: dict[int, dict] | None):
+        """
+        NOTE: this function currently requires us to set k and j properly in order to make
+              comparison in our research against the corresponding sample budget in MAB.
+        NOTE: the number of samples will slightly exceed the sample_budget if the number of operator
+        calls does not perfectly match the sample_budget. This may cause some minor discrepancies with
+        the progress manager as a result.
+        """
+        # for now, assert that the first operator in the plan is a ScanPhysicalOp
+        assert all(isinstance(op, ScanPhysicalOp) for op in plan.operator_sets[0]), "First operator in physical plan must be a ScanPhysicalOp"
+        logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
+        logger.info(f"Plan Details: {plan}")
+        # initialize plan stats
+        plan_stats = SentinelPlanStats.from_plan(plan)
+        plan_stats.start()
+        # get list of source indices which can be sampled from
+        source_indices = self._get_source_indices()
+        # initialize set of physical operators for each logical operator
+        op_sets = {
+            logical_op_id: OpSet(op_set, source_indices)
+            for logical_op_id, op_set in plan
+        }
+        # initialize and start the progress manager
+        self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, progress=self.progress)
+        self.progress_manager.start()
+        # NOTE: we must handle progress manager outside of _exeecute_sentinel_plan to ensure that it is shut down correctly;
+        #       if we don't have the `finally:` branch, then program crashes can cause future program runs to fail because
+        #       the progress manager cannot get a handle to the console
+        try:
+            # execute sentinel plan by sampling records and operators
+            plan_stats = self._execute_sentinel_plan(plan, op_sets, expected_outputs, plan_stats)
+        finally:
+            # finish progress tracking
+            self.progress_manager.finish()
+        logger.info(f"Done executing sentinel plan: {plan.plan_id}")
+        logger.debug(f"Plan stats: (plan_cost={plan_stats.total_plan_cost}, plan_time={plan_stats.total_plan_time})")
+        return plan_stats

palimpzest/query/execution/execution_strategy.py CHANGED Viewed

@@ -78,7 +78,7 @@ class ExecutionStrategy(BaseExecutionStrategy, ABC):
                     else min(self.scan_start_idx + self.num_samples, len(op.datareader))
                 )
                 inputs = [idx for idx in range(self.scan_start_idx, scan_end_idx)]
-            input_queues[op.get_op_id()] = inputs
+            input_queues[op.get_full_op_id()] = inputs
         return input_queues
@@ -95,6 +95,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         j: int,
         sample_budget: int,
         policy: Policy,
+        priors: dict | None = None,
         use_final_op_quality: bool = False,
         seed: int = 42,
         exp_name: str | None = None,
@@ -107,6 +108,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         self.j = j
         self.sample_budget = sample_budget
         self.policy = policy
+        self.priors = priors
         self.use_final_op_quality = use_final_op_quality
         self.seed = seed
         self.rng = np.random.default_rng(seed=seed)
@@ -378,9 +380,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
             return input.source_idx if isinstance(input, DataRecord) else input
         def get_hash(operator, input):
-            logical_op_id = operator.get_logical_op_id()
-            phys_op_id = operator.get_op_id()
-            return hash(f"{logical_op_id}{phys_op_id}{hash(input)}")
+            return hash(f"{operator.get_full_op_id()}{hash(input)}")
         # initialize mapping from source indices to output record sets
         source_idx_to_record_sets_and_ops = {get_source_idx(input): [] for _, input in op_input_pairs}

palimpzest/query/execution/execution_strategy_type.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from enum import Enum
+from palimpzest.query.execution.all_sample_execution_strategy import AllSamplingExecutionStrategy
 from palimpzest.query.execution.mab_execution_strategy import MABExecutionStrategy
-from palimpzest.query.execution.parallel_execution_strategy import ParallelExecutionStrategy
+from palimpzest.query.execution.parallel_execution_strategy import (
+    ParallelExecutionStrategy,
+    SequentialParallelExecutionStrategy,
+)
 from palimpzest.query.execution.random_sampling_execution_strategy import RandomSamplingExecutionStrategy
 from palimpzest.query.execution.single_threaded_execution_strategy import (
     PipelinedSingleThreadExecutionStrategy,
@@ -14,7 +18,9 @@ class ExecutionStrategyType(Enum):
     SEQUENTIAL = SequentialSingleThreadExecutionStrategy
     PIPELINED = PipelinedSingleThreadExecutionStrategy
     PARALLEL = ParallelExecutionStrategy
+    SEQUENTIAL_PARALLEL = SequentialParallelExecutionStrategy
 class SentinelExecutionStrategyType(Enum):
     MAB = MABExecutionStrategy
     RANDOM = RandomSamplingExecutionStrategy
+    ALL = AllSamplingExecutionStrategy

palimpzest 0.7.7__py3-none-any.whl → 0.7.9__py3-none-any.whl

palimpzest 0.7.7py3-none-any.whl → 0.7.9py3-none-any.whl