PyPI - palimpzest - Versions diffs - 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

palimpzest 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
palimpzest-0.7.0.dist-info/RECORD +96 -0
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.3.dist-info/RECORD +0 -87
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0

palimpzest/query/execution/single_threaded_execution_strategy.py CHANGED Viewed

@@ -1,13 +1,15 @@
-import time
+import logging
-from palimpzest.core.data.dataclasses import OperatorStats, PlanStats
+from palimpzest.core.data.dataclasses import PlanStats
+from palimpzest.core.elements.records import DataRecord
 from palimpzest.query.execution.execution_strategy import ExecutionStrategy
 from palimpzest.query.operators.aggregate import AggregateOp
-from palimpzest.query.operators.filter import FilterOp
 from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.scan import ScanPhysicalOp
 from palimpzest.query.optimizer.plan import PhysicalPlan
+from palimpzest.utils.progress import create_progress_manager
+logger = logging.getLogger(__name__)
 class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
     """
@@ -21,113 +23,100 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.max_workers = 1
-    def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1):
-        """Initialize the stats and the execute the plan."""
-        if self.verbose:
-            print("----------------------")
-            print(f"PLAN[{plan.plan_id}] (n={num_samples}):")
-            print(plan)
-            print("---")
-        plan_start_time = time.time()
-        # initialize plan stats and operator stats
-        plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
-        for op in plan.operators:
-            op_id = op.get_op_id()
-            op_name = op.op_name()
-            op_details = {k: str(v) for k, v in op.get_id_params().items()}
-            plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
-        # initialize list of output records and intermediate variables
-        output_records = []
-        current_scan_idx = self.scan_start_idx
-        # get handle to scan operator and pre-compute its size
-        source_operator = plan.operators[0]
-        assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
-        datareader_len = len(source_operator.datareader)
-        # initialize processing queues for each operation
-        processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
+    def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, list], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
         # execute the plan one operator at a time
+        output_records = []
         for op_idx, operator in enumerate(plan.operators):
+            # if we've filtered out all records, terminate early
             op_id = operator.get_op_id()
-            prev_op_id = plan.operators[op_idx - 1].get_op_id() if op_idx > 1 else None
-            next_op_id = plan.operators[op_idx + 1].get_op_id() if op_idx + 1 < len(plan.operators) else None
+            num_inputs = len(input_queues[op_id])
+            if num_inputs == 0:
+                break
-            # initialize output records and record_op_stats for this operator
+            # begin to process this operator
             records, record_op_stats = [], []
+            logger.info(f"Processing operator {operator.op_name()} ({op_id})")
-            # invoke scan operator(s) until we run out of source records or hit the num_samples limit
-            if isinstance(operator, ScanPhysicalOp):
-                keep_scanning_source_records = True
-                while keep_scanning_source_records:
-                    # run ScanPhysicalOp on current scan index
-                    record_set = operator(current_scan_idx)
-                    records.extend(record_set.data_records)
-                    record_op_stats.extend(record_set.record_op_stats)
-                    # update the current scan index
-                    current_scan_idx += 1
-                    # update whether to keep scanning source records
-                    keep_scanning_source_records = current_scan_idx < datareader_len and len(records) < num_samples
-            # aggregate operators accept all input records at once
-            elif isinstance(operator, AggregateOp):
-                record_set = operator(candidates=processing_queues[op_id])
+            # if this operator is an aggregate, process all the records in the input_queue
+            if isinstance(operator, AggregateOp):
+                record_set = operator(candidates=input_queues[op_id])
                 records = record_set.data_records
                 record_op_stats = record_set.record_op_stats
+                num_outputs = sum(record.passed_operator for record in records)
+                # update the progress manager
+                self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
-            # otherwise, process the records in the processing queue for this operator one at a time
-            elif len(processing_queues[op_id]) > 0:
-                for input_record in processing_queues[op_id]:
+            # otherwise, process the records in the input queue for this operator one at a time
+            else:
+                for input_record in input_queues[op_id]:
                     record_set = operator(input_record)
                     records.extend(record_set.data_records)
                     record_op_stats.extend(record_set.record_op_stats)
+                    num_outputs = sum(record.passed_operator for record in record_set.data_records)
+                    # update the progress manager
+                    self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                    # finish early if this is a limit
                     if isinstance(operator, LimitScanOp) and len(records) == operator.limit:
                         break
             # update plan stats
-            plan_stats.operator_stats[op_id].add_record_op_stats(
-                record_op_stats,
-                source_op_id=prev_op_id,
-                plan_id=plan.plan_id,
-            )
-            # add records (which are not filtered) to the cache, if allowed
-            if not self.nocache:
-                for record in records:
-                    if getattr(record, "passed_operator", True):
-                        # self.datadir.append_cache(operator.target_cache_id, record)
-                        pass
-            # update processing_queues or output_records
-            for record in records:
-                if isinstance(operator, FilterOp) and not record.passed_operator:
-                    continue
-                if next_op_id is not None:
-                    processing_queues[next_op_id].append(record)
-                else:
-                    output_records.append(record)
+            plan_stats.add_record_op_stats(record_op_stats)
-            # if we've filtered out all records, terminate early
-            if next_op_id is not None and processing_queues[next_op_id] == []:
-                break
+            # add records to the cache
+            self._add_records_to_cache(operator.target_cache_id, records)
-        # if caching was allowed, close the cache
-        if not self.nocache:
-            for _ in plan.operators:
-                # self.datadir.close_cache(operator.target_cache_id)
-                pass
+            # update next input_queue (if it exists)
+            output_records = [record for record in records if record.passed_operator]
+            if op_idx + 1 < len(plan.operators):
+                next_op_id = plan.operators[op_idx + 1].get_op_id()
+                input_queues[next_op_id] = output_records
+            logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_op_id()}), and generated {len(records)} records")
+        # close the cache
+        self._close_cache([op.target_cache_id for op in plan.operators])
         # finalize plan stats
-        total_plan_time = time.time() - plan_start_time
-        plan_stats.finalize(total_plan_time)
+        plan_stats.finish()
+        return output_records, plan_stats
+    def execute_plan(self, plan: PhysicalPlan) -> tuple[list[DataRecord], PlanStats]:
+        """Initialize the stats and execute the plan."""
+        # for now, assert that the first operator in the plan is a ScanPhysicalOp
+        assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
+        logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
+        logger.info(f"Plan Details: {plan}")
+        # initialize plan stats
+        plan_stats = PlanStats.from_plan(plan)
+        plan_stats.start()
+        # initialize input queues for each operation
+        input_queues = self._create_input_queues(plan)
+        # initialize and start the progress manager
+        self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
+        self.progress_manager.start()
+        # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
+        #       if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
+        #       because the progress manager cannot get a handle to the console
+        try:
+            # execute plan
+            output_records, plan_stats = self._execute_plan(plan, input_queues, plan_stats)
+        finally:
+            # finish progress tracking
+            self.progress_manager.finish()
+        logger.info(f"Done executing plan: {plan.plan_id}")
+        logger.debug(f"Plan stats: (plan_cost={plan_stats.total_plan_cost}, plan_time={plan_stats.total_plan_time})")
         return output_records, plan_stats
@@ -148,137 +137,118 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.max_workers = 1 if self.max_workers is None else self.max_workers
-    def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1):
-        """Initialize the stats and the execute the plan."""
-        if self.verbose:
-            print("----------------------")
-            print(f"PLAN[{plan.plan_id}] (n={num_samples}):")
-            print(plan)
-            print("---")
-        plan_start_time = time.time()
-        # initialize plan stats and operator stats
-        plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
-        for op in plan.operators:
-            op_id = op.get_op_id()
-            op_name = op.op_name()
-            op_details = {k: str(v) for k, v in op.get_id_params().items()}
-            plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
-        # initialize list of output records and intermediate variables
-        output_records = []
-        source_records_scanned = 0
-        current_scan_idx = self.scan_start_idx
+        self.max_workers = 1
+    def _any_queue_not_empty(self, queues: dict[str, list]) -> bool:
+        """Helper function to check if any queue is not empty."""
+        return any(len(queue) > 0 for queue in queues.values())
-        # get handle to scan operator and pre-compute its size
-        source_operator = plan.operators[0]
-        assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
-        datareader_len = len(source_operator.datareader)
+    def _upstream_ops_finished(self, plan: PhysicalPlan, op_idx: int, input_queues: dict[str, list]) -> bool:
+        """Helper function to check if all upstream operators have finished processing their inputs."""
+        for upstream_op_idx in range(op_idx):
+            upstream_op_id = plan.operators[upstream_op_idx].get_op_id()
+            if len(input_queues[upstream_op_id]) > 0:
+                return False
-        # initialize processing queues for each operation
-        processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
+        return True
+    def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, list], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
         # execute the plan until either:
         # 1. all records have been processed, or
-        # 2. the final limit operation has completed
-        finished_executing, keep_scanning_source_records = False, True
-        while not finished_executing:
+        # 2. the final limit operation has completed (we break out of the loop if this happens)
+        final_output_records = []
+        while self._any_queue_not_empty(input_queues):
             for op_idx, operator in enumerate(plan.operators):
+                # if this operator does not have enough inputs to execute, then skip it
                 op_id = operator.get_op_id()
-                prev_op_id = plan.operators[op_idx - 1].get_op_id() if op_idx > 1 else None
-                next_op_id = plan.operators[op_idx + 1].get_op_id() if op_idx + 1 < len(plan.operators) else None
+                num_inputs = len(input_queues[op_id])
+                agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, op_idx, input_queues)
+                if num_inputs == 0 or agg_op_not_ready:
+                    continue
                 # create empty lists for records and execution stats generated by executing this operator on its next input(s)
                 records, record_op_stats = [], []
-                # invoke scan operator(s) until we run out of source records or hit the num_samples limit
-                if isinstance(operator, ScanPhysicalOp):
-                    if keep_scanning_source_records:
-                        # run ScanPhysicalOp on current scan index
-                        record_set = operator(current_scan_idx)
-                        records = record_set.data_records
-                        record_op_stats = record_set.record_op_stats
-                        # update number of source records scanned and the current index
-                        source_records_scanned += len(records)
-                        current_scan_idx += 1
-                    else:
-                        continue
-                # only invoke aggregate operator(s) once there are no more source records and all
-                # upstream operators' processing queues are empty
-                elif isinstance(operator, AggregateOp):
-                    upstream_ops_are_finished = True
-                    for upstream_op_idx in range(op_idx):
-                        # scan operators do not have processing queues
-                        if isinstance(plan.operators[upstream_op_idx], ScanPhysicalOp):
-                            continue
-                        # check upstream ops which do have a processing queue
-                        upstream_op_id = plan.operators[upstream_op_idx].get_op_id()
-                        upstream_ops_are_finished = (
-                            upstream_ops_are_finished and len(processing_queues[upstream_op_id]) == 0
-                        )
-                    if not keep_scanning_source_records and upstream_ops_are_finished:
-                        record_set = operator(candidates=processing_queues[op_id])
-                        records = record_set.data_records
-                        record_op_stats = record_set.record_op_stats
-                        processing_queues[op_id] = []
-                # otherwise, process the next record in the processing queue for this operator
-                elif len(processing_queues[op_id]) > 0:
-                    input_record = processing_queues[op_id].pop(0)
+                # if the next operator is an aggregate, process all the records in the input_queue
+                if isinstance(operator, AggregateOp):
+                    input_records = [input_queues[op_id].pop(0) for _ in range(num_inputs)]
+                    record_set = operator(candidates=input_records)
+                    records = record_set.data_records
+                    record_op_stats = record_set.record_op_stats
+                    num_outputs = sum(record.passed_operator for record in records)
+                    # update the progress manager
+                    self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                # otherwise, process the next record in the input queue for this operator
+                else:
+                    input_record = input_queues[op_id].pop(0)
                     record_set = operator(input_record)
                     records = record_set.data_records
                     record_op_stats = record_set.record_op_stats
+                    num_outputs = sum(record.passed_operator for record in records)
-                # if records were generated by this operator, process them
-                if len(records) > 0:
-                    # update plan stats
-                    plan_stats.operator_stats[op_id].add_record_op_stats(
-                        record_op_stats,
-                        source_op_id=prev_op_id,
-                        plan_id=plan.plan_id,
-                    )
-                    # add records (which are not filtered) to the cache, if allowed
-                    if not self.nocache:
-                        for record in records:
-                            if getattr(record, "passed_operator", True):
-                                # self.datadir.append_cache(operator.target_cache_id, record)
-                                pass
-                    # update processing_queues or output_records
-                    for record in records:
-                        if isinstance(operator, FilterOp) and not record.passed_operator:
-                            continue
-                        if next_op_id is not None:
-                            processing_queues[next_op_id].append(record)
-                        else:
-                            output_records.append(record)
-            # update finished_executing based on whether all records have been processed
-            still_processing = any([len(queue) > 0 for queue in processing_queues.values()])
-            keep_scanning_source_records = current_scan_idx < datareader_len and source_records_scanned < num_samples
-            finished_executing = not keep_scanning_source_records and not still_processing
-            # update finished_executing based on limit
-            if isinstance(operator, LimitScanOp):
-                finished_executing = len(output_records) == operator.limit
-        # if caching was allowed, close the cache
-        if not self.nocache:
-            for _ in plan.operators:
-                # self.datadir.close_cache(operator.target_cache_id)
-                pass
+                    # update the progress manager
+                    self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                # update plan stats
+                plan_stats.add_record_op_stats(record_op_stats)
+                # add records to the cache
+                self._add_records_to_cache(operator.target_cache_id, records)
+                # update next input_queue or final_output_records
+                output_records = [record for record in records if record.passed_operator]
+                if op_idx + 1 < len(plan.operators):
+                    next_op_id = plan.operators[op_idx + 1].get_op_id()
+                    input_queues[next_op_id].extend(output_records)
+                else:
+                    final_output_records.extend(output_records)
+                logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_op_id()}) on {num_inputs} records")
+            # break out of loop if the final operator is a LimitScanOp and we've reached its limit
+            if isinstance(plan.operators[-1], LimitScanOp) and len(final_output_records) == plan.operators[-1].limit:
+                break
+        # close the cache
+        self._close_cache([op.target_cache_id for op in plan.operators])
         # finalize plan stats
-        total_plan_time = time.time() - plan_start_time
-        plan_stats.finalize(total_plan_time)
+        plan_stats.finish()
+        return final_output_records, plan_stats
+    def execute_plan(self, plan: PhysicalPlan):
+        """Initialize the stats and execute the plan."""
+        # for now, assert that the first operator in the plan is a ScanPhysicalOp
+        assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
+        logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
+        logger.info(f"Plan Details: {plan}")
+        # initialize plan stats
+        plan_stats = PlanStats.from_plan(plan)
+        plan_stats.start()
+        # initialize input queues for each operation
+        input_queues = self._create_input_queues(plan)
+        # initialize and start the progress manager
+        self.progress_manager = create_progress_manager(plan, self.num_samples, self.progress)
+        self.progress_manager.start()
+        # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
+        #       if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
+        #       because the progress manager cannot get a handle to the console
+        try:
+            # execute plan
+            output_records, plan_stats = self._execute_plan(plan, input_queues, plan_stats)
+        finally:
+            # finish progress tracking
+            self.progress_manager.finish()
+        logger.info(f"Done executing plan: {plan.plan_id}")
+        logger.debug(f"Plan stats: (plan_cost={plan_stats.total_plan_cost}, plan_time={plan_stats.total_plan_time})")
         return output_records, plan_stats

palimpzest/query/generators/api_client_factory.py ADDED Viewed

@@ -0,0 +1,31 @@
+from threading import Lock
+from openai import OpenAI
+from together import Together
+from palimpzest.constants import APIClient
+class APIClientFactory:
+    _instances = {}
+    _lock = Lock()
+    @classmethod
+    def get_client(cls, api_client: APIClient, api_key: str):
+        """Get a singleton instance of the requested API client."""
+        if api_client not in cls._instances:
+            with cls._lock:  # Ensure thread safety
+                if api_client not in cls._instances:  # Double-check inside the lock
+                    cls._instances[api_client] = cls._create_client(api_client, api_key)
+        return cls._instances[api_client]
+    @staticmethod
+    def _create_client(api_client: APIClient, api_key: str):
+        """Create a new client instance based on the api_client name."""
+        match api_client:
+            case APIClient.OPENAI:
+                return OpenAI(api_key=api_key)
+            case APIClient.TOGETHER:
+                return Together(api_key=api_key)
+            case _:
+                raise ValueError(f"Unknown api_client: {api_client}")

palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

palimpzest 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl