PyPI - palimpzest - Versions diffs - 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

palimpzest 0.7.21py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +259 -197
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +634 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +61 -5
palimpzest/prompts/filter_prompts.py +50 -5
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
palimpzest/prompts/prompt_factory.py +358 -46
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +157 -330
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +27 -21
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +22 -13
palimpzest/query/operators/join.py +402 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +198 -80
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +41 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +27 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
palimpzest-0.8.0.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.21.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0

palimpzest/query/execution/parallel_execution_strategy.py CHANGED Viewed

@@ -1,20 +1,22 @@
 import logging
-import multiprocessing
 from concurrent.futures import ThreadPoolExecutor, wait
 from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
-from palimpzest.core.data.dataclasses import PlanStats
-from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.elements.records import DataRecord
+from palimpzest.core.models import PlanStats
 from palimpzest.query.execution.execution_strategy import ExecutionStrategy
 from palimpzest.query.operators.aggregate import AggregateOp
+from palimpzest.query.operators.distinct import DistinctOp
+from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.physical import PhysicalOperator
-from palimpzest.query.operators.scan import ScanPhysicalOp
+from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
 from palimpzest.query.optimizer.plan import PhysicalPlan
 from palimpzest.utils.progress import create_progress_manager
 logger = logging.getLogger(__name__)
 class ParallelExecutionStrategy(ExecutionStrategy):
     """
     A parallel execution strategy that processes data through a pipeline of operators using thread-based parallelism.
@@ -22,76 +24,72 @@ class ParallelExecutionStrategy(ExecutionStrategy):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.max_workers = (
-            self._get_parallel_max_workers()
-            if self.max_workers is None
-            else self.max_workers
-        )
-    def _get_parallel_max_workers(self):
-        # for now, return the number of system CPUs;
-        # in the future, we may want to consider the models the user has access to
-        # and whether or not they will encounter rate-limits. If they will, we should
-        # set the max workers in a manner that is designed to avoid hitting them.
-        # Doing this "right" may require considering their logical, physical plan,
-        # and tier status with LLM providers. It may also be worth dynamically
-        # changing the max_workers in response to 429 errors.
-        return max(int(0.8 * multiprocessing.cpu_count()), 1)
-    def _any_queue_not_empty(self, queues: dict[str, list]) -> bool:
-        """Helper function to check if any queue is not empty."""
-        return any(len(queue) > 0 for queue in queues.values())
-    def _upstream_ops_finished(self, plan: PhysicalPlan, op_idx: int, input_queues: dict[str, list], future_queues: dict[str, list]) -> bool:
-        """Helper function to check if all upstream operators have finished processing their inputs."""
-        for upstream_op_idx in range(op_idx):
-            upstream_full_op_id = plan.operators[upstream_op_idx].get_full_op_id()
-            if len(input_queues[upstream_full_op_id]) > 0 or len(future_queues[upstream_full_op_id]) > 0:
-                return False
-        return True
-    def _process_future_results(self, operator: PhysicalOperator, future_queues: dict[str, list], plan_stats: PlanStats) -> list[DataRecord]:
+    def _any_queue_not_empty(self, queues: dict[str, list] | dict[str, dict[str, list]]) -> bool:
+        """Helper function to check if any queue is not empty."""
+        for _, value in queues.items():
+            if isinstance(value, dict):
+                if any(len(subqueue) > 0 for subqueue in value.values()):
+                    return True
+            elif len(value) > 0:
+                return True
+        return False
+    def _upstream_ops_finished(self, plan: PhysicalPlan, topo_idx: int, operator: PhysicalOperator, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> bool:
+        """Helper function to check if agg / join operator is ready to process its inputs."""
+        # for agg / join operator, we can only process it when all upstream operators have finished processing their inputs
+        upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
+        upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
+        upstream_future_queues = {upstream_unique_full_op_id: future_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
+        return not (self._any_queue_not_empty(upstream_input_queues) or self._any_queue_not_empty(upstream_future_queues))
+    def _process_future_results(self, unique_full_op_id: str, future_queues: dict[str, list], plan_stats: PlanStats) -> list[DataRecord]:
         """
-        Helper function which takes an operator, the future queues, and plan stats, and performs
+        Helper function which takes a full operator id, the future queues, and plan stats, and performs
         the updates to plan stats and progress manager before returning the results from the finished futures.
         """
-        # get the op_id for the operator
-        full_op_id = operator.get_full_op_id()
         # this function is called when the future queue is not empty
         # and the executor is not busy processing other futures
-        done_futures, not_done_futures = wait(future_queues[full_op_id], timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
+        done_futures, not_done_futures = wait(future_queues[unique_full_op_id], timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
         # add the unfinished futures back to the previous op's future queue
-        future_queues[full_op_id] = list(not_done_futures)
+        future_queues[unique_full_op_id] = list(not_done_futures)
         # add the finished futures to the input queue for this operator
-        output_records = []
+        output_records, total_inputs_processed, total_cost = [], 0, 0.0
         for future in done_futures:
-            record_set: DataRecordSet = future.result()
+            output = future.result()
+            record_set, num_inputs_processed = output if self.is_join_op[unique_full_op_id] else (output, 1)
+            # record set can be None if one side of join has no input records yet
+            if record_set is None:
+                continue
+            # otherwise, process records and their stats
             records = record_set.data_records
             record_op_stats = record_set.record_op_stats
-            num_outputs = sum(record.passed_operator for record in records)
-            # update the progress manager
-            self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+            # update the inputs processed and total cost
+            total_inputs_processed += num_inputs_processed
+            total_cost += record_set.get_total_cost()
             # update plan stats
-            plan_stats.add_record_op_stats(record_op_stats)
-            # add records to the cache
-            self._add_records_to_cache(operator.target_cache_id, records)
+            plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
             # add records which aren't filtered to the output records
             output_records.extend([record for record in records if record.passed_operator])
+        # update the progress manager
+        if total_inputs_processed > 0:
+            num_outputs = len(output_records)
+            self.progress_manager.incr(unique_full_op_id, num_inputs=total_inputs_processed, num_outputs=num_outputs, total_cost=total_cost)
         return output_records
     def _execute_plan(
             self,
             plan: PhysicalPlan,
-            input_queues: dict[str, list],
+            input_queues: dict[str, dict[str, list]],
             future_queues: dict[str, list],
             plan_stats: PlanStats,
         ) -> tuple[list[DataRecord], PlanStats]:
@@ -103,56 +101,119 @@ class ParallelExecutionStrategy(ExecutionStrategy):
             # execute the plan until either:
             # 1. all records have been processed, or
             # 2. the final limit operation has completed (we break out of the loop if this happens)
-            final_op = plan.operators[-1]
+            final_op = plan.operator
             while self._any_queue_not_empty(input_queues) or self._any_queue_not_empty(future_queues):
-                for op_idx, operator in enumerate(plan.operators):
-                    full_op_id = operator.get_full_op_id()
+                for topo_idx, operator in enumerate(plan):
+                    source_unique_full_op_ids = (
+                        [f"source_{operator.get_full_op_id()}"]
+                        if isinstance(operator, (ContextScanOp, ScanPhysicalOp))
+                        else plan.get_source_unique_full_op_ids(topo_idx, operator)
+                    )
+                    unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
                     # get any finished futures from the previous operator and add them to the input queue for this operator
-                    if not isinstance(operator, ScanPhysicalOp):
-                        prev_operator = plan.operators[op_idx - 1]
-                        records = self._process_future_results(prev_operator, future_queues, plan_stats)
-                        input_queues[full_op_id].extend(records)
+                    if not isinstance(operator, (ContextScanOp, ScanPhysicalOp)):
+                        for source_unique_full_op_id in source_unique_full_op_ids:
+                            records = self._process_future_results(source_unique_full_op_id, future_queues, plan_stats)
+                            input_queues[unique_full_op_id][source_unique_full_op_id].extend(records)
                     # for the final operator, add any finished futures to the output_records
-                    if full_op_id == final_op.get_full_op_id():
-                        records = self._process_future_results(operator, future_queues, plan_stats)
+                    if unique_full_op_id == f"{topo_idx}-{final_op.get_full_op_id()}":
+                        records = self._process_future_results(unique_full_op_id, future_queues, plan_stats)
                         output_records.extend(records)
                     # if this operator does not have enough inputs to execute, then skip it
-                    num_inputs = len(input_queues[full_op_id])
-                    agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, op_idx, input_queues, future_queues)
-                    if num_inputs == 0 or agg_op_not_ready:
+                    num_inputs = sum(len(inputs) for inputs in input_queues[unique_full_op_id].values())
+                    agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues, future_queues)
+                    join_op_not_ready = isinstance(operator, JoinOp) and not self.join_has_downstream_limit_op[unique_full_op_id] and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues, future_queues)
+                    if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
                         continue
                     # if this operator is an aggregate, process all the records in the input queue
                     if isinstance(operator, AggregateOp):
-                        input_records = [input_queues[full_op_id].pop(0) for _ in range(num_inputs)]
+                        source_unique_full_op_id = source_unique_full_op_ids[0]
+                        input_records = [input_queues[unique_full_op_id][source_unique_full_op_id].pop(0) for _ in range(num_inputs)]
                         future = executor.submit(operator, input_records)
-                        future_queues[full_op_id].append(future)
+                        future_queues[unique_full_op_id].append(future)
+                    # if this operator is a join, process all pairs of records from the two input queues
+                    elif isinstance(operator, JoinOp):
+                        left_unique_full_source_op_id = source_unique_full_op_ids[0]
+                        left_num_inputs = len(input_queues[unique_full_op_id][left_unique_full_source_op_id])
+                        left_input_records = [input_queues[unique_full_op_id][left_unique_full_source_op_id].pop(0) for _ in range(left_num_inputs)]
+                        right_unique_full_source_op_id = source_unique_full_op_ids[1]
+                        right_num_inputs = len(input_queues[unique_full_op_id][right_unique_full_source_op_id])
+                        right_input_records = [input_queues[unique_full_op_id][right_unique_full_source_op_id].pop(0) for _ in range(right_num_inputs)]
+                        # NOTE: it would be nice to use executor for join inputs here; but for now synchronizing may be necessary
+                        # future = executor.submit(operator, left_input_records, right_input_records)
+                        # future_queues[unique_full_op_id].append(future)
+                        record_set, num_inputs_processed = operator(left_input_records, right_input_records)
+                        def no_op(rset, num_inputs_processed):
+                            return rset, num_inputs_processed
+                        future = executor.submit(no_op, record_set, num_inputs_processed)
+                        future_queues[unique_full_op_id].append(future)
+                    # if this operator is a limit, process one record at a time
+                    elif isinstance(operator, LimitScanOp):
+                        source_unique_full_op_id = source_unique_full_op_ids[0]
+                        num_records_to_process = min(len(input_queues[unique_full_op_id][source_unique_full_op_id]), operator.limit - len(output_records))
+                        for _ in range(num_records_to_process):
+                            input_record = input_queues[unique_full_op_id][source_unique_full_op_id].pop(0)
+                            future = executor.submit(operator, input_record)
+                            future_queues[unique_full_op_id].append(future)
+                        # if this is the final operator, add any finished futures to the output_records
+                        # immediately so that we can break out of the loop if we've reached the limit
+                        if unique_full_op_id == f"{topo_idx}-{final_op.get_full_op_id()}":
+                            records = self._process_future_results(unique_full_op_id, future_queues, plan_stats)
+                            output_records.extend(records)
+                    # if this operator is a distinct, process records sequentially
+                    # (distinct is not parallelized because it requires maintaining a set of seen records)
+                    elif isinstance(operator, DistinctOp):
+                        source_unique_full_op_id = source_unique_full_op_ids[0]
+                        input_records = input_queues[unique_full_op_id][source_unique_full_op_id]
+                        for record in input_records:
+                            record_set = operator(record)
+                            def no_op(rset):
+                                return rset
+                            future = executor.submit(no_op, record_set)
+                            future_queues[unique_full_op_id].append(future)
+                        # clear the input queue for this operator since we processed all records
+                        input_queues[unique_full_op_id][source_unique_full_op_id].clear()
+                    # otherwise, process records according to batch size
                     else:
-                        input_record = input_queues[full_op_id].pop(0)
-                        future = executor.submit(operator, input_record)
-                        future_queues[full_op_id].append(future)
+                        source_unique_full_op_id = source_unique_full_op_ids[0]
+                        input_records = input_queues[unique_full_op_id][source_unique_full_op_id]
+                        if self.batch_size is None:
+                            for input_record in input_records:
+                                future = executor.submit(operator, input_record)
+                                future_queues[unique_full_op_id].append(future)
+                            input_queues[unique_full_op_id][source_unique_full_op_id].clear()
+                        else:
+                            batch_size = min(self.batch_size, len(input_records))
+                            batch_input_records = input_records[:batch_size]
+                            for input_record in batch_input_records:
+                                future = executor.submit(operator, input_record)
+                                future_queues[unique_full_op_id].append(future)
+                            input_queues[unique_full_op_id][source_unique_full_op_id] = input_records[batch_size:]
+                # TODO: change logic to stop upstream operators once a limit is reached
                 # break out of loop if the final operator is a LimitScanOp and we've reached its limit
                 if isinstance(final_op, LimitScanOp) and len(output_records) == final_op.limit:
                     break
-        # close the cache
-        self._close_cache([op.target_cache_id for op in plan.operators])
         # finalize plan stats
         plan_stats.finish()
         return output_records, plan_stats
     def execute_plan(self, plan: PhysicalPlan):
         """Initialize the stats and execute the plan."""
-        # for now, assert that the first operator in the plan is a ScanPhysicalOp
-        assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
         logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
         logger.info(f"Plan Details: {plan}")
@@ -162,180 +223,28 @@ class ParallelExecutionStrategy(ExecutionStrategy):
         # initialize input queues and future queues for each operation
         input_queues = self._create_input_queues(plan)
-        future_queues = {op.get_full_op_id(): [] for op in plan.operators}
-        # initialize and start the progress manager
-        self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
-        self.progress_manager.start()
-        # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
-        #       if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
-        #       because the progress manager cannot get a handle to the console
-        try:
-            # execute plan
-            output_records, plan_stats = self._execute_plan(plan, input_queues, future_queues, plan_stats)
-        finally:
-            # finish progress tracking
-            self.progress_manager.finish()
-        logger.info(f"Done executing plan: {plan.plan_id}")
-        logger.debug(f"Plan stats: (plan_cost={plan_stats.total_plan_cost}, plan_time={plan_stats.total_plan_time})")
-        return output_records, plan_stats
-class SequentialParallelExecutionStrategy(ExecutionStrategy):
-    """
-    A parallel execution strategy that processes operators sequentially.
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.max_workers = (
-            self._get_parallel_max_workers()
-            if self.max_workers is None
-            else self.max_workers
-        )
-    def _get_parallel_max_workers(self):
-        # for now, return the number of system CPUs;
-        # in the future, we may want to consider the models the user has access to
-        # and whether or not they will encounter rate-limits. If they will, we should
-        # set the max workers in a manner that is designed to avoid hitting them.
-        # Doing this "right" may require considering their logical, physical plan,
-        # and tier status with LLM providers. It may also be worth dynamically
-        # changing the max_workers in response to 429 errors.
-        return max(int(0.8 * multiprocessing.cpu_count()), 1)
-    def _any_queue_not_empty(self, queues: dict[str, list]) -> bool:
-        """Helper function to check if any queue is not empty."""
-        return any(len(queue) > 0 for queue in queues.values())
-    def _upstream_ops_finished(self, plan: PhysicalPlan, op_idx: int, input_queues: dict[str, list], future_queues: dict[str, list]) -> bool:
-        """Helper function to check if all upstream operators have finished processing their inputs."""
-        for upstream_op_idx in range(op_idx):
-            upstream_full_op_id = plan.operators[upstream_op_idx].get_full_op_id()
-            if len(input_queues[upstream_full_op_id]) > 0 or len(future_queues[upstream_full_op_id]) > 0:
-                return False
-        return True
-    def _process_future_results(self, operator: PhysicalOperator, future_queues: dict[str, list], plan_stats: PlanStats) -> list[DataRecord]:
-        """
-        Helper function which takes an operator, the future queues, and plan stats, and performs
-        the updates to plan stats and progress manager before returning the results from the finished futures.
-        """
-        # get the op_id for the operator
-        full_op_id = operator.get_full_op_id()
-        # this function is called when the future queue is not empty
-        # and the executor is not busy processing other futures
-        done_futures, not_done_futures = wait(future_queues[full_op_id], timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
-        # add the unfinished futures back to the previous op's future queue
-        future_queues[full_op_id] = list(not_done_futures)
-        # add the finished futures to the input queue for this operator
-        output_records = []
-        for future in done_futures:
-            record_set: DataRecordSet = future.result()
-            records = record_set.data_records
-            record_op_stats = record_set.record_op_stats
-            num_outputs = sum(record.passed_operator for record in records)
-            # update the progress manager
-            self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
-            # update plan stats
-            plan_stats.add_record_op_stats(record_op_stats)
-            # add records to the cache
-            self._add_records_to_cache(operator.target_cache_id, records)
-            # add records which aren't filtered to the output records
-            output_records.extend([record for record in records if record.passed_operator])
-        return output_records
-    def _execute_plan(
-            self,
-            plan: PhysicalPlan,
-            input_queues: dict[str, list],
-            future_queues: dict[str, list],
-            plan_stats: PlanStats,
-        ) -> tuple[list[DataRecord], PlanStats]:
-        # process all of the input records using a thread pool
-        output_records = []
-        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-            logger.debug(f"Created thread pool with {self.max_workers} workers")
-            # execute the plan until either:
-            # 1. all records have been processed, or
-            # 2. the final limit operation has completed (we break out of the loop if this happens)
-            final_op = plan.operators[-1]
-            for op_idx, operator in enumerate(plan.operators):
-                full_op_id = operator.get_full_op_id()
-                input_queue = input_queues[full_op_id]
-                # if this operator is an aggregate, process all the records in the input queue
-                if isinstance(operator, AggregateOp):
-                    num_inputs = len(input_queue)
-                    input_records = [input_queue.pop(0) for _ in range(num_inputs)]
-                    future = executor.submit(operator, input_records)
-                    future_queues[full_op_id].append(future)
-                else:
-                    while len(input_queue) > 0:
-                        input_record = input_queue.pop(0)
-                        future = executor.submit(operator, input_record)
-                        future_queues[full_op_id].append(future)
-                # block until all futures for this operator have completed; and add finished futures to next operator's input
-                while len(future_queues[full_op_id]) > 0:
-                    records = self._process_future_results(operator, future_queues, plan_stats)
-                    # get any finished futures from the previous operator and add them to the input queue for this operator
-                    if full_op_id != final_op.get_full_op_id():
-                        next_op_id = plan.operators[op_idx + 1].get_full_op_id()
-                        input_queues[next_op_id].extend(records)
-                    # for the final operator, add any finished futures to the output_records
-                    else:
-                        output_records.extend(records)
-                        # break out of loop if the final operator is a LimitScanOp and we've reached its limit
-                        if isinstance(final_op, LimitScanOp) and len(output_records) == final_op.limit:
-                            break
-        # close the cache
-        self._close_cache([op.target_cache_id for op in plan.operators])
-        # finalize plan stats
-        plan_stats.finish()
-        return output_records, plan_stats
-    def execute_plan(self, plan: PhysicalPlan):
-        """Initialize the stats and execute the plan."""
-        # for now, assert that the first operator in the plan is a ScanPhysicalOp
-        assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
-        logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
-        logger.info(f"Plan Details: {plan}")
-        # initialize plan stats
-        plan_stats = PlanStats.from_plan(plan)
-        plan_stats.start()
-        # initialize input queues and future queues for each operation
-        input_queues = self._create_input_queues(plan)
-        future_queues = {op.get_full_op_id(): [] for op in plan.operators}
+        future_queues = {f"{topo_idx}-{op.get_full_op_id()}": [] for topo_idx, op in enumerate(plan)}
+        # precompute which operators are joins and which joins have downstream limit ops
+        self.is_join_op = {f"{topo_idx}-{op.get_full_op_id()}": isinstance(op, JoinOp) for topo_idx, op in enumerate(plan)}
+        self.join_has_downstream_limit_op = {}
+        for topo_idx, op in enumerate(plan):
+            if isinstance(op, JoinOp):
+                unique_full_op_id = f"{topo_idx}-{op.get_full_op_id()}"
+                has_downstream_limit_op = False
+                for inner_topo_idx, op in enumerate(plan):
+                    if inner_topo_idx <= topo_idx:
+                        continue
+                    if isinstance(op, LimitScanOp):
+                        has_downstream_limit_op = True
+                        break
+                self.join_has_downstream_limit_op[unique_full_op_id] = has_downstream_limit_op
         # initialize and start the progress manager
         self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
         self.progress_manager.start()
-        # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
+        # NOTE: we must handle progress manager outside of _execute_plan to ensure that it is shut down correctly;
         #       if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
         #       because the progress manager cannot get a handle to the console
         try:

palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl

palimpzest 0.7.21py3-none-any.whl → 0.8.0py3-none-any.whl