PyPI - palimpzest - Versions diffs - 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

palimpzest 0.7.21py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +259 -197
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +634 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +61 -5
palimpzest/prompts/filter_prompts.py +50 -5
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
palimpzest/prompts/prompt_factory.py +358 -46
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +157 -330
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +27 -21
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +22 -13
palimpzest/query/operators/join.py +402 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +198 -80
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +41 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +27 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
palimpzest-0.8.0.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.21.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0

palimpzest/query/execution/single_threaded_execution_strategy.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import logging
-from palimpzest.core.data.dataclasses import PlanStats
 from palimpzest.core.elements.records import DataRecord
+from palimpzest.core.models import PlanStats
 from palimpzest.query.execution.execution_strategy import ExecutionStrategy
 from palimpzest.query.operators.aggregate import AggregateOp
+from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.limit import LimitScanOp
-from palimpzest.query.operators.scan import ScanPhysicalOp
+from palimpzest.query.operators.physical import PhysicalOperator
+from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
 from palimpzest.query.optimizer.plan import PhysicalPlan
 from palimpzest.utils.progress import create_progress_manager
@@ -25,61 +27,80 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
         super().__init__(*args, **kwargs)
         self.max_workers = 1
-    def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, list], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
+    def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, dict[str, list]], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
         # execute the plan one operator at a time
         output_records = []
-        for op_idx, operator in enumerate(plan.operators):
+        for topo_idx, operator in enumerate(plan):
             # if we've filtered out all records, terminate early
-            full_op_id = operator.get_full_op_id()
-            num_inputs = len(input_queues[full_op_id])
+            source_unique_full_op_ids = (
+                [f"source_{operator.get_full_op_id()}"]
+                if isinstance(operator, (ContextScanOp, ScanPhysicalOp))
+                else plan.get_source_unique_full_op_ids(topo_idx, operator)
+            )
+            unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
+            num_inputs = sum(len(input_queues[unique_full_op_id][source_unique_full_op_id]) for source_unique_full_op_id in source_unique_full_op_ids)
             if num_inputs == 0:
                 break
             # begin to process this operator
             records, record_op_stats = [], []
-            logger.info(f"Processing operator {operator.op_name()} ({full_op_id})")
+            logger.info(f"Processing operator {operator.op_name()} ({unique_full_op_id})")
             # if this operator is an aggregate, process all the records in the input_queue
             if isinstance(operator, AggregateOp):
-                record_set = operator(candidates=input_queues[full_op_id])
+                source_unique_full_op_id = source_unique_full_op_ids[0]
+                record_set = operator(candidates=input_queues[unique_full_op_id][source_unique_full_op_id])
                 records = record_set.data_records
                 record_op_stats = record_set.record_op_stats
                 num_outputs = sum(record.passed_operator for record in records)
                 # update the progress manager
-                self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+            # if this operator is a join, process all pairs of records from the two input queues
+            elif isinstance(operator, JoinOp):
+                left_full_source_op_id = source_unique_full_op_ids[0]
+                left_num_inputs = len(input_queues[unique_full_op_id][left_full_source_op_id])
+                left_input_records = [input_queues[unique_full_op_id][left_full_source_op_id].pop(0) for _ in range(left_num_inputs)]
+                right_full_source_op_id = source_unique_full_op_ids[1]
+                right_num_inputs = len(input_queues[unique_full_op_id][right_full_source_op_id])
+                right_input_records = [input_queues[unique_full_op_id][right_full_source_op_id].pop(0) for _ in range(right_num_inputs)]
+                record_set, num_inputs_processed = operator(left_input_records, right_input_records)
+                records = record_set.data_records
+                record_op_stats = record_set.record_op_stats
+                num_outputs = sum(record.passed_operator for record in records)
+                # update the progress manager
+                self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
             # otherwise, process the records in the input queue for this operator one at a time
             else:
-                for input_record in input_queues[full_op_id]:
+                source_unique_full_op_id = source_unique_full_op_ids[0]
+                for input_record in input_queues[unique_full_op_id][source_unique_full_op_id]:
                     record_set = operator(input_record)
                     records.extend(record_set.data_records)
                     record_op_stats.extend(record_set.record_op_stats)
                     num_outputs = sum(record.passed_operator for record in record_set.data_records)
                     # update the progress manager
-                    self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                    self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
                     # finish early if this is a limit
                     if isinstance(operator, LimitScanOp) and len(records) == operator.limit:
                         break
             # update plan stats
-            plan_stats.add_record_op_stats(record_op_stats)
-            # add records to the cache
-            self._add_records_to_cache(operator.target_cache_id, records)
+            plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
             # update next input_queue (if it exists)
-            output_records = [record for record in records if record.passed_operator]
-            if op_idx + 1 < len(plan.operators):
-                next_full_op_id = plan.operators[op_idx + 1].get_full_op_id()
-                input_queues[next_full_op_id] = output_records
+            output_records = [record for record in records if record.passed_operator]
+            next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
+            if next_unique_full_op_id is not None:
+                input_queues[next_unique_full_op_id][unique_full_op_id] = output_records
-            logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_full_op_id()}), and generated {len(records)} records")
-        # close the cache
-        self._close_cache([op.target_cache_id for op in plan.operators])
+            logger.info(f"Finished processing operator {operator.op_name()} ({unique_full_op_id}), and generated {len(records)} records")
         # finalize plan stats
         plan_stats.finish()
@@ -88,8 +109,6 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
     def execute_plan(self, plan: PhysicalPlan) -> tuple[list[DataRecord], PlanStats]:
         """Initialize the stats and execute the plan."""
-        # for now, assert that the first operator in the plan is a ScanPhysicalOp
-        assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
         logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
         logger.info(f"Plan Details: {plan}")
@@ -104,7 +123,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
         self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
         self.progress_manager.start()
-        # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
+        # NOTE: we must handle progress manager outside of _execute_plan to ensure that it is shut down correctly;
         #       if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
         #       because the progress manager cannot get a handle to the console
         try:
@@ -139,31 +158,43 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
         super().__init__(*args, **kwargs)
         self.max_workers = 1
-    def _any_queue_not_empty(self, queues: dict[str, list]) -> bool:
+    def _any_queue_not_empty(self, queues: dict[str, list] | dict[str, dict[str, list]]) -> bool:
         """Helper function to check if any queue is not empty."""
-        return any(len(queue) > 0 for queue in queues.values())
-    def _upstream_ops_finished(self, plan: PhysicalPlan, op_idx: int, input_queues: dict[str, list]) -> bool:
-        """Helper function to check if all upstream operators have finished processing their inputs."""
-        for upstream_op_idx in range(op_idx):
-            upstream_full_op_id = plan.operators[upstream_op_idx].get_full_op_id()
-            if len(input_queues[upstream_full_op_id]) > 0:
-                return False
-        return True
-    def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, list], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
+        for _, value in queues.items():
+            if isinstance(value, dict):
+                if any(len(subqueue) > 0 for subqueue in value.values()):
+                    return True
+            elif len(value) > 0:
+                return True
+        return False
+    def _upstream_ops_finished(self, plan: PhysicalPlan, topo_idx: int, operator: PhysicalOperator, input_queues: dict[str, dict[str, list]]) -> bool:
+        """Helper function to check if agg / join operator is ready to process its inputs."""
+        # for agg / join operator, we can only process it when all upstream operators have finished processing their inputs
+        upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
+        upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
+        return not self._any_queue_not_empty(upstream_input_queues)
+    def _execute_plan(self, plan: PhysicalPlan, input_queues: dict[str, dict[str, list]], plan_stats: PlanStats) -> tuple[list[DataRecord], PlanStats]:
         # execute the plan until either:
         # 1. all records have been processed, or
         # 2. the final limit operation has completed (we break out of the loop if this happens)
         final_output_records = []
         while self._any_queue_not_empty(input_queues):
-            for op_idx, operator in enumerate(plan.operators):
+            for topo_idx, operator in enumerate(plan):
                 # if this operator does not have enough inputs to execute, then skip it
-                full_op_id = operator.get_full_op_id()
-                num_inputs = len(input_queues[full_op_id])
-                agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, op_idx, input_queues)
-                if num_inputs == 0 or agg_op_not_ready:
+                source_unique_full_op_ids = (
+                    [f"source_{operator.get_full_op_id()}"]
+                    if isinstance(operator, (ContextScanOp, ScanPhysicalOp))
+                    else plan.get_source_unique_full_op_ids(topo_idx, operator)
+                )
+                unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
+                num_inputs = sum(len(input_queues[unique_full_op_id][source_unique_full_op_id]) for source_unique_full_op_id in source_unique_full_op_ids)
+                agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues)
+                join_op_not_ready = isinstance(operator, JoinOp) and not self._upstream_ops_finished(plan, topo_idx, operator, input_queues)
+                if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
                     continue
                 # create empty lists for records and execution stats generated by executing this operator on its next input(s)
@@ -171,49 +202,63 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                 # if the next operator is an aggregate, process all the records in the input_queue
                 if isinstance(operator, AggregateOp):
-                    input_records = [input_queues[full_op_id].pop(0) for _ in range(num_inputs)]
+                    source_unique_full_op_id = source_unique_full_op_ids[0]
+                    input_records = [input_queues[unique_full_op_id][source_unique_full_op_id].pop(0) for _ in range(num_inputs)]
                     record_set = operator(candidates=input_records)
                     records = record_set.data_records
                     record_op_stats = record_set.record_op_stats
                     num_outputs = sum(record.passed_operator for record in records)
                     # update the progress manager
-                    self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                    self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                # if this operator is a join, process all pairs of records from the two input queues
+                elif isinstance(operator, JoinOp):
+                    left_full_source_op_id = source_unique_full_op_ids[0]
+                    left_num_inputs = len(input_queues[unique_full_op_id][left_full_source_op_id])
+                    left_input_records = [input_queues[unique_full_op_id][left_full_source_op_id].pop(0) for _ in range(left_num_inputs)]
+                    right_full_source_op_id = source_unique_full_op_ids[1]
+                    right_num_inputs = len(input_queues[unique_full_op_id][right_full_source_op_id])
+                    right_input_records = [input_queues[unique_full_op_id][right_full_source_op_id].pop(0) for _ in range(right_num_inputs)]
+                    record_set, num_inputs_processed = operator(left_input_records, right_input_records)
+                    records = record_set.data_records
+                    record_op_stats = record_set.record_op_stats
+                    num_outputs = sum(record.passed_operator for record in records)
+                    # update the progress manager
+                    self.progress_manager.incr(unique_full_op_id, num_inputs=num_inputs_processed, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
                 # otherwise, process the next record in the input queue for this operator
                 else:
-                    input_record = input_queues[full_op_id].pop(0)
+                    source_unique_full_op_id = source_unique_full_op_ids[0]
+                    input_record = input_queues[unique_full_op_id][source_unique_full_op_id].pop(0)
                     record_set = operator(input_record)
                     records = record_set.data_records
                     record_op_stats = record_set.record_op_stats
                     num_outputs = sum(record.passed_operator for record in records)
                     # update the progress manager
-                    self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                    self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
                 # update plan stats
-                plan_stats.add_record_op_stats(record_op_stats)
-                # add records to the cache
-                self._add_records_to_cache(operator.target_cache_id, records)
+                plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
                 # update next input_queue or final_output_records
-                output_records = [record for record in records if record.passed_operator]
-                if op_idx + 1 < len(plan.operators):
-                    next_full_op_id = plan.operators[op_idx + 1].get_full_op_id()
-                    input_queues[next_full_op_id].extend(output_records)
+                output_records = [record for record in records if record.passed_operator]
+                next_unique_full_op_id = plan.get_next_unique_full_op_id(topo_idx, operator)
+                if next_unique_full_op_id is not None:
+                    input_queues[next_unique_full_op_id][unique_full_op_id].extend(output_records)
                 else:
                     final_output_records.extend(output_records)
-                logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_full_op_id()}) on {num_inputs} records")
+                logger.info(f"Finished processing operator {operator.op_name()} ({unique_full_op_id}) on {num_inputs} records")
             # break out of loop if the final operator is a LimitScanOp and we've reached its limit
-            if isinstance(plan.operators[-1], LimitScanOp) and len(final_output_records) == plan.operators[-1].limit:
+            if isinstance(plan.operator, LimitScanOp) and len(final_output_records) == plan.operator.limit:
                 break
-        # close the cache
-        self._close_cache([op.target_cache_id for op in plan.operators])
         # finalize plan stats
         plan_stats.finish()
@@ -221,8 +266,6 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
     def execute_plan(self, plan: PhysicalPlan):
         """Initialize the stats and execute the plan."""
-        # for now, assert that the first operator in the plan is a ScanPhysicalOp
-        assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
         logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
         logger.info(f"Plan Details: {plan}")
@@ -237,7 +280,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
         self.progress_manager = create_progress_manager(plan, self.num_samples, self.progress)
         self.progress_manager.start()
-        # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
+        # NOTE: we must handle progress manager outside of _execute_plan to ensure that it is shut down correctly;
         #       if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
         #       because the progress manager cannot get a handle to the console
         try:

palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl

palimpzest 0.7.21py3-none-any.whl → 0.8.0py3-none-any.whl