PyPI - palimpzest - Versions diffs - 0.7.7__py3-none-any.whl → 0.7.9__py3-none-any.whl - Mend

palimpzest 0.7.7py3-none-any.whl → 0.7.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

palimpzest/constants.py +113 -75
palimpzest/core/data/dataclasses.py +55 -38
palimpzest/core/elements/index.py +5 -15
palimpzest/core/elements/records.py +1 -1
palimpzest/prompts/prompt_factory.py +1 -1
palimpzest/query/execution/all_sample_execution_strategy.py +216 -0
palimpzest/query/execution/execution_strategy.py +4 -4
palimpzest/query/execution/execution_strategy_type.py +7 -1
palimpzest/query/execution/mab_execution_strategy.py +184 -72
palimpzest/query/execution/parallel_execution_strategy.py +182 -15
palimpzest/query/execution/single_threaded_execution_strategy.py +21 -21
palimpzest/query/generators/api_client_factory.py +6 -7
palimpzest/query/generators/generators.py +5 -8
palimpzest/query/operators/aggregate.py +4 -3
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/map.py +1 -1
palimpzest/query/operators/physical.py +8 -4
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/retrieve.py +7 -23
palimpzest/query/operators/scan.py +1 -1
palimpzest/query/optimizer/cost_model.py +54 -62
palimpzest/query/optimizer/optimizer.py +2 -6
palimpzest/query/optimizer/plan.py +4 -4
palimpzest/query/optimizer/primitives.py +1 -1
palimpzest/query/optimizer/rules.py +8 -26
palimpzest/query/optimizer/tasks.py +3 -3
palimpzest/query/processor/processing_strategy_type.py +2 -2
palimpzest/query/processor/sentinel_processor.py +0 -2
palimpzest/sets.py +2 -3
palimpzest/utils/generation_helpers.py +1 -1
palimpzest/utils/model_helpers.py +27 -9
palimpzest/utils/progress.py +81 -72
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/METADATA +4 -2
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/RECORD +39 -38
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/WHEEL +1 -1
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.7.dist-info → palimpzest-0.7.9.dist-info}/top_level.txt +0 -0

palimpzest/query/execution/parallel_execution_strategy.py CHANGED Viewed

@@ -45,8 +45,8 @@ class ParallelExecutionStrategy(ExecutionStrategy):
     def _upstream_ops_finished(self, plan: PhysicalPlan, op_idx: int, input_queues: dict[str, list], future_queues: dict[str, list]) -> bool:
         """Helper function to check if all upstream operators have finished processing their inputs."""
         for upstream_op_idx in range(op_idx):
-            upstream_op_id = plan.operators[upstream_op_idx].get_op_id()
-            if len(input_queues[upstream_op_id]) > 0 or len(future_queues[upstream_op_id]) > 0:
+            upstream_full_op_id = plan.operators[upstream_op_idx].get_full_op_id()
+            if len(input_queues[upstream_full_op_id]) > 0 or len(future_queues[upstream_full_op_id]) > 0:
                 return False
         return True
@@ -57,14 +57,14 @@ class ParallelExecutionStrategy(ExecutionStrategy):
         the updates to plan stats and progress manager before returning the results from the finished futures.
         """
         # get the op_id for the operator
-        op_id = operator.get_op_id()
+        full_op_id = operator.get_full_op_id()
         # this function is called when the future queue is not empty
         # and the executor is not busy processing other futures
-        done_futures, not_done_futures = wait(future_queues[op_id], timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
+        done_futures, not_done_futures = wait(future_queues[full_op_id], timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
         # add the unfinished futures back to the previous op's future queue
-        future_queues[op_id] = list(not_done_futures)
+        future_queues[full_op_id] = list(not_done_futures)
         # add the finished futures to the input queue for this operator
         output_records = []
@@ -75,7 +75,7 @@ class ParallelExecutionStrategy(ExecutionStrategy):
             num_outputs = sum(record.passed_operator for record in records)
             # update the progress manager
-            self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+            self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
             # update plan stats
             plan_stats.add_record_op_stats(record_op_stats)
@@ -106,35 +106,35 @@ class ParallelExecutionStrategy(ExecutionStrategy):
             final_op = plan.operators[-1]
             while self._any_queue_not_empty(input_queues) or self._any_queue_not_empty(future_queues):
                 for op_idx, operator in enumerate(plan.operators):
-                    op_id = operator.get_op_id()
+                    full_op_id = operator.get_full_op_id()
                     # get any finished futures from the previous operator and add them to the input queue for this operator
                     if not isinstance(operator, ScanPhysicalOp):
                         prev_operator = plan.operators[op_idx - 1]
                         records = self._process_future_results(prev_operator, future_queues, plan_stats)
-                        input_queues[op_id].extend(records)
+                        input_queues[full_op_id].extend(records)
                     # for the final operator, add any finished futures to the output_records
-                    if operator.get_op_id() == final_op.get_op_id():
+                    if full_op_id == final_op.get_full_op_id():
                         records = self._process_future_results(operator, future_queues, plan_stats)
                         output_records.extend(records)
                     # if this operator does not have enough inputs to execute, then skip it
-                    num_inputs = len(input_queues[op_id])
+                    num_inputs = len(input_queues[full_op_id])
                     agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, op_idx, input_queues, future_queues)
                     if num_inputs == 0 or agg_op_not_ready:
                         continue
                     # if this operator is an aggregate, process all the records in the input queue
                     if isinstance(operator, AggregateOp):
-                        input_records = [input_queues[op_id].pop(0) for _ in range(num_inputs)]
+                        input_records = [input_queues[full_op_id].pop(0) for _ in range(num_inputs)]
                         future = executor.submit(operator, input_records)
-                        future_queues[op_id].append(future)
+                        future_queues[full_op_id].append(future)
                     else:
-                        input_record = input_queues[op_id].pop(0)
+                        input_record = input_queues[full_op_id].pop(0)
                         future = executor.submit(operator, input_record)
-                        future_queues[op_id].append(future)
+                        future_queues[full_op_id].append(future)
                 # break out of loop if the final operator is a LimitScanOp and we've reached its limit
                 if isinstance(final_op, LimitScanOp) and len(output_records) == final_op.limit:
@@ -162,7 +162,174 @@ class ParallelExecutionStrategy(ExecutionStrategy):
         # initialize input queues and future queues for each operation
         input_queues = self._create_input_queues(plan)
-        future_queues = {op.get_op_id(): [] for op in plan.operators}
+        future_queues = {op.get_full_op_id(): [] for op in plan.operators}
+        # initialize and start the progress manager
+        self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
+        self.progress_manager.start()
+        # NOTE: we must handle progress manager outside of _exeecute_plan to ensure that it is shut down correctly;
+        #       if we don't have the `finally:` branch, then program crashes can cause future program runs to fail
+        #       because the progress manager cannot get a handle to the console
+        try:
+            # execute plan
+            output_records, plan_stats = self._execute_plan(plan, input_queues, future_queues, plan_stats)
+        finally:
+            # finish progress tracking
+            self.progress_manager.finish()
+        logger.info(f"Done executing plan: {plan.plan_id}")
+        logger.debug(f"Plan stats: (plan_cost={plan_stats.total_plan_cost}, plan_time={plan_stats.total_plan_time})")
+        return output_records, plan_stats
+class SequentialParallelExecutionStrategy(ExecutionStrategy):
+    """
+    A parallel execution strategy that processes operators sequentially.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_workers = (
+            self._get_parallel_max_workers()
+            if self.max_workers is None
+            else self.max_workers
+        )
+    def _get_parallel_max_workers(self):
+        # for now, return the number of system CPUs;
+        # in the future, we may want to consider the models the user has access to
+        # and whether or not they will encounter rate-limits. If they will, we should
+        # set the max workers in a manner that is designed to avoid hitting them.
+        # Doing this "right" may require considering their logical, physical plan,
+        # and tier status with LLM providers. It may also be worth dynamically
+        # changing the max_workers in response to 429 errors.
+        return max(int(0.8 * multiprocessing.cpu_count()), 1)
+    def _any_queue_not_empty(self, queues: dict[str, list]) -> bool:
+        """Helper function to check if any queue is not empty."""
+        return any(len(queue) > 0 for queue in queues.values())
+    def _upstream_ops_finished(self, plan: PhysicalPlan, op_idx: int, input_queues: dict[str, list], future_queues: dict[str, list]) -> bool:
+        """Helper function to check if all upstream operators have finished processing their inputs."""
+        for upstream_op_idx in range(op_idx):
+            upstream_full_op_id = plan.operators[upstream_op_idx].get_full_op_id()
+            if len(input_queues[upstream_full_op_id]) > 0 or len(future_queues[upstream_full_op_id]) > 0:
+                return False
+        return True
+    def _process_future_results(self, operator: PhysicalOperator, future_queues: dict[str, list], plan_stats: PlanStats) -> list[DataRecord]:
+        """
+        Helper function which takes an operator, the future queues, and plan stats, and performs
+        the updates to plan stats and progress manager before returning the results from the finished futures.
+        """
+        # get the op_id for the operator
+        full_op_id = operator.get_full_op_id()
+        # this function is called when the future queue is not empty
+        # and the executor is not busy processing other futures
+        done_futures, not_done_futures = wait(future_queues[full_op_id], timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
+        # add the unfinished futures back to the previous op's future queue
+        future_queues[full_op_id] = list(not_done_futures)
+        # add the finished futures to the input queue for this operator
+        output_records = []
+        for future in done_futures:
+            record_set: DataRecordSet = future.result()
+            records = record_set.data_records
+            record_op_stats = record_set.record_op_stats
+            num_outputs = sum(record.passed_operator for record in records)
+            # update the progress manager
+            self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+            # update plan stats
+            plan_stats.add_record_op_stats(record_op_stats)
+            # add records to the cache
+            self._add_records_to_cache(operator.target_cache_id, records)
+            # add records which aren't filtered to the output records
+            output_records.extend([record for record in records if record.passed_operator])
+        return output_records
+    def _execute_plan(
+            self,
+            plan: PhysicalPlan,
+            input_queues: dict[str, list],
+            future_queues: dict[str, list],
+            plan_stats: PlanStats,
+        ) -> tuple[list[DataRecord], PlanStats]:
+        # process all of the input records using a thread pool
+        output_records = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            logger.debug(f"Created thread pool with {self.max_workers} workers")
+            # execute the plan until either:
+            # 1. all records have been processed, or
+            # 2. the final limit operation has completed (we break out of the loop if this happens)
+            final_op = plan.operators[-1]
+            for op_idx, operator in enumerate(plan.operators):
+                full_op_id = operator.get_full_op_id()
+                input_queue = input_queues[full_op_id]
+                # if this operator is an aggregate, process all the records in the input queue
+                if isinstance(operator, AggregateOp):
+                    num_inputs = len(input_queue)
+                    input_records = [input_queue.pop(0) for _ in range(num_inputs)]
+                    future = executor.submit(operator, input_records)
+                    future_queues[full_op_id].append(future)
+                else:
+                    while len(input_queue) > 0:
+                        input_record = input_queue.pop(0)
+                        future = executor.submit(operator, input_record)
+                        future_queues[full_op_id].append(future)
+                # block until all futures for this operator have completed; and add finished futures to next operator's input
+                while len(future_queues[full_op_id]) > 0:
+                    records = self._process_future_results(operator, future_queues, plan_stats)
+                    # get any finished futures from the previous operator and add them to the input queue for this operator
+                    if full_op_id != final_op.get_full_op_id():
+                        next_op_id = plan.operators[op_idx + 1].get_full_op_id()
+                        input_queues[next_op_id].extend(records)
+                    # for the final operator, add any finished futures to the output_records
+                    else:
+                        output_records.extend(records)
+                        # break out of loop if the final operator is a LimitScanOp and we've reached its limit
+                        if isinstance(final_op, LimitScanOp) and len(output_records) == final_op.limit:
+                            break
+        # close the cache
+        self._close_cache([op.target_cache_id for op in plan.operators])
+        # finalize plan stats
+        plan_stats.finish()
+        return output_records, plan_stats
+    def execute_plan(self, plan: PhysicalPlan):
+        """Initialize the stats and execute the plan."""
+        # for now, assert that the first operator in the plan is a ScanPhysicalOp
+        assert isinstance(plan.operators[0], ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
+        logger.info(f"Executing plan {plan.plan_id} with {self.max_workers} workers")
+        logger.info(f"Plan Details: {plan}")
+        # initialize plan stats
+        plan_stats = PlanStats.from_plan(plan)
+        plan_stats.start()
+        # initialize input queues and future queues for each operation
+        input_queues = self._create_input_queues(plan)
+        future_queues = {op.get_full_op_id(): [] for op in plan.operators}
         # initialize and start the progress manager
         self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)

palimpzest/query/execution/single_threaded_execution_strategy.py CHANGED Viewed

@@ -30,35 +30,35 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
         output_records = []
         for op_idx, operator in enumerate(plan.operators):
             # if we've filtered out all records, terminate early
-            op_id = operator.get_op_id()
-            num_inputs = len(input_queues[op_id])
+            full_op_id = operator.get_full_op_id()
+            num_inputs = len(input_queues[full_op_id])
             if num_inputs == 0:
                 break
             # begin to process this operator
             records, record_op_stats = [], []
-            logger.info(f"Processing operator {operator.op_name()} ({op_id})")
+            logger.info(f"Processing operator {operator.op_name()} ({full_op_id})")
             # if this operator is an aggregate, process all the records in the input_queue
             if isinstance(operator, AggregateOp):
-                record_set = operator(candidates=input_queues[op_id])
+                record_set = operator(candidates=input_queues[full_op_id])
                 records = record_set.data_records
                 record_op_stats = record_set.record_op_stats
                 num_outputs = sum(record.passed_operator for record in records)
                 # update the progress manager
-                self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
             # otherwise, process the records in the input queue for this operator one at a time
             else:
-                for input_record in input_queues[op_id]:
+                for input_record in input_queues[full_op_id]:
                     record_set = operator(input_record)
                     records.extend(record_set.data_records)
                     record_op_stats.extend(record_set.record_op_stats)
                     num_outputs = sum(record.passed_operator for record in record_set.data_records)
                     # update the progress manager
-                    self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                    self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
                     # finish early if this is a limit
                     if isinstance(operator, LimitScanOp) and len(records) == operator.limit:
@@ -73,10 +73,10 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
             # update next input_queue (if it exists)
             output_records = [record for record in records if record.passed_operator]
             if op_idx + 1 < len(plan.operators):
-                next_op_id = plan.operators[op_idx + 1].get_op_id()
-                input_queues[next_op_id] = output_records
+                next_full_op_id = plan.operators[op_idx + 1].get_full_op_id()
+                input_queues[next_full_op_id] = output_records
-            logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_op_id()}), and generated {len(records)} records")
+            logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_full_op_id()}), and generated {len(records)} records")
         # close the cache
         self._close_cache([op.target_cache_id for op in plan.operators])
@@ -146,8 +146,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
     def _upstream_ops_finished(self, plan: PhysicalPlan, op_idx: int, input_queues: dict[str, list]) -> bool:
         """Helper function to check if all upstream operators have finished processing their inputs."""
         for upstream_op_idx in range(op_idx):
-            upstream_op_id = plan.operators[upstream_op_idx].get_op_id()
-            if len(input_queues[upstream_op_id]) > 0:
+            upstream_full_op_id = plan.operators[upstream_op_idx].get_full_op_id()
+            if len(input_queues[upstream_full_op_id]) > 0:
                 return False
         return True
@@ -160,8 +160,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
         while self._any_queue_not_empty(input_queues):
             for op_idx, operator in enumerate(plan.operators):
                 # if this operator does not have enough inputs to execute, then skip it
-                op_id = operator.get_op_id()
-                num_inputs = len(input_queues[op_id])
+                full_op_id = operator.get_full_op_id()
+                num_inputs = len(input_queues[full_op_id])
                 agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, op_idx, input_queues)
                 if num_inputs == 0 or agg_op_not_ready:
                     continue
@@ -171,25 +171,25 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                 # if the next operator is an aggregate, process all the records in the input_queue
                 if isinstance(operator, AggregateOp):
-                    input_records = [input_queues[op_id].pop(0) for _ in range(num_inputs)]
+                    input_records = [input_queues[full_op_id].pop(0) for _ in range(num_inputs)]
                     record_set = operator(candidates=input_records)
                     records = record_set.data_records
                     record_op_stats = record_set.record_op_stats
                     num_outputs = sum(record.passed_operator for record in records)
                     # update the progress manager
-                    self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                    self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
                 # otherwise, process the next record in the input queue for this operator
                 else:
-                    input_record = input_queues[op_id].pop(0)
+                    input_record = input_queues[full_op_id].pop(0)
                     record_set = operator(input_record)
                     records = record_set.data_records
                     record_op_stats = record_set.record_op_stats
                     num_outputs = sum(record.passed_operator for record in records)
                     # update the progress manager
-                    self.progress_manager.incr(op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
+                    self.progress_manager.incr(full_op_id, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
                 # update plan stats
                 plan_stats.add_record_op_stats(record_op_stats)
@@ -200,12 +200,12 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                 # update next input_queue or final_output_records
                 output_records = [record for record in records if record.passed_operator]
                 if op_idx + 1 < len(plan.operators):
-                    next_op_id = plan.operators[op_idx + 1].get_op_id()
-                    input_queues[next_op_id].extend(output_records)
+                    next_full_op_id = plan.operators[op_idx + 1].get_full_op_id()
+                    input_queues[next_full_op_id].extend(output_records)
                 else:
                     final_output_records.extend(output_records)
-                logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_op_id()}) on {num_inputs} records")
+                logger.info(f"Finished processing operator {operator.op_name()} ({operator.get_full_op_id()}) on {num_inputs} records")
             # break out of loop if the final operator is a LimitScanOp and we've reached its limit
             if isinstance(plan.operators[-1], LimitScanOp) and len(final_output_records) == plan.operators[-1].limit:

palimpzest/query/generators/api_client_factory.py CHANGED Viewed

@@ -22,10 +22,9 @@ class APIClientFactory:
     @staticmethod
     def _create_client(api_client: APIClient, api_key: str):
         """Create a new client instance based on the api_client name."""
-        match api_client:
-            case APIClient.OPENAI:
-                return OpenAI(api_key=api_key)
-            case APIClient.TOGETHER:
-                return Together(api_key=api_key)
-            case _:
-                raise ValueError(f"Unknown api_client: {api_client}")
+        if api_client == APIClient.OPENAI:
+            return OpenAI(api_key=api_key)
+        elif api_client == APIClient.TOGETHER:
+            return Together(api_key=api_key)
+        else:
+            raise ValueError(f"Unknown api_client: {api_client}")

palimpzest/query/generators/generators.py CHANGED Viewed

@@ -49,10 +49,10 @@ def generator_factory(
     """
     Factory function to return the correct generator based on the model, strategy, and cardinality.
     """
-    if model in [Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4o_V, Model.GPT_4o_MINI_V]:
+    if model.is_openai_model():
         return OpenAIGenerator(model, prompt_strategy, cardinality, verbose)
-    elif model in [Model.MIXTRAL, Model.LLAMA3, Model.LLAMA3_V, Model.DEEPSEEK]:
+    elif model.is_together_model():
         return TogetherGenerator(model, prompt_strategy, cardinality, verbose)
     raise Exception(f"Unsupported model: {model}")
@@ -61,8 +61,6 @@ def generator_factory(
 def get_api_key(key: str) -> str:
     # get API key from environment or throw an exception if it's not set
     if key not in os.environ:
-        print(f"KEY: {key}")
-        print(f"{os.environ.keys()}")
         raise ValueError("key not found in environment variables")
     return os.environ[key]
@@ -464,7 +462,7 @@ class OpenAIGenerator(BaseGenerator[str | list[str], str]):
         verbose: bool = False,
     ):
         # assert that model is an OpenAI model
-        assert model in [Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4o_V, Model.GPT_4o_MINI_V]
+        assert model.is_openai_model()
         super().__init__(model, prompt_strategy, cardinality, verbose, "developer")
     def _get_client_or_model(self, **kwargs) -> OpenAI:
@@ -508,7 +506,7 @@ class TogetherGenerator(BaseGenerator[str | list[str], str]):
         verbose: bool = False,
     ):
         # assert that model is a model offered by Together
-        assert model in [Model.MIXTRAL, Model.LLAMA3, Model.LLAMA3_V, Model.DEEPSEEK]
+        assert model.is_together_model()
         super().__init__(model, prompt_strategy, cardinality, verbose, "system")
     def _generate_payload(self, messages: list[dict], **kwargs) -> dict:
@@ -525,7 +523,7 @@ class TogetherGenerator(BaseGenerator[str | list[str], str]):
         For LLAMA3, the payload needs to be in a {"role": <role>, "content": <content>} format.
         """
         # for other models, use our standard payload generation
-        if self.model != Model.LLAMA3:
+        if not self.model.is_llama_model():
             return super()._generate_payload(messages, **kwargs)
         # get basic parameters
@@ -593,7 +591,6 @@ def code_ensemble_execution(
             preds.append(pred)
         preds = [pred for pred in preds if pred is not None]
-        print(preds)
         if len(preds) == 1:
             majority_response = preds[0]

palimpzest/query/operators/aggregate.py CHANGED Viewed

@@ -138,7 +138,7 @@ class ApplyGroupByOp(AggregateOp):
                 record_parent_id=dr.parent_id,
                 record_source_idx=dr.source_idx,
                 record_state=dr.to_dict(include_bytes=False),
-                op_id=self.get_op_id(),
+                full_op_id=self.get_full_op_id(),
                 logical_op_id=self.logical_op_id,
                 op_name=self.op_name(),
                 time_per_record=total_time / len(drs),
@@ -198,7 +198,8 @@ class AverageAggregateOp(AggregateOp):
             record_parent_id=dr.parent_id,
             record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
-            op_id=self.get_op_id(),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=time.time() - start_time,
             cost_per_record=0.0,
@@ -251,7 +252,7 @@ class CountAggregateOp(AggregateOp):
             record_parent_id=dr.parent_id,
             record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
-            op_id=self.get_op_id(),
+            full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=time.time() - start_time,

palimpzest/query/operators/convert.py CHANGED Viewed

@@ -115,7 +115,7 @@ class ConvertOp(PhysicalOperator, ABC):
                 record_parent_id=dr.parent_id,
                 record_source_idx=dr.source_idx,
                 record_state=dr.to_dict(include_bytes=False),
-                op_id=self.get_op_id(),
+                full_op_id=self.get_full_op_id(),
                 logical_op_id=self.logical_op_id,
                 op_name=self.op_name(),
                 time_per_record=time_per_record,

palimpzest/query/operators/filter.py CHANGED Viewed

@@ -84,7 +84,7 @@ class FilterOp(PhysicalOperator, ABC):
             record_parent_id=dr.parent_id,
             record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
-            op_id=self.get_op_id(),
+            full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=total_time,

palimpzest/query/operators/limit.py CHANGED Viewed

@@ -44,7 +44,7 @@ class LimitScanOp(PhysicalOperator):
             record_parent_id=dr.parent_id,
             record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
-            op_id=self.get_op_id(),
+            full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=0.0,

palimpzest/query/operators/map.py CHANGED Viewed

@@ -47,7 +47,7 @@ class MapOp(PhysicalOperator):
             record_parent_id=record.parent_id,
             record_source_idx=record.source_idx,
             record_state=record.to_dict(include_bytes=False),
-            op_id=self.get_op_id(),
+            full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=total_time,

palimpzest/query/operators/physical.py CHANGED Viewed

@@ -58,8 +58,8 @@ class PhysicalOperator:
         return op
     def __eq__(self, other) -> bool:
-        all_id_params_match = all(value == getattr(other, key) for key, value in self.get_id_params().items())
-        return isinstance(other, self.__class__) and all_id_params_match
+        all_op_params_match = all(value == getattr(other, key) for key, value in self.get_op_params().items())
+        return isinstance(other, self.__class__) and all_op_params_match
     def copy(self) -> PhysicalOperator:
         return self.__class__(**self.get_op_params())
@@ -79,7 +79,8 @@ class PhysicalOperator:
               This is particularly true for convert operations, where the output schema
               is now the union of the input and output schemas of the logical operator.
         """
-        return {"generated_fields": self.generated_fields}
+        # return {"generated_fields": self.generated_fields}
+        return {}
     def get_op_params(self) -> dict:
         """
@@ -129,8 +130,11 @@ class PhysicalOperator:
     def get_logical_op_id(self) -> str | None:
         return self.logical_op_id
+    def get_full_op_id(self):
+        return f"{self.get_logical_op_id()}-{self.get_op_id()}"
     def __hash__(self):
-        return int(self.op_id, 16)
+        return int(self.op_id, 16) # NOTE: should we use self.get_full_op_id() instead?
     def get_model_name(self) -> str | None:
         """Returns the name of the model used by the physical operator (if it sets self.model). Otherwise, it returns None."""

palimpzest/query/operators/project.py CHANGED Viewed

@@ -42,7 +42,7 @@ class ProjectOp(PhysicalOperator):
             record_parent_id=dr.parent_id,
             record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
-            op_id=self.get_op_id(),
+            full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=0.0,

palimpzest/query/operators/retrieve.py CHANGED Viewed

@@ -8,7 +8,6 @@ from chromadb.api.models.Collection import Collection
 from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
 from chromadb.utils.embedding_functions.openai_embedding_function import OpenAIEmbeddingFunction
 from openai import OpenAI
-from ragatouille.RAGPretrainedModel import RAGPretrainedModel
 from sentence_transformers import SentenceTransformer
 from palimpzest.constants import MODEL_CARDS, Model
@@ -21,7 +20,7 @@ from palimpzest.query.operators.physical import PhysicalOperator
 class RetrieveOp(PhysicalOperator):
     def __init__(
         self,
-        index: Collection | RAGPretrainedModel,
+        index: Collection,
         search_attr: str,
         output_attrs: list[dict] | type[Schema],
         search_func: Callable | None,
@@ -33,7 +32,7 @@ class RetrieveOp(PhysicalOperator):
         Initialize the RetrieveOp object.
         Args:
-            index (Collection | RAGPretrainedModel): The PZ index to use for retrieval.
+            index (Collection): The PZ index to use for retrieval.
             search_attr (str): The attribute to search on.
             output_attrs (list[dict]): The output fields containing the results of the search.
             search_func (Callable | None): The function to use for searching the index. If None, the default search function will be used.
@@ -100,7 +99,7 @@ class RetrieveOp(PhysicalOperator):
             quality=1.0,
         )
-    def default_search_func(self, index: Collection | RAGPretrainedModel, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
+    def default_search_func(self, index: Collection, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
         """
         Default search function for the Retrieve operation. This function uses the index to
         retrieve the top-k results for the given query. The query will be a (possibly singleton)
@@ -132,24 +131,8 @@ class RetrieveOp(PhysicalOperator):
             # NOTE: self.output_field_names must be a singleton for default_search_func to be used
             return {self.output_field_names[0]: final_results}
-        elif isinstance(index, RAGPretrainedModel):
-            # if the index is a rag model, use the rag model to get the top k results
-            results = index.search(query, k=k)
-            # the results will be a list[dict]; if the input is a singleton list, however
-            # it will be a list[list[dict]]; if the input is a list of lists
-            final_results = []
-            if is_singleton_list:
-                final_results = [result["content"] for result in results]
-            else:
-                for query_results in results:
-                    final_results.append([result["content"] for result in query_results])
-            # NOTE: self.output_field_names must be a singleton for default_search_func to be used
-            return {self.output_field_names[0]: final_results}
         else:
-            raise ValueError("Unsupported index type. Must be either a Collection or RAGPretrainedModel.")
+            raise ValueError("Unsupported index type. Must be either a Collection.")
     def _create_record_set(
         self,
@@ -180,7 +163,7 @@ class RetrieveOp(PhysicalOperator):
             record_parent_id=output_dr.parent_id,
             record_source_idx=output_dr.source_idx,
             record_state=record_state,
-            op_id=self.get_op_id(),
+            full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=total_time,
@@ -231,7 +214,8 @@ class RetrieveOp(PhysicalOperator):
             model_name = self.index._embedding_function._model_name if uses_openai_embedding_fcn else "clip-ViT-B-32"
             err_msg = f"For Chromadb, we currently only support `text-embedding-3-small` and `clip-ViT-B-32`; your index uses: {model_name}"
-            assert model_name in [Model.TEXT_EMBEDDING_3_SMALL.value, Model.CLIP_VIT_B_32.value], err_msg
+            embedding_model_names = [model.value for model in Model if model.is_embedding_model()]
+            assert model_name in embedding_model_names, err_msg
             # compute embeddings
             try:

palimpzest 0.7.7__py3-none-any.whl → 0.7.9__py3-none-any.whl

palimpzest 0.7.7py3-none-any.whl → 0.7.9py3-none-any.whl