PyPI - palimpzest - Versions diffs - 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

palimpzest 0.8.7py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

palimpzest/constants.py +13 -4
palimpzest/core/data/dataset.py +75 -5
palimpzest/core/elements/groupbysig.py +5 -1
palimpzest/core/elements/records.py +16 -7
palimpzest/core/lib/schemas.py +26 -3
palimpzest/core/models.py +4 -4
palimpzest/prompts/aggregate_prompts.py +99 -0
palimpzest/prompts/prompt_factory.py +162 -75
palimpzest/prompts/utils.py +38 -1
palimpzest/prompts/validator.py +24 -24
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +8 -8
palimpzest/query/execution/mab_execution_strategy.py +30 -11
palimpzest/query/execution/parallel_execution_strategy.py +31 -7
palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
palimpzest/query/generators/generators.py +9 -7
palimpzest/query/operators/__init__.py +10 -6
palimpzest/query/operators/aggregate.py +394 -10
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/join.py +279 -23
palimpzest/query/operators/logical.py +36 -11
palimpzest/query/operators/mixture_of_agents.py +3 -1
palimpzest/query/operators/physical.py +5 -2
palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
palimpzest/query/optimizer/__init__.py +11 -3
palimpzest/query/optimizer/cost_model.py +5 -5
palimpzest/query/optimizer/optimizer.py +3 -2
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/rules.py +73 -13
palimpzest/query/optimizer/tasks.py +4 -4
palimpzest/utils/progress.py +19 -17
palimpzest/validator/validator.py +7 -7
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/RECORD +37 -36
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/join.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import threading
 import time
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -37,10 +38,9 @@ class JoinOp(PhysicalOperator, ABC):
     def __init__(
         self,
         condition: str,
-        model: Model,
-        prompt_strategy: PromptStrategy = PromptStrategy.JOIN,
+        how: str = "inner",
+        on: list[str] | None = None,
         join_parallelism: int = 64,
-        reasoning_effort: str | None = None,
         retain_inputs: bool = True,
         desc: str | None = None,
         *args,
@@ -49,33 +49,37 @@ class JoinOp(PhysicalOperator, ABC):
         super().__init__(*args, **kwargs)
         assert self.input_schema == self.output_schema, "Input and output schemas must match for JoinOp"
         self.condition = condition
-        self.model = model
-        self.prompt_strategy = prompt_strategy
+        self.how = how
+        self.on = on
         self.join_parallelism = join_parallelism
-        self.reasoning_effort = reasoning_effort
         self.retain_inputs = retain_inputs
         self.desc = desc
-        self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
         self.join_idx = 0
+        self.finished = False
         # maintain list(s) of input records for the join
         self._left_input_records: list[DataRecord] = []
         self._right_input_records: list[DataRecord] = []
+        # maintain set of left/right record ids that have been joined (for left/right/outer joins)
+        self._left_joined_record_ids: set[str] = set()
+        self._right_joined_record_ids: set[str] = set()
     def __str__(self):
         op = super().__str__()
         op += f"    Condition: {self.condition}\n"
+        op += f"    How: {self.how}\n"
+        op += f"    On: {self.on}\n"
         return op
     def get_id_params(self):
         id_params = super().get_id_params()
         id_params = {
             "condition": self.condition,
-            "model": self.model.value,
-            "prompt_strategy": self.prompt_strategy.value,
             "join_parallelism": self.join_parallelism,
-            "reasoning_effort": self.reasoning_effort,
             "desc": self.desc,
+            "how": self.how,
+            "on": self.on,
             **id_params,
         }
         return id_params
@@ -84,23 +88,232 @@ class JoinOp(PhysicalOperator, ABC):
         op_params = super().get_op_params()
         op_params = {
             "condition": self.condition,
-            "model": self.model,
-            "prompt_strategy": self.prompt_strategy,
             "join_parallelism": self.join_parallelism,
-            "reasoning_effort": self.reasoning_effort,
             "retain_inputs": self.retain_inputs,
             "desc": self.desc,
+            "how": self.how,
+            "on": self.on,
             **op_params,
         }
         return op_params
-    def get_model_name(self):
-        return self.model.value
+    def _compute_unmatched_records(self) -> DataRecordSet:
+        """Helper function to compute unmatched records for left/right/outer joins."""
+        def join_unmatched_records(input_records: list[DataRecord] | list[tuple[DataRecord, list[float]]], joined_record_ids: set[str], left: bool = True):
+            records, record_op_stats_lst = [], []
+            for record in input_records:
+                start_time = time.time()
+                record = record[0] if isinstance(record, tuple) else record
+                if record._id not in joined_record_ids:
+                    unmatched_dr = (
+                        DataRecord.from_join_parents(self.output_schema, record, None)
+                        if left
+                        else DataRecord.from_join_parents(self.output_schema, None, record)
+                    )
+                    unmatched_dr._passed_operator = True
+                    # compute record stats and add to output_record_op_stats
+                    time_per_record = time.time() - start_time
+                    record_op_stats = RecordOpStats(
+                        record_id=unmatched_dr._id,
+                        record_parent_ids=unmatched_dr._parent_ids,
+                        record_source_indices=unmatched_dr._source_indices,
+                        record_state=unmatched_dr.to_dict(include_bytes=False),
+                        full_op_id=self.get_full_op_id(),
+                        logical_op_id=self.logical_op_id,
+                        op_name=self.op_name(),
+                        time_per_record=time_per_record,
+                        cost_per_record=0.0,
+                        model_name=self.get_model_name(),
+                        join_condition=str(self.on),
+                        fn_call_duration_secs=time_per_record,
+                        answer={"passed_operator": True},
+                        passed_operator=True,
+                        op_details={k: str(v) for k, v in self.get_id_params().items()},
+                    )
+                    records.append(unmatched_dr)
+                    record_op_stats_lst.append(record_op_stats)
+            return records, record_op_stats_lst
+        records, record_op_stats = [], []
+        if self.how == "left":
+            records, record_op_stats = join_unmatched_records(self._left_input_records, self._left_joined_record_ids, left=True)
+        elif self.how == "right":
+            records, record_op_stats = join_unmatched_records(self._right_input_records, self._right_joined_record_ids, left=False)
+        elif self.how == "outer":
+            records, record_op_stats = join_unmatched_records(self._left_input_records, self._left_joined_record_ids, left=True)
+            right_records, right_record_op_stats = join_unmatched_records(self._right_input_records, self._right_joined_record_ids, left=False)
+            records.extend(right_records)
+            record_op_stats.extend(right_record_op_stats)
+        return DataRecordSet(records, record_op_stats)
     @abstractmethod
     def naive_cost_estimates(self, left_source_op_cost_estimates: OperatorCostEstimates, right_source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
         pass
+    def set_finished(self):
+        """Mark the operator as finished after computing left/right/outer join logic."""
+        self.finished = True
+class RelationalJoin(JoinOp):
+    def get_model_name(self):
+        return None
+    def _process_join_candidate_pair(self, left_candidate, right_candidate) -> tuple[DataRecord, RecordOpStats]:
+        start_time = time.time()
+        # determine whether or not the join was satisfied
+        passed_operator = all(
+            left_candidate[field] == right_candidate[field]
+            for field in self.on
+        )
+        # handle different join types
+        if self.how == "left" and passed_operator:
+            self._left_joined_record_ids.add(left_candidate._id)
+        elif self.how == "right" and passed_operator:
+            self._right_joined_record_ids.add(right_candidate._id)
+        elif self.how == "outer" and passed_operator:
+            self._left_joined_record_ids.add(left_candidate._id)
+            self._right_joined_record_ids.add(right_candidate._id)
+        # compute output record and add to output_records
+        join_dr = DataRecord.from_join_parents(self.output_schema, left_candidate, right_candidate)
+        join_dr._passed_operator = passed_operator
+        # compute record stats and add to output_record_op_stats
+        time_per_record = time.time() - start_time
+        record_op_stats = RecordOpStats(
+            record_id=join_dr._id,
+            record_parent_ids=join_dr._parent_ids,
+            record_source_indices=join_dr._source_indices,
+            record_state=join_dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time_per_record,
+            cost_per_record=0.0,
+            model_name=self.get_model_name(),
+            join_condition=str(self.on),
+            fn_call_duration_secs=time_per_record,
+            answer={"passed_operator": passed_operator},
+            passed_operator=passed_operator,
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return join_dr, record_op_stats
+    def naive_cost_estimates(self, left_source_op_cost_estimates: OperatorCostEstimates, right_source_op_cost_estimates: OperatorCostEstimates):
+        # estimate output cardinality using a constant assumption of the filter selectivity
+        selectivity = NAIVE_EST_JOIN_SELECTIVITY
+        cardinality = selectivity * (left_source_op_cost_estimates.cardinality * right_source_op_cost_estimates.cardinality)
+        # estimate 1 ms execution time per input record pair
+        time_per_record = 0.001 * (left_source_op_cost_estimates.cardinality + right_source_op_cost_estimates.cardinality)
+        return OperatorCostEstimates(
+            cardinality=cardinality,
+            time_per_record=time_per_record,
+            cost_per_record=0.0,
+            quality=1.0,
+        )
+    def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord], final: bool = False) -> tuple[DataRecordSet, int]:
+        # create the set of candidates to join
+        join_candidates = []
+        for candidate in left_candidates:
+            for right_candidate in right_candidates:
+                join_candidates.append((candidate, right_candidate))
+            for right_candidate in self._right_input_records:
+                join_candidates.append((candidate, right_candidate))
+        for candidate in self._left_input_records:
+            for right_candidate in right_candidates:
+                join_candidates.append((candidate, right_candidate))
+        # apply the join logic to each pair of candidates
+        output_records, output_record_op_stats = [], []
+        with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
+            futures = [
+                executor.submit(self._process_join_candidate_pair, candidate, right_candidate)
+                for candidate, right_candidate in join_candidates
+            ]
+            # collect results as they complete
+            for future in as_completed(futures):
+                self.join_idx += 1
+                join_output_record, join_output_record_op_stats = future.result()
+                output_records.append(join_output_record)
+                output_record_op_stats.append(join_output_record_op_stats)
+        # compute the number of inputs processed
+        num_inputs_processed = len(join_candidates)
+        # store input records to join with new records added later
+        if self.retain_inputs:
+            self._left_input_records.extend(left_candidates)
+            self._right_input_records.extend(right_candidates)
+        # if this is the final call, then add in any left/right/outer join records that did not match
+        if final:
+            return self._compute_unmatched_records(), 0
+        # return empty DataRecordSet if no output records were produced
+        if len(output_records) == 0:
+            return DataRecordSet([], []), num_inputs_processed
+        return DataRecordSet(output_records, output_record_op_stats), num_inputs_processed
+class LLMJoin(JoinOp):
+    def __init__(
+        self,
+        model: Model,
+        prompt_strategy: PromptStrategy = PromptStrategy.JOIN,
+        reasoning_effort: str | None = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.model = model
+        self.prompt_strategy = prompt_strategy
+        self.reasoning_effort = reasoning_effort
+        self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Model: {self.model.value}\n"
+        op += f"    Reasoning Effort: {self.reasoning_effort}\n"
+        op += f"    Prompt Strategy: {self.prompt_strategy.value}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "model": self.model.value,
+            "prompt_strategy": self.prompt_strategy.value,
+            "reasoning_effort": self.reasoning_effort,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "model": self.model,
+            "prompt_strategy": self.prompt_strategy,
+            "reasoning_effort": self.reasoning_effort,
+            **op_params,
+        }
+        return op_params
+    def get_model_name(self):
+        return self.model.value
     def _process_join_candidate_pair(
         self,
         left_candidate: DataRecord,
@@ -116,6 +329,15 @@ class JoinOp(PhysicalOperator, ABC):
         # determine whether or not the join was satisfied
         passed_operator = field_answers["passed_operator"]
+        # handle different join types
+        if self.how == "left" and passed_operator:
+            self._left_joined_record_ids.add(left_candidate._id)
+        elif self.how == "right" and passed_operator:
+            self._right_joined_record_ids.add(right_candidate._id)
+        elif self.how == "outer" and passed_operator:
+            self._left_joined_record_ids.add(left_candidate._id)
+            self._right_joined_record_ids.add(right_candidate._id)
         # compute output record and add to output_records
         join_dr = DataRecord.from_join_parents(self.output_schema, left_candidate, right_candidate)
         join_dr._passed_operator = passed_operator
@@ -149,7 +371,7 @@ class JoinOp(PhysicalOperator, ABC):
         return join_dr, record_op_stats
-class NestedLoopsJoin(JoinOp):
+class NestedLoopsJoin(LLMJoin):
     def naive_cost_estimates(self, left_source_op_cost_estimates: OperatorCostEstimates, right_source_op_cost_estimates: OperatorCostEstimates):
         # estimate number of input tokens from source
@@ -192,7 +414,7 @@ class NestedLoopsJoin(JoinOp):
             quality=quality,
         )
-    def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord]) -> tuple[DataRecordSet, int]:
+    def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord], final: bool = False) -> tuple[DataRecordSet, int]:
         # get the set of input fields from both records in the join
         input_fields = self.get_input_fields()
@@ -234,6 +456,10 @@ class NestedLoopsJoin(JoinOp):
             self._left_input_records.extend(left_candidates)
             self._right_input_records.extend(right_candidates)
+        # if this is the final call, then add in any left/right/outer join records that did not match
+        if final:
+            return self._compute_unmatched_records(), 0
         # return empty DataRecordSet if no output records were produced
         if len(output_records) == 0:
             return DataRecordSet([], []), num_inputs_processed
@@ -241,7 +467,7 @@ class NestedLoopsJoin(JoinOp):
         return DataRecordSet(output_records, output_record_op_stats), num_inputs_processed
-class EmbeddingJoin(JoinOp):
+class EmbeddingJoin(LLMJoin):
     # NOTE: we currently do not support audio joins as embedding models for audio seem to have
     # specialized use cases (e.g., speech-to-text) with strict requirements on things like e.g. sample rate
     def __init__(
@@ -261,6 +487,8 @@ class EmbeddingJoin(JoinOp):
             if field_name.split(".")[-1] in self.get_input_fields()
         ])
         self.embedding_model = Model.TEXT_EMBEDDING_3_SMALL if self.text_only else Model.CLIP_VIT_B_32
+        self.clip_model = None
+        self._lock = threading.Lock()
         # keep track of embedding costs that could not be amortized if no output records were produced
         self.residual_embedding_cost = 0.0
@@ -276,6 +504,11 @@ class EmbeddingJoin(JoinOp):
         self.min_matching_sim = float("inf")
         self.max_non_matching_sim = float("-inf")
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Num Samples: {self.num_samples}\n"
+        return op
     def get_id_params(self):
         id_params = super().get_id_params()
         id_params = {
@@ -327,6 +560,12 @@ class EmbeddingJoin(JoinOp):
             quality=quality,
         )
+    def _get_clip_model(self):
+        with self._lock:
+            if self.clip_model is None:
+                self.clip_model = SentenceTransformer(self.embedding_model.value)
+            return self.clip_model
     def _compute_embeddings(self, candidates: list[DataRecord], input_fields: list[str]) -> tuple[np.ndarray, GenerationStats]:
         # return empty array and empty stats if no candidates
         if len(candidates) == 0:
@@ -342,7 +581,7 @@ class EmbeddingJoin(JoinOp):
             total_input_tokens = response.usage.total_tokens
             embeddings = np.array([item.embedding for item in response.data])
         else:
-            model = SentenceTransformer(self.embedding_model.value)
+            model = self._get_clip_model()
             embeddings = np.zeros((len(candidates), 512))  # CLIP embeddings are 512-dimensional
             num_input_fields_present = 0
             for field in input_fields:
@@ -389,6 +628,15 @@ class EmbeddingJoin(JoinOp):
         join_dr = DataRecord.from_join_parents(self.output_schema, left_candidate, right_candidate)
         join_dr._passed_operator = passed_operator
+        # handle different join types
+        if self.how == "left" and passed_operator:
+            self._left_joined_record_ids.add(left_candidate._id)
+        elif self.how == "right" and passed_operator:
+            self._right_joined_record_ids.add(right_candidate._id)
+        elif self.how == "outer" and passed_operator:
+            self._left_joined_record_ids.add(left_candidate._id)
+            self._right_joined_record_ids.add(right_candidate._id)
         # NOTE: embedding costs are amortized over all records and added at the end of __call__
         # compute record stats and add to output_record_op_stats
         record_op_stats = RecordOpStats(
@@ -410,7 +658,7 @@ class EmbeddingJoin(JoinOp):
         return join_dr, record_op_stats
-    def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord]) -> tuple[DataRecordSet, int]:
+    def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord], final: bool = False) -> tuple[DataRecordSet, int]:
         # get the set of input fields from both records in the join
         input_fields = self.get_input_fields()
@@ -468,18 +716,22 @@ class EmbeddingJoin(JoinOp):
                         self.max_non_matching_sim = embedding_sim
                     if records_joined and embedding_sim < self.min_matching_sim:
                         self.min_matching_sim = embedding_sim
             # update samples drawn and num_inputs_processed
             self.samples_drawn += samples_to_draw
             num_inputs_processed += samples_to_draw
         # process remaining candidates based on embedding similarity
         if len(join_candidates) > 0:
-             assert self.samples_drawn == self.num_samples, "All samples should have been drawn before processing remaining candidates"
+             assert self.samples_drawn >= self.num_samples, "All samples should have been drawn before processing remaining candidates"
              with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
                 futures = []
                 for left_candidate, right_candidate, embedding_sim in join_candidates:
-                    llm_call_needed = self.min_matching_sim <= embedding_sim <= self.max_non_matching_sim
+                    llm_call_needed = (
+                        self.min_matching_sim == float("inf")
+                        or self.max_non_matching_sim == float("-inf")
+                        or self.min_matching_sim <= embedding_sim <= self.max_non_matching_sim
+                    )
                     if llm_call_needed:
                         futures.append(executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim))
@@ -526,6 +778,10 @@ class EmbeddingJoin(JoinOp):
             self._left_input_records.extend(zip(left_candidates, left_embeddings))
             self._right_input_records.extend(zip(right_candidates, right_embeddings))
+        # if this is the final call, then add in any left/right/outer join records that did not match
+        if final:
+            return self._compute_unmatched_records(), 0
         # return empty DataRecordSet if no output records were produced
         if len(output_records) == 0:
             self.residual_embedding_cost = total_embedding_cost

palimpzest/query/operators/logical.py CHANGED Viewed

@@ -9,7 +9,7 @@ from palimpzest.constants import AggFunc, Cardinality
 from palimpzest.core.data import context, dataset
 from palimpzest.core.elements.filters import Filter
 from palimpzest.core.elements.groupbysig import GroupBySig
-from palimpzest.core.lib.schemas import Average, Count
+from palimpzest.core.lib.schemas import Average, Count, Max, Min, Sum
 from palimpzest.utils.hash_helpers import hash_for_id
@@ -25,7 +25,7 @@ class LogicalOperator:
     - LimitScan (scans up to N records from a Set)
     - GroupByAggregate (applies a group by on the Set)
     - Aggregate (applies an aggregation on the Set)
-    - RetrieveScan (fetches documents from a provided input for a given query)
+    - TopKScan (fetches documents from a provided input for a given query)
     - Map (applies a function to each record in the Set without adding any new columns)
     - ComputeOperator (executes a computation described in natural language)
     - SearchOperator (executes a search query on the input Context)
@@ -149,27 +149,41 @@ class Aggregate(LogicalOperator):
     def __init__(
         self,
-        agg_func: AggFunc,
+        agg_func: AggFunc | None = None,
+        agg_str: str | None = None,
         *args,
         **kwargs,
     ):
+        assert agg_func is not None or agg_str is not None, "Either agg_func or agg_str must be provided"
         if kwargs.get("output_schema") is None:
             if agg_func == AggFunc.COUNT:
                 kwargs["output_schema"] = Count
             elif agg_func == AggFunc.AVERAGE:
                 kwargs["output_schema"] = Average
+            elif agg_func == AggFunc.SUM:
+                kwargs["output_schema"] = Sum
+            elif agg_func == AggFunc.MIN:
+                kwargs["output_schema"] = Min
+            elif agg_func == AggFunc.MAX:
+                kwargs["output_schema"] = Max
             else:
                 raise ValueError(f"Unsupported aggregation function: {agg_func}")
         super().__init__(*args, **kwargs)
         self.agg_func = agg_func
+        self.agg_str = agg_str
     def __str__(self):
-        return f"{self.__class__.__name__}(function: {str(self.agg_func.value)})"
+        desc = f"function: {str(self.agg_func.value)}" if self.agg_func else f"agg: {self.agg_str}"
+        return f"{self.__class__.__name__}({desc})"
     def get_logical_id_params(self) -> dict:
         logical_id_params = super().get_logical_id_params()
-        logical_id_params = {"agg_func": self.agg_func, **logical_id_params}
+        logical_id_params = {
+            "agg_func": self.agg_func,
+            "agg_str": self.agg_str,
+            **logical_id_params,
+        }
         return logical_id_params
@@ -177,6 +191,7 @@ class Aggregate(LogicalOperator):
         logical_op_params = super().get_logical_op_params()
         logical_op_params = {
             "agg_func": self.agg_func,
+            "agg_str": self.agg_str,
             **logical_op_params,
         }
@@ -398,17 +413,25 @@ class GroupByAggregate(LogicalOperator):
 class JoinOp(LogicalOperator):
-    def __init__(self, condition: str, desc: str | None = None, *args, **kwargs):
+    def __init__(self, condition: str, on: list[str] | None = None, how: str = "inner", desc: str | None = None, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.condition = condition
+        self.on = on
+        self.how = how
         self.desc = desc
     def __str__(self):
-        return f"Join(condition={self.condition})"
+        return f"Join(condition={self.condition})" if self.on is None else f"Join(on={self.on}, how={self.how})"
     def get_logical_id_params(self) -> dict:
         logical_id_params = super().get_logical_id_params()
-        logical_id_params = {"condition": self.condition, "desc": self.desc, **logical_id_params}
+        logical_id_params = {
+            "condition": self.condition,
+            "on": self.on,
+            "how": self.how,
+            "desc": self.desc,
+            **logical_id_params,
+        }
         return logical_id_params
@@ -416,6 +439,8 @@ class JoinOp(LogicalOperator):
         logical_op_params = super().get_logical_op_params()
         logical_op_params = {
             "condition": self.condition,
+            "on": self.on,
+            "how": self.how,
             "desc": self.desc,
             **logical_op_params,
         }
@@ -471,8 +496,8 @@ class Project(LogicalOperator):
         return logical_op_params
-class RetrieveScan(LogicalOperator):
-    """A RetrieveScan is a logical operator that represents a scan of a particular input Dataset, with a convert-like retrieve applied."""
+class TopKScan(LogicalOperator):
+    """A TopKScan is a logical operator that represents a scan of a particular input Dataset, with a top-k operation applied."""
     def __init__(
         self,
@@ -492,7 +517,7 @@ class RetrieveScan(LogicalOperator):
         self.k = k
     def __str__(self):
-        return f"RetrieveScan({self.input_schema} -> {str(self.output_schema)})"
+        return f"TopKScan({self.input_schema} -> {str(self.output_schema)})"
     def get_logical_id_params(self) -> dict:
         # NOTE: if we allow optimization over index, then we will need to include it in the id params

palimpzest/query/operators/mixture_of_agents.py CHANGED Viewed

@@ -75,8 +75,9 @@ class MixtureOfAgentsConvert(LLMConvert):
         In practice, this naive quality estimate will be overwritten by the CostModel's estimate
         once it executes a few instances of the operator.
         """
-        # temporarily set self.model so that super().naive_cost_estimates(...) can compute an estimate
+        # temporarily set self.model and self.prompt_strategy so that super().naive_cost_estimates(...) can compute an estimate
         self.model = self.proposer_models[0]
+        self.prompt_strategy = PromptStrategy.MAP_MOA_PROPOSER
         # get naive cost estimates for single LLM call and scale it by number of LLMs used in MoA
         naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
@@ -98,6 +99,7 @@ class MixtureOfAgentsConvert(LLMConvert):
         # reset self.model to be None
         self.model = None
+        self.prompt_strategy = None
         return naive_op_cost_estimates

palimpzest/query/operators/physical.py CHANGED Viewed

@@ -42,10 +42,13 @@ class PhysicalOperator:
         self.op_id = None
         # compute the input modalities (if any) for this physical operator
+        depends_on_short_field_names = [field.split(".")[-1] for field in self.depends_on] if self.depends_on is not None else None
         self.input_modalities = None
         if self.input_schema is not None:
             self.input_modalities = set()
-            for field in self.input_schema.model_fields.values():
+            for field_name, field in self.input_schema.model_fields.items():
+                if self.depends_on is not None and field_name not in depends_on_short_field_names:
+                    continue
                 field_type = field.annotation
                 if field_type in IMAGE_FIELD_TYPES:
                     self.input_modalities.add(Modality.IMAGE)
@@ -191,7 +194,7 @@ class PhysicalOperator:
         in the candidate. This is important for operators with retry logic, where we may only need to
         recompute a subset of self.generated_fields.
-        Right now this is only used by convert and retrieve operators.
+        Right now this is only used by convert and top-k operators.
         """
         fields_to_generate = [
             field_name

palimpzest 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl

palimpzest 0.8.7py3-none-any.whl → 1.0.0py3-none-any.whl