PyPI - palimpzest - Versions diffs - 0.9.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

palimpzest 0.9.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

palimpzest/constants.py +1 -0
palimpzest/core/data/dataset.py +33 -5
palimpzest/core/elements/groupbysig.py +5 -1
palimpzest/core/elements/records.py +16 -7
palimpzest/core/lib/schemas.py +20 -3
palimpzest/core/models.py +4 -4
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +8 -8
palimpzest/query/execution/mab_execution_strategy.py +30 -11
palimpzest/query/execution/parallel_execution_strategy.py +31 -7
palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
palimpzest/query/operators/__init__.py +7 -6
palimpzest/query/operators/aggregate.py +110 -5
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/join.py +279 -23
palimpzest/query/operators/logical.py +20 -8
palimpzest/query/operators/mixture_of_agents.py +3 -1
palimpzest/query/operators/physical.py +5 -2
palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
palimpzest/query/optimizer/__init__.py +7 -3
palimpzest/query/optimizer/cost_model.py +5 -5
palimpzest/query/optimizer/optimizer.py +3 -2
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/rules.py +31 -11
palimpzest/query/optimizer/tasks.py +4 -4
palimpzest/utils/progress.py +19 -17
palimpzest/validator/validator.py +7 -7
{palimpzest-0.9.0.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
{palimpzest-0.9.0.dist-info → palimpzest-1.0.0.dist-info}/RECORD +32 -32
{palimpzest-0.9.0.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
{palimpzest-0.9.0.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.9.0.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/logical.py CHANGED Viewed

@@ -9,7 +9,7 @@ from palimpzest.constants import AggFunc, Cardinality
 from palimpzest.core.data import context, dataset
 from palimpzest.core.elements.filters import Filter
 from palimpzest.core.elements.groupbysig import GroupBySig
-from palimpzest.core.lib.schemas import Average, Count, Max, Min
+from palimpzest.core.lib.schemas import Average, Count, Max, Min, Sum
 from palimpzest.utils.hash_helpers import hash_for_id
@@ -25,7 +25,7 @@ class LogicalOperator:
     - LimitScan (scans up to N records from a Set)
     - GroupByAggregate (applies a group by on the Set)
     - Aggregate (applies an aggregation on the Set)
-    - RetrieveScan (fetches documents from a provided input for a given query)
+    - TopKScan (fetches documents from a provided input for a given query)
     - Map (applies a function to each record in the Set without adding any new columns)
     - ComputeOperator (executes a computation described in natural language)
     - SearchOperator (executes a search query on the input Context)
@@ -160,6 +160,8 @@ class Aggregate(LogicalOperator):
                 kwargs["output_schema"] = Count
             elif agg_func == AggFunc.AVERAGE:
                 kwargs["output_schema"] = Average
+            elif agg_func == AggFunc.SUM:
+                kwargs["output_schema"] = Sum
             elif agg_func == AggFunc.MIN:
                 kwargs["output_schema"] = Min
             elif agg_func == AggFunc.MAX:
@@ -411,17 +413,25 @@ class GroupByAggregate(LogicalOperator):
 class JoinOp(LogicalOperator):
-    def __init__(self, condition: str, desc: str | None = None, *args, **kwargs):
+    def __init__(self, condition: str, on: list[str] | None = None, how: str = "inner", desc: str | None = None, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.condition = condition
+        self.on = on
+        self.how = how
         self.desc = desc
     def __str__(self):
-        return f"Join(condition={self.condition})"
+        return f"Join(condition={self.condition})" if self.on is None else f"Join(on={self.on}, how={self.how})"
     def get_logical_id_params(self) -> dict:
         logical_id_params = super().get_logical_id_params()
-        logical_id_params = {"condition": self.condition, "desc": self.desc, **logical_id_params}
+        logical_id_params = {
+            "condition": self.condition,
+            "on": self.on,
+            "how": self.how,
+            "desc": self.desc,
+            **logical_id_params,
+        }
         return logical_id_params
@@ -429,6 +439,8 @@ class JoinOp(LogicalOperator):
         logical_op_params = super().get_logical_op_params()
         logical_op_params = {
             "condition": self.condition,
+            "on": self.on,
+            "how": self.how,
             "desc": self.desc,
             **logical_op_params,
         }
@@ -484,8 +496,8 @@ class Project(LogicalOperator):
         return logical_op_params
-class RetrieveScan(LogicalOperator):
-    """A RetrieveScan is a logical operator that represents a scan of a particular input Dataset, with a convert-like retrieve applied."""
+class TopKScan(LogicalOperator):
+    """A TopKScan is a logical operator that represents a scan of a particular input Dataset, with a top-k operation applied."""
     def __init__(
         self,
@@ -505,7 +517,7 @@ class RetrieveScan(LogicalOperator):
         self.k = k
     def __str__(self):
-        return f"RetrieveScan({self.input_schema} -> {str(self.output_schema)})"
+        return f"TopKScan({self.input_schema} -> {str(self.output_schema)})"
     def get_logical_id_params(self) -> dict:
         # NOTE: if we allow optimization over index, then we will need to include it in the id params

palimpzest/query/operators/mixture_of_agents.py CHANGED Viewed

@@ -75,8 +75,9 @@ class MixtureOfAgentsConvert(LLMConvert):
         In practice, this naive quality estimate will be overwritten by the CostModel's estimate
         once it executes a few instances of the operator.
         """
-        # temporarily set self.model so that super().naive_cost_estimates(...) can compute an estimate
+        # temporarily set self.model and self.prompt_strategy so that super().naive_cost_estimates(...) can compute an estimate
         self.model = self.proposer_models[0]
+        self.prompt_strategy = PromptStrategy.MAP_MOA_PROPOSER
         # get naive cost estimates for single LLM call and scale it by number of LLMs used in MoA
         naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
@@ -98,6 +99,7 @@ class MixtureOfAgentsConvert(LLMConvert):
         # reset self.model to be None
         self.model = None
+        self.prompt_strategy = None
         return naive_op_cost_estimates

palimpzest/query/operators/physical.py CHANGED Viewed

@@ -42,10 +42,13 @@ class PhysicalOperator:
         self.op_id = None
         # compute the input modalities (if any) for this physical operator
+        depends_on_short_field_names = [field.split(".")[-1] for field in self.depends_on] if self.depends_on is not None else None
         self.input_modalities = None
         if self.input_schema is not None:
             self.input_modalities = set()
-            for field in self.input_schema.model_fields.values():
+            for field_name, field in self.input_schema.model_fields.items():
+                if self.depends_on is not None and field_name not in depends_on_short_field_names:
+                    continue
                 field_type = field.annotation
                 if field_type in IMAGE_FIELD_TYPES:
                     self.input_modalities.add(Modality.IMAGE)
@@ -191,7 +194,7 @@ class PhysicalOperator:
         in the candidate. This is important for operators with retry logic, where we may only need to
         recompute a subset of self.generated_fields.
-        Right now this is only used by convert and retrieve operators.
+        Right now this is only used by convert and top-k operators.
         """
         fields_to_generate = [
             field_name

palimpzest/query/operators/{retrieve.py → topk.py} RENAMED Viewed

@@ -17,7 +17,7 @@ from palimpzest.core.models import GenerationStats, OperatorCostEstimates, Recor
 from palimpzest.query.operators.physical import PhysicalOperator
-class RetrieveOp(PhysicalOperator):
+class TopKOp(PhysicalOperator):
     def __init__(
         self,
         index: Collection,
@@ -29,7 +29,7 @@ class RetrieveOp(PhysicalOperator):
         **kwargs,
     ) -> None:
         """
-        Initialize the RetrieveOp object.
+        Initialize the TopKOp object.
         Args:
             index (Collection): The PZ index to use for retrieval.
@@ -59,7 +59,7 @@ class RetrieveOp(PhysicalOperator):
     def __str__(self):
         op = super().__str__()
-        op += f"    Retrieve: {self.index.__class__.__name__} with top {self.k}\n"
+        op += f"    Top-K: {self.index.__class__.__name__} with k={self.k}\n"
         return op
     def get_id_params(self):
@@ -89,8 +89,8 @@ class RetrieveOp(PhysicalOperator):
     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
         """
-        Compute naive cost estimates for the Retrieve operation. These estimates assume
-        that the Retrieve (1) has no cost and (2) has perfect quality.
+        Compute naive cost estimates for the Top-K operation. These estimates assume
+        that the Top-K (1) has negligible cost and (2) has perfect quality.
         """
         return OperatorCostEstimates(
             cardinality=source_op_cost_estimates.cardinality,
@@ -101,7 +101,7 @@ class RetrieveOp(PhysicalOperator):
     def default_search_func(self, index: Collection, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
         """
-        Default search function for the Retrieve operation. This function uses the index to
+        Default search function for the Top-K operation. This function uses the index to
         retrieve the top-k results for the given query. The query will be a (possibly singleton)
         list of strings or a list of lists of floats (i.e., embeddings). The function will return
         the top-k results per-query in (descending) sorted order. If the input is a singleton list,
@@ -111,7 +111,7 @@ class RetrieveOp(PhysicalOperator):
         Args:
             index (PZIndex): The index to use for retrieval.
             query (list[str] | list[list[float]]): The query (or queries) to search for.
-            k (int): The maximum number of results the retrieve operator will return.
+            k (int): The maximum number of results the top-k operator will return.
         Returns:
             list[str] | list[list[str]]: The top results in (descending) sorted order per query.
@@ -260,10 +260,10 @@ class RetrieveOp(PhysicalOperator):
             top_results = self.search_func(self.index, inputs, self.k)
         except Exception:
-            top_results = ["error-in-retrieve"]
-            os.makedirs("retrieve-errors", exist_ok=True)
+            top_results = ["error-in-topk"]
+            os.makedirs("topk-errors", exist_ok=True)
             ts = time.time()
-            with open(f"retrieve-errors/error-{ts}.txt", "w") as f:
+            with open(f"topk-errors/error-{ts}.txt", "w") as f:
                 f.write(str(query))
         # TODO: the user is always right! let's drop this post-processing in the future

palimpzest/query/optimizer/__init__.py CHANGED Viewed

@@ -39,10 +39,10 @@ from palimpzest.query.optimizer.rules import (
     RAGRule as _RAGRule,
 )
 from palimpzest.query.optimizer.rules import (
-    ReorderConverts as _ReorderConverts,
+    RelationalJoinRule as _RelationalJoinRule,
 )
 from palimpzest.query.optimizer.rules import (
-    RetrieveRule as _RetrieveRule,
+    ReorderConverts as _ReorderConverts,
 )
 from palimpzest.query.optimizer.rules import (
     Rule as _Rule,
@@ -53,6 +53,9 @@ from palimpzest.query.optimizer.rules import (
 from palimpzest.query.optimizer.rules import (
     SplitRule as _SplitRule,
 )
+from palimpzest.query.optimizer.rules import (
+    TopKRule as _TopKRule,
+)
 from palimpzest.query.optimizer.rules import (
     TransformationRule as _TransformationRule,
 )
@@ -72,8 +75,9 @@ ALL_RULES = [
     _NonLLMFilterRule,
     _PushDownFilter,
     _RAGRule,
+    _RelationalJoinRule,
     _ReorderConverts,
-    _RetrieveRule,
+    _TopKRule,
     _Rule,
     _SemanticAggregateRule,
     _SplitRule,

palimpzest/query/optimizer/cost_model.py CHANGED Viewed

@@ -131,17 +131,17 @@ class SampleBasedCostModel:
                 # compute selectivity
                 selectivity = physical_op_df.passed_operator.sum() / num_source_records
+                # compute quality; if all qualities are None then this will be NaN
+                quality = physical_op_df.quality.mean()
+                # set operator stats for this physical operator
                 operator_to_stats[unique_logical_op_id][full_op_id] = {
                     "cost": physical_op_df.cost_per_record.mean(),
                     "time": physical_op_df.time_per_record.mean(),
-                    "quality": physical_op_df.quality.mean(),
+                    "quality": 1.0 if pd.isna(quality) else quality,
                     "selectivity": selectivity,
                 }
-        # if this is an experiment, log the dataframe and operator_to_stats dictionary
-        if self.exp_name is not None:
-            operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
         logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
         return operator_to_stats

palimpzest/query/optimizer/optimizer.py CHANGED Viewed

@@ -284,10 +284,11 @@ class Optimizer:
                 all_properties["filters"] = set([op_filter_str])
         elif isinstance(op, JoinOp):
+            unique_join_str = str(sorted(op.on)) if op.condition is None else op.condition
             if "joins" in all_properties:
-                all_properties["joins"].add(op.condition)
+                all_properties["joins"].add(unique_join_str)
             else:
-                all_properties["joins"] = set([op.condition])
+                all_properties["joins"] = set([unique_join_str])
         elif isinstance(op, LimitScan):
             op_limit_str = op.get_logical_op_id()

palimpzest/query/optimizer/plan.py CHANGED Viewed

@@ -203,9 +203,8 @@ class PhysicalPlan(Plan):
         # return the current index and the upstream unique full_op_ids for this operator
         return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
-    def get_upstream_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
-        """Return the list of unique full_op_ids for the upstream operators of this operator."""
-        unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
+    def get_upstream_unique_full_op_ids(self, unique_full_op_id: str) -> list[str]:
+        """Return the list of unique full_op_ids for the upstream operators of the operator specified by `unique_full_op_id`."""
         return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
     def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:

palimpzest/query/optimizer/rules.py CHANGED Viewed

@@ -19,13 +19,14 @@ from palimpzest.query.operators.aggregate import (
     MaxAggregateOp,
     MinAggregateOp,
     SemanticAggregate,
+    SumAggregateOp,
 )
 from palimpzest.query.operators.compute import SmolAgentsCompute
 from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
 from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert, CritiqueAndRefineFilter
 from palimpzest.query.operators.distinct import DistinctOp
 from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
-from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin
+from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin, RelationalJoin
 from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.logical import (
     Aggregate,
@@ -39,19 +40,19 @@ from palimpzest.query.operators.logical import (
     JoinOp,
     LimitScan,
     Project,
-    RetrieveScan,
     SearchOperator,
+    TopKScan,
 )
 from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert, MixtureOfAgentsFilter
 from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.query.operators.project import ProjectOp
 from palimpzest.query.operators.rag import RAGConvert, RAGFilter
-from palimpzest.query.operators.retrieve import RetrieveOp
 from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
 from palimpzest.query.operators.search import (
     SmolAgentsSearch,  # SmolAgentsCustomManagedSearch,  # SmolAgentsManagedSearch
 )
 from palimpzest.query.operators.split import SplitConvert, SplitFilter
+from palimpzest.query.operators.topk import TopKOp
 from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
 logger = logging.getLogger(__name__)
@@ -796,26 +797,26 @@ class SplitRule(ImplementationRule):
         return cls._perform_substitution(logical_expression, phys_op_cls, runtime_kwargs, variable_op_kwargs)
-class RetrieveRule(ImplementationRule):
+class TopKRule(ImplementationRule):
     """
-    Substitute a logical expression for a RetrieveScan with a Retrieve physical implementation.
+    Substitute a logical expression for a TopKScan with a TopK physical implementation.
     """
     k_budgets = [1, 3, 5, 10, 15, 20, 25]
     @classmethod
     def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
-        is_match = isinstance(logical_expression.operator, RetrieveScan)
-        logger.debug(f"RetrieveRule matches_pattern: {is_match} for {logical_expression}")
+        is_match = isinstance(logical_expression.operator, TopKScan)
+        logger.debug(f"TopKRule matches_pattern: {is_match} for {logical_expression}")
         return is_match
     @classmethod
     def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
-        logger.debug(f"Substituting RetrieveRule for {logical_expression}")
+        logger.debug(f"Substituting TopKRule for {logical_expression}")
         # create variable physical operator kwargs for each model which can implement this logical_expression
         ks = cls.k_budgets if logical_expression.operator.k == -1 else [logical_expression.operator.k]
         variable_op_kwargs = [{"k": k} for k in ks]
-        return cls._perform_substitution(logical_expression, RetrieveOp, runtime_kwargs, variable_op_kwargs)
+        return cls._perform_substitution(logical_expression, TopKOp, runtime_kwargs, variable_op_kwargs)
 class NonLLMFilterRule(ImplementationRule):
@@ -867,6 +868,23 @@ class LLMFilterRule(ImplementationRule):
         return cls._perform_substitution(logical_expression, LLMFilter, runtime_kwargs, variable_op_kwargs)
+class RelationalJoinRule(ImplementationRule):
+    """
+    Substitute a logical expression for a JoinOp with a RelationalJoin physical implementation.
+    """
+    @classmethod
+    def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
+        is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition == ""
+        logger.debug(f"RelationalJoinRule matches_pattern: {is_match} for {logical_expression}")
+        return is_match
+    @classmethod
+    def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
+        logger.debug(f"Substituting RelationalJoinRule for {logical_expression}")
+        return cls._perform_substitution(logical_expression, RelationalJoin, runtime_kwargs)
 class NestedLoopsJoinRule(ImplementationRule):
     """
     Substitute a logical expression for a JoinOp with an (LLM) NestedLoopsJoin physical implementation.
@@ -874,7 +892,7 @@ class NestedLoopsJoinRule(ImplementationRule):
     @classmethod
     def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
-        is_match = isinstance(logical_expression.operator, JoinOp)
+        is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != ""
         logger.debug(f"NestedLoopsJoinRule matches_pattern: {is_match} for {logical_expression}")
         return is_match
@@ -906,7 +924,7 @@ class EmbeddingJoinRule(ImplementationRule):
     @classmethod
     def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
-        is_match = isinstance(logical_expression.operator, JoinOp) and not cls._is_audio_operation(logical_expression)
+        is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != "" and not cls._is_audio_operation(logical_expression)
         logger.debug(f"EmbeddingJoinRule matches_pattern: {is_match} for {logical_expression}")
         return is_match
@@ -982,6 +1000,8 @@ class AggregateRule(ImplementationRule):
             physical_op_class = CountAggregateOp
         elif logical_expression.operator.agg_func == AggFunc.AVERAGE:
             physical_op_class = AverageAggregateOp
+        elif logical_expression.operator.agg_func == AggFunc.SUM:
+            physical_op_class = SumAggregateOp
         elif logical_expression.operator.agg_func == AggFunc.MIN:
             physical_op_class = MinAggregateOp
         elif logical_expression.operator.agg_func == AggFunc.MAX:

palimpzest/query/optimizer/tasks.py CHANGED Viewed

@@ -501,8 +501,8 @@ class OptimizePhysicalExpression(Task):
                         # compute the total cost for this physical expression by summing its operator's PlanCost
                         # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
-                        execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
-                        full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy)
+                        execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
+                        full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy_str)
                         full_plan_cost.op_estimates = op_plan_cost.op_estimates
                         all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
@@ -570,8 +570,8 @@ class OptimizePhysicalExpression(Task):
                 # compute the total cost for this physical expression by summing its operator's PlanCost
                 # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
-                execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
-                full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy)
+                execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
+                full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy_str)
                 full_plan_cost.op_estimates = op_plan_cost.op_estimates
             else:

palimpzest/utils/progress.py CHANGED Viewed

@@ -24,7 +24,7 @@ from palimpzest.query.operators.filter import LLMFilter
 from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.physical import PhysicalOperator
-from palimpzest.query.operators.retrieve import RetrieveOp
+from palimpzest.query.operators.topk import TopKOp
 from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
@@ -225,20 +225,22 @@ class PZProgressManager(ProgressManager):
             current_unique_full_op_id = unique_full_op_id
             next_op, next_unique_full_op_id = self.unique_full_op_id_to_next_op_and_id[unique_full_op_id]
             while next_op is not None:
-                if not isinstance(next_op, (AggregateOp, LimitScanOp)):
-                    next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
-                    multiplier = 1
-                    if isinstance(next_op, JoinOp):
-                        # for joins, scale the delta by the number of inputs from the other side of the join
-                        left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
-                        if current_unique_full_op_id == left_input_unique_full_op_id:
-                            multiplier = self.get_task_total(right_input_unique_input_op_id)
-                        elif current_unique_full_op_id == right_input_unique_input_op_id:
-                            multiplier = self.get_task_total(left_input_unique_full_op_id)
-                        else:
-                            raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
-                    delta_adjusted = delta * multiplier
-                    self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
+                if isinstance(next_op, (AggregateOp, LimitScanOp)):
+                    break
+                next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
+                multiplier = 1
+                if isinstance(next_op, JoinOp):
+                    # for joins, scale the delta by the number of inputs from the other side of the join
+                    left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
+                    if current_unique_full_op_id == left_input_unique_full_op_id:
+                        multiplier = self.get_task_total(right_input_unique_input_op_id)
+                    elif current_unique_full_op_id == right_input_unique_input_op_id:
+                        multiplier = self.get_task_total(left_input_unique_full_op_id)
+                    else:
+                        raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
+                delta_adjusted = delta * multiplier
+                self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
                 # move to the next operator in the plan
                 current_unique_full_op_id = next_unique_full_op_id
@@ -348,9 +350,9 @@ class PZSentinelProgressManager(ProgressManager):
     def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
         is_llm_convert = isinstance(physical_op, LLMConvert)
         is_llm_filter = isinstance(physical_op, LLMFilter)
-        is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
+        is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
         is_llm_join = isinstance(physical_op, JoinOp)
-        return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
+        return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
     def get_task_description(self, unique_logical_op_id: str) -> str:
         """Return the current description for the given task."""

palimpzest/validator/validator.py CHANGED Viewed

@@ -19,7 +19,7 @@ from palimpzest.query.generators.generators import get_json_from_answer
 from palimpzest.query.operators.convert import LLMConvert
 from palimpzest.query.operators.filter import LLMFilter
 from palimpzest.query.operators.join import JoinOp
-from palimpzest.query.operators.retrieve import RetrieveOp
+from palimpzest.query.operators.topk import TopKOp
 class Validator:
@@ -47,7 +47,7 @@ class Validator:
     def join_score_fn(self, condition: str, left_input_record: dict, right_input_record: dict, output: bool) -> float | None:
         raise NotImplementedError("Validator.join_score_fn not implemented.")
-    def retrieve_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
+    def topk_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
         raise NotImplementedError("Validator.map_score_fn not implemented.")
     def _get_gen_stats_from_completion(self, completion, start_time: float) -> GenerationStats:
@@ -218,11 +218,11 @@ class Validator:
         return score, gen_stats
-    def _default_retrieve_score_fn(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
+    def _default_topk_score_fn(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
         """
         Compute the quality of the generated output for the given fields and input_record.
         """
-        # TODO: retrieve k=25; score each item based on relevance; compute F1
+        # TODO: top-k k=25; score each item based on relevance; compute F1
         # TODO: support retrieval over images
         # create prompt factory
         factory = PromptFactory(PromptStrategy.MAP, self.model, Cardinality.ONE_TO_ONE)
@@ -294,11 +294,11 @@ class Validator:
             score, gen_stats = self._default_join_score_fn(op, condition, left_input_record, right_input_record, output)
             return score, gen_stats, full_hash
-    def _score_retrieve(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
+    def _score_topk(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
         try:
-            out = self.retrieve_score_fn(fields, input_record.to_dict(), output)
+            out = self.topk_score_fn(fields, input_record.to_dict(), output)
             score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
             return score, gen_stats, full_hash
         except NotImplementedError:
-            score, gen_stats = self._default_retrieve_score_fn(op, fields, input_record, output)
+            score, gen_stats = self._default_topk_score_fn(op, fields, input_record, output)
             return score, gen_stats, full_hash

palimpzest 0.9.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

palimpzest 0.9.0py3-none-any.whl → 1.0.0py3-none-any.whl