PyPI - palimpzest - Versions diffs - 0.9.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

palimpzest 0.9.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

palimpzest/constants.py +1 -0
palimpzest/core/data/dataset.py +33 -5
palimpzest/core/elements/groupbysig.py +10 -1
palimpzest/core/elements/records.py +16 -7
palimpzest/core/lib/schemas.py +20 -3
palimpzest/core/models.py +10 -4
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +13 -11
palimpzest/query/execution/mab_execution_strategy.py +40 -14
palimpzest/query/execution/parallel_execution_strategy.py +31 -7
palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
palimpzest/query/generators/generators.py +1 -1
palimpzest/query/operators/__init__.py +7 -6
palimpzest/query/operators/aggregate.py +110 -5
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/join.py +279 -23
palimpzest/query/operators/logical.py +20 -8
palimpzest/query/operators/mixture_of_agents.py +3 -1
palimpzest/query/operators/physical.py +5 -2
palimpzest/query/operators/rag.py +5 -4
palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
palimpzest/query/optimizer/__init__.py +7 -3
palimpzest/query/optimizer/cost_model.py +5 -5
palimpzest/query/optimizer/optimizer.py +3 -2
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/rules.py +31 -11
palimpzest/query/optimizer/tasks.py +4 -4
palimpzest/query/processor/config.py +1 -0
palimpzest/utils/progress.py +51 -23
palimpzest/validator/validator.py +7 -7
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/METADATA +26 -66
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/RECORD +35 -35
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/WHEEL +0 -0
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/{retrieve.py → topk.py} RENAMED Viewed

@@ -17,7 +17,7 @@ from palimpzest.core.models import GenerationStats, OperatorCostEstimates, Recor
 from palimpzest.query.operators.physical import PhysicalOperator
-class RetrieveOp(PhysicalOperator):
+class TopKOp(PhysicalOperator):
     def __init__(
         self,
         index: Collection,
@@ -29,7 +29,7 @@ class RetrieveOp(PhysicalOperator):
         **kwargs,
     ) -> None:
         """
-        Initialize the RetrieveOp object.
+        Initialize the TopKOp object.
         Args:
             index (Collection): The PZ index to use for retrieval.
@@ -59,7 +59,7 @@ class RetrieveOp(PhysicalOperator):
     def __str__(self):
         op = super().__str__()
-        op += f"    Retrieve: {self.index.__class__.__name__} with top {self.k}\n"
+        op += f"    Top-K: {self.index.__class__.__name__} with k={self.k}\n"
         return op
     def get_id_params(self):
@@ -89,8 +89,8 @@ class RetrieveOp(PhysicalOperator):
     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
         """
-        Compute naive cost estimates for the Retrieve operation. These estimates assume
-        that the Retrieve (1) has no cost and (2) has perfect quality.
+        Compute naive cost estimates for the Top-K operation. These estimates assume
+        that the Top-K (1) has negligible cost and (2) has perfect quality.
         """
         return OperatorCostEstimates(
             cardinality=source_op_cost_estimates.cardinality,
@@ -101,7 +101,7 @@ class RetrieveOp(PhysicalOperator):
     def default_search_func(self, index: Collection, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
         """
-        Default search function for the Retrieve operation. This function uses the index to
+        Default search function for the Top-K operation. This function uses the index to
         retrieve the top-k results for the given query. The query will be a (possibly singleton)
         list of strings or a list of lists of floats (i.e., embeddings). The function will return
         the top-k results per-query in (descending) sorted order. If the input is a singleton list,
@@ -111,7 +111,7 @@ class RetrieveOp(PhysicalOperator):
         Args:
             index (PZIndex): The index to use for retrieval.
             query (list[str] | list[list[float]]): The query (or queries) to search for.
-            k (int): The maximum number of results the retrieve operator will return.
+            k (int): The maximum number of results the top-k operator will return.
         Returns:
             list[str] | list[list[str]]: The top results in (descending) sorted order per query.
@@ -260,10 +260,10 @@ class RetrieveOp(PhysicalOperator):
             top_results = self.search_func(self.index, inputs, self.k)
         except Exception:
-            top_results = ["error-in-retrieve"]
-            os.makedirs("retrieve-errors", exist_ok=True)
+            top_results = ["error-in-topk"]
+            os.makedirs("topk-errors", exist_ok=True)
             ts = time.time()
-            with open(f"retrieve-errors/error-{ts}.txt", "w") as f:
+            with open(f"topk-errors/error-{ts}.txt", "w") as f:
                 f.write(str(query))
         # TODO: the user is always right! let's drop this post-processing in the future

palimpzest/query/optimizer/__init__.py CHANGED Viewed

@@ -39,10 +39,10 @@ from palimpzest.query.optimizer.rules import (
     RAGRule as _RAGRule,
 )
 from palimpzest.query.optimizer.rules import (
-    ReorderConverts as _ReorderConverts,
+    RelationalJoinRule as _RelationalJoinRule,
 )
 from palimpzest.query.optimizer.rules import (
-    RetrieveRule as _RetrieveRule,
+    ReorderConverts as _ReorderConverts,
 )
 from palimpzest.query.optimizer.rules import (
     Rule as _Rule,
@@ -53,6 +53,9 @@ from palimpzest.query.optimizer.rules import (
 from palimpzest.query.optimizer.rules import (
     SplitRule as _SplitRule,
 )
+from palimpzest.query.optimizer.rules import (
+    TopKRule as _TopKRule,
+)
 from palimpzest.query.optimizer.rules import (
     TransformationRule as _TransformationRule,
 )
@@ -72,8 +75,9 @@ ALL_RULES = [
     _NonLLMFilterRule,
     _PushDownFilter,
     _RAGRule,
+    _RelationalJoinRule,
     _ReorderConverts,
-    _RetrieveRule,
+    _TopKRule,
     _Rule,
     _SemanticAggregateRule,
     _SplitRule,

palimpzest/query/optimizer/cost_model.py CHANGED Viewed

@@ -131,17 +131,17 @@ class SampleBasedCostModel:
                 # compute selectivity
                 selectivity = physical_op_df.passed_operator.sum() / num_source_records
+                # compute quality; if all qualities are None then this will be NaN
+                quality = physical_op_df.quality.mean()
+                # set operator stats for this physical operator
                 operator_to_stats[unique_logical_op_id][full_op_id] = {
                     "cost": physical_op_df.cost_per_record.mean(),
                     "time": physical_op_df.time_per_record.mean(),
-                    "quality": physical_op_df.quality.mean(),
+                    "quality": 1.0 if pd.isna(quality) else quality,
                     "selectivity": selectivity,
                 }
-        # if this is an experiment, log the dataframe and operator_to_stats dictionary
-        if self.exp_name is not None:
-            operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
         logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
         return operator_to_stats

palimpzest/query/optimizer/optimizer.py CHANGED Viewed

@@ -284,10 +284,11 @@ class Optimizer:
                 all_properties["filters"] = set([op_filter_str])
         elif isinstance(op, JoinOp):
+            unique_join_str = str(sorted(op.on)) if op.condition is None else op.condition
             if "joins" in all_properties:
-                all_properties["joins"].add(op.condition)
+                all_properties["joins"].add(unique_join_str)
             else:
-                all_properties["joins"] = set([op.condition])
+                all_properties["joins"] = set([unique_join_str])
         elif isinstance(op, LimitScan):
             op_limit_str = op.get_logical_op_id()

palimpzest/query/optimizer/plan.py CHANGED Viewed

@@ -203,9 +203,8 @@ class PhysicalPlan(Plan):
         # return the current index and the upstream unique full_op_ids for this operator
         return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
-    def get_upstream_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
-        """Return the list of unique full_op_ids for the upstream operators of this operator."""
-        unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
+    def get_upstream_unique_full_op_ids(self, unique_full_op_id: str) -> list[str]:
+        """Return the list of unique full_op_ids for the upstream operators of the operator specified by `unique_full_op_id`."""
         return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
     def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:

palimpzest/query/optimizer/rules.py CHANGED Viewed

@@ -19,13 +19,14 @@ from palimpzest.query.operators.aggregate import (
     MaxAggregateOp,
     MinAggregateOp,
     SemanticAggregate,
+    SumAggregateOp,
 )
 from palimpzest.query.operators.compute import SmolAgentsCompute
 from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
 from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert, CritiqueAndRefineFilter
 from palimpzest.query.operators.distinct import DistinctOp
 from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
-from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin
+from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin, RelationalJoin
 from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.logical import (
     Aggregate,
@@ -39,19 +40,19 @@ from palimpzest.query.operators.logical import (
     JoinOp,
     LimitScan,
     Project,
-    RetrieveScan,
     SearchOperator,
+    TopKScan,
 )
 from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert, MixtureOfAgentsFilter
 from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.query.operators.project import ProjectOp
 from palimpzest.query.operators.rag import RAGConvert, RAGFilter
-from palimpzest.query.operators.retrieve import RetrieveOp
 from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
 from palimpzest.query.operators.search import (
     SmolAgentsSearch,  # SmolAgentsCustomManagedSearch,  # SmolAgentsManagedSearch
 )
 from palimpzest.query.operators.split import SplitConvert, SplitFilter
+from palimpzest.query.operators.topk import TopKOp
 from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
 logger = logging.getLogger(__name__)
@@ -796,26 +797,26 @@ class SplitRule(ImplementationRule):
         return cls._perform_substitution(logical_expression, phys_op_cls, runtime_kwargs, variable_op_kwargs)
-class RetrieveRule(ImplementationRule):
+class TopKRule(ImplementationRule):
     """
-    Substitute a logical expression for a RetrieveScan with a Retrieve physical implementation.
+    Substitute a logical expression for a TopKScan with a TopK physical implementation.
     """
     k_budgets = [1, 3, 5, 10, 15, 20, 25]
     @classmethod
     def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
-        is_match = isinstance(logical_expression.operator, RetrieveScan)
-        logger.debug(f"RetrieveRule matches_pattern: {is_match} for {logical_expression}")
+        is_match = isinstance(logical_expression.operator, TopKScan)
+        logger.debug(f"TopKRule matches_pattern: {is_match} for {logical_expression}")
         return is_match
     @classmethod
     def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
-        logger.debug(f"Substituting RetrieveRule for {logical_expression}")
+        logger.debug(f"Substituting TopKRule for {logical_expression}")
         # create variable physical operator kwargs for each model which can implement this logical_expression
         ks = cls.k_budgets if logical_expression.operator.k == -1 else [logical_expression.operator.k]
         variable_op_kwargs = [{"k": k} for k in ks]
-        return cls._perform_substitution(logical_expression, RetrieveOp, runtime_kwargs, variable_op_kwargs)
+        return cls._perform_substitution(logical_expression, TopKOp, runtime_kwargs, variable_op_kwargs)
 class NonLLMFilterRule(ImplementationRule):
@@ -867,6 +868,23 @@ class LLMFilterRule(ImplementationRule):
         return cls._perform_substitution(logical_expression, LLMFilter, runtime_kwargs, variable_op_kwargs)
+class RelationalJoinRule(ImplementationRule):
+    """
+    Substitute a logical expression for a JoinOp with a RelationalJoin physical implementation.
+    """
+    @classmethod
+    def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
+        is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition == ""
+        logger.debug(f"RelationalJoinRule matches_pattern: {is_match} for {logical_expression}")
+        return is_match
+    @classmethod
+    def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
+        logger.debug(f"Substituting RelationalJoinRule for {logical_expression}")
+        return cls._perform_substitution(logical_expression, RelationalJoin, runtime_kwargs)
 class NestedLoopsJoinRule(ImplementationRule):
     """
     Substitute a logical expression for a JoinOp with an (LLM) NestedLoopsJoin physical implementation.
@@ -874,7 +892,7 @@ class NestedLoopsJoinRule(ImplementationRule):
     @classmethod
     def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
-        is_match = isinstance(logical_expression.operator, JoinOp)
+        is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != ""
         logger.debug(f"NestedLoopsJoinRule matches_pattern: {is_match} for {logical_expression}")
         return is_match
@@ -906,7 +924,7 @@ class EmbeddingJoinRule(ImplementationRule):
     @classmethod
     def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
-        is_match = isinstance(logical_expression.operator, JoinOp) and not cls._is_audio_operation(logical_expression)
+        is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != "" and not cls._is_audio_operation(logical_expression)
         logger.debug(f"EmbeddingJoinRule matches_pattern: {is_match} for {logical_expression}")
         return is_match
@@ -982,6 +1000,8 @@ class AggregateRule(ImplementationRule):
             physical_op_class = CountAggregateOp
         elif logical_expression.operator.agg_func == AggFunc.AVERAGE:
             physical_op_class = AverageAggregateOp
+        elif logical_expression.operator.agg_func == AggFunc.SUM:
+            physical_op_class = SumAggregateOp
         elif logical_expression.operator.agg_func == AggFunc.MIN:
             physical_op_class = MinAggregateOp
         elif logical_expression.operator.agg_func == AggFunc.MAX:

palimpzest/query/optimizer/tasks.py CHANGED Viewed

@@ -501,8 +501,8 @@ class OptimizePhysicalExpression(Task):
                         # compute the total cost for this physical expression by summing its operator's PlanCost
                         # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
-                        execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
-                        full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy)
+                        execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
+                        full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy_str)
                         full_plan_cost.op_estimates = op_plan_cost.op_estimates
                         all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
@@ -570,8 +570,8 @@ class OptimizePhysicalExpression(Task):
                 # compute the total cost for this physical expression by summing its operator's PlanCost
                 # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
-                execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
-                full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy)
+                execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
+                full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy_str)
                 full_plan_cost.op_estimates = op_plan_cost.op_estimates
             else:

palimpzest/query/processor/config.py CHANGED Viewed

@@ -44,6 +44,7 @@ class QueryProcessorConfig(BaseModel):
     k: int = Field(default=6)
     j: int = Field(default=4)
     sample_budget: int = Field(default=100)
+    sample_cost_budget: float | None = Field(default=None)
     seed: int = Field(default=42)
     exp_name: str | None = Field(default=None)
     priors: dict | None = Field(default=None)

palimpzest/utils/progress.py CHANGED Viewed

@@ -24,7 +24,7 @@ from palimpzest.query.operators.filter import LLMFilter
 from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.physical import PhysicalOperator
-from palimpzest.query.operators.retrieve import RetrieveOp
+from palimpzest.query.operators.topk import TopKOp
 from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
@@ -225,20 +225,22 @@ class PZProgressManager(ProgressManager):
             current_unique_full_op_id = unique_full_op_id
             next_op, next_unique_full_op_id = self.unique_full_op_id_to_next_op_and_id[unique_full_op_id]
             while next_op is not None:
-                if not isinstance(next_op, (AggregateOp, LimitScanOp)):
-                    next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
-                    multiplier = 1
-                    if isinstance(next_op, JoinOp):
-                        # for joins, scale the delta by the number of inputs from the other side of the join
-                        left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
-                        if current_unique_full_op_id == left_input_unique_full_op_id:
-                            multiplier = self.get_task_total(right_input_unique_input_op_id)
-                        elif current_unique_full_op_id == right_input_unique_input_op_id:
-                            multiplier = self.get_task_total(left_input_unique_full_op_id)
-                        else:
-                            raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
-                    delta_adjusted = delta * multiplier
-                    self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
+                if isinstance(next_op, (AggregateOp, LimitScanOp)):
+                    break
+                next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
+                multiplier = 1
+                if isinstance(next_op, JoinOp):
+                    # for joins, scale the delta by the number of inputs from the other side of the join
+                    left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
+                    if current_unique_full_op_id == left_input_unique_full_op_id:
+                        multiplier = self.get_task_total(right_input_unique_input_op_id)
+                    elif current_unique_full_op_id == right_input_unique_input_op_id:
+                        multiplier = self.get_task_total(left_input_unique_full_op_id)
+                    else:
+                        raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
+                delta_adjusted = delta * multiplier
+                self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
                 # move to the next operator in the plan
                 current_unique_full_op_id = next_unique_full_op_id
@@ -281,7 +283,7 @@ class PZProgressManager(ProgressManager):
         self.unique_full_op_id_to_stats[unique_full_op_id].memory_usage_mb = get_memory_usage()
 class PZSentinelProgressManager(ProgressManager):
-    def __init__(self, plan: SentinelPlan, sample_budget: int):
+    def __init__(self, plan: SentinelPlan, sample_budget: int | None, sample_cost_budget: float | None):
         # overall progress bar
         self.overall_progress = RichProgress(
             SpinnerColumn(),
@@ -296,7 +298,9 @@ class PZSentinelProgressManager(ProgressManager):
             refresh_per_second=10,
             expand=True,   # Use full width
         )
-        self.overall_task_id = self.overall_progress.add_task("", total=sample_budget, cost=0.0, recent="")
+        self.use_cost_budget = sample_cost_budget is not None
+        total = sample_cost_budget if self.use_cost_budget else sample_budget
+        self.overall_task_id = self.overall_progress.add_task("", total=total, cost=0.0, recent="")
         # logical operator progress bars
         self.op_progress = RichProgress(
@@ -332,6 +336,9 @@ class PZSentinelProgressManager(ProgressManager):
         # initialize start time
         self.start_time = None
+        # initialize validation cost
+        self.validation_cost = 0.0
         # add a task to the progress manager for each operator in the plan
         for topo_idx, (logical_op_id, op_set) in enumerate(plan):
             unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
@@ -348,9 +355,9 @@ class PZSentinelProgressManager(ProgressManager):
     def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
         is_llm_convert = isinstance(physical_op, LLMConvert)
         is_llm_filter = isinstance(physical_op, LLMFilter)
-        is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
+        is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
         is_llm_join = isinstance(physical_op, JoinOp)
-        return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
+        return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
     def get_task_description(self, unique_logical_op_id: str) -> str:
         """Return the current description for the given task."""
@@ -385,15 +392,34 @@ class PZSentinelProgressManager(ProgressManager):
         # start progress bars
         self.live_display.start()
+    def incr_overall_progress_cost(self, cost_delta: float):
+        """Advance the overall progress bar by the given cost delta"""
+        self.validation_cost += cost_delta
+        self.overall_progress.update(
+            self.overall_task_id,
+            advance=cost_delta,
+            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
+            refresh=True,
+        )
+        # force the live display to refresh
+        self.live_display.refresh()
     def incr(self, unique_logical_op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
         # TODO: (above) organize progress bars into a Live / Table / Panel or something
         # get the task for the given operation
         task = self.unique_logical_op_id_to_task.get(unique_logical_op_id)
+        # store the cost before updating stats
+        previous_total_cost = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost
         # update statistics with any additional keyword arguments
         if kwargs != {}:
             self.update_stats(unique_logical_op_id, **kwargs)
+        # compute the cost delta
+        cost_delta = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost - previous_total_cost
         # update progress bar and recent text in one update
         if display_text is not None:
             self.unique_logical_op_id_to_stats[unique_logical_op_id].recent_text = display_text
@@ -412,10 +438,11 @@ class PZSentinelProgressManager(ProgressManager):
         )
         # advance the overall progress bar
+        advance = cost_delta if self.use_cost_budget else num_samples
         self.overall_progress.update(
             self.overall_task_id,
-            advance=num_samples,
-            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()),
+            advance=advance,
+            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
             refresh=True,
         )
@@ -449,6 +476,7 @@ def create_progress_manager(
     plan: PhysicalPlan | SentinelPlan,
     num_samples: int | None = None,
     sample_budget: int | None = None,
+    sample_cost_budget: float | None = None,
     progress: bool = True,
 ) -> ProgressManager:
     """Factory function to create appropriate progress manager based on environment"""
@@ -456,7 +484,7 @@ def create_progress_manager(
         return MockProgressManager(plan, num_samples)
     if isinstance(plan, SentinelPlan):
-        assert sample_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
-        return PZSentinelProgressManager(plan, sample_budget)
+        assert sample_budget is not None or sample_cost_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
+        return PZSentinelProgressManager(plan, sample_budget, sample_cost_budget)
     return PZProgressManager(plan, num_samples)

palimpzest/validator/validator.py CHANGED Viewed

@@ -19,7 +19,7 @@ from palimpzest.query.generators.generators import get_json_from_answer
 from palimpzest.query.operators.convert import LLMConvert
 from palimpzest.query.operators.filter import LLMFilter
 from palimpzest.query.operators.join import JoinOp
-from palimpzest.query.operators.retrieve import RetrieveOp
+from palimpzest.query.operators.topk import TopKOp
 class Validator:
@@ -47,7 +47,7 @@ class Validator:
     def join_score_fn(self, condition: str, left_input_record: dict, right_input_record: dict, output: bool) -> float | None:
         raise NotImplementedError("Validator.join_score_fn not implemented.")
-    def retrieve_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
+    def topk_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
         raise NotImplementedError("Validator.map_score_fn not implemented.")
     def _get_gen_stats_from_completion(self, completion, start_time: float) -> GenerationStats:
@@ -218,11 +218,11 @@ class Validator:
         return score, gen_stats
-    def _default_retrieve_score_fn(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
+    def _default_topk_score_fn(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
         """
         Compute the quality of the generated output for the given fields and input_record.
         """
-        # TODO: retrieve k=25; score each item based on relevance; compute F1
+        # TODO: top-k k=25; score each item based on relevance; compute F1
         # TODO: support retrieval over images
         # create prompt factory
         factory = PromptFactory(PromptStrategy.MAP, self.model, Cardinality.ONE_TO_ONE)
@@ -294,11 +294,11 @@ class Validator:
             score, gen_stats = self._default_join_score_fn(op, condition, left_input_record, right_input_record, output)
             return score, gen_stats, full_hash
-    def _score_retrieve(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
+    def _score_topk(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
         try:
-            out = self.retrieve_score_fn(fields, input_record.to_dict(), output)
+            out = self.topk_score_fn(fields, input_record.to_dict(), output)
             score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
             return score, gen_stats, full_hash
         except NotImplementedError:
-            score, gen_stats = self._default_retrieve_score_fn(op, fields, input_record, output)
+            score, gen_stats = self._default_topk_score_fn(op, fields, input_record, output)
             return score, gen_stats, full_hash

{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 0.9.0
+Version: 1.1.0
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.8
-Requires-Python: >=3.10
+Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: anthropic>=0.55.0
@@ -59,15 +59,20 @@ Dynamic: license-file
 <!-- [![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b?logo=arxiv)](https://arxiv.org/pdf/2405.14696) -->
 <!-- [![Video](https://img.shields.io/badge/YouTube-Talk-red?logo=youtube)](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
-## Learn How to Use PZ
-Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
+## 📚 Learn How to Use PZ
+Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
-## Getting started
+## 🚀 Getting started
 You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
 ```bash
 $ pip install palimpzest
 ```
+You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
+```bash
+$ uv pip install palimpzest
+```
 Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
 ```bash
 $ git clone git@github.com:mitdbg/palimpzest.git
@@ -75,7 +80,7 @@ $ cd palimpzest
 $ pip install .
 ```
-## Join the PZ Community
+## 🙋🏽 Join the PZ Community
 We are actively hacking on PZ and would love to have you join our community [![Discord](https://img.shields.io/discord/1245561987480420445?logo=discord)](https://discord.gg/dN85JJ6jaH)
 [Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
@@ -86,66 +91,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
 We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
-## Quick Start
-The easiest way to get started with Palimpzest is to run the `quickstart.ipynb` jupyter notebook. We demonstrate the full workflow of working with PZ, including registering a dataset, composing and executing a pipeline, and accessing the results.
-To run the notebook, you can use the following command:
-```bash
-$ jupyter notebook
-```
-And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
-### Even Quicker Start
-For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
-```python
-import palimpzest as pz
-# define the fields we wish to compute
-email_cols = [
-    {"name": "sender", "type": str, "desc": "The email address of the sender"},
-    {"name": "subject", "type": str, "desc": "The subject of the email"},
-    {"name": "date", "type": str, "desc": "The date the email was sent"},
-]
-# lazily construct the computation to get emails about holidays sent in July
-dataset = pz.Dataset("testdata/enron-tiny/")
-dataset = dataset.sem_add_columns(email_cols)
-dataset = dataset.sem_filter("The email was sent in July")
-dataset = dataset.sem_filter("The email is about holidays")
-# execute the computation w/the MinCost policy
-config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
-output = dataset.run(config)
-# display output (if using Jupyter, otherwise use print(output_df))
-output_df = output.to_df(cols=["date", "sender", "subject"])
-display(output_df)
-```
-## Python Demos
-Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
-### Downloading test data
-To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
-```
-chmod +x testdata/download-testdata.sh
-./testdata/download-testdata.sh
-```
-### Running the Demos
-Set your OpenAI (or Together.ai) api key at the command line:
-```bash
-# set one (or both) of the following:
-export OPENAI_API_KEY=<your-api-key>
-export TOGETHER_API_KEY=<your-api-key>
-```
-Now you can run the simple test program with:
-```bash
-$ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
-```
-### Citation
-If you would like to cite our work, please use the following citation:
+### 📓 Citation
+If you would like to cite our original paper on Palimpzest, please use the following citation:
 ```
 @inproceedings{palimpzestCIDR,
     title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
@@ -154,3 +101,16 @@ If you would like to cite our work, please use the following citation:
     date = 2025,
 }
 ```
+If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
+```
+@misc{russo2025abacuscostbasedoptimizersemantic,
+      title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
+      author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
+      year={2025},
+      eprint={2505.14661},
+      archivePrefix={arXiv},
+      primaryClass={cs.DB},
+      url={https://arxiv.org/abs/2505.14661},
+}
+```

palimpzest 0.9.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

palimpzest 0.9.0py3-none-any.whl → 1.1.0py3-none-any.whl