palimpzest 0.9.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. palimpzest/constants.py +1 -0
  2. palimpzest/core/data/dataset.py +33 -5
  3. palimpzest/core/elements/groupbysig.py +10 -1
  4. palimpzest/core/elements/records.py +16 -7
  5. palimpzest/core/lib/schemas.py +20 -3
  6. palimpzest/core/models.py +10 -4
  7. palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
  8. palimpzest/query/execution/execution_strategy.py +13 -11
  9. palimpzest/query/execution/mab_execution_strategy.py +40 -14
  10. palimpzest/query/execution/parallel_execution_strategy.py +31 -7
  11. palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
  12. palimpzest/query/generators/generators.py +1 -1
  13. palimpzest/query/operators/__init__.py +7 -6
  14. palimpzest/query/operators/aggregate.py +110 -5
  15. palimpzest/query/operators/convert.py +1 -1
  16. palimpzest/query/operators/join.py +279 -23
  17. palimpzest/query/operators/logical.py +20 -8
  18. palimpzest/query/operators/mixture_of_agents.py +3 -1
  19. palimpzest/query/operators/physical.py +5 -2
  20. palimpzest/query/operators/rag.py +5 -4
  21. palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
  22. palimpzest/query/optimizer/__init__.py +7 -3
  23. palimpzest/query/optimizer/cost_model.py +5 -5
  24. palimpzest/query/optimizer/optimizer.py +3 -2
  25. palimpzest/query/optimizer/plan.py +2 -3
  26. palimpzest/query/optimizer/rules.py +31 -11
  27. palimpzest/query/optimizer/tasks.py +4 -4
  28. palimpzest/query/processor/config.py +1 -0
  29. palimpzest/utils/progress.py +51 -23
  30. palimpzest/validator/validator.py +7 -7
  31. {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/METADATA +26 -66
  32. {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/RECORD +35 -35
  33. {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/WHEEL +0 -0
  34. {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/licenses/LICENSE +0 -0
  35. {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ from palimpzest.core.models import GenerationStats, OperatorCostEstimates, Recor
17
17
  from palimpzest.query.operators.physical import PhysicalOperator
18
18
 
19
19
 
20
- class RetrieveOp(PhysicalOperator):
20
+ class TopKOp(PhysicalOperator):
21
21
  def __init__(
22
22
  self,
23
23
  index: Collection,
@@ -29,7 +29,7 @@ class RetrieveOp(PhysicalOperator):
29
29
  **kwargs,
30
30
  ) -> None:
31
31
  """
32
- Initialize the RetrieveOp object.
32
+ Initialize the TopKOp object.
33
33
 
34
34
  Args:
35
35
  index (Collection): The PZ index to use for retrieval.
@@ -59,7 +59,7 @@ class RetrieveOp(PhysicalOperator):
59
59
 
60
60
  def __str__(self):
61
61
  op = super().__str__()
62
- op += f" Retrieve: {self.index.__class__.__name__} with top {self.k}\n"
62
+ op += f" Top-K: {self.index.__class__.__name__} with k={self.k}\n"
63
63
  return op
64
64
 
65
65
  def get_id_params(self):
@@ -89,8 +89,8 @@ class RetrieveOp(PhysicalOperator):
89
89
 
90
90
  def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
91
91
  """
92
- Compute naive cost estimates for the Retrieve operation. These estimates assume
93
- that the Retrieve (1) has no cost and (2) has perfect quality.
92
+ Compute naive cost estimates for the Top-K operation. These estimates assume
93
+ that the Top-K (1) has negligible cost and (2) has perfect quality.
94
94
  """
95
95
  return OperatorCostEstimates(
96
96
  cardinality=source_op_cost_estimates.cardinality,
@@ -101,7 +101,7 @@ class RetrieveOp(PhysicalOperator):
101
101
 
102
102
  def default_search_func(self, index: Collection, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
103
103
  """
104
- Default search function for the Retrieve operation. This function uses the index to
104
+ Default search function for the Top-K operation. This function uses the index to
105
105
  retrieve the top-k results for the given query. The query will be a (possibly singleton)
106
106
  list of strings or a list of lists of floats (i.e., embeddings). The function will return
107
107
  the top-k results per-query in (descending) sorted order. If the input is a singleton list,
@@ -111,7 +111,7 @@ class RetrieveOp(PhysicalOperator):
111
111
  Args:
112
112
  index (PZIndex): The index to use for retrieval.
113
113
  query (list[str] | list[list[float]]): The query (or queries) to search for.
114
- k (int): The maximum number of results the retrieve operator will return.
114
+ k (int): The maximum number of results the top-k operator will return.
115
115
 
116
116
  Returns:
117
117
  list[str] | list[list[str]]: The top results in (descending) sorted order per query.
@@ -260,10 +260,10 @@ class RetrieveOp(PhysicalOperator):
260
260
  top_results = self.search_func(self.index, inputs, self.k)
261
261
 
262
262
  except Exception:
263
- top_results = ["error-in-retrieve"]
264
- os.makedirs("retrieve-errors", exist_ok=True)
263
+ top_results = ["error-in-topk"]
264
+ os.makedirs("topk-errors", exist_ok=True)
265
265
  ts = time.time()
266
- with open(f"retrieve-errors/error-{ts}.txt", "w") as f:
266
+ with open(f"topk-errors/error-{ts}.txt", "w") as f:
267
267
  f.write(str(query))
268
268
 
269
269
  # TODO: the user is always right! let's drop this post-processing in the future
@@ -39,10 +39,10 @@ from palimpzest.query.optimizer.rules import (
39
39
  RAGRule as _RAGRule,
40
40
  )
41
41
  from palimpzest.query.optimizer.rules import (
42
- ReorderConverts as _ReorderConverts,
42
+ RelationalJoinRule as _RelationalJoinRule,
43
43
  )
44
44
  from palimpzest.query.optimizer.rules import (
45
- RetrieveRule as _RetrieveRule,
45
+ ReorderConverts as _ReorderConverts,
46
46
  )
47
47
  from palimpzest.query.optimizer.rules import (
48
48
  Rule as _Rule,
@@ -53,6 +53,9 @@ from palimpzest.query.optimizer.rules import (
53
53
  from palimpzest.query.optimizer.rules import (
54
54
  SplitRule as _SplitRule,
55
55
  )
56
+ from palimpzest.query.optimizer.rules import (
57
+ TopKRule as _TopKRule,
58
+ )
56
59
  from palimpzest.query.optimizer.rules import (
57
60
  TransformationRule as _TransformationRule,
58
61
  )
@@ -72,8 +75,9 @@ ALL_RULES = [
72
75
  _NonLLMFilterRule,
73
76
  _PushDownFilter,
74
77
  _RAGRule,
78
+ _RelationalJoinRule,
75
79
  _ReorderConverts,
76
- _RetrieveRule,
80
+ _TopKRule,
77
81
  _Rule,
78
82
  _SemanticAggregateRule,
79
83
  _SplitRule,
@@ -131,17 +131,17 @@ class SampleBasedCostModel:
131
131
  # compute selectivity
132
132
  selectivity = physical_op_df.passed_operator.sum() / num_source_records
133
133
 
134
+ # compute quality; if all qualities are None then this will be NaN
135
+ quality = physical_op_df.quality.mean()
136
+
137
+ # set operator stats for this physical operator
134
138
  operator_to_stats[unique_logical_op_id][full_op_id] = {
135
139
  "cost": physical_op_df.cost_per_record.mean(),
136
140
  "time": physical_op_df.time_per_record.mean(),
137
- "quality": physical_op_df.quality.mean(),
141
+ "quality": 1.0 if pd.isna(quality) else quality,
138
142
  "selectivity": selectivity,
139
143
  }
140
144
 
141
- # if this is an experiment, log the dataframe and operator_to_stats dictionary
142
- if self.exp_name is not None:
143
- operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
144
-
145
145
  logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
146
146
  return operator_to_stats
147
147
 
@@ -284,10 +284,11 @@ class Optimizer:
284
284
  all_properties["filters"] = set([op_filter_str])
285
285
 
286
286
  elif isinstance(op, JoinOp):
287
+ unique_join_str = str(sorted(op.on)) if op.condition is None else op.condition
287
288
  if "joins" in all_properties:
288
- all_properties["joins"].add(op.condition)
289
+ all_properties["joins"].add(unique_join_str)
289
290
  else:
290
- all_properties["joins"] = set([op.condition])
291
+ all_properties["joins"] = set([unique_join_str])
291
292
 
292
293
  elif isinstance(op, LimitScan):
293
294
  op_limit_str = op.get_logical_op_id()
@@ -203,9 +203,8 @@ class PhysicalPlan(Plan):
203
203
  # return the current index and the upstream unique full_op_ids for this operator
204
204
  return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
205
205
 
206
- def get_upstream_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
207
- """Return the list of unique full_op_ids for the upstream operators of this operator."""
208
- unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
206
+ def get_upstream_unique_full_op_ids(self, unique_full_op_id: str) -> list[str]:
207
+ """Return the list of unique full_op_ids for the upstream operators of the operator specified by `unique_full_op_id`."""
209
208
  return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
210
209
 
211
210
  def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
@@ -19,13 +19,14 @@ from palimpzest.query.operators.aggregate import (
19
19
  MaxAggregateOp,
20
20
  MinAggregateOp,
21
21
  SemanticAggregate,
22
+ SumAggregateOp,
22
23
  )
23
24
  from palimpzest.query.operators.compute import SmolAgentsCompute
24
25
  from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
25
26
  from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert, CritiqueAndRefineFilter
26
27
  from palimpzest.query.operators.distinct import DistinctOp
27
28
  from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
28
- from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin
29
+ from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin, RelationalJoin
29
30
  from palimpzest.query.operators.limit import LimitScanOp
30
31
  from palimpzest.query.operators.logical import (
31
32
  Aggregate,
@@ -39,19 +40,19 @@ from palimpzest.query.operators.logical import (
39
40
  JoinOp,
40
41
  LimitScan,
41
42
  Project,
42
- RetrieveScan,
43
43
  SearchOperator,
44
+ TopKScan,
44
45
  )
45
46
  from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert, MixtureOfAgentsFilter
46
47
  from palimpzest.query.operators.physical import PhysicalOperator
47
48
  from palimpzest.query.operators.project import ProjectOp
48
49
  from palimpzest.query.operators.rag import RAGConvert, RAGFilter
49
- from palimpzest.query.operators.retrieve import RetrieveOp
50
50
  from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
51
51
  from palimpzest.query.operators.search import (
52
52
  SmolAgentsSearch, # SmolAgentsCustomManagedSearch, # SmolAgentsManagedSearch
53
53
  )
54
54
  from palimpzest.query.operators.split import SplitConvert, SplitFilter
55
+ from palimpzest.query.operators.topk import TopKOp
55
56
  from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
56
57
 
57
58
  logger = logging.getLogger(__name__)
@@ -796,26 +797,26 @@ class SplitRule(ImplementationRule):
796
797
  return cls._perform_substitution(logical_expression, phys_op_cls, runtime_kwargs, variable_op_kwargs)
797
798
 
798
799
 
799
- class RetrieveRule(ImplementationRule):
800
+ class TopKRule(ImplementationRule):
800
801
  """
801
- Substitute a logical expression for a RetrieveScan with a Retrieve physical implementation.
802
+ Substitute a logical expression for a TopKScan with a TopK physical implementation.
802
803
  """
803
804
  k_budgets = [1, 3, 5, 10, 15, 20, 25]
804
805
 
805
806
  @classmethod
806
807
  def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
807
- is_match = isinstance(logical_expression.operator, RetrieveScan)
808
- logger.debug(f"RetrieveRule matches_pattern: {is_match} for {logical_expression}")
808
+ is_match = isinstance(logical_expression.operator, TopKScan)
809
+ logger.debug(f"TopKRule matches_pattern: {is_match} for {logical_expression}")
809
810
  return is_match
810
811
 
811
812
  @classmethod
812
813
  def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
813
- logger.debug(f"Substituting RetrieveRule for {logical_expression}")
814
+ logger.debug(f"Substituting TopKRule for {logical_expression}")
814
815
 
815
816
  # create variable physical operator kwargs for each model which can implement this logical_expression
816
817
  ks = cls.k_budgets if logical_expression.operator.k == -1 else [logical_expression.operator.k]
817
818
  variable_op_kwargs = [{"k": k} for k in ks]
818
- return cls._perform_substitution(logical_expression, RetrieveOp, runtime_kwargs, variable_op_kwargs)
819
+ return cls._perform_substitution(logical_expression, TopKOp, runtime_kwargs, variable_op_kwargs)
819
820
 
820
821
 
821
822
  class NonLLMFilterRule(ImplementationRule):
@@ -867,6 +868,23 @@ class LLMFilterRule(ImplementationRule):
867
868
  return cls._perform_substitution(logical_expression, LLMFilter, runtime_kwargs, variable_op_kwargs)
868
869
 
869
870
 
871
+ class RelationalJoinRule(ImplementationRule):
872
+ """
873
+ Substitute a logical expression for a JoinOp with a RelationalJoin physical implementation.
874
+ """
875
+
876
+ @classmethod
877
+ def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
878
+ is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition == ""
879
+ logger.debug(f"RelationalJoinRule matches_pattern: {is_match} for {logical_expression}")
880
+ return is_match
881
+
882
+ @classmethod
883
+ def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
884
+ logger.debug(f"Substituting RelationalJoinRule for {logical_expression}")
885
+ return cls._perform_substitution(logical_expression, RelationalJoin, runtime_kwargs)
886
+
887
+
870
888
  class NestedLoopsJoinRule(ImplementationRule):
871
889
  """
872
890
  Substitute a logical expression for a JoinOp with an (LLM) NestedLoopsJoin physical implementation.
@@ -874,7 +892,7 @@ class NestedLoopsJoinRule(ImplementationRule):
874
892
 
875
893
  @classmethod
876
894
  def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
877
- is_match = isinstance(logical_expression.operator, JoinOp)
895
+ is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != ""
878
896
  logger.debug(f"NestedLoopsJoinRule matches_pattern: {is_match} for {logical_expression}")
879
897
  return is_match
880
898
 
@@ -906,7 +924,7 @@ class EmbeddingJoinRule(ImplementationRule):
906
924
 
907
925
  @classmethod
908
926
  def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
909
- is_match = isinstance(logical_expression.operator, JoinOp) and not cls._is_audio_operation(logical_expression)
927
+ is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != "" and not cls._is_audio_operation(logical_expression)
910
928
  logger.debug(f"EmbeddingJoinRule matches_pattern: {is_match} for {logical_expression}")
911
929
  return is_match
912
930
 
@@ -982,6 +1000,8 @@ class AggregateRule(ImplementationRule):
982
1000
  physical_op_class = CountAggregateOp
983
1001
  elif logical_expression.operator.agg_func == AggFunc.AVERAGE:
984
1002
  physical_op_class = AverageAggregateOp
1003
+ elif logical_expression.operator.agg_func == AggFunc.SUM:
1004
+ physical_op_class = SumAggregateOp
985
1005
  elif logical_expression.operator.agg_func == AggFunc.MIN:
986
1006
  physical_op_class = MinAggregateOp
987
1007
  elif logical_expression.operator.agg_func == AggFunc.MAX:
@@ -501,8 +501,8 @@ class OptimizePhysicalExpression(Task):
501
501
 
502
502
  # compute the total cost for this physical expression by summing its operator's PlanCost
503
503
  # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
504
- execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
505
- full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy)
504
+ execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
505
+ full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy_str)
506
506
  full_plan_cost.op_estimates = op_plan_cost.op_estimates
507
507
  all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
508
508
 
@@ -570,8 +570,8 @@ class OptimizePhysicalExpression(Task):
570
570
 
571
571
  # compute the total cost for this physical expression by summing its operator's PlanCost
572
572
  # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
573
- execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
574
- full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy)
573
+ execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
574
+ full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy_str)
575
575
  full_plan_cost.op_estimates = op_plan_cost.op_estimates
576
576
 
577
577
  else:
@@ -44,6 +44,7 @@ class QueryProcessorConfig(BaseModel):
44
44
  k: int = Field(default=6)
45
45
  j: int = Field(default=4)
46
46
  sample_budget: int = Field(default=100)
47
+ sample_cost_budget: float | None = Field(default=None)
47
48
  seed: int = Field(default=42)
48
49
  exp_name: str | None = Field(default=None)
49
50
  priors: dict | None = Field(default=None)
@@ -24,7 +24,7 @@ from palimpzest.query.operators.filter import LLMFilter
24
24
  from palimpzest.query.operators.join import JoinOp
25
25
  from palimpzest.query.operators.limit import LimitScanOp
26
26
  from palimpzest.query.operators.physical import PhysicalOperator
27
- from palimpzest.query.operators.retrieve import RetrieveOp
27
+ from palimpzest.query.operators.topk import TopKOp
28
28
  from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
29
29
 
30
30
 
@@ -225,20 +225,22 @@ class PZProgressManager(ProgressManager):
225
225
  current_unique_full_op_id = unique_full_op_id
226
226
  next_op, next_unique_full_op_id = self.unique_full_op_id_to_next_op_and_id[unique_full_op_id]
227
227
  while next_op is not None:
228
- if not isinstance(next_op, (AggregateOp, LimitScanOp)):
229
- next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
230
- multiplier = 1
231
- if isinstance(next_op, JoinOp):
232
- # for joins, scale the delta by the number of inputs from the other side of the join
233
- left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
234
- if current_unique_full_op_id == left_input_unique_full_op_id:
235
- multiplier = self.get_task_total(right_input_unique_input_op_id)
236
- elif current_unique_full_op_id == right_input_unique_input_op_id:
237
- multiplier = self.get_task_total(left_input_unique_full_op_id)
238
- else:
239
- raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
240
- delta_adjusted = delta * multiplier
241
- self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
228
+ if isinstance(next_op, (AggregateOp, LimitScanOp)):
229
+ break
230
+
231
+ next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
232
+ multiplier = 1
233
+ if isinstance(next_op, JoinOp):
234
+ # for joins, scale the delta by the number of inputs from the other side of the join
235
+ left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
236
+ if current_unique_full_op_id == left_input_unique_full_op_id:
237
+ multiplier = self.get_task_total(right_input_unique_input_op_id)
238
+ elif current_unique_full_op_id == right_input_unique_input_op_id:
239
+ multiplier = self.get_task_total(left_input_unique_full_op_id)
240
+ else:
241
+ raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
242
+ delta_adjusted = delta * multiplier
243
+ self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
242
244
 
243
245
  # move to the next operator in the plan
244
246
  current_unique_full_op_id = next_unique_full_op_id
@@ -281,7 +283,7 @@ class PZProgressManager(ProgressManager):
281
283
  self.unique_full_op_id_to_stats[unique_full_op_id].memory_usage_mb = get_memory_usage()
282
284
 
283
285
  class PZSentinelProgressManager(ProgressManager):
284
- def __init__(self, plan: SentinelPlan, sample_budget: int):
286
+ def __init__(self, plan: SentinelPlan, sample_budget: int | None, sample_cost_budget: float | None):
285
287
  # overall progress bar
286
288
  self.overall_progress = RichProgress(
287
289
  SpinnerColumn(),
@@ -296,7 +298,9 @@ class PZSentinelProgressManager(ProgressManager):
296
298
  refresh_per_second=10,
297
299
  expand=True, # Use full width
298
300
  )
299
- self.overall_task_id = self.overall_progress.add_task("", total=sample_budget, cost=0.0, recent="")
301
+ self.use_cost_budget = sample_cost_budget is not None
302
+ total = sample_cost_budget if self.use_cost_budget else sample_budget
303
+ self.overall_task_id = self.overall_progress.add_task("", total=total, cost=0.0, recent="")
300
304
 
301
305
  # logical operator progress bars
302
306
  self.op_progress = RichProgress(
@@ -332,6 +336,9 @@ class PZSentinelProgressManager(ProgressManager):
332
336
  # initialize start time
333
337
  self.start_time = None
334
338
 
339
+ # initialize validation cost
340
+ self.validation_cost = 0.0
341
+
335
342
  # add a task to the progress manager for each operator in the plan
336
343
  for topo_idx, (logical_op_id, op_set) in enumerate(plan):
337
344
  unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
@@ -348,9 +355,9 @@ class PZSentinelProgressManager(ProgressManager):
348
355
  def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
349
356
  is_llm_convert = isinstance(physical_op, LLMConvert)
350
357
  is_llm_filter = isinstance(physical_op, LLMFilter)
351
- is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
358
+ is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
352
359
  is_llm_join = isinstance(physical_op, JoinOp)
353
- return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
360
+ return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
354
361
 
355
362
  def get_task_description(self, unique_logical_op_id: str) -> str:
356
363
  """Return the current description for the given task."""
@@ -385,15 +392,34 @@ class PZSentinelProgressManager(ProgressManager):
385
392
  # start progress bars
386
393
  self.live_display.start()
387
394
 
395
+ def incr_overall_progress_cost(self, cost_delta: float):
396
+ """Advance the overall progress bar by the given cost delta"""
397
+ self.validation_cost += cost_delta
398
+ self.overall_progress.update(
399
+ self.overall_task_id,
400
+ advance=cost_delta,
401
+ cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
402
+ refresh=True,
403
+ )
404
+
405
+ # force the live display to refresh
406
+ self.live_display.refresh()
407
+
388
408
  def incr(self, unique_logical_op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
389
409
  # TODO: (above) organize progress bars into a Live / Table / Panel or something
390
410
  # get the task for the given operation
391
411
  task = self.unique_logical_op_id_to_task.get(unique_logical_op_id)
392
412
 
413
+ # store the cost before updating stats
414
+ previous_total_cost = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost
415
+
393
416
  # update statistics with any additional keyword arguments
394
417
  if kwargs != {}:
395
418
  self.update_stats(unique_logical_op_id, **kwargs)
396
419
 
420
+ # compute the cost delta
421
+ cost_delta = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost - previous_total_cost
422
+
397
423
  # update progress bar and recent text in one update
398
424
  if display_text is not None:
399
425
  self.unique_logical_op_id_to_stats[unique_logical_op_id].recent_text = display_text
@@ -412,10 +438,11 @@ class PZSentinelProgressManager(ProgressManager):
412
438
  )
413
439
 
414
440
  # advance the overall progress bar
441
+ advance = cost_delta if self.use_cost_budget else num_samples
415
442
  self.overall_progress.update(
416
443
  self.overall_task_id,
417
- advance=num_samples,
418
- cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()),
444
+ advance=advance,
445
+ cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
419
446
  refresh=True,
420
447
  )
421
448
 
@@ -449,6 +476,7 @@ def create_progress_manager(
449
476
  plan: PhysicalPlan | SentinelPlan,
450
477
  num_samples: int | None = None,
451
478
  sample_budget: int | None = None,
479
+ sample_cost_budget: float | None = None,
452
480
  progress: bool = True,
453
481
  ) -> ProgressManager:
454
482
  """Factory function to create appropriate progress manager based on environment"""
@@ -456,7 +484,7 @@ def create_progress_manager(
456
484
  return MockProgressManager(plan, num_samples)
457
485
 
458
486
  if isinstance(plan, SentinelPlan):
459
- assert sample_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
460
- return PZSentinelProgressManager(plan, sample_budget)
487
+ assert sample_budget is not None or sample_cost_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
488
+ return PZSentinelProgressManager(plan, sample_budget, sample_cost_budget)
461
489
 
462
490
  return PZProgressManager(plan, num_samples)
@@ -19,7 +19,7 @@ from palimpzest.query.generators.generators import get_json_from_answer
19
19
  from palimpzest.query.operators.convert import LLMConvert
20
20
  from palimpzest.query.operators.filter import LLMFilter
21
21
  from palimpzest.query.operators.join import JoinOp
22
- from palimpzest.query.operators.retrieve import RetrieveOp
22
+ from palimpzest.query.operators.topk import TopKOp
23
23
 
24
24
 
25
25
  class Validator:
@@ -47,7 +47,7 @@ class Validator:
47
47
  def join_score_fn(self, condition: str, left_input_record: dict, right_input_record: dict, output: bool) -> float | None:
48
48
  raise NotImplementedError("Validator.join_score_fn not implemented.")
49
49
 
50
- def retrieve_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
50
+ def topk_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
51
51
  raise NotImplementedError("Validator.map_score_fn not implemented.")
52
52
 
53
53
  def _get_gen_stats_from_completion(self, completion, start_time: float) -> GenerationStats:
@@ -218,11 +218,11 @@ class Validator:
218
218
 
219
219
  return score, gen_stats
220
220
 
221
- def _default_retrieve_score_fn(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
221
+ def _default_topk_score_fn(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
222
222
  """
223
223
  Compute the quality of the generated output for the given fields and input_record.
224
224
  """
225
- # TODO: retrieve k=25; score each item based on relevance; compute F1
225
+ # TODO: top-k k=25; score each item based on relevance; compute F1
226
226
  # TODO: support retrieval over images
227
227
  # create prompt factory
228
228
  factory = PromptFactory(PromptStrategy.MAP, self.model, Cardinality.ONE_TO_ONE)
@@ -294,11 +294,11 @@ class Validator:
294
294
  score, gen_stats = self._default_join_score_fn(op, condition, left_input_record, right_input_record, output)
295
295
  return score, gen_stats, full_hash
296
296
 
297
- def _score_retrieve(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
297
+ def _score_topk(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
298
298
  try:
299
- out = self.retrieve_score_fn(fields, input_record.to_dict(), output)
299
+ out = self.topk_score_fn(fields, input_record.to_dict(), output)
300
300
  score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
301
301
  return score, gen_stats, full_hash
302
302
  except NotImplementedError:
303
- score, gen_stats = self._default_retrieve_score_fn(op, fields, input_record, output)
303
+ score, gen_stats = self._default_topk_score_fn(op, fields, input_record, output)
304
304
  return score, gen_stats, full_hash
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.9.0
3
+ Version: 1.1.0
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
13
13
  Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Programming Language :: Python :: 3.8
15
- Requires-Python: >=3.10
15
+ Requires-Python: >=3.12
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: anthropic>=0.55.0
@@ -59,15 +59,20 @@ Dynamic: license-file
59
59
  <!-- [![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b?logo=arxiv)](https://arxiv.org/pdf/2405.14696) -->
60
60
  <!-- [![Video](https://img.shields.io/badge/YouTube-Talk-red?logo=youtube)](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
61
61
 
62
- ## Learn How to Use PZ
63
- Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
62
+ ## 📚 Learn How to Use PZ
63
+ Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
64
64
 
65
- ## Getting started
65
+ ## 🚀 Getting started
66
66
  You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
67
67
  ```bash
68
68
  $ pip install palimpzest
69
69
  ```
70
70
 
71
+ You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
72
+ ```bash
73
+ $ uv pip install palimpzest
74
+ ```
75
+
71
76
  Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
72
77
  ```bash
73
78
  $ git clone git@github.com:mitdbg/palimpzest.git
@@ -75,7 +80,7 @@ $ cd palimpzest
75
80
  $ pip install .
76
81
  ```
77
82
 
78
- ## Join the PZ Community
83
+ ## 🙋🏽 Join the PZ Community
79
84
  We are actively hacking on PZ and would love to have you join our community [![Discord](https://img.shields.io/discord/1245561987480420445?logo=discord)](https://discord.gg/dN85JJ6jaH)
80
85
 
81
86
  [Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
@@ -86,66 +91,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
86
91
 
87
92
  We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
88
93
 
89
- ## Quick Start
90
- The easiest way to get started with Palimpzest is to run the `quickstart.ipynb` jupyter notebook. We demonstrate the full workflow of working with PZ, including registering a dataset, composing and executing a pipeline, and accessing the results.
91
- To run the notebook, you can use the following command:
92
- ```bash
93
- $ jupyter notebook
94
- ```
95
- And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
96
-
97
- ### Even Quicker Start
98
- For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
99
- ```python
100
- import palimpzest as pz
101
-
102
- # define the fields we wish to compute
103
- email_cols = [
104
- {"name": "sender", "type": str, "desc": "The email address of the sender"},
105
- {"name": "subject", "type": str, "desc": "The subject of the email"},
106
- {"name": "date", "type": str, "desc": "The date the email was sent"},
107
- ]
108
-
109
- # lazily construct the computation to get emails about holidays sent in July
110
- dataset = pz.Dataset("testdata/enron-tiny/")
111
- dataset = dataset.sem_add_columns(email_cols)
112
- dataset = dataset.sem_filter("The email was sent in July")
113
- dataset = dataset.sem_filter("The email is about holidays")
114
-
115
- # execute the computation w/the MinCost policy
116
- config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
117
- output = dataset.run(config)
118
-
119
- # display output (if using Jupyter, otherwise use print(output_df))
120
- output_df = output.to_df(cols=["date", "sender", "subject"])
121
- display(output_df)
122
- ```
123
-
124
- ## Python Demos
125
- Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
126
-
127
- ### Downloading test data
128
- To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
129
- ```
130
- chmod +x testdata/download-testdata.sh
131
- ./testdata/download-testdata.sh
132
- ```
133
-
134
- ### Running the Demos
135
- Set your OpenAI (or Together.ai) api key at the command line:
136
- ```bash
137
- # set one (or both) of the following:
138
- export OPENAI_API_KEY=<your-api-key>
139
- export TOGETHER_API_KEY=<your-api-key>
140
- ```
141
-
142
- Now you can run the simple test program with:
143
- ```bash
144
- $ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
145
- ```
146
-
147
- ### Citation
148
- If you would like to cite our work, please use the following citation:
94
+ ### 📓 Citation
95
+ If you would like to cite our original paper on Palimpzest, please use the following citation:
149
96
  ```
150
97
  @inproceedings{palimpzestCIDR,
151
98
  title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
@@ -154,3 +101,16 @@ If you would like to cite our work, please use the following citation:
154
101
  date = 2025,
155
102
  }
156
103
  ```
104
+
105
+ If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
106
+ ```
107
+ @misc{russo2025abacuscostbasedoptimizersemantic,
108
+ title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
109
+ author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
110
+ year={2025},
111
+ eprint={2505.14661},
112
+ archivePrefix={arXiv},
113
+ primaryClass={cs.DB},
114
+ url={https://arxiv.org/abs/2505.14661},
115
+ }
116
+ ```