palimpzest 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. palimpzest/constants.py +13 -4
  2. palimpzest/core/data/dataset.py +75 -5
  3. palimpzest/core/elements/groupbysig.py +5 -1
  4. palimpzest/core/elements/records.py +16 -7
  5. palimpzest/core/lib/schemas.py +26 -3
  6. palimpzest/core/models.py +4 -4
  7. palimpzest/prompts/aggregate_prompts.py +99 -0
  8. palimpzest/prompts/prompt_factory.py +162 -75
  9. palimpzest/prompts/utils.py +38 -1
  10. palimpzest/prompts/validator.py +24 -24
  11. palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
  12. palimpzest/query/execution/execution_strategy.py +8 -8
  13. palimpzest/query/execution/mab_execution_strategy.py +30 -11
  14. palimpzest/query/execution/parallel_execution_strategy.py +31 -7
  15. palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
  16. palimpzest/query/generators/generators.py +9 -7
  17. palimpzest/query/operators/__init__.py +10 -6
  18. palimpzest/query/operators/aggregate.py +394 -10
  19. palimpzest/query/operators/convert.py +1 -1
  20. palimpzest/query/operators/join.py +279 -23
  21. palimpzest/query/operators/logical.py +36 -11
  22. palimpzest/query/operators/mixture_of_agents.py +3 -1
  23. palimpzest/query/operators/physical.py +5 -2
  24. palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
  25. palimpzest/query/optimizer/__init__.py +11 -3
  26. palimpzest/query/optimizer/cost_model.py +5 -5
  27. palimpzest/query/optimizer/optimizer.py +3 -2
  28. palimpzest/query/optimizer/plan.py +2 -3
  29. palimpzest/query/optimizer/rules.py +73 -13
  30. palimpzest/query/optimizer/tasks.py +4 -4
  31. palimpzest/utils/progress.py +19 -17
  32. palimpzest/validator/validator.py +7 -7
  33. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
  34. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/RECORD +37 -36
  35. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
  36. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
  37. {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ from palimpzest.core.models import GenerationStats, OperatorCostEstimates, Recor
17
17
  from palimpzest.query.operators.physical import PhysicalOperator
18
18
 
19
19
 
20
- class RetrieveOp(PhysicalOperator):
20
+ class TopKOp(PhysicalOperator):
21
21
  def __init__(
22
22
  self,
23
23
  index: Collection,
@@ -29,7 +29,7 @@ class RetrieveOp(PhysicalOperator):
29
29
  **kwargs,
30
30
  ) -> None:
31
31
  """
32
- Initialize the RetrieveOp object.
32
+ Initialize the TopKOp object.
33
33
 
34
34
  Args:
35
35
  index (Collection): The PZ index to use for retrieval.
@@ -59,7 +59,7 @@ class RetrieveOp(PhysicalOperator):
59
59
 
60
60
  def __str__(self):
61
61
  op = super().__str__()
62
- op += f" Retrieve: {self.index.__class__.__name__} with top {self.k}\n"
62
+ op += f" Top-K: {self.index.__class__.__name__} with k={self.k}\n"
63
63
  return op
64
64
 
65
65
  def get_id_params(self):
@@ -89,8 +89,8 @@ class RetrieveOp(PhysicalOperator):
89
89
 
90
90
  def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
91
91
  """
92
- Compute naive cost estimates for the Retrieve operation. These estimates assume
93
- that the Retrieve (1) has no cost and (2) has perfect quality.
92
+ Compute naive cost estimates for the Top-K operation. These estimates assume
93
+ that the Top-K (1) has negligible cost and (2) has perfect quality.
94
94
  """
95
95
  return OperatorCostEstimates(
96
96
  cardinality=source_op_cost_estimates.cardinality,
@@ -101,7 +101,7 @@ class RetrieveOp(PhysicalOperator):
101
101
 
102
102
  def default_search_func(self, index: Collection, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
103
103
  """
104
- Default search function for the Retrieve operation. This function uses the index to
104
+ Default search function for the Top-K operation. This function uses the index to
105
105
  retrieve the top-k results for the given query. The query will be a (possibly singleton)
106
106
  list of strings or a list of lists of floats (i.e., embeddings). The function will return
107
107
  the top-k results per-query in (descending) sorted order. If the input is a singleton list,
@@ -111,7 +111,7 @@ class RetrieveOp(PhysicalOperator):
111
111
  Args:
112
112
  index (PZIndex): The index to use for retrieval.
113
113
  query (list[str] | list[list[float]]): The query (or queries) to search for.
114
- k (int): The maximum number of results the retrieve operator will return.
114
+ k (int): The maximum number of results the top-k operator will return.
115
115
 
116
116
  Returns:
117
117
  list[str] | list[list[str]]: The top results in (descending) sorted order per query.
@@ -260,10 +260,10 @@ class RetrieveOp(PhysicalOperator):
260
260
  top_results = self.search_func(self.index, inputs, self.k)
261
261
 
262
262
  except Exception:
263
- top_results = ["error-in-retrieve"]
264
- os.makedirs("retrieve-errors", exist_ok=True)
263
+ top_results = ["error-in-topk"]
264
+ os.makedirs("topk-errors", exist_ok=True)
265
265
  ts = time.time()
266
- with open(f"retrieve-errors/error-{ts}.txt", "w") as f:
266
+ with open(f"topk-errors/error-{ts}.txt", "w") as f:
267
267
  f.write(str(query))
268
268
 
269
269
  # TODO: the user is always right! let's drop this post-processing in the future
@@ -39,17 +39,23 @@ from palimpzest.query.optimizer.rules import (
39
39
  RAGRule as _RAGRule,
40
40
  )
41
41
  from palimpzest.query.optimizer.rules import (
42
- ReorderConverts as _ReorderConverts,
42
+ RelationalJoinRule as _RelationalJoinRule,
43
43
  )
44
44
  from palimpzest.query.optimizer.rules import (
45
- RetrieveRule as _RetrieveRule,
45
+ ReorderConverts as _ReorderConverts,
46
46
  )
47
47
  from palimpzest.query.optimizer.rules import (
48
48
  Rule as _Rule,
49
49
  )
50
+ from palimpzest.query.optimizer.rules import (
51
+ SemanticAggregateRule as _SemanticAggregateRule,
52
+ )
50
53
  from palimpzest.query.optimizer.rules import (
51
54
  SplitRule as _SplitRule,
52
55
  )
56
+ from palimpzest.query.optimizer.rules import (
57
+ TopKRule as _TopKRule,
58
+ )
53
59
  from palimpzest.query.optimizer.rules import (
54
60
  TransformationRule as _TransformationRule,
55
61
  )
@@ -69,9 +75,11 @@ ALL_RULES = [
69
75
  _NonLLMFilterRule,
70
76
  _PushDownFilter,
71
77
  _RAGRule,
78
+ _RelationalJoinRule,
72
79
  _ReorderConverts,
73
- _RetrieveRule,
80
+ _TopKRule,
74
81
  _Rule,
82
+ _SemanticAggregateRule,
75
83
  _SplitRule,
76
84
  _TransformationRule,
77
85
  ]
@@ -131,17 +131,17 @@ class SampleBasedCostModel:
131
131
  # compute selectivity
132
132
  selectivity = physical_op_df.passed_operator.sum() / num_source_records
133
133
 
134
+ # compute quality; if all qualities are None then this will be NaN
135
+ quality = physical_op_df.quality.mean()
136
+
137
+ # set operator stats for this physical operator
134
138
  operator_to_stats[unique_logical_op_id][full_op_id] = {
135
139
  "cost": physical_op_df.cost_per_record.mean(),
136
140
  "time": physical_op_df.time_per_record.mean(),
137
- "quality": physical_op_df.quality.mean(),
141
+ "quality": 1.0 if pd.isna(quality) else quality,
138
142
  "selectivity": selectivity,
139
143
  }
140
144
 
141
- # if this is an experiment, log the dataframe and operator_to_stats dictionary
142
- if self.exp_name is not None:
143
- operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
144
-
145
145
  logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
146
146
  return operator_to_stats
147
147
 
@@ -284,10 +284,11 @@ class Optimizer:
284
284
  all_properties["filters"] = set([op_filter_str])
285
285
 
286
286
  elif isinstance(op, JoinOp):
287
+ unique_join_str = str(sorted(op.on)) if op.condition is None else op.condition
287
288
  if "joins" in all_properties:
288
- all_properties["joins"].add(op.condition)
289
+ all_properties["joins"].add(unique_join_str)
289
290
  else:
290
- all_properties["joins"] = set([op.condition])
291
+ all_properties["joins"] = set([unique_join_str])
291
292
 
292
293
  elif isinstance(op, LimitScan):
293
294
  op_limit_str = op.get_logical_op_id()
@@ -203,9 +203,8 @@ class PhysicalPlan(Plan):
203
203
  # return the current index and the upstream unique full_op_ids for this operator
204
204
  return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
205
205
 
206
- def get_upstream_unique_full_op_ids(self, topo_idx: int, operator: PhysicalOperator) -> list[str]:
207
- """Return the list of unique full_op_ids for the upstream operators of this operator."""
208
- unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
206
+ def get_upstream_unique_full_op_ids(self, unique_full_op_id: str) -> list[str]:
207
+ """Return the list of unique full_op_ids for the upstream operators of the operator specified by `unique_full_op_id`."""
209
208
  return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
210
209
 
211
210
  def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
@@ -12,13 +12,21 @@ from palimpzest.core.lib.schemas import (
12
12
  IMAGE_LIST_FIELD_TYPES,
13
13
  )
14
14
  from palimpzest.prompts import CONTEXT_SEARCH_PROMPT
15
- from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
15
+ from palimpzest.query.operators.aggregate import (
16
+ ApplyGroupByOp,
17
+ AverageAggregateOp,
18
+ CountAggregateOp,
19
+ MaxAggregateOp,
20
+ MinAggregateOp,
21
+ SemanticAggregate,
22
+ SumAggregateOp,
23
+ )
16
24
  from palimpzest.query.operators.compute import SmolAgentsCompute
17
25
  from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
18
26
  from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert, CritiqueAndRefineFilter
19
27
  from palimpzest.query.operators.distinct import DistinctOp
20
28
  from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
21
- from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin
29
+ from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin, RelationalJoin
22
30
  from palimpzest.query.operators.limit import LimitScanOp
23
31
  from palimpzest.query.operators.logical import (
24
32
  Aggregate,
@@ -32,19 +40,19 @@ from palimpzest.query.operators.logical import (
32
40
  JoinOp,
33
41
  LimitScan,
34
42
  Project,
35
- RetrieveScan,
36
43
  SearchOperator,
44
+ TopKScan,
37
45
  )
38
46
  from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert, MixtureOfAgentsFilter
39
47
  from palimpzest.query.operators.physical import PhysicalOperator
40
48
  from palimpzest.query.operators.project import ProjectOp
41
49
  from palimpzest.query.operators.rag import RAGConvert, RAGFilter
42
- from palimpzest.query.operators.retrieve import RetrieveOp
43
50
  from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
44
51
  from palimpzest.query.operators.search import (
45
52
  SmolAgentsSearch, # SmolAgentsCustomManagedSearch, # SmolAgentsManagedSearch
46
53
  )
47
54
  from palimpzest.query.operators.split import SplitConvert, SplitFilter
55
+ from palimpzest.query.operators.topk import TopKOp
48
56
  from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
49
57
 
50
58
  logger = logging.getLogger(__name__)
@@ -789,26 +797,26 @@ class SplitRule(ImplementationRule):
789
797
  return cls._perform_substitution(logical_expression, phys_op_cls, runtime_kwargs, variable_op_kwargs)
790
798
 
791
799
 
792
- class RetrieveRule(ImplementationRule):
800
+ class TopKRule(ImplementationRule):
793
801
  """
794
- Substitute a logical expression for a RetrieveScan with a Retrieve physical implementation.
802
+ Substitute a logical expression for a TopKScan with a TopK physical implementation.
795
803
  """
796
804
  k_budgets = [1, 3, 5, 10, 15, 20, 25]
797
805
 
798
806
  @classmethod
799
807
  def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
800
- is_match = isinstance(logical_expression.operator, RetrieveScan)
801
- logger.debug(f"RetrieveRule matches_pattern: {is_match} for {logical_expression}")
808
+ is_match = isinstance(logical_expression.operator, TopKScan)
809
+ logger.debug(f"TopKRule matches_pattern: {is_match} for {logical_expression}")
802
810
  return is_match
803
811
 
804
812
  @classmethod
805
813
  def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
806
- logger.debug(f"Substituting RetrieveRule for {logical_expression}")
814
+ logger.debug(f"Substituting TopKRule for {logical_expression}")
807
815
 
808
816
  # create variable physical operator kwargs for each model which can implement this logical_expression
809
817
  ks = cls.k_budgets if logical_expression.operator.k == -1 else [logical_expression.operator.k]
810
818
  variable_op_kwargs = [{"k": k} for k in ks]
811
- return cls._perform_substitution(logical_expression, RetrieveOp, runtime_kwargs, variable_op_kwargs)
819
+ return cls._perform_substitution(logical_expression, TopKOp, runtime_kwargs, variable_op_kwargs)
812
820
 
813
821
 
814
822
  class NonLLMFilterRule(ImplementationRule):
@@ -860,6 +868,23 @@ class LLMFilterRule(ImplementationRule):
860
868
  return cls._perform_substitution(logical_expression, LLMFilter, runtime_kwargs, variable_op_kwargs)
861
869
 
862
870
 
871
+ class RelationalJoinRule(ImplementationRule):
872
+ """
873
+ Substitute a logical expression for a JoinOp with a RelationalJoin physical implementation.
874
+ """
875
+
876
+ @classmethod
877
+ def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
878
+ is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition == ""
879
+ logger.debug(f"RelationalJoinRule matches_pattern: {is_match} for {logical_expression}")
880
+ return is_match
881
+
882
+ @classmethod
883
+ def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
884
+ logger.debug(f"Substituting RelationalJoinRule for {logical_expression}")
885
+ return cls._perform_substitution(logical_expression, RelationalJoin, runtime_kwargs)
886
+
887
+
863
888
  class NestedLoopsJoinRule(ImplementationRule):
864
889
  """
865
890
  Substitute a logical expression for a JoinOp with an (LLM) NestedLoopsJoin physical implementation.
@@ -867,7 +892,7 @@ class NestedLoopsJoinRule(ImplementationRule):
867
892
 
868
893
  @classmethod
869
894
  def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
870
- is_match = isinstance(logical_expression.operator, JoinOp)
895
+ is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != ""
871
896
  logger.debug(f"NestedLoopsJoinRule matches_pattern: {is_match} for {logical_expression}")
872
897
  return is_match
873
898
 
@@ -899,7 +924,7 @@ class EmbeddingJoinRule(ImplementationRule):
899
924
 
900
925
  @classmethod
901
926
  def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
902
- is_match = isinstance(logical_expression.operator, JoinOp) and not cls._is_audio_operation(logical_expression)
927
+ is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != "" and not cls._is_audio_operation(logical_expression)
903
928
  logger.debug(f"EmbeddingJoinRule matches_pattern: {is_match} for {logical_expression}")
904
929
  return is_match
905
930
 
@@ -924,6 +949,35 @@ class EmbeddingJoinRule(ImplementationRule):
924
949
 
925
950
  return cls._perform_substitution(logical_expression, EmbeddingJoin, runtime_kwargs, variable_op_kwargs)
926
951
 
952
+ class SemanticAggregateRule(ImplementationRule):
953
+ """
954
+ Substitute a logical expression for a SemanticAggregate with an llm physical implementation.
955
+ """
956
+
957
+ @classmethod
958
+ def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
959
+ is_match = isinstance(logical_expression.operator, Aggregate) and logical_expression.operator.agg_str is not None
960
+ logger.debug(f"SemanticAggregateRule matches_pattern: {is_match} for {logical_expression}")
961
+ return is_match
962
+
963
+ @classmethod
964
+ def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
965
+ logger.debug(f"Substituting SemanticAggregateRule for {logical_expression}")
966
+
967
+ # create variable physical operator kwargs for each model which can implement this logical_expression
968
+ models = [model for model in runtime_kwargs["available_models"] if cls._model_matches_input(model, logical_expression) and not model.is_llama_model()]
969
+ no_reasoning = runtime_kwargs["reasoning_effort"] in [None, "minimal", "low"]
970
+ variable_op_kwargs = [
971
+ {
972
+ "model": model,
973
+ "prompt_strategy": PromptStrategy.AGG_NO_REASONING if model.is_reasoning_model() and no_reasoning else PromptStrategy.AGG,
974
+ "reasoning_effort": runtime_kwargs["reasoning_effort"]
975
+ }
976
+ for model in models
977
+ ]
978
+
979
+ return cls._perform_substitution(logical_expression, SemanticAggregate, runtime_kwargs, variable_op_kwargs)
980
+
927
981
 
928
982
  class AggregateRule(ImplementationRule):
929
983
  """
@@ -932,7 +986,7 @@ class AggregateRule(ImplementationRule):
932
986
 
933
987
  @classmethod
934
988
  def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
935
- is_match = isinstance(logical_expression.operator, Aggregate)
989
+ is_match = isinstance(logical_expression.operator, Aggregate) and logical_expression.operator.agg_func is not None
936
990
  logger.debug(f"AggregateRule matches_pattern: {is_match} for {logical_expression}")
937
991
  return is_match
938
992
 
@@ -946,6 +1000,12 @@ class AggregateRule(ImplementationRule):
946
1000
  physical_op_class = CountAggregateOp
947
1001
  elif logical_expression.operator.agg_func == AggFunc.AVERAGE:
948
1002
  physical_op_class = AverageAggregateOp
1003
+ elif logical_expression.operator.agg_func == AggFunc.SUM:
1004
+ physical_op_class = SumAggregateOp
1005
+ elif logical_expression.operator.agg_func == AggFunc.MIN:
1006
+ physical_op_class = MinAggregateOp
1007
+ elif logical_expression.operator.agg_func == AggFunc.MAX:
1008
+ physical_op_class = MaxAggregateOp
949
1009
  else:
950
1010
  raise Exception(f"Cannot support aggregate function: {logical_expression.operator.agg_func}")
951
1011
 
@@ -501,8 +501,8 @@ class OptimizePhysicalExpression(Task):
501
501
 
502
502
  # compute the total cost for this physical expression by summing its operator's PlanCost
503
503
  # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
504
- execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
505
- full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy)
504
+ execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
505
+ full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy_str)
506
506
  full_plan_cost.op_estimates = op_plan_cost.op_estimates
507
507
  all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
508
508
 
@@ -570,8 +570,8 @@ class OptimizePhysicalExpression(Task):
570
570
 
571
571
  # compute the total cost for this physical expression by summing its operator's PlanCost
572
572
  # with the input groups' total PlanCost; also set the op_estimates for this expression's operator
573
- execution_strategy = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
574
- full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy)
573
+ execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
574
+ full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy_str)
575
575
  full_plan_cost.op_estimates = op_plan_cost.op_estimates
576
576
 
577
577
  else:
@@ -24,7 +24,7 @@ from palimpzest.query.operators.filter import LLMFilter
24
24
  from palimpzest.query.operators.join import JoinOp
25
25
  from palimpzest.query.operators.limit import LimitScanOp
26
26
  from palimpzest.query.operators.physical import PhysicalOperator
27
- from palimpzest.query.operators.retrieve import RetrieveOp
27
+ from palimpzest.query.operators.topk import TopKOp
28
28
  from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
29
29
 
30
30
 
@@ -225,20 +225,22 @@ class PZProgressManager(ProgressManager):
225
225
  current_unique_full_op_id = unique_full_op_id
226
226
  next_op, next_unique_full_op_id = self.unique_full_op_id_to_next_op_and_id[unique_full_op_id]
227
227
  while next_op is not None:
228
- if not isinstance(next_op, (AggregateOp, LimitScanOp)):
229
- next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
230
- multiplier = 1
231
- if isinstance(next_op, JoinOp):
232
- # for joins, scale the delta by the number of inputs from the other side of the join
233
- left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
234
- if current_unique_full_op_id == left_input_unique_full_op_id:
235
- multiplier = self.get_task_total(right_input_unique_input_op_id)
236
- elif current_unique_full_op_id == right_input_unique_input_op_id:
237
- multiplier = self.get_task_total(left_input_unique_full_op_id)
238
- else:
239
- raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
240
- delta_adjusted = delta * multiplier
241
- self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
228
+ if isinstance(next_op, (AggregateOp, LimitScanOp)):
229
+ break
230
+
231
+ next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
232
+ multiplier = 1
233
+ if isinstance(next_op, JoinOp):
234
+ # for joins, scale the delta by the number of inputs from the other side of the join
235
+ left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
236
+ if current_unique_full_op_id == left_input_unique_full_op_id:
237
+ multiplier = self.get_task_total(right_input_unique_input_op_id)
238
+ elif current_unique_full_op_id == right_input_unique_input_op_id:
239
+ multiplier = self.get_task_total(left_input_unique_full_op_id)
240
+ else:
241
+ raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
242
+ delta_adjusted = delta * multiplier
243
+ self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
242
244
 
243
245
  # move to the next operator in the plan
244
246
  current_unique_full_op_id = next_unique_full_op_id
@@ -348,9 +350,9 @@ class PZSentinelProgressManager(ProgressManager):
348
350
  def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
349
351
  is_llm_convert = isinstance(physical_op, LLMConvert)
350
352
  is_llm_filter = isinstance(physical_op, LLMFilter)
351
- is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
353
+ is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
352
354
  is_llm_join = isinstance(physical_op, JoinOp)
353
- return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
355
+ return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
354
356
 
355
357
  def get_task_description(self, unique_logical_op_id: str) -> str:
356
358
  """Return the current description for the given task."""
@@ -19,7 +19,7 @@ from palimpzest.query.generators.generators import get_json_from_answer
19
19
  from palimpzest.query.operators.convert import LLMConvert
20
20
  from palimpzest.query.operators.filter import LLMFilter
21
21
  from palimpzest.query.operators.join import JoinOp
22
- from palimpzest.query.operators.retrieve import RetrieveOp
22
+ from palimpzest.query.operators.topk import TopKOp
23
23
 
24
24
 
25
25
  class Validator:
@@ -47,7 +47,7 @@ class Validator:
47
47
  def join_score_fn(self, condition: str, left_input_record: dict, right_input_record: dict, output: bool) -> float | None:
48
48
  raise NotImplementedError("Validator.join_score_fn not implemented.")
49
49
 
50
- def retrieve_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
50
+ def topk_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
51
51
  raise NotImplementedError("Validator.map_score_fn not implemented.")
52
52
 
53
53
  def _get_gen_stats_from_completion(self, completion, start_time: float) -> GenerationStats:
@@ -218,11 +218,11 @@ class Validator:
218
218
 
219
219
  return score, gen_stats
220
220
 
221
- def _default_retrieve_score_fn(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
221
+ def _default_topk_score_fn(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
222
222
  """
223
223
  Compute the quality of the generated output for the given fields and input_record.
224
224
  """
225
- # TODO: retrieve k=25; score each item based on relevance; compute F1
225
+ # TODO: top-k k=25; score each item based on relevance; compute F1
226
226
  # TODO: support retrieval over images
227
227
  # create prompt factory
228
228
  factory = PromptFactory(PromptStrategy.MAP, self.model, Cardinality.ONE_TO_ONE)
@@ -294,11 +294,11 @@ class Validator:
294
294
  score, gen_stats = self._default_join_score_fn(op, condition, left_input_record, right_input_record, output)
295
295
  return score, gen_stats, full_hash
296
296
 
297
- def _score_retrieve(self, op: RetrieveOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
297
+ def _score_topk(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
298
298
  try:
299
- out = self.retrieve_score_fn(fields, input_record.to_dict(), output)
299
+ out = self.topk_score_fn(fields, input_record.to_dict(), output)
300
300
  score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
301
301
  return score, gen_stats, full_hash
302
302
  except NotImplementedError:
303
- score, gen_stats = self._default_retrieve_score_fn(op, fields, input_record, output)
303
+ score, gen_stats = self._default_topk_score_fn(op, fields, input_record, output)
304
304
  return score, gen_stats, full_hash
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.8.7
3
+ Version: 1.0.0
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
13
13
  Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Programming Language :: Python :: 3.8
15
- Requires-Python: >=3.10
15
+ Requires-Python: >=3.12
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: anthropic>=0.55.0
@@ -59,15 +59,20 @@ Dynamic: license-file
59
59
  <!-- [![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b?logo=arxiv)](https://arxiv.org/pdf/2405.14696) -->
60
60
  <!-- [![Video](https://img.shields.io/badge/YouTube-Talk-red?logo=youtube)](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
61
61
 
62
- ## Learn How to Use PZ
63
- Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
62
+ ## 📚 Learn How to Use PZ
63
+ Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
64
64
 
65
- ## Getting started
65
+ ## 🚀 Getting started
66
66
  You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
67
67
  ```bash
68
68
  $ pip install palimpzest
69
69
  ```
70
70
 
71
+ You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
72
+ ```bash
73
+ $ uv pip install palimpzest
74
+ ```
75
+
71
76
  Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
72
77
  ```bash
73
78
  $ git clone git@github.com:mitdbg/palimpzest.git
@@ -75,7 +80,7 @@ $ cd palimpzest
75
80
  $ pip install .
76
81
  ```
77
82
 
78
- ## Join the PZ Community
83
+ ## 🙋🏽 Join the PZ Community
79
84
  We are actively hacking on PZ and would love to have you join our community [![Discord](https://img.shields.io/discord/1245561987480420445?logo=discord)](https://discord.gg/dN85JJ6jaH)
80
85
 
81
86
  [Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
@@ -86,66 +91,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
86
91
 
87
92
  We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
88
93
 
89
- ## Quick Start
90
- The easiest way to get started with Palimpzest is to run the `quickstart.ipynb` jupyter notebook. We demonstrate the full workflow of working with PZ, including registering a dataset, composing and executing a pipeline, and accessing the results.
91
- To run the notebook, you can use the following command:
92
- ```bash
93
- $ jupyter notebook
94
- ```
95
- And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
96
-
97
- ### Even Quicker Start
98
- For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
99
- ```python
100
- import palimpzest as pz
101
-
102
- # define the fields we wish to compute
103
- email_cols = [
104
- {"name": "sender", "type": str, "desc": "The email address of the sender"},
105
- {"name": "subject", "type": str, "desc": "The subject of the email"},
106
- {"name": "date", "type": str, "desc": "The date the email was sent"},
107
- ]
108
-
109
- # lazily construct the computation to get emails about holidays sent in July
110
- dataset = pz.Dataset("testdata/enron-tiny/")
111
- dataset = dataset.sem_add_columns(email_cols)
112
- dataset = dataset.sem_filter("The email was sent in July")
113
- dataset = dataset.sem_filter("The email is about holidays")
114
-
115
- # execute the computation w/the MinCost policy
116
- config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
117
- output = dataset.run(config)
118
-
119
- # display output (if using Jupyter, otherwise use print(output_df))
120
- output_df = output.to_df(cols=["date", "sender", "subject"])
121
- display(output_df)
122
- ```
123
-
124
- ## Python Demos
125
- Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
126
-
127
- ### Downloading test data
128
- To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
129
- ```
130
- chmod +x testdata/download-testdata.sh
131
- ./testdata/download-testdata.sh
132
- ```
133
-
134
- ### Running the Demos
135
- Set your OpenAI (or Together.ai) api key at the command line:
136
- ```bash
137
- # set one (or both) of the following:
138
- export OPENAI_API_KEY=<your-api-key>
139
- export TOGETHER_API_KEY=<your-api-key>
140
- ```
141
-
142
- Now you can run the simple test program with:
143
- ```bash
144
- $ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
145
- ```
146
-
147
- ### Citation
148
- If you would like to cite our work, please use the following citation:
94
+ ### 📓 Citation
95
+ If you would like to cite our original paper on Palimpzest, please use the following citation:
149
96
  ```
150
97
  @inproceedings{palimpzestCIDR,
151
98
  title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
@@ -154,3 +101,16 @@ If you would like to cite our work, please use the following citation:
154
101
  date = 2025,
155
102
  }
156
103
  ```
104
+
105
+ If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
106
+ ```
107
+ @misc{russo2025abacuscostbasedoptimizersemantic,
108
+ title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
109
+ author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
110
+ year={2025},
111
+ eprint={2505.14661},
112
+ archivePrefix={arXiv},
113
+ primaryClass={cs.DB},
114
+ url={https://arxiv.org/abs/2505.14661},
115
+ }
116
+ ```