palimpzest 0.9.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/constants.py +1 -0
- palimpzest/core/data/dataset.py +33 -5
- palimpzest/core/elements/groupbysig.py +10 -1
- palimpzest/core/elements/records.py +16 -7
- palimpzest/core/lib/schemas.py +20 -3
- palimpzest/core/models.py +10 -4
- palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
- palimpzest/query/execution/execution_strategy.py +13 -11
- palimpzest/query/execution/mab_execution_strategy.py +40 -14
- palimpzest/query/execution/parallel_execution_strategy.py +31 -7
- palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
- palimpzest/query/generators/generators.py +1 -1
- palimpzest/query/operators/__init__.py +7 -6
- palimpzest/query/operators/aggregate.py +110 -5
- palimpzest/query/operators/convert.py +1 -1
- palimpzest/query/operators/join.py +279 -23
- palimpzest/query/operators/logical.py +20 -8
- palimpzest/query/operators/mixture_of_agents.py +3 -1
- palimpzest/query/operators/physical.py +5 -2
- palimpzest/query/operators/rag.py +5 -4
- palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
- palimpzest/query/optimizer/__init__.py +7 -3
- palimpzest/query/optimizer/cost_model.py +5 -5
- palimpzest/query/optimizer/optimizer.py +3 -2
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/rules.py +31 -11
- palimpzest/query/optimizer/tasks.py +4 -4
- palimpzest/query/processor/config.py +1 -0
- palimpzest/utils/progress.py +51 -23
- palimpzest/validator/validator.py +7 -7
- {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/METADATA +26 -66
- {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/RECORD +35 -35
- {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ from palimpzest.core.models import GenerationStats, OperatorCostEstimates, Recor
|
|
|
17
17
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class
|
|
20
|
+
class TopKOp(PhysicalOperator):
|
|
21
21
|
def __init__(
|
|
22
22
|
self,
|
|
23
23
|
index: Collection,
|
|
@@ -29,7 +29,7 @@ class RetrieveOp(PhysicalOperator):
|
|
|
29
29
|
**kwargs,
|
|
30
30
|
) -> None:
|
|
31
31
|
"""
|
|
32
|
-
Initialize the
|
|
32
|
+
Initialize the TopKOp object.
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
35
|
index (Collection): The PZ index to use for retrieval.
|
|
@@ -59,7 +59,7 @@ class RetrieveOp(PhysicalOperator):
|
|
|
59
59
|
|
|
60
60
|
def __str__(self):
|
|
61
61
|
op = super().__str__()
|
|
62
|
-
op += f"
|
|
62
|
+
op += f" Top-K: {self.index.__class__.__name__} with k={self.k}\n"
|
|
63
63
|
return op
|
|
64
64
|
|
|
65
65
|
def get_id_params(self):
|
|
@@ -89,8 +89,8 @@ class RetrieveOp(PhysicalOperator):
|
|
|
89
89
|
|
|
90
90
|
def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
|
|
91
91
|
"""
|
|
92
|
-
Compute naive cost estimates for the
|
|
93
|
-
that the
|
|
92
|
+
Compute naive cost estimates for the Top-K operation. These estimates assume
|
|
93
|
+
that the Top-K (1) has negligible cost and (2) has perfect quality.
|
|
94
94
|
"""
|
|
95
95
|
return OperatorCostEstimates(
|
|
96
96
|
cardinality=source_op_cost_estimates.cardinality,
|
|
@@ -101,7 +101,7 @@ class RetrieveOp(PhysicalOperator):
|
|
|
101
101
|
|
|
102
102
|
def default_search_func(self, index: Collection, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
|
|
103
103
|
"""
|
|
104
|
-
Default search function for the
|
|
104
|
+
Default search function for the Top-K operation. This function uses the index to
|
|
105
105
|
retrieve the top-k results for the given query. The query will be a (possibly singleton)
|
|
106
106
|
list of strings or a list of lists of floats (i.e., embeddings). The function will return
|
|
107
107
|
the top-k results per-query in (descending) sorted order. If the input is a singleton list,
|
|
@@ -111,7 +111,7 @@ class RetrieveOp(PhysicalOperator):
|
|
|
111
111
|
Args:
|
|
112
112
|
index (PZIndex): The index to use for retrieval.
|
|
113
113
|
query (list[str] | list[list[float]]): The query (or queries) to search for.
|
|
114
|
-
k (int): The maximum number of results the
|
|
114
|
+
k (int): The maximum number of results the top-k operator will return.
|
|
115
115
|
|
|
116
116
|
Returns:
|
|
117
117
|
list[str] | list[list[str]]: The top results in (descending) sorted order per query.
|
|
@@ -260,10 +260,10 @@ class RetrieveOp(PhysicalOperator):
|
|
|
260
260
|
top_results = self.search_func(self.index, inputs, self.k)
|
|
261
261
|
|
|
262
262
|
except Exception:
|
|
263
|
-
top_results = ["error-in-
|
|
264
|
-
os.makedirs("
|
|
263
|
+
top_results = ["error-in-topk"]
|
|
264
|
+
os.makedirs("topk-errors", exist_ok=True)
|
|
265
265
|
ts = time.time()
|
|
266
|
-
with open(f"
|
|
266
|
+
with open(f"topk-errors/error-{ts}.txt", "w") as f:
|
|
267
267
|
f.write(str(query))
|
|
268
268
|
|
|
269
269
|
# TODO: the user is always right! let's drop this post-processing in the future
|
|
@@ -39,10 +39,10 @@ from palimpzest.query.optimizer.rules import (
|
|
|
39
39
|
RAGRule as _RAGRule,
|
|
40
40
|
)
|
|
41
41
|
from palimpzest.query.optimizer.rules import (
|
|
42
|
-
|
|
42
|
+
RelationalJoinRule as _RelationalJoinRule,
|
|
43
43
|
)
|
|
44
44
|
from palimpzest.query.optimizer.rules import (
|
|
45
|
-
|
|
45
|
+
ReorderConverts as _ReorderConverts,
|
|
46
46
|
)
|
|
47
47
|
from palimpzest.query.optimizer.rules import (
|
|
48
48
|
Rule as _Rule,
|
|
@@ -53,6 +53,9 @@ from palimpzest.query.optimizer.rules import (
|
|
|
53
53
|
from palimpzest.query.optimizer.rules import (
|
|
54
54
|
SplitRule as _SplitRule,
|
|
55
55
|
)
|
|
56
|
+
from palimpzest.query.optimizer.rules import (
|
|
57
|
+
TopKRule as _TopKRule,
|
|
58
|
+
)
|
|
56
59
|
from palimpzest.query.optimizer.rules import (
|
|
57
60
|
TransformationRule as _TransformationRule,
|
|
58
61
|
)
|
|
@@ -72,8 +75,9 @@ ALL_RULES = [
|
|
|
72
75
|
_NonLLMFilterRule,
|
|
73
76
|
_PushDownFilter,
|
|
74
77
|
_RAGRule,
|
|
78
|
+
_RelationalJoinRule,
|
|
75
79
|
_ReorderConverts,
|
|
76
|
-
|
|
80
|
+
_TopKRule,
|
|
77
81
|
_Rule,
|
|
78
82
|
_SemanticAggregateRule,
|
|
79
83
|
_SplitRule,
|
|
@@ -131,17 +131,17 @@ class SampleBasedCostModel:
|
|
|
131
131
|
# compute selectivity
|
|
132
132
|
selectivity = physical_op_df.passed_operator.sum() / num_source_records
|
|
133
133
|
|
|
134
|
+
# compute quality; if all qualities are None then this will be NaN
|
|
135
|
+
quality = physical_op_df.quality.mean()
|
|
136
|
+
|
|
137
|
+
# set operator stats for this physical operator
|
|
134
138
|
operator_to_stats[unique_logical_op_id][full_op_id] = {
|
|
135
139
|
"cost": physical_op_df.cost_per_record.mean(),
|
|
136
140
|
"time": physical_op_df.time_per_record.mean(),
|
|
137
|
-
"quality":
|
|
141
|
+
"quality": 1.0 if pd.isna(quality) else quality,
|
|
138
142
|
"selectivity": selectivity,
|
|
139
143
|
}
|
|
140
144
|
|
|
141
|
-
# if this is an experiment, log the dataframe and operator_to_stats dictionary
|
|
142
|
-
if self.exp_name is not None:
|
|
143
|
-
operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
|
|
144
|
-
|
|
145
145
|
logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
|
|
146
146
|
return operator_to_stats
|
|
147
147
|
|
|
@@ -284,10 +284,11 @@ class Optimizer:
|
|
|
284
284
|
all_properties["filters"] = set([op_filter_str])
|
|
285
285
|
|
|
286
286
|
elif isinstance(op, JoinOp):
|
|
287
|
+
unique_join_str = str(sorted(op.on)) if op.condition is None else op.condition
|
|
287
288
|
if "joins" in all_properties:
|
|
288
|
-
all_properties["joins"].add(
|
|
289
|
+
all_properties["joins"].add(unique_join_str)
|
|
289
290
|
else:
|
|
290
|
-
all_properties["joins"] = set([
|
|
291
|
+
all_properties["joins"] = set([unique_join_str])
|
|
291
292
|
|
|
292
293
|
elif isinstance(op, LimitScan):
|
|
293
294
|
op_limit_str = op.get_logical_op_id()
|
|
@@ -203,9 +203,8 @@ class PhysicalPlan(Plan):
|
|
|
203
203
|
# return the current index and the upstream unique full_op_ids for this operator
|
|
204
204
|
return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
|
|
205
205
|
|
|
206
|
-
def get_upstream_unique_full_op_ids(self,
|
|
207
|
-
"""Return the list of unique full_op_ids for the upstream operators of
|
|
208
|
-
unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
|
|
206
|
+
def get_upstream_unique_full_op_ids(self, unique_full_op_id: str) -> list[str]:
|
|
207
|
+
"""Return the list of unique full_op_ids for the upstream operators of the operator specified by `unique_full_op_id`."""
|
|
209
208
|
return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
|
|
210
209
|
|
|
211
210
|
def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
|
|
@@ -19,13 +19,14 @@ from palimpzest.query.operators.aggregate import (
|
|
|
19
19
|
MaxAggregateOp,
|
|
20
20
|
MinAggregateOp,
|
|
21
21
|
SemanticAggregate,
|
|
22
|
+
SumAggregateOp,
|
|
22
23
|
)
|
|
23
24
|
from palimpzest.query.operators.compute import SmolAgentsCompute
|
|
24
25
|
from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
|
|
25
26
|
from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert, CritiqueAndRefineFilter
|
|
26
27
|
from palimpzest.query.operators.distinct import DistinctOp
|
|
27
28
|
from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
|
|
28
|
-
from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin
|
|
29
|
+
from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin, RelationalJoin
|
|
29
30
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
30
31
|
from palimpzest.query.operators.logical import (
|
|
31
32
|
Aggregate,
|
|
@@ -39,19 +40,19 @@ from palimpzest.query.operators.logical import (
|
|
|
39
40
|
JoinOp,
|
|
40
41
|
LimitScan,
|
|
41
42
|
Project,
|
|
42
|
-
RetrieveScan,
|
|
43
43
|
SearchOperator,
|
|
44
|
+
TopKScan,
|
|
44
45
|
)
|
|
45
46
|
from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert, MixtureOfAgentsFilter
|
|
46
47
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
47
48
|
from palimpzest.query.operators.project import ProjectOp
|
|
48
49
|
from palimpzest.query.operators.rag import RAGConvert, RAGFilter
|
|
49
|
-
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
50
50
|
from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
|
|
51
51
|
from palimpzest.query.operators.search import (
|
|
52
52
|
SmolAgentsSearch, # SmolAgentsCustomManagedSearch, # SmolAgentsManagedSearch
|
|
53
53
|
)
|
|
54
54
|
from palimpzest.query.operators.split import SplitConvert, SplitFilter
|
|
55
|
+
from palimpzest.query.operators.topk import TopKOp
|
|
55
56
|
from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
|
|
56
57
|
|
|
57
58
|
logger = logging.getLogger(__name__)
|
|
@@ -796,26 +797,26 @@ class SplitRule(ImplementationRule):
|
|
|
796
797
|
return cls._perform_substitution(logical_expression, phys_op_cls, runtime_kwargs, variable_op_kwargs)
|
|
797
798
|
|
|
798
799
|
|
|
799
|
-
class
|
|
800
|
+
class TopKRule(ImplementationRule):
|
|
800
801
|
"""
|
|
801
|
-
Substitute a logical expression for a
|
|
802
|
+
Substitute a logical expression for a TopKScan with a TopK physical implementation.
|
|
802
803
|
"""
|
|
803
804
|
k_budgets = [1, 3, 5, 10, 15, 20, 25]
|
|
804
805
|
|
|
805
806
|
@classmethod
|
|
806
807
|
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
807
|
-
is_match = isinstance(logical_expression.operator,
|
|
808
|
-
logger.debug(f"
|
|
808
|
+
is_match = isinstance(logical_expression.operator, TopKScan)
|
|
809
|
+
logger.debug(f"TopKRule matches_pattern: {is_match} for {logical_expression}")
|
|
809
810
|
return is_match
|
|
810
811
|
|
|
811
812
|
@classmethod
|
|
812
813
|
def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
|
|
813
|
-
logger.debug(f"Substituting
|
|
814
|
+
logger.debug(f"Substituting TopKRule for {logical_expression}")
|
|
814
815
|
|
|
815
816
|
# create variable physical operator kwargs for each model which can implement this logical_expression
|
|
816
817
|
ks = cls.k_budgets if logical_expression.operator.k == -1 else [logical_expression.operator.k]
|
|
817
818
|
variable_op_kwargs = [{"k": k} for k in ks]
|
|
818
|
-
return cls._perform_substitution(logical_expression,
|
|
819
|
+
return cls._perform_substitution(logical_expression, TopKOp, runtime_kwargs, variable_op_kwargs)
|
|
819
820
|
|
|
820
821
|
|
|
821
822
|
class NonLLMFilterRule(ImplementationRule):
|
|
@@ -867,6 +868,23 @@ class LLMFilterRule(ImplementationRule):
|
|
|
867
868
|
return cls._perform_substitution(logical_expression, LLMFilter, runtime_kwargs, variable_op_kwargs)
|
|
868
869
|
|
|
869
870
|
|
|
871
|
+
class RelationalJoinRule(ImplementationRule):
|
|
872
|
+
"""
|
|
873
|
+
Substitute a logical expression for a JoinOp with a RelationalJoin physical implementation.
|
|
874
|
+
"""
|
|
875
|
+
|
|
876
|
+
@classmethod
|
|
877
|
+
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
878
|
+
is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition == ""
|
|
879
|
+
logger.debug(f"RelationalJoinRule matches_pattern: {is_match} for {logical_expression}")
|
|
880
|
+
return is_match
|
|
881
|
+
|
|
882
|
+
@classmethod
|
|
883
|
+
def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
|
|
884
|
+
logger.debug(f"Substituting RelationalJoinRule for {logical_expression}")
|
|
885
|
+
return cls._perform_substitution(logical_expression, RelationalJoin, runtime_kwargs)
|
|
886
|
+
|
|
887
|
+
|
|
870
888
|
class NestedLoopsJoinRule(ImplementationRule):
|
|
871
889
|
"""
|
|
872
890
|
Substitute a logical expression for a JoinOp with an (LLM) NestedLoopsJoin physical implementation.
|
|
@@ -874,7 +892,7 @@ class NestedLoopsJoinRule(ImplementationRule):
|
|
|
874
892
|
|
|
875
893
|
@classmethod
|
|
876
894
|
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
877
|
-
is_match = isinstance(logical_expression.operator, JoinOp)
|
|
895
|
+
is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != ""
|
|
878
896
|
logger.debug(f"NestedLoopsJoinRule matches_pattern: {is_match} for {logical_expression}")
|
|
879
897
|
return is_match
|
|
880
898
|
|
|
@@ -906,7 +924,7 @@ class EmbeddingJoinRule(ImplementationRule):
|
|
|
906
924
|
|
|
907
925
|
@classmethod
|
|
908
926
|
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
909
|
-
is_match = isinstance(logical_expression.operator, JoinOp) and not cls._is_audio_operation(logical_expression)
|
|
927
|
+
is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != "" and not cls._is_audio_operation(logical_expression)
|
|
910
928
|
logger.debug(f"EmbeddingJoinRule matches_pattern: {is_match} for {logical_expression}")
|
|
911
929
|
return is_match
|
|
912
930
|
|
|
@@ -982,6 +1000,8 @@ class AggregateRule(ImplementationRule):
|
|
|
982
1000
|
physical_op_class = CountAggregateOp
|
|
983
1001
|
elif logical_expression.operator.agg_func == AggFunc.AVERAGE:
|
|
984
1002
|
physical_op_class = AverageAggregateOp
|
|
1003
|
+
elif logical_expression.operator.agg_func == AggFunc.SUM:
|
|
1004
|
+
physical_op_class = SumAggregateOp
|
|
985
1005
|
elif logical_expression.operator.agg_func == AggFunc.MIN:
|
|
986
1006
|
physical_op_class = MinAggregateOp
|
|
987
1007
|
elif logical_expression.operator.agg_func == AggFunc.MAX:
|
|
@@ -501,8 +501,8 @@ class OptimizePhysicalExpression(Task):
|
|
|
501
501
|
|
|
502
502
|
# compute the total cost for this physical expression by summing its operator's PlanCost
|
|
503
503
|
# with the input groups' total PlanCost; also set the op_estimates for this expression's operator
|
|
504
|
-
|
|
505
|
-
full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost,
|
|
504
|
+
execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
|
|
505
|
+
full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy_str)
|
|
506
506
|
full_plan_cost.op_estimates = op_plan_cost.op_estimates
|
|
507
507
|
all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
|
|
508
508
|
|
|
@@ -570,8 +570,8 @@ class OptimizePhysicalExpression(Task):
|
|
|
570
570
|
|
|
571
571
|
# compute the total cost for this physical expression by summing its operator's PlanCost
|
|
572
572
|
# with the input groups' total PlanCost; also set the op_estimates for this expression's operator
|
|
573
|
-
|
|
574
|
-
full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost,
|
|
573
|
+
execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
|
|
574
|
+
full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy_str)
|
|
575
575
|
full_plan_cost.op_estimates = op_plan_cost.op_estimates
|
|
576
576
|
|
|
577
577
|
else:
|
|
@@ -44,6 +44,7 @@ class QueryProcessorConfig(BaseModel):
|
|
|
44
44
|
k: int = Field(default=6)
|
|
45
45
|
j: int = Field(default=4)
|
|
46
46
|
sample_budget: int = Field(default=100)
|
|
47
|
+
sample_cost_budget: float | None = Field(default=None)
|
|
47
48
|
seed: int = Field(default=42)
|
|
48
49
|
exp_name: str | None = Field(default=None)
|
|
49
50
|
priors: dict | None = Field(default=None)
|
palimpzest/utils/progress.py
CHANGED
|
@@ -24,7 +24,7 @@ from palimpzest.query.operators.filter import LLMFilter
|
|
|
24
24
|
from palimpzest.query.operators.join import JoinOp
|
|
25
25
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
26
26
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
27
|
-
from palimpzest.query.operators.
|
|
27
|
+
from palimpzest.query.operators.topk import TopKOp
|
|
28
28
|
from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
|
|
29
29
|
|
|
30
30
|
|
|
@@ -225,20 +225,22 @@ class PZProgressManager(ProgressManager):
|
|
|
225
225
|
current_unique_full_op_id = unique_full_op_id
|
|
226
226
|
next_op, next_unique_full_op_id = self.unique_full_op_id_to_next_op_and_id[unique_full_op_id]
|
|
227
227
|
while next_op is not None:
|
|
228
|
-
if
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
228
|
+
if isinstance(next_op, (AggregateOp, LimitScanOp)):
|
|
229
|
+
break
|
|
230
|
+
|
|
231
|
+
next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
|
|
232
|
+
multiplier = 1
|
|
233
|
+
if isinstance(next_op, JoinOp):
|
|
234
|
+
# for joins, scale the delta by the number of inputs from the other side of the join
|
|
235
|
+
left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
|
|
236
|
+
if current_unique_full_op_id == left_input_unique_full_op_id:
|
|
237
|
+
multiplier = self.get_task_total(right_input_unique_input_op_id)
|
|
238
|
+
elif current_unique_full_op_id == right_input_unique_input_op_id:
|
|
239
|
+
multiplier = self.get_task_total(left_input_unique_full_op_id)
|
|
240
|
+
else:
|
|
241
|
+
raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
|
|
242
|
+
delta_adjusted = delta * multiplier
|
|
243
|
+
self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
|
|
242
244
|
|
|
243
245
|
# move to the next operator in the plan
|
|
244
246
|
current_unique_full_op_id = next_unique_full_op_id
|
|
@@ -281,7 +283,7 @@ class PZProgressManager(ProgressManager):
|
|
|
281
283
|
self.unique_full_op_id_to_stats[unique_full_op_id].memory_usage_mb = get_memory_usage()
|
|
282
284
|
|
|
283
285
|
class PZSentinelProgressManager(ProgressManager):
|
|
284
|
-
def __init__(self, plan: SentinelPlan, sample_budget: int):
|
|
286
|
+
def __init__(self, plan: SentinelPlan, sample_budget: int | None, sample_cost_budget: float | None):
|
|
285
287
|
# overall progress bar
|
|
286
288
|
self.overall_progress = RichProgress(
|
|
287
289
|
SpinnerColumn(),
|
|
@@ -296,7 +298,9 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
296
298
|
refresh_per_second=10,
|
|
297
299
|
expand=True, # Use full width
|
|
298
300
|
)
|
|
299
|
-
self.
|
|
301
|
+
self.use_cost_budget = sample_cost_budget is not None
|
|
302
|
+
total = sample_cost_budget if self.use_cost_budget else sample_budget
|
|
303
|
+
self.overall_task_id = self.overall_progress.add_task("", total=total, cost=0.0, recent="")
|
|
300
304
|
|
|
301
305
|
# logical operator progress bars
|
|
302
306
|
self.op_progress = RichProgress(
|
|
@@ -332,6 +336,9 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
332
336
|
# initialize start time
|
|
333
337
|
self.start_time = None
|
|
334
338
|
|
|
339
|
+
# initialize validation cost
|
|
340
|
+
self.validation_cost = 0.0
|
|
341
|
+
|
|
335
342
|
# add a task to the progress manager for each operator in the plan
|
|
336
343
|
for topo_idx, (logical_op_id, op_set) in enumerate(plan):
|
|
337
344
|
unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
|
|
@@ -348,9 +355,9 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
348
355
|
def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
|
|
349
356
|
is_llm_convert = isinstance(physical_op, LLMConvert)
|
|
350
357
|
is_llm_filter = isinstance(physical_op, LLMFilter)
|
|
351
|
-
|
|
358
|
+
is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
|
|
352
359
|
is_llm_join = isinstance(physical_op, JoinOp)
|
|
353
|
-
return is_llm_convert or is_llm_filter or
|
|
360
|
+
return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
|
|
354
361
|
|
|
355
362
|
def get_task_description(self, unique_logical_op_id: str) -> str:
|
|
356
363
|
"""Return the current description for the given task."""
|
|
@@ -385,15 +392,34 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
385
392
|
# start progress bars
|
|
386
393
|
self.live_display.start()
|
|
387
394
|
|
|
395
|
+
def incr_overall_progress_cost(self, cost_delta: float):
|
|
396
|
+
"""Advance the overall progress bar by the given cost delta"""
|
|
397
|
+
self.validation_cost += cost_delta
|
|
398
|
+
self.overall_progress.update(
|
|
399
|
+
self.overall_task_id,
|
|
400
|
+
advance=cost_delta,
|
|
401
|
+
cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
|
|
402
|
+
refresh=True,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# force the live display to refresh
|
|
406
|
+
self.live_display.refresh()
|
|
407
|
+
|
|
388
408
|
def incr(self, unique_logical_op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
|
|
389
409
|
# TODO: (above) organize progress bars into a Live / Table / Panel or something
|
|
390
410
|
# get the task for the given operation
|
|
391
411
|
task = self.unique_logical_op_id_to_task.get(unique_logical_op_id)
|
|
392
412
|
|
|
413
|
+
# store the cost before updating stats
|
|
414
|
+
previous_total_cost = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost
|
|
415
|
+
|
|
393
416
|
# update statistics with any additional keyword arguments
|
|
394
417
|
if kwargs != {}:
|
|
395
418
|
self.update_stats(unique_logical_op_id, **kwargs)
|
|
396
419
|
|
|
420
|
+
# compute the cost delta
|
|
421
|
+
cost_delta = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost - previous_total_cost
|
|
422
|
+
|
|
397
423
|
# update progress bar and recent text in one update
|
|
398
424
|
if display_text is not None:
|
|
399
425
|
self.unique_logical_op_id_to_stats[unique_logical_op_id].recent_text = display_text
|
|
@@ -412,10 +438,11 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
412
438
|
)
|
|
413
439
|
|
|
414
440
|
# advance the overall progress bar
|
|
441
|
+
advance = cost_delta if self.use_cost_budget else num_samples
|
|
415
442
|
self.overall_progress.update(
|
|
416
443
|
self.overall_task_id,
|
|
417
|
-
advance=
|
|
418
|
-
cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()),
|
|
444
|
+
advance=advance,
|
|
445
|
+
cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
|
|
419
446
|
refresh=True,
|
|
420
447
|
)
|
|
421
448
|
|
|
@@ -449,6 +476,7 @@ def create_progress_manager(
|
|
|
449
476
|
plan: PhysicalPlan | SentinelPlan,
|
|
450
477
|
num_samples: int | None = None,
|
|
451
478
|
sample_budget: int | None = None,
|
|
479
|
+
sample_cost_budget: float | None = None,
|
|
452
480
|
progress: bool = True,
|
|
453
481
|
) -> ProgressManager:
|
|
454
482
|
"""Factory function to create appropriate progress manager based on environment"""
|
|
@@ -456,7 +484,7 @@ def create_progress_manager(
|
|
|
456
484
|
return MockProgressManager(plan, num_samples)
|
|
457
485
|
|
|
458
486
|
if isinstance(plan, SentinelPlan):
|
|
459
|
-
assert sample_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
|
|
460
|
-
return PZSentinelProgressManager(plan, sample_budget)
|
|
487
|
+
assert sample_budget is not None or sample_cost_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
|
|
488
|
+
return PZSentinelProgressManager(plan, sample_budget, sample_cost_budget)
|
|
461
489
|
|
|
462
490
|
return PZProgressManager(plan, num_samples)
|
|
@@ -19,7 +19,7 @@ from palimpzest.query.generators.generators import get_json_from_answer
|
|
|
19
19
|
from palimpzest.query.operators.convert import LLMConvert
|
|
20
20
|
from palimpzest.query.operators.filter import LLMFilter
|
|
21
21
|
from palimpzest.query.operators.join import JoinOp
|
|
22
|
-
from palimpzest.query.operators.
|
|
22
|
+
from palimpzest.query.operators.topk import TopKOp
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class Validator:
|
|
@@ -47,7 +47,7 @@ class Validator:
|
|
|
47
47
|
def join_score_fn(self, condition: str, left_input_record: dict, right_input_record: dict, output: bool) -> float | None:
|
|
48
48
|
raise NotImplementedError("Validator.join_score_fn not implemented.")
|
|
49
49
|
|
|
50
|
-
def
|
|
50
|
+
def topk_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
|
|
51
51
|
raise NotImplementedError("Validator.map_score_fn not implemented.")
|
|
52
52
|
|
|
53
53
|
def _get_gen_stats_from_completion(self, completion, start_time: float) -> GenerationStats:
|
|
@@ -218,11 +218,11 @@ class Validator:
|
|
|
218
218
|
|
|
219
219
|
return score, gen_stats
|
|
220
220
|
|
|
221
|
-
def
|
|
221
|
+
def _default_topk_score_fn(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
|
|
222
222
|
"""
|
|
223
223
|
Compute the quality of the generated output for the given fields and input_record.
|
|
224
224
|
"""
|
|
225
|
-
# TODO:
|
|
225
|
+
# TODO: top-k k=25; score each item based on relevance; compute F1
|
|
226
226
|
# TODO: support retrieval over images
|
|
227
227
|
# create prompt factory
|
|
228
228
|
factory = PromptFactory(PromptStrategy.MAP, self.model, Cardinality.ONE_TO_ONE)
|
|
@@ -294,11 +294,11 @@ class Validator:
|
|
|
294
294
|
score, gen_stats = self._default_join_score_fn(op, condition, left_input_record, right_input_record, output)
|
|
295
295
|
return score, gen_stats, full_hash
|
|
296
296
|
|
|
297
|
-
def
|
|
297
|
+
def _score_topk(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
|
|
298
298
|
try:
|
|
299
|
-
out = self.
|
|
299
|
+
out = self.topk_score_fn(fields, input_record.to_dict(), output)
|
|
300
300
|
score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
|
|
301
301
|
return score, gen_stats, full_hash
|
|
302
302
|
except NotImplementedError:
|
|
303
|
-
score, gen_stats = self.
|
|
303
|
+
score, gen_stats = self._default_topk_score_fn(op, fields, input_record, output)
|
|
304
304
|
return score, gen_stats, full_hash
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: palimpzest
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
|
|
5
5
|
Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
|
|
6
6
|
Project-URL: homepage, https://palimpzest.org
|
|
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
|
|
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
-
Requires-Python: >=3.
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
18
|
Requires-Dist: anthropic>=0.55.0
|
|
@@ -59,15 +59,20 @@ Dynamic: license-file
|
|
|
59
59
|
<!-- [](https://arxiv.org/pdf/2405.14696) -->
|
|
60
60
|
<!-- [](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
|
|
61
61
|
|
|
62
|
-
## Learn How to Use PZ
|
|
63
|
-
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
|
|
62
|
+
## 📚 Learn How to Use PZ
|
|
63
|
+
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
|
|
64
64
|
|
|
65
|
-
## Getting started
|
|
65
|
+
## 🚀 Getting started
|
|
66
66
|
You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
|
|
67
67
|
```bash
|
|
68
68
|
$ pip install palimpzest
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
+
You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
|
|
72
|
+
```bash
|
|
73
|
+
$ uv pip install palimpzest
|
|
74
|
+
```
|
|
75
|
+
|
|
71
76
|
Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
|
|
72
77
|
```bash
|
|
73
78
|
$ git clone git@github.com:mitdbg/palimpzest.git
|
|
@@ -75,7 +80,7 @@ $ cd palimpzest
|
|
|
75
80
|
$ pip install .
|
|
76
81
|
```
|
|
77
82
|
|
|
78
|
-
## Join the PZ Community
|
|
83
|
+
## 🙋🏽 Join the PZ Community
|
|
79
84
|
We are actively hacking on PZ and would love to have you join our community [](https://discord.gg/dN85JJ6jaH)
|
|
80
85
|
|
|
81
86
|
[Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
|
|
@@ -86,66 +91,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
|
|
|
86
91
|
|
|
87
92
|
We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
|
|
88
93
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
To run the notebook, you can use the following command:
|
|
92
|
-
```bash
|
|
93
|
-
$ jupyter notebook
|
|
94
|
-
```
|
|
95
|
-
And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
|
|
96
|
-
|
|
97
|
-
### Even Quicker Start
|
|
98
|
-
For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
|
|
99
|
-
```python
|
|
100
|
-
import palimpzest as pz
|
|
101
|
-
|
|
102
|
-
# define the fields we wish to compute
|
|
103
|
-
email_cols = [
|
|
104
|
-
{"name": "sender", "type": str, "desc": "The email address of the sender"},
|
|
105
|
-
{"name": "subject", "type": str, "desc": "The subject of the email"},
|
|
106
|
-
{"name": "date", "type": str, "desc": "The date the email was sent"},
|
|
107
|
-
]
|
|
108
|
-
|
|
109
|
-
# lazily construct the computation to get emails about holidays sent in July
|
|
110
|
-
dataset = pz.Dataset("testdata/enron-tiny/")
|
|
111
|
-
dataset = dataset.sem_add_columns(email_cols)
|
|
112
|
-
dataset = dataset.sem_filter("The email was sent in July")
|
|
113
|
-
dataset = dataset.sem_filter("The email is about holidays")
|
|
114
|
-
|
|
115
|
-
# execute the computation w/the MinCost policy
|
|
116
|
-
config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
|
|
117
|
-
output = dataset.run(config)
|
|
118
|
-
|
|
119
|
-
# display output (if using Jupyter, otherwise use print(output_df))
|
|
120
|
-
output_df = output.to_df(cols=["date", "sender", "subject"])
|
|
121
|
-
display(output_df)
|
|
122
|
-
```
|
|
123
|
-
|
|
124
|
-
## Python Demos
|
|
125
|
-
Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
|
|
126
|
-
|
|
127
|
-
### Downloading test data
|
|
128
|
-
To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
|
|
129
|
-
```
|
|
130
|
-
chmod +x testdata/download-testdata.sh
|
|
131
|
-
./testdata/download-testdata.sh
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
### Running the Demos
|
|
135
|
-
Set your OpenAI (or Together.ai) api key at the command line:
|
|
136
|
-
```bash
|
|
137
|
-
# set one (or both) of the following:
|
|
138
|
-
export OPENAI_API_KEY=<your-api-key>
|
|
139
|
-
export TOGETHER_API_KEY=<your-api-key>
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
Now you can run the simple test program with:
|
|
143
|
-
```bash
|
|
144
|
-
$ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
|
|
145
|
-
```
|
|
146
|
-
|
|
147
|
-
### Citation
|
|
148
|
-
If you would like to cite our work, please use the following citation:
|
|
94
|
+
### 📓 Citation
|
|
95
|
+
If you would like to cite our original paper on Palimpzest, please use the following citation:
|
|
149
96
|
```
|
|
150
97
|
@inproceedings{palimpzestCIDR,
|
|
151
98
|
title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
|
|
@@ -154,3 +101,16 @@ If you would like to cite our work, please use the following citation:
|
|
|
154
101
|
date = 2025,
|
|
155
102
|
}
|
|
156
103
|
```
|
|
104
|
+
|
|
105
|
+
If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
|
|
106
|
+
```
|
|
107
|
+
@misc{russo2025abacuscostbasedoptimizersemantic,
|
|
108
|
+
title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
|
|
109
|
+
author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
|
|
110
|
+
year={2025},
|
|
111
|
+
eprint={2505.14661},
|
|
112
|
+
archivePrefix={arXiv},
|
|
113
|
+
primaryClass={cs.DB},
|
|
114
|
+
url={https://arxiv.org/abs/2505.14661},
|
|
115
|
+
}
|
|
116
|
+
```
|