palimpzest 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/constants.py +13 -4
- palimpzest/core/data/dataset.py +75 -5
- palimpzest/core/elements/groupbysig.py +5 -1
- palimpzest/core/elements/records.py +16 -7
- palimpzest/core/lib/schemas.py +26 -3
- palimpzest/core/models.py +4 -4
- palimpzest/prompts/aggregate_prompts.py +99 -0
- palimpzest/prompts/prompt_factory.py +162 -75
- palimpzest/prompts/utils.py +38 -1
- palimpzest/prompts/validator.py +24 -24
- palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
- palimpzest/query/execution/execution_strategy.py +8 -8
- palimpzest/query/execution/mab_execution_strategy.py +30 -11
- palimpzest/query/execution/parallel_execution_strategy.py +31 -7
- palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
- palimpzest/query/generators/generators.py +9 -7
- palimpzest/query/operators/__init__.py +10 -6
- palimpzest/query/operators/aggregate.py +394 -10
- palimpzest/query/operators/convert.py +1 -1
- palimpzest/query/operators/join.py +279 -23
- palimpzest/query/operators/logical.py +36 -11
- palimpzest/query/operators/mixture_of_agents.py +3 -1
- palimpzest/query/operators/physical.py +5 -2
- palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
- palimpzest/query/optimizer/__init__.py +11 -3
- palimpzest/query/optimizer/cost_model.py +5 -5
- palimpzest/query/optimizer/optimizer.py +3 -2
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/rules.py +73 -13
- palimpzest/query/optimizer/tasks.py +4 -4
- palimpzest/utils/progress.py +19 -17
- palimpzest/validator/validator.py +7 -7
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/RECORD +37 -36
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ from palimpzest.core.models import GenerationStats, OperatorCostEstimates, Recor
|
|
|
17
17
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class
|
|
20
|
+
class TopKOp(PhysicalOperator):
|
|
21
21
|
def __init__(
|
|
22
22
|
self,
|
|
23
23
|
index: Collection,
|
|
@@ -29,7 +29,7 @@ class RetrieveOp(PhysicalOperator):
|
|
|
29
29
|
**kwargs,
|
|
30
30
|
) -> None:
|
|
31
31
|
"""
|
|
32
|
-
Initialize the
|
|
32
|
+
Initialize the TopKOp object.
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
35
|
index (Collection): The PZ index to use for retrieval.
|
|
@@ -59,7 +59,7 @@ class RetrieveOp(PhysicalOperator):
|
|
|
59
59
|
|
|
60
60
|
def __str__(self):
|
|
61
61
|
op = super().__str__()
|
|
62
|
-
op += f"
|
|
62
|
+
op += f" Top-K: {self.index.__class__.__name__} with k={self.k}\n"
|
|
63
63
|
return op
|
|
64
64
|
|
|
65
65
|
def get_id_params(self):
|
|
@@ -89,8 +89,8 @@ class RetrieveOp(PhysicalOperator):
|
|
|
89
89
|
|
|
90
90
|
def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
|
|
91
91
|
"""
|
|
92
|
-
Compute naive cost estimates for the
|
|
93
|
-
that the
|
|
92
|
+
Compute naive cost estimates for the Top-K operation. These estimates assume
|
|
93
|
+
that the Top-K (1) has negligible cost and (2) has perfect quality.
|
|
94
94
|
"""
|
|
95
95
|
return OperatorCostEstimates(
|
|
96
96
|
cardinality=source_op_cost_estimates.cardinality,
|
|
@@ -101,7 +101,7 @@ class RetrieveOp(PhysicalOperator):
|
|
|
101
101
|
|
|
102
102
|
def default_search_func(self, index: Collection, query: list[str] | list[list[float]], k: int) -> list[str] | list[list[str]]:
|
|
103
103
|
"""
|
|
104
|
-
Default search function for the
|
|
104
|
+
Default search function for the Top-K operation. This function uses the index to
|
|
105
105
|
retrieve the top-k results for the given query. The query will be a (possibly singleton)
|
|
106
106
|
list of strings or a list of lists of floats (i.e., embeddings). The function will return
|
|
107
107
|
the top-k results per-query in (descending) sorted order. If the input is a singleton list,
|
|
@@ -111,7 +111,7 @@ class RetrieveOp(PhysicalOperator):
|
|
|
111
111
|
Args:
|
|
112
112
|
index (PZIndex): The index to use for retrieval.
|
|
113
113
|
query (list[str] | list[list[float]]): The query (or queries) to search for.
|
|
114
|
-
k (int): The maximum number of results the
|
|
114
|
+
k (int): The maximum number of results the top-k operator will return.
|
|
115
115
|
|
|
116
116
|
Returns:
|
|
117
117
|
list[str] | list[list[str]]: The top results in (descending) sorted order per query.
|
|
@@ -260,10 +260,10 @@ class RetrieveOp(PhysicalOperator):
|
|
|
260
260
|
top_results = self.search_func(self.index, inputs, self.k)
|
|
261
261
|
|
|
262
262
|
except Exception:
|
|
263
|
-
top_results = ["error-in-
|
|
264
|
-
os.makedirs("
|
|
263
|
+
top_results = ["error-in-topk"]
|
|
264
|
+
os.makedirs("topk-errors", exist_ok=True)
|
|
265
265
|
ts = time.time()
|
|
266
|
-
with open(f"
|
|
266
|
+
with open(f"topk-errors/error-{ts}.txt", "w") as f:
|
|
267
267
|
f.write(str(query))
|
|
268
268
|
|
|
269
269
|
# TODO: the user is always right! let's drop this post-processing in the future
|
|
@@ -39,17 +39,23 @@ from palimpzest.query.optimizer.rules import (
|
|
|
39
39
|
RAGRule as _RAGRule,
|
|
40
40
|
)
|
|
41
41
|
from palimpzest.query.optimizer.rules import (
|
|
42
|
-
|
|
42
|
+
RelationalJoinRule as _RelationalJoinRule,
|
|
43
43
|
)
|
|
44
44
|
from palimpzest.query.optimizer.rules import (
|
|
45
|
-
|
|
45
|
+
ReorderConverts as _ReorderConverts,
|
|
46
46
|
)
|
|
47
47
|
from palimpzest.query.optimizer.rules import (
|
|
48
48
|
Rule as _Rule,
|
|
49
49
|
)
|
|
50
|
+
from palimpzest.query.optimizer.rules import (
|
|
51
|
+
SemanticAggregateRule as _SemanticAggregateRule,
|
|
52
|
+
)
|
|
50
53
|
from palimpzest.query.optimizer.rules import (
|
|
51
54
|
SplitRule as _SplitRule,
|
|
52
55
|
)
|
|
56
|
+
from palimpzest.query.optimizer.rules import (
|
|
57
|
+
TopKRule as _TopKRule,
|
|
58
|
+
)
|
|
53
59
|
from palimpzest.query.optimizer.rules import (
|
|
54
60
|
TransformationRule as _TransformationRule,
|
|
55
61
|
)
|
|
@@ -69,9 +75,11 @@ ALL_RULES = [
|
|
|
69
75
|
_NonLLMFilterRule,
|
|
70
76
|
_PushDownFilter,
|
|
71
77
|
_RAGRule,
|
|
78
|
+
_RelationalJoinRule,
|
|
72
79
|
_ReorderConverts,
|
|
73
|
-
|
|
80
|
+
_TopKRule,
|
|
74
81
|
_Rule,
|
|
82
|
+
_SemanticAggregateRule,
|
|
75
83
|
_SplitRule,
|
|
76
84
|
_TransformationRule,
|
|
77
85
|
]
|
|
@@ -131,17 +131,17 @@ class SampleBasedCostModel:
|
|
|
131
131
|
# compute selectivity
|
|
132
132
|
selectivity = physical_op_df.passed_operator.sum() / num_source_records
|
|
133
133
|
|
|
134
|
+
# compute quality; if all qualities are None then this will be NaN
|
|
135
|
+
quality = physical_op_df.quality.mean()
|
|
136
|
+
|
|
137
|
+
# set operator stats for this physical operator
|
|
134
138
|
operator_to_stats[unique_logical_op_id][full_op_id] = {
|
|
135
139
|
"cost": physical_op_df.cost_per_record.mean(),
|
|
136
140
|
"time": physical_op_df.time_per_record.mean(),
|
|
137
|
-
"quality":
|
|
141
|
+
"quality": 1.0 if pd.isna(quality) else quality,
|
|
138
142
|
"selectivity": selectivity,
|
|
139
143
|
}
|
|
140
144
|
|
|
141
|
-
# if this is an experiment, log the dataframe and operator_to_stats dictionary
|
|
142
|
-
if self.exp_name is not None:
|
|
143
|
-
operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
|
|
144
|
-
|
|
145
145
|
logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
|
|
146
146
|
return operator_to_stats
|
|
147
147
|
|
|
@@ -284,10 +284,11 @@ class Optimizer:
|
|
|
284
284
|
all_properties["filters"] = set([op_filter_str])
|
|
285
285
|
|
|
286
286
|
elif isinstance(op, JoinOp):
|
|
287
|
+
unique_join_str = str(sorted(op.on)) if op.condition is None else op.condition
|
|
287
288
|
if "joins" in all_properties:
|
|
288
|
-
all_properties["joins"].add(
|
|
289
|
+
all_properties["joins"].add(unique_join_str)
|
|
289
290
|
else:
|
|
290
|
-
all_properties["joins"] = set([
|
|
291
|
+
all_properties["joins"] = set([unique_join_str])
|
|
291
292
|
|
|
292
293
|
elif isinstance(op, LimitScan):
|
|
293
294
|
op_limit_str = op.get_logical_op_id()
|
|
@@ -203,9 +203,8 @@ class PhysicalPlan(Plan):
|
|
|
203
203
|
# return the current index and the upstream unique full_op_ids for this operator
|
|
204
204
|
return current_idx, self.operator.get_full_op_id(), upstream_map[this_unique_full_op_id]
|
|
205
205
|
|
|
206
|
-
def get_upstream_unique_full_op_ids(self,
|
|
207
|
-
"""Return the list of unique full_op_ids for the upstream operators of
|
|
208
|
-
unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
|
|
206
|
+
def get_upstream_unique_full_op_ids(self, unique_full_op_id: str) -> list[str]:
|
|
207
|
+
"""Return the list of unique full_op_ids for the upstream operators of the operator specified by `unique_full_op_id`."""
|
|
209
208
|
return self.unique_full_op_id_to_upstream_full_op_ids[unique_full_op_id]
|
|
210
209
|
|
|
211
210
|
def _compute_source_unique_full_op_ids_map(self, source_map: dict[str, list[str]], current_idx: int | None = None) -> tuple[int, str]:
|
|
@@ -12,13 +12,21 @@ from palimpzest.core.lib.schemas import (
|
|
|
12
12
|
IMAGE_LIST_FIELD_TYPES,
|
|
13
13
|
)
|
|
14
14
|
from palimpzest.prompts import CONTEXT_SEARCH_PROMPT
|
|
15
|
-
from palimpzest.query.operators.aggregate import
|
|
15
|
+
from palimpzest.query.operators.aggregate import (
|
|
16
|
+
ApplyGroupByOp,
|
|
17
|
+
AverageAggregateOp,
|
|
18
|
+
CountAggregateOp,
|
|
19
|
+
MaxAggregateOp,
|
|
20
|
+
MinAggregateOp,
|
|
21
|
+
SemanticAggregate,
|
|
22
|
+
SumAggregateOp,
|
|
23
|
+
)
|
|
16
24
|
from palimpzest.query.operators.compute import SmolAgentsCompute
|
|
17
25
|
from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
|
|
18
26
|
from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert, CritiqueAndRefineFilter
|
|
19
27
|
from palimpzest.query.operators.distinct import DistinctOp
|
|
20
28
|
from palimpzest.query.operators.filter import LLMFilter, NonLLMFilter
|
|
21
|
-
from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin
|
|
29
|
+
from palimpzest.query.operators.join import EmbeddingJoin, NestedLoopsJoin, RelationalJoin
|
|
22
30
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
23
31
|
from palimpzest.query.operators.logical import (
|
|
24
32
|
Aggregate,
|
|
@@ -32,19 +40,19 @@ from palimpzest.query.operators.logical import (
|
|
|
32
40
|
JoinOp,
|
|
33
41
|
LimitScan,
|
|
34
42
|
Project,
|
|
35
|
-
RetrieveScan,
|
|
36
43
|
SearchOperator,
|
|
44
|
+
TopKScan,
|
|
37
45
|
)
|
|
38
46
|
from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert, MixtureOfAgentsFilter
|
|
39
47
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
40
48
|
from palimpzest.query.operators.project import ProjectOp
|
|
41
49
|
from palimpzest.query.operators.rag import RAGConvert, RAGFilter
|
|
42
|
-
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
43
50
|
from palimpzest.query.operators.scan import ContextScanOp, MarshalAndScanDataOp
|
|
44
51
|
from palimpzest.query.operators.search import (
|
|
45
52
|
SmolAgentsSearch, # SmolAgentsCustomManagedSearch, # SmolAgentsManagedSearch
|
|
46
53
|
)
|
|
47
54
|
from palimpzest.query.operators.split import SplitConvert, SplitFilter
|
|
55
|
+
from palimpzest.query.operators.topk import TopKOp
|
|
48
56
|
from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
|
|
49
57
|
|
|
50
58
|
logger = logging.getLogger(__name__)
|
|
@@ -789,26 +797,26 @@ class SplitRule(ImplementationRule):
|
|
|
789
797
|
return cls._perform_substitution(logical_expression, phys_op_cls, runtime_kwargs, variable_op_kwargs)
|
|
790
798
|
|
|
791
799
|
|
|
792
|
-
class
|
|
800
|
+
class TopKRule(ImplementationRule):
|
|
793
801
|
"""
|
|
794
|
-
Substitute a logical expression for a
|
|
802
|
+
Substitute a logical expression for a TopKScan with a TopK physical implementation.
|
|
795
803
|
"""
|
|
796
804
|
k_budgets = [1, 3, 5, 10, 15, 20, 25]
|
|
797
805
|
|
|
798
806
|
@classmethod
|
|
799
807
|
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
800
|
-
is_match = isinstance(logical_expression.operator,
|
|
801
|
-
logger.debug(f"
|
|
808
|
+
is_match = isinstance(logical_expression.operator, TopKScan)
|
|
809
|
+
logger.debug(f"TopKRule matches_pattern: {is_match} for {logical_expression}")
|
|
802
810
|
return is_match
|
|
803
811
|
|
|
804
812
|
@classmethod
|
|
805
813
|
def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
|
|
806
|
-
logger.debug(f"Substituting
|
|
814
|
+
logger.debug(f"Substituting TopKRule for {logical_expression}")
|
|
807
815
|
|
|
808
816
|
# create variable physical operator kwargs for each model which can implement this logical_expression
|
|
809
817
|
ks = cls.k_budgets if logical_expression.operator.k == -1 else [logical_expression.operator.k]
|
|
810
818
|
variable_op_kwargs = [{"k": k} for k in ks]
|
|
811
|
-
return cls._perform_substitution(logical_expression,
|
|
819
|
+
return cls._perform_substitution(logical_expression, TopKOp, runtime_kwargs, variable_op_kwargs)
|
|
812
820
|
|
|
813
821
|
|
|
814
822
|
class NonLLMFilterRule(ImplementationRule):
|
|
@@ -860,6 +868,23 @@ class LLMFilterRule(ImplementationRule):
|
|
|
860
868
|
return cls._perform_substitution(logical_expression, LLMFilter, runtime_kwargs, variable_op_kwargs)
|
|
861
869
|
|
|
862
870
|
|
|
871
|
+
class RelationalJoinRule(ImplementationRule):
|
|
872
|
+
"""
|
|
873
|
+
Substitute a logical expression for a JoinOp with a RelationalJoin physical implementation.
|
|
874
|
+
"""
|
|
875
|
+
|
|
876
|
+
@classmethod
|
|
877
|
+
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
878
|
+
is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition == ""
|
|
879
|
+
logger.debug(f"RelationalJoinRule matches_pattern: {is_match} for {logical_expression}")
|
|
880
|
+
return is_match
|
|
881
|
+
|
|
882
|
+
@classmethod
|
|
883
|
+
def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
|
|
884
|
+
logger.debug(f"Substituting RelationalJoinRule for {logical_expression}")
|
|
885
|
+
return cls._perform_substitution(logical_expression, RelationalJoin, runtime_kwargs)
|
|
886
|
+
|
|
887
|
+
|
|
863
888
|
class NestedLoopsJoinRule(ImplementationRule):
|
|
864
889
|
"""
|
|
865
890
|
Substitute a logical expression for a JoinOp with an (LLM) NestedLoopsJoin physical implementation.
|
|
@@ -867,7 +892,7 @@ class NestedLoopsJoinRule(ImplementationRule):
|
|
|
867
892
|
|
|
868
893
|
@classmethod
|
|
869
894
|
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
870
|
-
is_match = isinstance(logical_expression.operator, JoinOp)
|
|
895
|
+
is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != ""
|
|
871
896
|
logger.debug(f"NestedLoopsJoinRule matches_pattern: {is_match} for {logical_expression}")
|
|
872
897
|
return is_match
|
|
873
898
|
|
|
@@ -899,7 +924,7 @@ class EmbeddingJoinRule(ImplementationRule):
|
|
|
899
924
|
|
|
900
925
|
@classmethod
|
|
901
926
|
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
902
|
-
is_match = isinstance(logical_expression.operator, JoinOp) and not cls._is_audio_operation(logical_expression)
|
|
927
|
+
is_match = isinstance(logical_expression.operator, JoinOp) and logical_expression.operator.condition != "" and not cls._is_audio_operation(logical_expression)
|
|
903
928
|
logger.debug(f"EmbeddingJoinRule matches_pattern: {is_match} for {logical_expression}")
|
|
904
929
|
return is_match
|
|
905
930
|
|
|
@@ -924,6 +949,35 @@ class EmbeddingJoinRule(ImplementationRule):
|
|
|
924
949
|
|
|
925
950
|
return cls._perform_substitution(logical_expression, EmbeddingJoin, runtime_kwargs, variable_op_kwargs)
|
|
926
951
|
|
|
952
|
+
class SemanticAggregateRule(ImplementationRule):
|
|
953
|
+
"""
|
|
954
|
+
Substitute a logical expression for a SemanticAggregate with an llm physical implementation.
|
|
955
|
+
"""
|
|
956
|
+
|
|
957
|
+
@classmethod
|
|
958
|
+
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
959
|
+
is_match = isinstance(logical_expression.operator, Aggregate) and logical_expression.operator.agg_str is not None
|
|
960
|
+
logger.debug(f"SemanticAggregateRule matches_pattern: {is_match} for {logical_expression}")
|
|
961
|
+
return is_match
|
|
962
|
+
|
|
963
|
+
@classmethod
|
|
964
|
+
def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
|
|
965
|
+
logger.debug(f"Substituting SemanticAggregateRule for {logical_expression}")
|
|
966
|
+
|
|
967
|
+
# create variable physical operator kwargs for each model which can implement this logical_expression
|
|
968
|
+
models = [model for model in runtime_kwargs["available_models"] if cls._model_matches_input(model, logical_expression) and not model.is_llama_model()]
|
|
969
|
+
no_reasoning = runtime_kwargs["reasoning_effort"] in [None, "minimal", "low"]
|
|
970
|
+
variable_op_kwargs = [
|
|
971
|
+
{
|
|
972
|
+
"model": model,
|
|
973
|
+
"prompt_strategy": PromptStrategy.AGG_NO_REASONING if model.is_reasoning_model() and no_reasoning else PromptStrategy.AGG,
|
|
974
|
+
"reasoning_effort": runtime_kwargs["reasoning_effort"]
|
|
975
|
+
}
|
|
976
|
+
for model in models
|
|
977
|
+
]
|
|
978
|
+
|
|
979
|
+
return cls._perform_substitution(logical_expression, SemanticAggregate, runtime_kwargs, variable_op_kwargs)
|
|
980
|
+
|
|
927
981
|
|
|
928
982
|
class AggregateRule(ImplementationRule):
|
|
929
983
|
"""
|
|
@@ -932,7 +986,7 @@ class AggregateRule(ImplementationRule):
|
|
|
932
986
|
|
|
933
987
|
@classmethod
|
|
934
988
|
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
935
|
-
is_match = isinstance(logical_expression.operator, Aggregate)
|
|
989
|
+
is_match = isinstance(logical_expression.operator, Aggregate) and logical_expression.operator.agg_func is not None
|
|
936
990
|
logger.debug(f"AggregateRule matches_pattern: {is_match} for {logical_expression}")
|
|
937
991
|
return is_match
|
|
938
992
|
|
|
@@ -946,6 +1000,12 @@ class AggregateRule(ImplementationRule):
|
|
|
946
1000
|
physical_op_class = CountAggregateOp
|
|
947
1001
|
elif logical_expression.operator.agg_func == AggFunc.AVERAGE:
|
|
948
1002
|
physical_op_class = AverageAggregateOp
|
|
1003
|
+
elif logical_expression.operator.agg_func == AggFunc.SUM:
|
|
1004
|
+
physical_op_class = SumAggregateOp
|
|
1005
|
+
elif logical_expression.operator.agg_func == AggFunc.MIN:
|
|
1006
|
+
physical_op_class = MinAggregateOp
|
|
1007
|
+
elif logical_expression.operator.agg_func == AggFunc.MAX:
|
|
1008
|
+
physical_op_class = MaxAggregateOp
|
|
949
1009
|
else:
|
|
950
1010
|
raise Exception(f"Cannot support aggregate function: {logical_expression.operator.agg_func}")
|
|
951
1011
|
|
|
@@ -501,8 +501,8 @@ class OptimizePhysicalExpression(Task):
|
|
|
501
501
|
|
|
502
502
|
# compute the total cost for this physical expression by summing its operator's PlanCost
|
|
503
503
|
# with the input groups' total PlanCost; also set the op_estimates for this expression's operator
|
|
504
|
-
|
|
505
|
-
full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost,
|
|
504
|
+
execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
|
|
505
|
+
full_plan_cost = op_plan_cost.join_add(left_input_plan_cost, right_input_plan_cost, execution_strategy_str)
|
|
506
506
|
full_plan_cost.op_estimates = op_plan_cost.op_estimates
|
|
507
507
|
all_possible_plan_costs.append((full_plan_cost, (left_input_plan_cost, right_input_plan_cost)))
|
|
508
508
|
|
|
@@ -570,8 +570,8 @@ class OptimizePhysicalExpression(Task):
|
|
|
570
570
|
|
|
571
571
|
# compute the total cost for this physical expression by summing its operator's PlanCost
|
|
572
572
|
# with the input groups' total PlanCost; also set the op_estimates for this expression's operator
|
|
573
|
-
|
|
574
|
-
full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost,
|
|
573
|
+
execution_strategy_str = "parallel" if execution_strategy.is_fully_parallel() else "sequential"
|
|
574
|
+
full_plan_cost = op_plan_cost.join_add(left_best_input_plan_cost, right_best_input_plan_cost, execution_strategy_str)
|
|
575
575
|
full_plan_cost.op_estimates = op_plan_cost.op_estimates
|
|
576
576
|
|
|
577
577
|
else:
|
palimpzest/utils/progress.py
CHANGED
|
@@ -24,7 +24,7 @@ from palimpzest.query.operators.filter import LLMFilter
|
|
|
24
24
|
from palimpzest.query.operators.join import JoinOp
|
|
25
25
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
26
26
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
27
|
-
from palimpzest.query.operators.
|
|
27
|
+
from palimpzest.query.operators.topk import TopKOp
|
|
28
28
|
from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
|
|
29
29
|
|
|
30
30
|
|
|
@@ -225,20 +225,22 @@ class PZProgressManager(ProgressManager):
|
|
|
225
225
|
current_unique_full_op_id = unique_full_op_id
|
|
226
226
|
next_op, next_unique_full_op_id = self.unique_full_op_id_to_next_op_and_id[unique_full_op_id]
|
|
227
227
|
while next_op is not None:
|
|
228
|
-
if
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
228
|
+
if isinstance(next_op, (AggregateOp, LimitScanOp)):
|
|
229
|
+
break
|
|
230
|
+
|
|
231
|
+
next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
|
|
232
|
+
multiplier = 1
|
|
233
|
+
if isinstance(next_op, JoinOp):
|
|
234
|
+
# for joins, scale the delta by the number of inputs from the other side of the join
|
|
235
|
+
left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
|
|
236
|
+
if current_unique_full_op_id == left_input_unique_full_op_id:
|
|
237
|
+
multiplier = self.get_task_total(right_input_unique_input_op_id)
|
|
238
|
+
elif current_unique_full_op_id == right_input_unique_input_op_id:
|
|
239
|
+
multiplier = self.get_task_total(left_input_unique_full_op_id)
|
|
240
|
+
else:
|
|
241
|
+
raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
|
|
242
|
+
delta_adjusted = delta * multiplier
|
|
243
|
+
self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
|
|
242
244
|
|
|
243
245
|
# move to the next operator in the plan
|
|
244
246
|
current_unique_full_op_id = next_unique_full_op_id
|
|
@@ -348,9 +350,9 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
348
350
|
def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
|
|
349
351
|
is_llm_convert = isinstance(physical_op, LLMConvert)
|
|
350
352
|
is_llm_filter = isinstance(physical_op, LLMFilter)
|
|
351
|
-
|
|
353
|
+
is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
|
|
352
354
|
is_llm_join = isinstance(physical_op, JoinOp)
|
|
353
|
-
return is_llm_convert or is_llm_filter or
|
|
355
|
+
return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
|
|
354
356
|
|
|
355
357
|
def get_task_description(self, unique_logical_op_id: str) -> str:
|
|
356
358
|
"""Return the current description for the given task."""
|
|
@@ -19,7 +19,7 @@ from palimpzest.query.generators.generators import get_json_from_answer
|
|
|
19
19
|
from palimpzest.query.operators.convert import LLMConvert
|
|
20
20
|
from palimpzest.query.operators.filter import LLMFilter
|
|
21
21
|
from palimpzest.query.operators.join import JoinOp
|
|
22
|
-
from palimpzest.query.operators.
|
|
22
|
+
from palimpzest.query.operators.topk import TopKOp
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class Validator:
|
|
@@ -47,7 +47,7 @@ class Validator:
|
|
|
47
47
|
def join_score_fn(self, condition: str, left_input_record: dict, right_input_record: dict, output: bool) -> float | None:
|
|
48
48
|
raise NotImplementedError("Validator.join_score_fn not implemented.")
|
|
49
49
|
|
|
50
|
-
def
|
|
50
|
+
def topk_score_fn(self, fields: list[str], input_record: dict, output: dict) -> float | None:
|
|
51
51
|
raise NotImplementedError("Validator.map_score_fn not implemented.")
|
|
52
52
|
|
|
53
53
|
def _get_gen_stats_from_completion(self, completion, start_time: float) -> GenerationStats:
|
|
@@ -218,11 +218,11 @@ class Validator:
|
|
|
218
218
|
|
|
219
219
|
return score, gen_stats
|
|
220
220
|
|
|
221
|
-
def
|
|
221
|
+
def _default_topk_score_fn(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict) -> tuple[float | None, GenerationStats]:
|
|
222
222
|
"""
|
|
223
223
|
Compute the quality of the generated output for the given fields and input_record.
|
|
224
224
|
"""
|
|
225
|
-
# TODO:
|
|
225
|
+
# TODO: top-k k=25; score each item based on relevance; compute F1
|
|
226
226
|
# TODO: support retrieval over images
|
|
227
227
|
# create prompt factory
|
|
228
228
|
factory = PromptFactory(PromptStrategy.MAP, self.model, Cardinality.ONE_TO_ONE)
|
|
@@ -294,11 +294,11 @@ class Validator:
|
|
|
294
294
|
score, gen_stats = self._default_join_score_fn(op, condition, left_input_record, right_input_record, output)
|
|
295
295
|
return score, gen_stats, full_hash
|
|
296
296
|
|
|
297
|
-
def
|
|
297
|
+
def _score_topk(self, op: TopKOp, fields: list[str], input_record: DataRecord, output: dict, full_hash: str) -> tuple[float | None, GenerationStats, str]:
|
|
298
298
|
try:
|
|
299
|
-
out = self.
|
|
299
|
+
out = self.topk_score_fn(fields, input_record.to_dict(), output)
|
|
300
300
|
score, gen_stats = out if isinstance(out, tuple) else (out, GenerationStats())
|
|
301
301
|
return score, gen_stats, full_hash
|
|
302
302
|
except NotImplementedError:
|
|
303
|
-
score, gen_stats = self.
|
|
303
|
+
score, gen_stats = self._default_topk_score_fn(op, fields, input_record, output)
|
|
304
304
|
return score, gen_stats, full_hash
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: palimpzest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
|
|
5
5
|
Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
|
|
6
6
|
Project-URL: homepage, https://palimpzest.org
|
|
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
|
|
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
-
Requires-Python: >=3.
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
18
|
Requires-Dist: anthropic>=0.55.0
|
|
@@ -59,15 +59,20 @@ Dynamic: license-file
|
|
|
59
59
|
<!-- [](https://arxiv.org/pdf/2405.14696) -->
|
|
60
60
|
<!-- [](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu) -->
|
|
61
61
|
|
|
62
|
-
## Learn How to Use PZ
|
|
63
|
-
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation, and much more.
|
|
62
|
+
## 📚 Learn How to Use PZ
|
|
63
|
+
Our [full documentation](https://palimpzest.org) is the definitive resource for learning how to use PZ. It contains all of the installation and quickstart materials on this page, as well as user guides, full API documentation (coming soon), and much more.
|
|
64
64
|
|
|
65
|
-
## Getting started
|
|
65
|
+
## 🚀 Getting started
|
|
66
66
|
You can find a stable version of the PZ package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
|
|
67
67
|
```bash
|
|
68
68
|
$ pip install palimpzest
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
+
You can also install PZ with [uv](https://docs.astral.sh/uv/) for a faster installation:
|
|
72
|
+
```bash
|
|
73
|
+
$ uv pip install palimpzest
|
|
74
|
+
```
|
|
75
|
+
|
|
71
76
|
Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
|
|
72
77
|
```bash
|
|
73
78
|
$ git clone git@github.com:mitdbg/palimpzest.git
|
|
@@ -75,7 +80,7 @@ $ cd palimpzest
|
|
|
75
80
|
$ pip install .
|
|
76
81
|
```
|
|
77
82
|
|
|
78
|
-
## Join the PZ Community
|
|
83
|
+
## 🙋🏽 Join the PZ Community
|
|
79
84
|
We are actively hacking on PZ and would love to have you join our community [](https://discord.gg/dN85JJ6jaH)
|
|
80
85
|
|
|
81
86
|
[Our Discord server](https://discord.gg/dN85JJ6jaH) is the best place to:
|
|
@@ -86,66 +91,8 @@ We are actively hacking on PZ and would love to have you join our community [![D
|
|
|
86
91
|
|
|
87
92
|
We are eager to learn more about your workloads and use cases, and will take them into consideration in planning our future roadmap.
|
|
88
93
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
To run the notebook, you can use the following command:
|
|
92
|
-
```bash
|
|
93
|
-
$ jupyter notebook
|
|
94
|
-
```
|
|
95
|
-
And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
|
|
96
|
-
|
|
97
|
-
### Even Quicker Start
|
|
98
|
-
For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
|
|
99
|
-
```python
|
|
100
|
-
import palimpzest as pz
|
|
101
|
-
|
|
102
|
-
# define the fields we wish to compute
|
|
103
|
-
email_cols = [
|
|
104
|
-
{"name": "sender", "type": str, "desc": "The email address of the sender"},
|
|
105
|
-
{"name": "subject", "type": str, "desc": "The subject of the email"},
|
|
106
|
-
{"name": "date", "type": str, "desc": "The date the email was sent"},
|
|
107
|
-
]
|
|
108
|
-
|
|
109
|
-
# lazily construct the computation to get emails about holidays sent in July
|
|
110
|
-
dataset = pz.Dataset("testdata/enron-tiny/")
|
|
111
|
-
dataset = dataset.sem_add_columns(email_cols)
|
|
112
|
-
dataset = dataset.sem_filter("The email was sent in July")
|
|
113
|
-
dataset = dataset.sem_filter("The email is about holidays")
|
|
114
|
-
|
|
115
|
-
# execute the computation w/the MinCost policy
|
|
116
|
-
config = pz.QueryProcessorConfig(policy=pz.MinCost(), verbose=True)
|
|
117
|
-
output = dataset.run(config)
|
|
118
|
-
|
|
119
|
-
# display output (if using Jupyter, otherwise use print(output_df))
|
|
120
|
-
output_df = output.to_df(cols=["date", "sender", "subject"])
|
|
121
|
-
display(output_df)
|
|
122
|
-
```
|
|
123
|
-
|
|
124
|
-
## Python Demos
|
|
125
|
-
Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
|
|
126
|
-
|
|
127
|
-
### Downloading test data
|
|
128
|
-
To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
|
|
129
|
-
```
|
|
130
|
-
chmod +x testdata/download-testdata.sh
|
|
131
|
-
./testdata/download-testdata.sh
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
### Running the Demos
|
|
135
|
-
Set your OpenAI (or Together.ai) api key at the command line:
|
|
136
|
-
```bash
|
|
137
|
-
# set one (or both) of the following:
|
|
138
|
-
export OPENAI_API_KEY=<your-api-key>
|
|
139
|
-
export TOGETHER_API_KEY=<your-api-key>
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
Now you can run the simple test program with:
|
|
143
|
-
```bash
|
|
144
|
-
$ python demos/simple-demo.py --task enron --dataset testdata/enron-eval-tiny --verbose
|
|
145
|
-
```
|
|
146
|
-
|
|
147
|
-
### Citation
|
|
148
|
-
If you would like to cite our work, please use the following citation:
|
|
94
|
+
### 📓 Citation
|
|
95
|
+
If you would like to cite our original paper on Palimpzest, please use the following citation:
|
|
149
96
|
```
|
|
150
97
|
@inproceedings{palimpzestCIDR,
|
|
151
98
|
title={Palimpzest: Optimizing AI-Powered Analytics with Declarative Query Processing},
|
|
@@ -154,3 +101,16 @@ If you would like to cite our work, please use the following citation:
|
|
|
154
101
|
date = 2025,
|
|
155
102
|
}
|
|
156
103
|
```
|
|
104
|
+
|
|
105
|
+
If you would like to cite our paper on Palimpzest's optimizer Abacus, please use the following citation:
|
|
106
|
+
```
|
|
107
|
+
@misc{russo2025abacuscostbasedoptimizersemantic,
|
|
108
|
+
title={Abacus: A Cost-Based Optimizer for Semantic Operator Systems},
|
|
109
|
+
author={Matthew Russo and Sivaprasad Sudhir and Gerardo Vitagliano and Chunwei Liu and Tim Kraska and Samuel Madden and Michael Cafarella},
|
|
110
|
+
year={2025},
|
|
111
|
+
eprint={2505.14661},
|
|
112
|
+
archivePrefix={arXiv},
|
|
113
|
+
primaryClass={cs.DB},
|
|
114
|
+
url={https://arxiv.org/abs/2505.14661},
|
|
115
|
+
}
|
|
116
|
+
```
|