palimpzest 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/constants.py +13 -4
- palimpzest/core/data/dataset.py +75 -5
- palimpzest/core/elements/groupbysig.py +5 -1
- palimpzest/core/elements/records.py +16 -7
- palimpzest/core/lib/schemas.py +26 -3
- palimpzest/core/models.py +4 -4
- palimpzest/prompts/aggregate_prompts.py +99 -0
- palimpzest/prompts/prompt_factory.py +162 -75
- palimpzest/prompts/utils.py +38 -1
- palimpzest/prompts/validator.py +24 -24
- palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
- palimpzest/query/execution/execution_strategy.py +8 -8
- palimpzest/query/execution/mab_execution_strategy.py +30 -11
- palimpzest/query/execution/parallel_execution_strategy.py +31 -7
- palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
- palimpzest/query/generators/generators.py +9 -7
- palimpzest/query/operators/__init__.py +10 -6
- palimpzest/query/operators/aggregate.py +394 -10
- palimpzest/query/operators/convert.py +1 -1
- palimpzest/query/operators/join.py +279 -23
- palimpzest/query/operators/logical.py +36 -11
- palimpzest/query/operators/mixture_of_agents.py +3 -1
- palimpzest/query/operators/physical.py +5 -2
- palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
- palimpzest/query/optimizer/__init__.py +11 -3
- palimpzest/query/optimizer/cost_model.py +5 -5
- palimpzest/query/optimizer/optimizer.py +3 -2
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/rules.py +73 -13
- palimpzest/query/optimizer/tasks.py +4 -4
- palimpzest/utils/progress.py +19 -17
- palimpzest/validator/validator.py +7 -7
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/RECORD +37 -36
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0
palimpzest/prompts/validator.py
CHANGED
|
@@ -22,17 +22,17 @@ OUTPUT FIELDS:
|
|
|
22
22
|
- birth_year: the year the scientist was born
|
|
23
23
|
|
|
24
24
|
CONTEXT:
|
|
25
|
-
{
|
|
25
|
+
{
|
|
26
26
|
"text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
|
|
27
27
|
"birthday": "December 10, 1815"
|
|
28
|
-
}
|
|
28
|
+
}
|
|
29
29
|
|
|
30
30
|
OUTPUT:
|
|
31
31
|
--------
|
|
32
|
-
{
|
|
32
|
+
{
|
|
33
33
|
"name": "Charles Babbage",
|
|
34
34
|
"birth_year": 1815
|
|
35
|
-
}
|
|
35
|
+
}
|
|
36
36
|
|
|
37
37
|
EVALUATION: {"name": 0.0, "birth_year": 1.0}
|
|
38
38
|
|
|
@@ -66,18 +66,18 @@ OUTPUT FIELDS:
|
|
|
66
66
|
- person_in_image: true if a person is in the image and false otherwise
|
|
67
67
|
|
|
68
68
|
CONTEXT:
|
|
69
|
-
{
|
|
69
|
+
{
|
|
70
70
|
"image": <bytes>,
|
|
71
71
|
"photographer": "CameraEnthusiast1"
|
|
72
|
-
}
|
|
72
|
+
}
|
|
73
73
|
<image content provided here; assume in this example the image shows a dog and a cat playing>
|
|
74
74
|
|
|
75
75
|
OUTPUT:
|
|
76
76
|
--------
|
|
77
|
-
{
|
|
77
|
+
{
|
|
78
78
|
"dog_in_image": true,
|
|
79
79
|
"person_in_image": true
|
|
80
|
-
}
|
|
80
|
+
}
|
|
81
81
|
|
|
82
82
|
EVALUATION: {"dog_in_image": 1.0, "person_in_image": 0.0}
|
|
83
83
|
|
|
@@ -113,22 +113,22 @@ OUTPUT FIELDS:
|
|
|
113
113
|
- birth_year: the year the scientist was born
|
|
114
114
|
|
|
115
115
|
CONTEXT:
|
|
116
|
-
{
|
|
116
|
+
{
|
|
117
117
|
"text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
|
|
118
118
|
"birthdays": "...Lovelace was born on December 10, 1815, almost exactly 24 years after Babbage's birth on 26 December 1791..."
|
|
119
|
-
}
|
|
119
|
+
}
|
|
120
120
|
|
|
121
121
|
OUTPUTS:
|
|
122
122
|
--------
|
|
123
123
|
[
|
|
124
|
-
{
|
|
124
|
+
{
|
|
125
125
|
"name": "Ada Lovelace",
|
|
126
126
|
"birth_year": 1815
|
|
127
|
-
}
|
|
128
|
-
{
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
129
|
"name": "Charles Babbage",
|
|
130
130
|
"birth_year": 1790
|
|
131
|
-
}
|
|
131
|
+
}
|
|
132
132
|
]
|
|
133
133
|
|
|
134
134
|
EVALUATION: [{"name": 1.0, "birth_year": 1.0}, {"name": 1.0, "birth_year": 0.0}]
|
|
@@ -163,23 +163,23 @@ OUTPUT FIELDS:
|
|
|
163
163
|
- animal_is_canine: true if the animal is a canine and false otherwise
|
|
164
164
|
|
|
165
165
|
CONTEXT:
|
|
166
|
-
{
|
|
166
|
+
{
|
|
167
167
|
"image": <bytes>,
|
|
168
168
|
"photographer": "CameraEnthusiast1"
|
|
169
|
-
}
|
|
169
|
+
}
|
|
170
170
|
<image content provided here; assume in this example the image shows a dog and a cat playing>
|
|
171
171
|
|
|
172
172
|
OUTPUT:
|
|
173
173
|
--------
|
|
174
174
|
[
|
|
175
|
-
{
|
|
175
|
+
{
|
|
176
176
|
"animal": "dog",
|
|
177
177
|
"animal_is_canine": true
|
|
178
|
-
}
|
|
179
|
-
{
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
180
|
"animal": "cat",
|
|
181
181
|
"animal_is_canine": true
|
|
182
|
-
}
|
|
182
|
+
}
|
|
183
183
|
]
|
|
184
184
|
|
|
185
185
|
EVALUATION: [{"animal": 1.0, "animal_is_canine": 1.0}, {"animal": 1.0, "animal_is_canine": 0.0}]
|
|
@@ -214,20 +214,20 @@ OUTPUT FIELDS:
|
|
|
214
214
|
- related_scientists: list of scientists who perform similar work as the scientist described in the text
|
|
215
215
|
|
|
216
216
|
CONTEXT:
|
|
217
|
-
{
|
|
217
|
+
{
|
|
218
218
|
"text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
|
|
219
|
-
}
|
|
219
|
+
}
|
|
220
220
|
|
|
221
221
|
OUTPUT:
|
|
222
222
|
--------
|
|
223
|
-
{
|
|
223
|
+
{
|
|
224
224
|
"related_scientists": [
|
|
225
225
|
"Charles Babbage",
|
|
226
226
|
"Alan Turing",
|
|
227
227
|
"Charles Darwin",
|
|
228
228
|
"John von Neumann",
|
|
229
229
|
]
|
|
230
|
-
}
|
|
230
|
+
}
|
|
231
231
|
|
|
232
232
|
EVALUATION: {"related_scientists": 0.75}
|
|
233
233
|
|
|
@@ -225,7 +225,7 @@ class AllSamplingExecutionStrategy(SentinelExecutionStrategy):
|
|
|
225
225
|
dataset_id_to_source_indices = {}
|
|
226
226
|
for dataset_id, dataset in train_dataset.items():
|
|
227
227
|
total_num_samples = len(dataset)
|
|
228
|
-
source_indices = [f"{dataset_id}
|
|
228
|
+
source_indices = [f"{dataset_id}---{int(idx)}" for idx in np.arange(total_num_samples)]
|
|
229
229
|
dataset_id_to_source_indices[dataset_id] = source_indices
|
|
230
230
|
|
|
231
231
|
# initialize set of physical operators for each logical operator
|
|
@@ -14,8 +14,8 @@ from palimpzest.query.operators.convert import LLMConvert
|
|
|
14
14
|
from palimpzest.query.operators.filter import LLMFilter
|
|
15
15
|
from palimpzest.query.operators.join import JoinOp
|
|
16
16
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
17
|
-
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
18
17
|
from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
|
|
18
|
+
from palimpzest.query.operators.topk import TopKOp
|
|
19
19
|
from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
|
|
20
20
|
from palimpzest.utils.progress import PZSentinelProgressManager
|
|
21
21
|
from palimpzest.validator.validator import Validator
|
|
@@ -123,7 +123,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
123
123
|
return (
|
|
124
124
|
not isinstance(op, LLMConvert)
|
|
125
125
|
and not isinstance(op, LLMFilter)
|
|
126
|
-
and not isinstance(op,
|
|
126
|
+
and not isinstance(op, TopKOp)
|
|
127
127
|
and not isinstance(op, JoinOp)
|
|
128
128
|
)
|
|
129
129
|
|
|
@@ -167,8 +167,8 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
167
167
|
full_hashes.add(full_hash)
|
|
168
168
|
futures.append(executor.submit(validator._score_flat_map, op, fields, input_record, output, full_hash))
|
|
169
169
|
|
|
170
|
-
# create future for
|
|
171
|
-
elif isinstance(op,
|
|
170
|
+
# create future for top-k
|
|
171
|
+
elif isinstance(op, TopKOp):
|
|
172
172
|
fields = op.generated_fields
|
|
173
173
|
input_record: DataRecord = record_set.input
|
|
174
174
|
output = record_set.data_records[0].to_dict(project_cols=fields)
|
|
@@ -176,7 +176,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
176
176
|
full_hash = f"{hash(input_record)}{hash(output_str)}"
|
|
177
177
|
if full_hash not in full_hashes:
|
|
178
178
|
full_hashes.add(full_hash)
|
|
179
|
-
futures.append(executor.submit(validator.
|
|
179
|
+
futures.append(executor.submit(validator._score_topk, op, fields, input_record, output, full_hash))
|
|
180
180
|
|
|
181
181
|
# create future for filter
|
|
182
182
|
elif isinstance(op, LLMFilter):
|
|
@@ -235,7 +235,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
235
235
|
|
|
236
236
|
# TODO: this scoring function will (likely) bias towards small values of k since it
|
|
237
237
|
# measures precision and not recall / F1; will need to revisit this in the future
|
|
238
|
-
elif isinstance(op,
|
|
238
|
+
elif isinstance(op, TopKOp):
|
|
239
239
|
fields = op.generated_fields
|
|
240
240
|
input_record: DataRecord = record_set.input
|
|
241
241
|
output_str = record_set.data_records[0].to_json_str(project_cols=fields, bytes_to_str=True, sorted=True)
|
|
@@ -341,9 +341,9 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
|
|
|
341
341
|
def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
|
|
342
342
|
is_llm_convert = isinstance(physical_op, LLMConvert)
|
|
343
343
|
is_llm_filter = isinstance(physical_op, LLMFilter)
|
|
344
|
-
|
|
344
|
+
is_llm_topk = isinstance(physical_op, TopKOp) and isinstance(physical_op.index, Collection)
|
|
345
345
|
is_llm_join = isinstance(physical_op, JoinOp)
|
|
346
|
-
return is_llm_convert or is_llm_filter or
|
|
346
|
+
return is_llm_convert or is_llm_filter or is_llm_topk or is_llm_join
|
|
347
347
|
|
|
348
348
|
@abstractmethod
|
|
349
349
|
def execute_sentinel_plan(self, sentinel_plan: SentinelPlan, train_dataset: dict[str, Dataset], validator: Validator) -> SentinelPlanStats:
|
|
@@ -14,8 +14,8 @@ from palimpzest.query.operators.convert import LLMConvert
|
|
|
14
14
|
from palimpzest.query.operators.filter import FilterOp, LLMFilter, NonLLMFilter
|
|
15
15
|
from palimpzest.query.operators.join import JoinOp
|
|
16
16
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
17
|
-
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
18
17
|
from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
|
|
18
|
+
from palimpzest.query.operators.topk import TopKOp
|
|
19
19
|
from palimpzest.query.optimizer.plan import SentinelPlan
|
|
20
20
|
from palimpzest.utils.progress import create_progress_manager
|
|
21
21
|
from palimpzest.validator.validator import Validator
|
|
@@ -66,8 +66,8 @@ class OpFrontier:
|
|
|
66
66
|
self.is_llm_join = isinstance(sample_op, JoinOp)
|
|
67
67
|
is_llm_convert = isinstance(sample_op, LLMConvert)
|
|
68
68
|
is_llm_filter = isinstance(sample_op, LLMFilter)
|
|
69
|
-
|
|
70
|
-
self.is_llm_op = is_llm_convert or is_llm_filter or
|
|
69
|
+
is_llm_topk = isinstance(sample_op, TopKOp) and isinstance(sample_op.index, Collection)
|
|
70
|
+
self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_topk or self.is_llm_join
|
|
71
71
|
|
|
72
72
|
# get order in which we will sample physical operators for this logical operator
|
|
73
73
|
sample_op_indices = self._get_op_index_order(op_set, seed)
|
|
@@ -96,6 +96,12 @@ class OpFrontier:
|
|
|
96
96
|
"""
|
|
97
97
|
return self.frontier_ops
|
|
98
98
|
|
|
99
|
+
def get_off_frontier_ops(self) -> list[PhysicalOperator]:
|
|
100
|
+
"""
|
|
101
|
+
Returns the set of off-frontier operators for this OpFrontier.
|
|
102
|
+
"""
|
|
103
|
+
return self.off_frontier_ops
|
|
104
|
+
|
|
99
105
|
def _compute_op_id_to_pareto_distance(self, priors: dict[str, dict[str, float]]) -> dict[str, float]:
|
|
100
106
|
"""
|
|
101
107
|
Return l2-distance for each operator from the pareto frontier.
|
|
@@ -298,7 +304,7 @@ class OpFrontier:
|
|
|
298
304
|
def remove_unavailable_root_datasets(source_indices: str | tuple) -> str | tuple | None:
|
|
299
305
|
# base case: source_indices is a string
|
|
300
306
|
if isinstance(source_indices, str):
|
|
301
|
-
return source_indices if source_indices.split("
|
|
307
|
+
return source_indices if source_indices.split("---")[0] in self.root_dataset_ids else None
|
|
302
308
|
|
|
303
309
|
# recursive case: source_indices is a tuple
|
|
304
310
|
left_indices = source_indices[0]
|
|
@@ -383,6 +389,12 @@ class OpFrontier:
|
|
|
383
389
|
# compute final list of record op stats
|
|
384
390
|
full_op_id_to_record_op_stats[full_op_id] = list(record_id_to_max_quality_record_op_stats.values())
|
|
385
391
|
|
|
392
|
+
# NOTE: it is possible for the full_op_id_to_record_op_stats to be empty if there is a duplicate operator
|
|
393
|
+
# (e.g. a scan of the same dataset) which has all of its results cached and no new_record_op_stats;
|
|
394
|
+
# in this case, we do not update the frontier
|
|
395
|
+
if full_op_id_to_record_op_stats == {}:
|
|
396
|
+
return
|
|
397
|
+
|
|
386
398
|
# update the set of source indices processed by each physical operator
|
|
387
399
|
for full_op_id, source_indices_processed in full_op_id_to_source_indices_processed.items():
|
|
388
400
|
# update the set of source indices processed
|
|
@@ -641,8 +653,8 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
|
|
|
641
653
|
"""
|
|
642
654
|
Returns the operator in the frontier with the highest (estimated) quality.
|
|
643
655
|
"""
|
|
644
|
-
# get the
|
|
645
|
-
frontier_ops = op_frontiers[unique_logical_op_id].get_frontier_ops()
|
|
656
|
+
# get the (off) frontier operators for this logical_op_id
|
|
657
|
+
frontier_ops = op_frontiers[unique_logical_op_id].get_frontier_ops() + op_frontiers[unique_logical_op_id].get_off_frontier_ops()
|
|
646
658
|
|
|
647
659
|
# get a mapping from full_op_id --> list[RecordOpStats]
|
|
648
660
|
full_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(unique_logical_op_id, {})
|
|
@@ -693,14 +705,21 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
|
|
|
693
705
|
max_quality_op = self._get_max_quality_op(unique_logical_op_id, op_frontiers, plan_stats)
|
|
694
706
|
|
|
695
707
|
# get frontier ops and their next input
|
|
696
|
-
def
|
|
697
|
-
|
|
708
|
+
def filter_and_clean_inputs(frontier_op_inputs: list[tuple]) -> bool:
|
|
709
|
+
cleaned_inputs = []
|
|
710
|
+
for tup in frontier_op_inputs:
|
|
711
|
+
input = tup[-1]
|
|
712
|
+
if isinstance(input, list):
|
|
713
|
+
input = [record for record in input if record is not None]
|
|
714
|
+
if input is not None and input != []:
|
|
715
|
+
cleaned_inputs.append((tup[0], tup[1], input))
|
|
716
|
+
return cleaned_inputs
|
|
698
717
|
frontier_op_inputs = op_frontiers[unique_logical_op_id].get_frontier_op_inputs(source_indices_to_sample, max_quality_op)
|
|
699
|
-
frontier_op_inputs =
|
|
718
|
+
frontier_op_inputs = filter_and_clean_inputs(frontier_op_inputs)
|
|
700
719
|
|
|
701
720
|
# break out of the loop if frontier_op_inputs is empty, as this means all records have been filtered out
|
|
702
721
|
if len(frontier_op_inputs) == 0:
|
|
703
|
-
|
|
722
|
+
continue
|
|
704
723
|
|
|
705
724
|
# run sampled operators on sampled inputs and update the number of samples drawn
|
|
706
725
|
source_indices_to_record_set_tuples, num_llm_ops = self._execute_op_set(unique_logical_op_id, frontier_op_inputs)
|
|
@@ -764,7 +783,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
|
|
|
764
783
|
dataset_id_to_shuffled_source_indices = {}
|
|
765
784
|
for dataset_id, dataset in train_dataset.items():
|
|
766
785
|
total_num_samples = len(dataset)
|
|
767
|
-
shuffled_source_indices = [f"{dataset_id}
|
|
786
|
+
shuffled_source_indices = [f"{dataset_id}---{int(idx)}" for idx in np.arange(total_num_samples)]
|
|
768
787
|
self.rng.shuffle(shuffled_source_indices)
|
|
769
788
|
dataset_id_to_shuffled_source_indices[dataset_id] = shuffled_source_indices
|
|
770
789
|
|
|
@@ -9,7 +9,6 @@ from palimpzest.query.operators.aggregate import AggregateOp
|
|
|
9
9
|
from palimpzest.query.operators.distinct import DistinctOp
|
|
10
10
|
from palimpzest.query.operators.join import JoinOp
|
|
11
11
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
12
|
-
from palimpzest.query.operators.physical import PhysicalOperator
|
|
13
12
|
from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
|
|
14
13
|
from palimpzest.query.optimizer.plan import PhysicalPlan
|
|
15
14
|
from palimpzest.utils.progress import create_progress_manager
|
|
@@ -35,14 +34,27 @@ class ParallelExecutionStrategy(ExecutionStrategy):
|
|
|
35
34
|
return True
|
|
36
35
|
return False
|
|
37
36
|
|
|
38
|
-
def _upstream_ops_finished(self, plan: PhysicalPlan,
|
|
37
|
+
def _upstream_ops_finished(self, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> bool:
|
|
39
38
|
"""Helper function to check if agg / join operator is ready to process its inputs."""
|
|
40
|
-
|
|
41
|
-
upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
|
|
39
|
+
upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(unique_full_op_id)
|
|
42
40
|
upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
|
|
43
41
|
upstream_future_queues = {upstream_unique_full_op_id: future_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
|
|
44
42
|
return not (self._any_queue_not_empty(upstream_input_queues) or self._any_queue_not_empty(upstream_future_queues))
|
|
45
43
|
|
|
44
|
+
def _finish_outer_join(self, executor: ThreadPoolExecutor, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]], future_queues: dict[str, list]) -> None:
|
|
45
|
+
join_op_upstream_finished = self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
|
|
46
|
+
join_input_queues_empty = all(len(inputs) == 0 for inputs in input_queues[unique_full_op_id].values())
|
|
47
|
+
join_future_queue_empty = len(future_queues[unique_full_op_id]) == 0
|
|
48
|
+
if join_op_upstream_finished and join_input_queues_empty and join_future_queue_empty:
|
|
49
|
+
# process the join one last time with final=True to handle any left/right/outer join logic
|
|
50
|
+
operator = self.unique_full_op_id_to_operator[unique_full_op_id]
|
|
51
|
+
if not operator.finished:
|
|
52
|
+
def finalize_op(operator):
|
|
53
|
+
return operator([], [], final=True)
|
|
54
|
+
future = executor.submit(finalize_op, operator)
|
|
55
|
+
future_queues[unique_full_op_id].append(future)
|
|
56
|
+
operator.set_finished()
|
|
57
|
+
|
|
46
58
|
def _process_future_results(self, unique_full_op_id: str, future_queues: dict[str, list], plan_stats: PlanStats) -> list[DataRecord]:
|
|
47
59
|
"""
|
|
48
60
|
Helper function which takes a full operator id, the future queues, and plan stats, and performs
|
|
@@ -117,15 +129,23 @@ class ParallelExecutionStrategy(ExecutionStrategy):
|
|
|
117
129
|
records = self._process_future_results(source_unique_full_op_id, future_queues, plan_stats)
|
|
118
130
|
input_queues[unique_full_op_id][source_unique_full_op_id].extend(records)
|
|
119
131
|
|
|
132
|
+
# if the source is a left/right/outer join operator with no more inputs to process, then finish it
|
|
133
|
+
if self.is_outer_join_op[source_unique_full_op_id]:
|
|
134
|
+
self._finish_outer_join(executor, plan, source_unique_full_op_id, input_queues, future_queues)
|
|
135
|
+
|
|
120
136
|
# for the final operator, add any finished futures to the output_records
|
|
121
137
|
if unique_full_op_id == f"{topo_idx}-{final_op.get_full_op_id()}":
|
|
122
138
|
records = self._process_future_results(unique_full_op_id, future_queues, plan_stats)
|
|
123
139
|
output_records.extend(records)
|
|
124
140
|
|
|
141
|
+
# if this is a left/right/outer join operator with no more inputs to process, then finish it
|
|
142
|
+
if self.is_outer_join_op[unique_full_op_id]:
|
|
143
|
+
self._finish_outer_join(executor, plan, unique_full_op_id, input_queues, future_queues)
|
|
144
|
+
|
|
125
145
|
# if this operator does not have enough inputs to execute, then skip it
|
|
126
146
|
num_inputs = sum(len(inputs) for inputs in input_queues[unique_full_op_id].values())
|
|
127
|
-
agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan,
|
|
128
|
-
join_op_not_ready = isinstance(operator, JoinOp) and not self.join_has_downstream_limit_op[unique_full_op_id] and not self._upstream_ops_finished(plan,
|
|
147
|
+
agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
|
|
148
|
+
join_op_not_ready = isinstance(operator, JoinOp) and not self.join_has_downstream_limit_op[unique_full_op_id] and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues, future_queues)
|
|
129
149
|
if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
|
|
130
150
|
continue
|
|
131
151
|
|
|
@@ -225,8 +245,9 @@ class ParallelExecutionStrategy(ExecutionStrategy):
|
|
|
225
245
|
input_queues = self._create_input_queues(plan)
|
|
226
246
|
future_queues = {f"{topo_idx}-{op.get_full_op_id()}": [] for topo_idx, op in enumerate(plan)}
|
|
227
247
|
|
|
228
|
-
# precompute which operators are joins and which joins have downstream limit ops
|
|
248
|
+
# precompute which operators are (outer) joins and which joins have downstream limit ops
|
|
229
249
|
self.is_join_op = {f"{topo_idx}-{op.get_full_op_id()}": isinstance(op, JoinOp) for topo_idx, op in enumerate(plan)}
|
|
250
|
+
self.is_outer_join_op = {f"{topo_idx}-{op.get_full_op_id()}": isinstance(op, JoinOp) and op.how in ("left", "right", "outer") for topo_idx, op in enumerate(plan)}
|
|
230
251
|
self.join_has_downstream_limit_op = {}
|
|
231
252
|
for topo_idx, op in enumerate(plan):
|
|
232
253
|
if isinstance(op, JoinOp):
|
|
@@ -240,6 +261,9 @@ class ParallelExecutionStrategy(ExecutionStrategy):
|
|
|
240
261
|
break
|
|
241
262
|
self.join_has_downstream_limit_op[unique_full_op_id] = has_downstream_limit_op
|
|
242
263
|
|
|
264
|
+
# precompute mapping from unique_full_op_id to operator instance
|
|
265
|
+
self.unique_full_op_id_to_operator = {f"{topo_idx}-{op.get_full_op_id()}": op for topo_idx, op in enumerate(plan)}
|
|
266
|
+
|
|
243
267
|
# initialize and start the progress manager
|
|
244
268
|
self.progress_manager = create_progress_manager(plan, num_samples=self.num_samples, progress=self.progress)
|
|
245
269
|
self.progress_manager.start()
|
|
@@ -6,7 +6,6 @@ from palimpzest.query.execution.execution_strategy import ExecutionStrategy
|
|
|
6
6
|
from palimpzest.query.operators.aggregate import AggregateOp
|
|
7
7
|
from palimpzest.query.operators.join import JoinOp
|
|
8
8
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
9
|
-
from palimpzest.query.operators.physical import PhysicalOperator
|
|
10
9
|
from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
|
|
11
10
|
from palimpzest.query.optimizer.plan import PhysicalPlan
|
|
12
11
|
from palimpzest.utils.progress import create_progress_manager
|
|
@@ -70,6 +69,13 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
70
69
|
record_set, num_inputs_processed = operator(left_input_records, right_input_records)
|
|
71
70
|
records = record_set.data_records
|
|
72
71
|
record_op_stats = record_set.record_op_stats
|
|
72
|
+
|
|
73
|
+
# process the join one last time with final=True to handle any left/right/outer join logic
|
|
74
|
+
if operator.how in ("left", "right", "outer"):
|
|
75
|
+
record_set, num_inputs_processed = operator([], [], final=True)
|
|
76
|
+
records.extend(record_set.data_records)
|
|
77
|
+
record_op_stats.extend(record_set.record_op_stats)
|
|
78
|
+
|
|
73
79
|
num_outputs = sum(record._passed_operator for record in records)
|
|
74
80
|
|
|
75
81
|
# update the progress manager
|
|
@@ -168,10 +174,9 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
168
174
|
return True
|
|
169
175
|
return False
|
|
170
176
|
|
|
171
|
-
def _upstream_ops_finished(self, plan: PhysicalPlan,
|
|
177
|
+
def _upstream_ops_finished(self, plan: PhysicalPlan, unique_full_op_id: str, input_queues: dict[str, dict[str, list]]) -> bool:
|
|
172
178
|
"""Helper function to check if agg / join operator is ready to process its inputs."""
|
|
173
|
-
|
|
174
|
-
upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(topo_idx, operator)
|
|
179
|
+
upstream_unique_full_op_ids = plan.get_upstream_unique_full_op_ids(unique_full_op_id)
|
|
175
180
|
upstream_input_queues = {upstream_unique_full_op_id: input_queues[upstream_unique_full_op_id] for upstream_unique_full_op_id in upstream_unique_full_op_ids}
|
|
176
181
|
return not self._any_queue_not_empty(upstream_input_queues)
|
|
177
182
|
|
|
@@ -192,8 +197,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
192
197
|
unique_full_op_id = f"{topo_idx}-{operator.get_full_op_id()}"
|
|
193
198
|
|
|
194
199
|
num_inputs = sum(len(input_queues[unique_full_op_id][source_unique_full_op_id]) for source_unique_full_op_id in source_unique_full_op_ids)
|
|
195
|
-
agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan,
|
|
196
|
-
join_op_not_ready = isinstance(operator, JoinOp) and not self._upstream_ops_finished(plan,
|
|
200
|
+
agg_op_not_ready = isinstance(operator, AggregateOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
|
|
201
|
+
join_op_not_ready = isinstance(operator, JoinOp) and not self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
|
|
197
202
|
if num_inputs == 0 or agg_op_not_ready or join_op_not_ready:
|
|
198
203
|
continue
|
|
199
204
|
|
|
@@ -242,6 +247,18 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
|
|
|
242
247
|
# update the progress manager
|
|
243
248
|
self.progress_manager.incr(unique_full_op_id, num_inputs=1, num_outputs=num_outputs, total_cost=record_set.get_total_cost())
|
|
244
249
|
|
|
250
|
+
# if this is a join operator with no more inputs to process, then finish it
|
|
251
|
+
if isinstance(operator, JoinOp) and operator.how in ("left", "right", "outer"):
|
|
252
|
+
join_op_upstream_finished = self._upstream_ops_finished(plan, unique_full_op_id, input_queues)
|
|
253
|
+
join_input_queues_empty = all(len(inputs) == 0 for inputs in input_queues[unique_full_op_id].values())
|
|
254
|
+
if join_op_upstream_finished and join_input_queues_empty and not operator.finished:
|
|
255
|
+
# process the join one last time with final=True to handle any left/right/outer join logic
|
|
256
|
+
record_set, num_inputs_processed = operator([], [], final=True)
|
|
257
|
+
records.extend(record_set.data_records)
|
|
258
|
+
record_op_stats.extend(record_set.record_op_stats)
|
|
259
|
+
num_outputs += sum(record._passed_operator for record in record_set.data_records)
|
|
260
|
+
operator.set_finished()
|
|
261
|
+
|
|
245
262
|
# update plan stats
|
|
246
263
|
plan_stats.add_record_op_stats(unique_full_op_id, record_op_stats)
|
|
247
264
|
|
|
@@ -296,9 +296,9 @@ class Generator(Generic[ContextType, InputType]):
|
|
|
296
296
|
|
|
297
297
|
return field_answers
|
|
298
298
|
|
|
299
|
-
def __call__(self, candidate: DataRecord, fields: dict[str, FieldInfo] | None, right_candidate: DataRecord | None = None, json_output: bool=True, **kwargs) -> GenerationOutput:
|
|
300
|
-
"""Take the input record (`candidate`), generate the output `fields`, and return the generated output."""
|
|
301
|
-
logger.debug(f"Generating for candidate {candidate} with fields {fields}")
|
|
299
|
+
def __call__(self, candidate: DataRecord | list[DataRecord], fields: dict[str, FieldInfo] | None, right_candidate: DataRecord | None = None, json_output: bool=True, **kwargs) -> GenerationOutput:
|
|
300
|
+
"""Take the input record(s) (`candidate`), generate the output `fields`, and return the generated output."""
|
|
301
|
+
logger.debug(f"Generating for candidate(s) {candidate} with fields {fields}")
|
|
302
302
|
|
|
303
303
|
# fields can only be None if the user provides an answer parser
|
|
304
304
|
fields_check = fields is not None or "parse_answer" in kwargs
|
|
@@ -338,7 +338,7 @@ class Generator(Generic[ContextType, InputType]):
|
|
|
338
338
|
reasoning_effort = "minimal" if self.reasoning_effort is None else self.reasoning_effort
|
|
339
339
|
completion_kwargs = {"reasoning_effort": reasoning_effort, **completion_kwargs}
|
|
340
340
|
if self.model.is_vllm_model():
|
|
341
|
-
completion_kwargs = {"api_base": self.api_base, **completion_kwargs}
|
|
341
|
+
completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key") **completion_kwargs}
|
|
342
342
|
completion = litellm.completion(model=self.model_name, messages=messages, **completion_kwargs)
|
|
343
343
|
end_time = time.time()
|
|
344
344
|
logger.debug(f"Generated completion in {end_time - start_time:.2f} seconds")
|
|
@@ -405,15 +405,17 @@ class Generator(Generic[ContextType, InputType]):
|
|
|
405
405
|
|
|
406
406
|
# pretty print prompt + full completion output for debugging
|
|
407
407
|
completion_text = completion.choices[0].message.content
|
|
408
|
-
prompt = ""
|
|
408
|
+
prompt, system_prompt = "", ""
|
|
409
409
|
for message in messages:
|
|
410
|
+
if message["role"] == "system":
|
|
411
|
+
system_prompt += message["content"] + "\n"
|
|
410
412
|
if message["role"] == "user":
|
|
411
413
|
if message["type"] == "text":
|
|
412
414
|
prompt += message["content"] + "\n"
|
|
413
415
|
elif message["type"] == "image":
|
|
414
|
-
prompt += "<image>\n"
|
|
416
|
+
prompt += "<image>\n" * len(message["content"])
|
|
415
417
|
elif message["type"] == "input_audio":
|
|
416
|
-
prompt += "<audio>\n"
|
|
418
|
+
prompt += "<audio>\n" * len(message["content"])
|
|
417
419
|
logger.debug(f"PROMPT:\n{prompt}")
|
|
418
420
|
logger.debug(Fore.GREEN + f"{completion_text}\n" + Style.RESET_ALL)
|
|
419
421
|
|
|
@@ -2,6 +2,10 @@ from palimpzest.query.operators.aggregate import AggregateOp as _AggregateOp
|
|
|
2
2
|
from palimpzest.query.operators.aggregate import ApplyGroupByOp as _ApplyGroupByOp
|
|
3
3
|
from palimpzest.query.operators.aggregate import AverageAggregateOp as _AverageAggregateOp
|
|
4
4
|
from palimpzest.query.operators.aggregate import CountAggregateOp as _CountAggregateOp
|
|
5
|
+
from palimpzest.query.operators.aggregate import MaxAggregateOp as _MaxAggregateOp
|
|
6
|
+
from palimpzest.query.operators.aggregate import MinAggregateOp as _MinAggregateOp
|
|
7
|
+
from palimpzest.query.operators.aggregate import SemanticAggregate as _SemanticAggregate
|
|
8
|
+
from palimpzest.query.operators.aggregate import SumAggregateOp as _SumAggregateOp
|
|
5
9
|
from palimpzest.query.operators.convert import ConvertOp as _ConvertOp
|
|
6
10
|
from palimpzest.query.operators.convert import LLMConvert as _LLMConvert
|
|
7
11
|
from palimpzest.query.operators.convert import LLMConvertBonded as _LLMConvertBonded
|
|
@@ -47,7 +51,7 @@ from palimpzest.query.operators.logical import (
|
|
|
47
51
|
Project as _Project,
|
|
48
52
|
)
|
|
49
53
|
from palimpzest.query.operators.logical import (
|
|
50
|
-
|
|
54
|
+
TopKScan as _TopKScan,
|
|
51
55
|
)
|
|
52
56
|
from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert as _MixtureOfAgentsConvert
|
|
53
57
|
from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsFilter as _MixtureOfAgentsFilter
|
|
@@ -55,11 +59,11 @@ from palimpzest.query.operators.physical import PhysicalOperator as _PhysicalOpe
|
|
|
55
59
|
from palimpzest.query.operators.project import ProjectOp as _ProjectOp
|
|
56
60
|
from palimpzest.query.operators.rag import RAGConvert as _RAGConvert
|
|
57
61
|
from palimpzest.query.operators.rag import RAGFilter as _RAGFilter
|
|
58
|
-
from palimpzest.query.operators.retrieve import RetrieveOp as _RetrieveOp
|
|
59
62
|
from palimpzest.query.operators.scan import MarshalAndScanDataOp as _MarshalAndScanDataOp
|
|
60
63
|
from palimpzest.query.operators.scan import ScanPhysicalOp as _ScanPhysicalOp
|
|
61
64
|
from palimpzest.query.operators.split import SplitConvert as _SplitConvert
|
|
62
65
|
from palimpzest.query.operators.split import SplitFilter as _SplitFilter
|
|
66
|
+
from palimpzest.query.operators.topk import TopKOp as _TopKOp
|
|
63
67
|
|
|
64
68
|
LOGICAL_OPERATORS = [
|
|
65
69
|
_LogicalOperator,
|
|
@@ -72,12 +76,12 @@ LOGICAL_OPERATORS = [
|
|
|
72
76
|
_LogicalJoinOp,
|
|
73
77
|
_LimitScan,
|
|
74
78
|
_Project,
|
|
75
|
-
|
|
79
|
+
_TopKScan,
|
|
76
80
|
]
|
|
77
81
|
|
|
78
82
|
PHYSICAL_OPERATORS = (
|
|
79
83
|
# aggregate
|
|
80
|
-
[_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp]
|
|
84
|
+
[_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp, _MaxAggregateOp, _MinAggregateOp, _SemanticAggregate, _SumAggregateOp]
|
|
81
85
|
# convert
|
|
82
86
|
+ [_ConvertOp, _NonLLMConvert, _LLMConvert, _LLMConvertBonded]
|
|
83
87
|
# critique and refine
|
|
@@ -100,8 +104,8 @@ PHYSICAL_OPERATORS = (
|
|
|
100
104
|
+ [_ProjectOp]
|
|
101
105
|
# rag
|
|
102
106
|
+ [_RAGConvert, _RAGFilter]
|
|
103
|
-
#
|
|
104
|
-
+ [
|
|
107
|
+
# top-k
|
|
108
|
+
+ [_TopKOp]
|
|
105
109
|
# split
|
|
106
110
|
+ [_SplitConvert, _SplitFilter]
|
|
107
111
|
)
|