palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +5 -0
- palimpzest/constants.py +110 -43
- palimpzest/core/__init__.py +0 -78
- palimpzest/core/data/dataclasses.py +382 -44
- palimpzest/core/elements/filters.py +7 -3
- palimpzest/core/elements/index.py +70 -0
- palimpzest/core/elements/records.py +33 -11
- palimpzest/core/lib/fields.py +1 -0
- palimpzest/core/lib/schemas.py +4 -3
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
- palimpzest/prompts/prompt_factory.py +44 -7
- palimpzest/prompts/split_merge_prompts.py +56 -0
- palimpzest/prompts/split_proposer_prompts.py +55 -0
- palimpzest/query/execution/execution_strategy.py +435 -53
- palimpzest/query/execution/execution_strategy_type.py +20 -0
- palimpzest/query/execution/mab_execution_strategy.py +532 -0
- palimpzest/query/execution/parallel_execution_strategy.py +143 -172
- palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
- palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
- palimpzest/query/generators/api_client_factory.py +31 -0
- palimpzest/query/generators/generators.py +256 -76
- palimpzest/query/operators/__init__.py +1 -2
- palimpzest/query/operators/code_synthesis_convert.py +33 -18
- palimpzest/query/operators/convert.py +30 -97
- palimpzest/query/operators/critique_and_refine_convert.py +5 -6
- palimpzest/query/operators/filter.py +7 -10
- palimpzest/query/operators/logical.py +54 -10
- palimpzest/query/operators/map.py +130 -0
- palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
- palimpzest/query/operators/physical.py +3 -12
- palimpzest/query/operators/rag_convert.py +66 -18
- palimpzest/query/operators/retrieve.py +230 -34
- palimpzest/query/operators/scan.py +5 -2
- palimpzest/query/operators/split_convert.py +169 -0
- palimpzest/query/operators/token_reduction_convert.py +8 -14
- palimpzest/query/optimizer/__init__.py +4 -16
- palimpzest/query/optimizer/cost_model.py +73 -266
- palimpzest/query/optimizer/optimizer.py +87 -58
- palimpzest/query/optimizer/optimizer_strategy.py +18 -97
- palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/primitives.py +5 -3
- palimpzest/query/optimizer/rules.py +336 -172
- palimpzest/query/optimizer/tasks.py +30 -100
- palimpzest/query/processor/config.py +38 -22
- palimpzest/query/processor/nosentinel_processor.py +16 -520
- palimpzest/query/processor/processing_strategy_type.py +28 -0
- palimpzest/query/processor/query_processor.py +38 -206
- palimpzest/query/processor/query_processor_factory.py +117 -130
- palimpzest/query/processor/sentinel_processor.py +90 -0
- palimpzest/query/processor/streaming_processor.py +25 -32
- palimpzest/sets.py +88 -41
- palimpzest/utils/model_helpers.py +8 -7
- palimpzest/utils/progress.py +368 -152
- palimpzest/utils/token_reduction_helpers.py +1 -3
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
- palimpzest-0.7.0.dist-info/RECORD +96 -0
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
- palimpzest/query/processor/mab_sentinel_processor.py +0 -884
- palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
- palimpzest/utils/index_helpers.py +0 -6
- palimpzest-0.6.3.dist-info/RECORD +0 -87
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -1,639 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from concurrent.futures import ThreadPoolExecutor, wait
|
|
3
|
-
from copy import deepcopy
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
|
|
7
|
-
from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
|
|
8
|
-
from palimpzest.core.data.dataclasses import (
|
|
9
|
-
ExecutionStats,
|
|
10
|
-
OperatorCostEstimates,
|
|
11
|
-
OperatorStats,
|
|
12
|
-
PlanStats,
|
|
13
|
-
RecordOpStats,
|
|
14
|
-
)
|
|
15
|
-
from palimpzest.core.elements.records import DataRecordCollection, DataRecordSet
|
|
16
|
-
from palimpzest.policy import Policy
|
|
17
|
-
from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy
|
|
18
|
-
from palimpzest.query.execution.single_threaded_execution_strategy import (
|
|
19
|
-
PipelinedSingleThreadExecutionStrategy,
|
|
20
|
-
SequentialSingleThreadExecutionStrategy,
|
|
21
|
-
)
|
|
22
|
-
from palimpzest.query.operators.convert import ConvertOp, LLMConvert
|
|
23
|
-
from palimpzest.query.operators.filter import FilterOp, LLMFilter
|
|
24
|
-
from palimpzest.query.operators.physical import PhysicalOperator
|
|
25
|
-
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
26
|
-
from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp
|
|
27
|
-
from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
|
|
28
|
-
from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
|
|
29
|
-
from palimpzest.query.optimizer.plan import SentinelPlan
|
|
30
|
-
from palimpzest.query.processor.query_processor import QueryProcessor
|
|
31
|
-
from palimpzest.sets import Set
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class RandomSamplingSentinelQueryProcessor(QueryProcessor):
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
"""
|
|
38
|
-
def __init__(
|
|
39
|
-
self,
|
|
40
|
-
k: int,
|
|
41
|
-
sample_budget: int,
|
|
42
|
-
sample_all_ops: bool = False,
|
|
43
|
-
sample_all_records: bool = False,
|
|
44
|
-
sample_start_idx: int | None = None,
|
|
45
|
-
sample_end_idx: int | None = None,
|
|
46
|
-
use_final_op_quality: bool = False,
|
|
47
|
-
seed: int = 42,
|
|
48
|
-
exp_name: str | None = None,
|
|
49
|
-
*args,
|
|
50
|
-
**kwargs,
|
|
51
|
-
):
|
|
52
|
-
super().__init__(*args, **kwargs)
|
|
53
|
-
# self.max_workers = self.get_parallel_max_workers()
|
|
54
|
-
# TODO: undo
|
|
55
|
-
# self.max_workers = 1
|
|
56
|
-
self.k = k
|
|
57
|
-
self.sample_budget = sample_budget
|
|
58
|
-
self.j = int(sample_budget / k)
|
|
59
|
-
self.sample_all_ops = sample_all_ops
|
|
60
|
-
self.sample_all_records = sample_all_records
|
|
61
|
-
self.sample_start_idx = sample_start_idx
|
|
62
|
-
self.sample_end_idx = sample_end_idx
|
|
63
|
-
self.use_final_op_quality = use_final_op_quality
|
|
64
|
-
self.pick_output_fn = self.pick_ensemble_output
|
|
65
|
-
self.rng = np.random.default_rng(seed=seed)
|
|
66
|
-
self.exp_name = exp_name
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def compute_quality(
|
|
70
|
-
self,
|
|
71
|
-
record_set: DataRecordSet,
|
|
72
|
-
expected_output: dict | None = None,
|
|
73
|
-
champion_record_set: DataRecordSet | None = None,
|
|
74
|
-
is_filter_op: bool = False,
|
|
75
|
-
is_convert_op: bool = False,
|
|
76
|
-
) -> DataRecordSet:
|
|
77
|
-
"""
|
|
78
|
-
Compute the quality for the given `record_set` by comparing it to the `expected_output`.
|
|
79
|
-
|
|
80
|
-
Update the record_set by assigning the quality to each entry in its record_op_stats and
|
|
81
|
-
returning the updated record_set.
|
|
82
|
-
"""
|
|
83
|
-
# compute whether we can only use the champion
|
|
84
|
-
only_using_champion = expected_output is None
|
|
85
|
-
|
|
86
|
-
# if this operation is a failed convert
|
|
87
|
-
if is_convert_op and len(record_set) == 0:
|
|
88
|
-
record_set.record_op_stats[0].quality = 0.0
|
|
89
|
-
|
|
90
|
-
# if this operation is a filter:
|
|
91
|
-
# - we assign a quality of 1.0 if the record is in the expected outputs and it passes this filter
|
|
92
|
-
# - we assign a quality of 0.0 if the record is in the expected outputs and it does NOT pass this filter
|
|
93
|
-
# - we assign a quality relative to the champion / ensemble output if the record is not in the expected outputs
|
|
94
|
-
# we cannot know for certain what the correct behavior is a given filter on a record which is not in the output
|
|
95
|
-
# (unless it is the only filter in the plan), thus we only evaluate the filter based on its performance on
|
|
96
|
-
# records which are in the output
|
|
97
|
-
elif is_filter_op:
|
|
98
|
-
# NOTE:
|
|
99
|
-
# - we know that record_set.record_op_stats will contain a single entry for a filter op
|
|
100
|
-
# - if we are using the champion, then champion_record_set will also contain a single entry for a filter op
|
|
101
|
-
record_op_stats = record_set.record_op_stats[0]
|
|
102
|
-
if only_using_champion:
|
|
103
|
-
champion_record = champion_record_set[0]
|
|
104
|
-
record_op_stats.quality = int(record_op_stats.passed_operator == champion_record.passed_operator)
|
|
105
|
-
|
|
106
|
-
# - if we are using validation data, we may have multiple expected records in the expected_output for this source_idx,
|
|
107
|
-
# thus, if we can identify an exact match, we can use that to evaluate the filter's quality
|
|
108
|
-
# - if we are using validation data but we *cannot* find an exact match, then we will once again use the champion record set
|
|
109
|
-
else:
|
|
110
|
-
# compute number of matches between this record's computed fields and this expected record's outputs
|
|
111
|
-
found_match_in_output = False
|
|
112
|
-
labels_dict_lst = expected_output["labels"] if isinstance(expected_output["labels"], list) else [expected_output["labels"]]
|
|
113
|
-
for labels_dict in labels_dict_lst:
|
|
114
|
-
all_correct = True
|
|
115
|
-
for field, value in record_op_stats.record_state.items():
|
|
116
|
-
if value != labels_dict[field]:
|
|
117
|
-
all_correct = False
|
|
118
|
-
break
|
|
119
|
-
|
|
120
|
-
if all_correct:
|
|
121
|
-
found_match_in_output = True
|
|
122
|
-
break
|
|
123
|
-
|
|
124
|
-
if found_match_in_output:
|
|
125
|
-
record_op_stats.quality = int(record_op_stats.passed_operator)
|
|
126
|
-
else:
|
|
127
|
-
champion_record = champion_record_set[0]
|
|
128
|
-
record_op_stats.quality = int(record_op_stats.passed_operator == champion_record.passed_operator)
|
|
129
|
-
|
|
130
|
-
# if this is a successful convert operation
|
|
131
|
-
else:
|
|
132
|
-
# NOTE: the following computation assumes we do not project out computed values
|
|
133
|
-
# (and that the validation examples provide all computed fields); even if
|
|
134
|
-
# a user program does add projection, we can ignore the projection on the
|
|
135
|
-
# validation dataset and use the champion model (as opposed to the validation
|
|
136
|
-
# output) for scoring fields which have their values projected out
|
|
137
|
-
|
|
138
|
-
# create list of dictionaries of labels for each expected / champion output
|
|
139
|
-
labels_dict_lst = []
|
|
140
|
-
if only_using_champion:
|
|
141
|
-
for champion_record in champion_record_set:
|
|
142
|
-
labels_dict_lst.append(champion_record.to_dict())
|
|
143
|
-
else:
|
|
144
|
-
labels_dict_lst = (
|
|
145
|
-
expected_output["labels"]
|
|
146
|
-
if isinstance(expected_output["labels"], list)
|
|
147
|
-
else [expected_output["labels"]]
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
# GREEDY ALGORITHM
|
|
151
|
-
# for each record in the expected output, we look for the computed record which maximizes the quality metric;
|
|
152
|
-
# once we've identified that computed record we remove it from consideration for the next expected output
|
|
153
|
-
field_to_score_fn = {} if only_using_champion else expected_output["score_fn"]
|
|
154
|
-
for labels_dict in labels_dict_lst:
|
|
155
|
-
best_quality, best_record_op_stats = 0.0, None
|
|
156
|
-
for record_op_stats in record_set.record_op_stats:
|
|
157
|
-
# if we already assigned this record a quality, skip it
|
|
158
|
-
if record_op_stats.quality is not None:
|
|
159
|
-
continue
|
|
160
|
-
|
|
161
|
-
# compute number of matches between this record's computed fields and this expected record's outputs
|
|
162
|
-
total_quality = 0
|
|
163
|
-
for field in record_op_stats.generated_fields:
|
|
164
|
-
computed_value = record_op_stats.record_state.get(field, None)
|
|
165
|
-
expected_value = labels_dict[field]
|
|
166
|
-
|
|
167
|
-
# get the metric function for this field
|
|
168
|
-
score_fn = field_to_score_fn.get(field, "exact")
|
|
169
|
-
|
|
170
|
-
# compute exact match
|
|
171
|
-
if score_fn == "exact":
|
|
172
|
-
total_quality += int(computed_value == expected_value)
|
|
173
|
-
|
|
174
|
-
# compute UDF metric
|
|
175
|
-
elif callable(score_fn):
|
|
176
|
-
total_quality += score_fn(computed_value, expected_value)
|
|
177
|
-
|
|
178
|
-
# otherwise, throw an exception
|
|
179
|
-
else:
|
|
180
|
-
raise Exception(f"Unrecognized score_fn: {score_fn}")
|
|
181
|
-
|
|
182
|
-
# compute recall and update best seen so far
|
|
183
|
-
quality = total_quality / len(record_op_stats.generated_fields)
|
|
184
|
-
if quality > best_quality:
|
|
185
|
-
best_quality = quality
|
|
186
|
-
best_record_op_stats = record_op_stats
|
|
187
|
-
|
|
188
|
-
# set best_quality as quality for the best_record_op_stats
|
|
189
|
-
if best_record_op_stats is not None:
|
|
190
|
-
best_record_op_stats.quality = best_quality
|
|
191
|
-
|
|
192
|
-
# for any records which did not receive a quality, set it to 0.0 as these are unexpected extras
|
|
193
|
-
for record_op_stats in record_set.record_op_stats:
|
|
194
|
-
if record_op_stats.quality is None:
|
|
195
|
-
record_op_stats.quality = 0.0
|
|
196
|
-
|
|
197
|
-
return record_set
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def score_quality(
|
|
201
|
-
self,
|
|
202
|
-
operator_sets: list[list[PhysicalOperator]],
|
|
203
|
-
execution_data: dict[str, dict[str, list[DataRecordSet]]],
|
|
204
|
-
champion_outputs: dict[str, dict[str, DataRecordSet]],
|
|
205
|
-
expected_outputs: dict[str, dict],
|
|
206
|
-
) -> list[RecordOpStats]:
|
|
207
|
-
"""
|
|
208
|
-
NOTE: This approach to cost modeling does not work directly for aggregation queries;
|
|
209
|
-
for these queries, we would ask the user to provide validation data for the step immediately
|
|
210
|
-
before a final aggregation
|
|
211
|
-
|
|
212
|
-
NOTE: This function currently assumes that one-to-many converts do NOT create duplicate outputs.
|
|
213
|
-
This assumption would break if, for example, we extracted the breed of every dog in an image.
|
|
214
|
-
If there were two golden retrievers and a bernoodle in an image and we extracted:
|
|
215
|
-
|
|
216
|
-
{"image": "file1.png", "breed": "Golden Retriever"}
|
|
217
|
-
{"image": "file1.png", "breed": "Golden Retriever"}
|
|
218
|
-
{"image": "file1.png", "breed": "Bernedoodle"}
|
|
219
|
-
|
|
220
|
-
This function would currently give perfect accuracy to the following output:
|
|
221
|
-
|
|
222
|
-
{"image": "file1.png", "breed": "Golden Retriever"}
|
|
223
|
-
{"image": "file1.png", "breed": "Bernedoodle"}
|
|
224
|
-
|
|
225
|
-
Even though it is missing one of the golden retrievers.
|
|
226
|
-
"""
|
|
227
|
-
# extract information about the logical operation performed at this stage of the sentinel plan;
|
|
228
|
-
# NOTE: we can infer these fields from context clues, but in the long-term we should have a more
|
|
229
|
-
# principled way of getting these directly from attributes either stored in the sentinel_plan
|
|
230
|
-
# or in the PhysicalOperator
|
|
231
|
-
op_set = operator_sets[-1]
|
|
232
|
-
physical_op = op_set[0]
|
|
233
|
-
is_source_op = isinstance(physical_op, (MarshalAndScanDataOp, CacheScanDataOp))
|
|
234
|
-
is_filter_op = isinstance(physical_op, FilterOp)
|
|
235
|
-
is_convert_op = isinstance(physical_op, ConvertOp)
|
|
236
|
-
is_perfect_quality_op = (
|
|
237
|
-
not isinstance(physical_op, LLMConvert)
|
|
238
|
-
and not isinstance(physical_op, LLMFilter)
|
|
239
|
-
and not isinstance(physical_op, RetrieveOp)
|
|
240
|
-
)
|
|
241
|
-
logical_op_id = physical_op.logical_op_id
|
|
242
|
-
|
|
243
|
-
# if this logical_op_id is not in the execution_data (because all upstream records were filtered), return
|
|
244
|
-
if logical_op_id not in execution_data:
|
|
245
|
-
return execution_data
|
|
246
|
-
|
|
247
|
-
# pull out the execution data from this operator; place the upstream execution data in a new list
|
|
248
|
-
this_op_execution_data = execution_data[logical_op_id]
|
|
249
|
-
|
|
250
|
-
# compute quality of each output computed by this operator
|
|
251
|
-
for source_idx, record_sets in this_op_execution_data.items():
|
|
252
|
-
# NOTE
|
|
253
|
-
# source_idx is a particular input, for which we may have computed multiple output record_sets;
|
|
254
|
-
# each of these record_sets may contain more than one record (b/c one-to-many) and we have one
|
|
255
|
-
# record_set per operator in the op_set
|
|
256
|
-
|
|
257
|
-
# if this operation does not involve an LLM, every record_op_stats object gets perfect quality
|
|
258
|
-
if is_perfect_quality_op:
|
|
259
|
-
for record_set in record_sets:
|
|
260
|
-
for record_op_stats in record_set.record_op_stats:
|
|
261
|
-
record_op_stats.quality = 1.0
|
|
262
|
-
continue
|
|
263
|
-
|
|
264
|
-
# get the expected output for this source_idx if we have one
|
|
265
|
-
expected_output = (
|
|
266
|
-
expected_outputs[source_idx]
|
|
267
|
-
if expected_outputs is not None and source_idx in expected_outputs
|
|
268
|
-
else None
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
# extract champion output for this record set
|
|
272
|
-
champion_record_set = champion_outputs[logical_op_id][source_idx]
|
|
273
|
-
|
|
274
|
-
# for each record_set produced by an operation, compute its quality
|
|
275
|
-
for record_set in record_sets:
|
|
276
|
-
record_set = self.compute_quality(record_set, expected_output, champion_record_set, is_filter_op, is_convert_op)
|
|
277
|
-
|
|
278
|
-
# if this operator is a source op (i.e. has no input logical operator), return the execution data
|
|
279
|
-
if is_source_op:
|
|
280
|
-
return execution_data
|
|
281
|
-
|
|
282
|
-
# recursively call the function on the next logical operator until you reach a scan
|
|
283
|
-
execution_data = self.score_quality(operator_sets[:-1], execution_data, champion_outputs, expected_outputs)
|
|
284
|
-
|
|
285
|
-
# return the quality annotated record op stats
|
|
286
|
-
return execution_data
|
|
287
|
-
|
|
288
|
-
def pick_champion_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
|
|
289
|
-
# if there's only one operator in the set, we return its record_set
|
|
290
|
-
if len(op_set_record_sets) == 1:
|
|
291
|
-
record_set, _ = op_set_record_sets[0]
|
|
292
|
-
return record_set
|
|
293
|
-
|
|
294
|
-
# find the operator with the highest average quality and return its record_set
|
|
295
|
-
base_op_cost_est = OperatorCostEstimates(cardinality=1.0, cost_per_record=0.0, time_per_record=0.0, quality=1.0)
|
|
296
|
-
champion_record_set, champion_quality = None, -1.0
|
|
297
|
-
for record_set, op in op_set_record_sets:
|
|
298
|
-
op_cost_estimates = op.naive_cost_estimates(base_op_cost_est)
|
|
299
|
-
if op_cost_estimates.quality > champion_quality:
|
|
300
|
-
champion_record_set, champion_quality = record_set, op_cost_estimates.quality
|
|
301
|
-
|
|
302
|
-
return champion_record_set
|
|
303
|
-
|
|
304
|
-
def pick_ensemble_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
|
|
305
|
-
# if there's only one operator in the set, we return its record_set
|
|
306
|
-
if len(op_set_record_sets) == 1:
|
|
307
|
-
record_set, _ = op_set_record_sets[0]
|
|
308
|
-
return record_set
|
|
309
|
-
|
|
310
|
-
# NOTE: I don't like that this assumes the models are consistent in
|
|
311
|
-
# how they order their record outputs for one-to-many converts;
|
|
312
|
-
# eventually we can try out more robust schemes to account for
|
|
313
|
-
# differences in ordering
|
|
314
|
-
# aggregate records at each index in the response
|
|
315
|
-
idx_to_records = {}
|
|
316
|
-
for record_set, _ in op_set_record_sets:
|
|
317
|
-
for idx, record in enumerate(record_set):
|
|
318
|
-
if idx not in idx_to_records:
|
|
319
|
-
idx_to_records[idx] = [record]
|
|
320
|
-
else:
|
|
321
|
-
idx_to_records[idx].append(record)
|
|
322
|
-
|
|
323
|
-
# compute most common answer at each index
|
|
324
|
-
out_records = []
|
|
325
|
-
for idx in range(len(idx_to_records)):
|
|
326
|
-
records = idx_to_records[idx]
|
|
327
|
-
most_common_record = max(set(records), key=records.count)
|
|
328
|
-
out_records.append(most_common_record)
|
|
329
|
-
|
|
330
|
-
# create and return final DataRecordSet
|
|
331
|
-
return DataRecordSet(out_records, [])
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def execute_op_set(self, candidates, op_set):
|
|
335
|
-
# TODO: post-submission we will need to modify this to:
|
|
336
|
-
# - submit all candidates for aggregate operators
|
|
337
|
-
# - handle limits
|
|
338
|
-
# create thread pool w/max workers and run futures over worker pool
|
|
339
|
-
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
340
|
-
# create futures
|
|
341
|
-
futures = []
|
|
342
|
-
for candidate in candidates:
|
|
343
|
-
for operator in op_set:
|
|
344
|
-
future = executor.submit(PhysicalOperator.execute_op_wrapper, operator, candidate)
|
|
345
|
-
futures.append(future)
|
|
346
|
-
|
|
347
|
-
# compute output record_set for each (operator, candidate) pair
|
|
348
|
-
output_record_sets = []
|
|
349
|
-
while len(futures) > 0:
|
|
350
|
-
# get the set of futures that have (and have not) finished in the last PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
|
|
351
|
-
done_futures, not_done_futures = wait(futures, timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
|
|
352
|
-
|
|
353
|
-
# cast not_done_futures from a set to a list so we can append to it
|
|
354
|
-
not_done_futures = list(not_done_futures)
|
|
355
|
-
|
|
356
|
-
# process finished futures
|
|
357
|
-
for future in done_futures:
|
|
358
|
-
# get the result and add it to the output records set
|
|
359
|
-
record_set, operator, candidate = future.result()
|
|
360
|
-
output_record_sets.append((record_set, operator, candidate))
|
|
361
|
-
|
|
362
|
-
# update list of futures
|
|
363
|
-
futures = not_done_futures
|
|
364
|
-
|
|
365
|
-
# compute mapping from source_idx to record sets for all operators and for champion operator
|
|
366
|
-
all_record_sets, champion_record_sets = {}, {}
|
|
367
|
-
for candidate in candidates:
|
|
368
|
-
candidate_output_record_sets = []
|
|
369
|
-
for record_set, operator, candidate_ in output_record_sets:
|
|
370
|
-
if candidate == candidate_:
|
|
371
|
-
candidate_output_record_sets.append((record_set, operator))
|
|
372
|
-
|
|
373
|
-
# select the champion (i.e. best) record_set from all the record sets computed for this operator
|
|
374
|
-
champion_record_set = self.pick_output_fn(candidate_output_record_sets)
|
|
375
|
-
|
|
376
|
-
# get the source_idx associated with this input record
|
|
377
|
-
source_idx = candidate.source_idx
|
|
378
|
-
|
|
379
|
-
# add champion record_set to mapping from source_idx --> champion record_set
|
|
380
|
-
champion_record_sets[source_idx] = champion_record_set
|
|
381
|
-
|
|
382
|
-
# add all record_sets computed for this source_idx to mapping from source_idx --> record_sets
|
|
383
|
-
all_record_sets[source_idx] = [tup[0] for tup in candidate_output_record_sets]
|
|
384
|
-
|
|
385
|
-
return all_record_sets, champion_record_sets
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
def execute_sentinel_plan(self, plan: SentinelPlan, expected_outputs: dict[str, dict], policy: Policy):
|
|
389
|
-
"""
|
|
390
|
-
"""
|
|
391
|
-
if self.verbose:
|
|
392
|
-
print("----------------------")
|
|
393
|
-
print(f"PLAN[{plan.plan_id}] (sentinel):")
|
|
394
|
-
print(plan)
|
|
395
|
-
print("---")
|
|
396
|
-
|
|
397
|
-
plan_start_time = time.time()
|
|
398
|
-
|
|
399
|
-
# initialize plan stats and operator stats
|
|
400
|
-
plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
|
|
401
|
-
for logical_op_id, logical_op_name, op_set in plan:
|
|
402
|
-
op_set_details = {
|
|
403
|
-
op.op_name(): {k: str(v) for k, v in op.get_id_params().items()}
|
|
404
|
-
for op in op_set
|
|
405
|
-
}
|
|
406
|
-
plan_stats.operator_stats[logical_op_id] = OperatorStats(
|
|
407
|
-
op_id=logical_op_id,
|
|
408
|
-
op_name=logical_op_name,
|
|
409
|
-
op_details=op_set_details,
|
|
410
|
-
)
|
|
411
|
-
|
|
412
|
-
# sample validation records
|
|
413
|
-
total_num_samples = len(self.val_datasource)
|
|
414
|
-
source_indices = np.arange(total_num_samples)
|
|
415
|
-
if self.sample_start_idx is not None:
|
|
416
|
-
assert self.sample_end_idx is not None
|
|
417
|
-
source_indices = source_indices[self.sample_start_idx:self.sample_end_idx]
|
|
418
|
-
elif not self.sample_all_records:
|
|
419
|
-
self.rng.shuffle(source_indices)
|
|
420
|
-
j = min(self.j, len(source_indices))
|
|
421
|
-
source_indices = source_indices[:j]
|
|
422
|
-
|
|
423
|
-
# initialize output variables
|
|
424
|
-
all_outputs, champion_outputs = {}, {}
|
|
425
|
-
|
|
426
|
-
# create initial set of candidates for source scan operator
|
|
427
|
-
candidates = []
|
|
428
|
-
for source_idx in source_indices:
|
|
429
|
-
candidates.append(source_idx)
|
|
430
|
-
|
|
431
|
-
# NOTE: because we need to dynamically create sample matrices for each operator,
|
|
432
|
-
# sentinel execution must be executed one operator at a time (i.e. sequentially)
|
|
433
|
-
# execute operator sets in sequence
|
|
434
|
-
for op_idx, (logical_op_id, _, op_set) in enumerate(plan):
|
|
435
|
-
prev_logical_op_id = plan.logical_op_ids[op_idx - 1] if op_idx > 0 else None
|
|
436
|
-
next_logical_op_id = plan.logical_op_ids[op_idx + 1] if op_idx + 1 < len(plan) else None
|
|
437
|
-
|
|
438
|
-
# sample k optimizations
|
|
439
|
-
k = min(self.k, len(op_set)) if not self.sample_all_ops else len(op_set)
|
|
440
|
-
sampled_ops = self.rng.choice(op_set, size=k, replace=False)
|
|
441
|
-
|
|
442
|
-
# run sampled operators on sampled candidates
|
|
443
|
-
source_idx_to_record_sets, source_idx_to_champion_record_set = self.execute_op_set(candidates, sampled_ops)
|
|
444
|
-
|
|
445
|
-
# update all_outputs and champion_outputs dictionary
|
|
446
|
-
if logical_op_id not in all_outputs:
|
|
447
|
-
all_outputs[logical_op_id] = source_idx_to_record_sets
|
|
448
|
-
champion_outputs[logical_op_id] = source_idx_to_champion_record_set
|
|
449
|
-
else:
|
|
450
|
-
for source_idx, record_sets in source_idx_to_record_sets.items():
|
|
451
|
-
if source_idx not in all_outputs[logical_op_id]:
|
|
452
|
-
all_outputs[logical_op_id][source_idx] = record_sets
|
|
453
|
-
champion_outputs[logical_op_id][source_idx] = source_idx_to_champion_record_set[source_idx]
|
|
454
|
-
else:
|
|
455
|
-
all_outputs[logical_op_id][source_idx].extend(record_sets)
|
|
456
|
-
champion_outputs[logical_op_id][source_idx].extend(source_idx_to_champion_record_set[source_idx])
|
|
457
|
-
|
|
458
|
-
# flatten lists of records and record_op_stats
|
|
459
|
-
all_records, all_record_op_stats = [], []
|
|
460
|
-
for _, record_sets in source_idx_to_record_sets.items():
|
|
461
|
-
for record_set in record_sets:
|
|
462
|
-
all_records.extend(record_set.data_records)
|
|
463
|
-
all_record_op_stats.extend(record_set.record_op_stats)
|
|
464
|
-
|
|
465
|
-
# update plan stats
|
|
466
|
-
plan_stats.operator_stats[logical_op_id].add_record_op_stats(
|
|
467
|
-
all_record_op_stats,
|
|
468
|
-
source_op_id=prev_logical_op_id,
|
|
469
|
-
plan_id=plan.plan_id,
|
|
470
|
-
)
|
|
471
|
-
|
|
472
|
-
# add records (which are not filtered) to the cache, if allowed
|
|
473
|
-
if not self.nocache:
|
|
474
|
-
for record in all_records:
|
|
475
|
-
if getattr(record, "passed_operator", True):
|
|
476
|
-
# self.datadir.append_cache(logical_op_id, record)
|
|
477
|
-
pass
|
|
478
|
-
|
|
479
|
-
# update candidates for next operator; we use champion outputs as input
|
|
480
|
-
candidates = []
|
|
481
|
-
if next_logical_op_id is not None:
|
|
482
|
-
for _, record_set in source_idx_to_champion_record_set.items():
|
|
483
|
-
for record in record_set:
|
|
484
|
-
if isinstance(op_set[0], FilterOp) and not record.passed_operator:
|
|
485
|
-
continue
|
|
486
|
-
candidates.append(record)
|
|
487
|
-
|
|
488
|
-
# if we've filtered out all records, terminate early
|
|
489
|
-
if next_logical_op_id is not None and candidates == []:
|
|
490
|
-
break
|
|
491
|
-
|
|
492
|
-
# compute quality for each operator
|
|
493
|
-
all_outputs = self.score_quality(plan.operator_sets, all_outputs, champion_outputs, expected_outputs)
|
|
494
|
-
|
|
495
|
-
# if caching was allowed, close the cache
|
|
496
|
-
if not self.nocache:
|
|
497
|
-
for _, _, _ in plan:
|
|
498
|
-
# self.datadir.close_cache(logical_op_id)
|
|
499
|
-
pass
|
|
500
|
-
|
|
501
|
-
# finalize plan stats
|
|
502
|
-
total_plan_time = time.time() - plan_start_time
|
|
503
|
-
plan_stats.finalize(total_plan_time)
|
|
504
|
-
|
|
505
|
-
return all_outputs, plan_stats
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
def generate_sample_observations(self, sentinel_plan: SentinelPlan, policy: Policy):
|
|
509
|
-
"""
|
|
510
|
-
This function is responsible for generating sample observation data which can be
|
|
511
|
-
consumed by the CostModel.
|
|
512
|
-
|
|
513
|
-
To accomplish this, we construct a special sentinel plan using the Optimizer which is
|
|
514
|
-
capable of executing any valid physical implementation of a Filter or Convert operator
|
|
515
|
-
on each record.
|
|
516
|
-
"""
|
|
517
|
-
# if we're using validation data, get the set of expected output records
|
|
518
|
-
expected_outputs = {}
|
|
519
|
-
for source_idx in range(len(self.val_datasource)):
|
|
520
|
-
# TODO: make sure execute_op_set uses self.val_datasource
|
|
521
|
-
expected_output = self.val_datasource[source_idx]
|
|
522
|
-
expected_outputs[source_idx] = expected_output
|
|
523
|
-
|
|
524
|
-
# run sentinel plan
|
|
525
|
-
execution_data, plan_stats = self.execute_sentinel_plan(sentinel_plan, expected_outputs, policy)
|
|
526
|
-
|
|
527
|
-
return execution_data, plan_stats
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
def create_sentinel_plan(self, dataset: Set, policy: Policy) -> SentinelPlan:
|
|
531
|
-
"""
|
|
532
|
-
Generates and returns a SentinelPlan for the given dataset.
|
|
533
|
-
"""
|
|
534
|
-
# TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
|
|
535
|
-
|
|
536
|
-
# create a new optimizer and update its strategy to SENTINEL
|
|
537
|
-
optimizer = self.optimizer.deepcopy_clean()
|
|
538
|
-
optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
|
|
539
|
-
|
|
540
|
-
# create copy of dataset, but change its data source to the validation data source
|
|
541
|
-
dataset = deepcopy(dataset)
|
|
542
|
-
dataset._set_data_source(self.val_datasource)
|
|
543
|
-
|
|
544
|
-
# get the sentinel plan for the given dataset
|
|
545
|
-
sentinel_plans = optimizer.optimize(dataset, policy)
|
|
546
|
-
sentinel_plan = sentinel_plans[0]
|
|
547
|
-
|
|
548
|
-
return sentinel_plan
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
def execute(self) -> DataRecordCollection:
|
|
552
|
-
execution_start_time = time.time()
|
|
553
|
-
|
|
554
|
-
# for now, enforce that we are using validation data; we can relax this after paper submission
|
|
555
|
-
if self.val_datasource is None:
|
|
556
|
-
raise Exception("Make sure you are using validation data with MABSentinelExecutionEngine")
|
|
557
|
-
|
|
558
|
-
# if nocache is True, make sure we do not re-use codegen examples
|
|
559
|
-
if self.nocache:
|
|
560
|
-
# self.clear_cached_examples()
|
|
561
|
-
pass
|
|
562
|
-
|
|
563
|
-
# create sentinel plan
|
|
564
|
-
sentinel_plan = self.create_sentinel_plan(self.dataset, self.policy)
|
|
565
|
-
|
|
566
|
-
# generate sample execution data
|
|
567
|
-
all_execution_data, plan_stats = self.generate_sample_observations(sentinel_plan, self.policy)
|
|
568
|
-
|
|
569
|
-
# put sentinel plan execution stats into list and prepare list of output records
|
|
570
|
-
all_plan_stats = [plan_stats]
|
|
571
|
-
all_records = []
|
|
572
|
-
|
|
573
|
-
# construct the CostModel with any sample execution data we've gathered
|
|
574
|
-
cost_model = SampleBasedCostModel(sentinel_plan, all_execution_data, self.verbose, self.exp_name)
|
|
575
|
-
optimizer = self.optimizer.deepcopy_clean()
|
|
576
|
-
optimizer.update_cost_model(cost_model)
|
|
577
|
-
total_optimization_time = time.time() - execution_start_time
|
|
578
|
-
|
|
579
|
-
# execute plan(s) according to the optimization strategy
|
|
580
|
-
records, plan_stats = self._execute_with_strategy(self.dataset, self.policy, optimizer)
|
|
581
|
-
all_records.extend(records)
|
|
582
|
-
all_plan_stats.extend(plan_stats)
|
|
583
|
-
|
|
584
|
-
# aggregate plan stats
|
|
585
|
-
aggregate_plan_stats = self.aggregate_plan_stats(all_plan_stats)
|
|
586
|
-
|
|
587
|
-
# add sentinel records and plan stats (if captured) to plan execution data
|
|
588
|
-
execution_stats = ExecutionStats(
|
|
589
|
-
execution_id=self.execution_id(),
|
|
590
|
-
plan_stats=aggregate_plan_stats,
|
|
591
|
-
total_optimization_time=total_optimization_time,
|
|
592
|
-
total_execution_time=time.time() - execution_start_time,
|
|
593
|
-
total_execution_cost=sum(list(map(lambda plan_stats: plan_stats.total_plan_cost, aggregate_plan_stats.values()))),
|
|
594
|
-
plan_strs={plan_id: plan_stats.plan_str for plan_id, plan_stats in aggregate_plan_stats.items()},
|
|
595
|
-
)
|
|
596
|
-
|
|
597
|
-
return DataRecordCollection(all_records, execution_stats=execution_stats)
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
class RandomSamplingSentinelSequentialSingleThreadProcessor(RandomSamplingSentinelQueryProcessor, SequentialSingleThreadExecutionStrategy):
|
|
601
|
-
"""
|
|
602
|
-
This class performs sentinel execution while executing plans in a sequential, single-threaded fashion.
|
|
603
|
-
"""
|
|
604
|
-
def __init__(self, *args, **kwargs):
|
|
605
|
-
RandomSamplingSentinelQueryProcessor.__init__(self, *args, **kwargs)
|
|
606
|
-
SequentialSingleThreadExecutionStrategy.__init__(
|
|
607
|
-
self,
|
|
608
|
-
scan_start_idx=self.scan_start_idx,
|
|
609
|
-
max_workers=self.max_workers,
|
|
610
|
-
verbose=self.verbose
|
|
611
|
-
)
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
class RandomSamplingSentinelPipelinedParallelProcessor(RandomSamplingSentinelQueryProcessor, PipelinedParallelExecutionStrategy):
|
|
615
|
-
"""
|
|
616
|
-
This class performs sentinel execution while executing plans in a pipelined, parallel fashion.
|
|
617
|
-
"""
|
|
618
|
-
def __init__(self, *args, **kwargs):
|
|
619
|
-
RandomSamplingSentinelQueryProcessor.__init__(self, *args, **kwargs)
|
|
620
|
-
PipelinedParallelExecutionStrategy.__init__(
|
|
621
|
-
self,
|
|
622
|
-
scan_start_idx=self.scan_start_idx,
|
|
623
|
-
max_workers=self.max_workers,
|
|
624
|
-
verbose=self.verbose
|
|
625
|
-
)
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
class RandomSamplingSentinelPipelinedSingleThreadProcessor(RandomSamplingSentinelQueryProcessor, PipelinedSingleThreadExecutionStrategy):
|
|
629
|
-
"""
|
|
630
|
-
This class performs sentinel execution while executing plans in a pipelined, parallel fashion.
|
|
631
|
-
"""
|
|
632
|
-
def __init__(self, *args, **kwargs):
|
|
633
|
-
RandomSamplingSentinelQueryProcessor.__init__(self, *args, **kwargs)
|
|
634
|
-
PipelinedSingleThreadExecutionStrategy.__init__(
|
|
635
|
-
self,
|
|
636
|
-
scan_start_idx=self.scan_start_idx,
|
|
637
|
-
max_workers=self.max_workers,
|
|
638
|
-
verbose=self.verbose
|
|
639
|
-
)
|