palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +5 -0
- palimpzest/constants.py +110 -43
- palimpzest/core/__init__.py +0 -78
- palimpzest/core/data/dataclasses.py +382 -44
- palimpzest/core/elements/filters.py +7 -3
- palimpzest/core/elements/index.py +70 -0
- palimpzest/core/elements/records.py +33 -11
- palimpzest/core/lib/fields.py +1 -0
- palimpzest/core/lib/schemas.py +4 -3
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
- palimpzest/prompts/prompt_factory.py +44 -7
- palimpzest/prompts/split_merge_prompts.py +56 -0
- palimpzest/prompts/split_proposer_prompts.py +55 -0
- palimpzest/query/execution/execution_strategy.py +435 -53
- palimpzest/query/execution/execution_strategy_type.py +20 -0
- palimpzest/query/execution/mab_execution_strategy.py +532 -0
- palimpzest/query/execution/parallel_execution_strategy.py +143 -172
- palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
- palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
- palimpzest/query/generators/api_client_factory.py +31 -0
- palimpzest/query/generators/generators.py +256 -76
- palimpzest/query/operators/__init__.py +1 -2
- palimpzest/query/operators/code_synthesis_convert.py +33 -18
- palimpzest/query/operators/convert.py +30 -97
- palimpzest/query/operators/critique_and_refine_convert.py +5 -6
- palimpzest/query/operators/filter.py +7 -10
- palimpzest/query/operators/logical.py +54 -10
- palimpzest/query/operators/map.py +130 -0
- palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
- palimpzest/query/operators/physical.py +3 -12
- palimpzest/query/operators/rag_convert.py +66 -18
- palimpzest/query/operators/retrieve.py +230 -34
- palimpzest/query/operators/scan.py +5 -2
- palimpzest/query/operators/split_convert.py +169 -0
- palimpzest/query/operators/token_reduction_convert.py +8 -14
- palimpzest/query/optimizer/__init__.py +4 -16
- palimpzest/query/optimizer/cost_model.py +73 -266
- palimpzest/query/optimizer/optimizer.py +87 -58
- palimpzest/query/optimizer/optimizer_strategy.py +18 -97
- palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/primitives.py +5 -3
- palimpzest/query/optimizer/rules.py +336 -172
- palimpzest/query/optimizer/tasks.py +30 -100
- palimpzest/query/processor/config.py +38 -22
- palimpzest/query/processor/nosentinel_processor.py +16 -520
- palimpzest/query/processor/processing_strategy_type.py +28 -0
- palimpzest/query/processor/query_processor.py +38 -206
- palimpzest/query/processor/query_processor_factory.py +117 -130
- palimpzest/query/processor/sentinel_processor.py +90 -0
- palimpzest/query/processor/streaming_processor.py +25 -32
- palimpzest/sets.py +88 -41
- palimpzest/utils/model_helpers.py +8 -7
- palimpzest/utils/progress.py +368 -152
- palimpzest/utils/token_reduction_helpers.py +1 -3
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
- palimpzest-0.7.0.dist-info/RECORD +96 -0
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
- palimpzest/query/processor/mab_sentinel_processor.py +0 -884
- palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
- palimpzest/utils/index_helpers.py +0 -6
- palimpzest-0.6.3.dist-info/RECORD +0 -87
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
- {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -1,884 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from concurrent.futures import ThreadPoolExecutor, wait
|
|
3
|
-
from copy import deepcopy
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
|
|
7
|
-
from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
|
|
8
|
-
from palimpzest.core.data.dataclasses import (
|
|
9
|
-
ExecutionStats,
|
|
10
|
-
OperatorCostEstimates,
|
|
11
|
-
OperatorStats,
|
|
12
|
-
PlanStats,
|
|
13
|
-
RecordOpStats,
|
|
14
|
-
)
|
|
15
|
-
from palimpzest.core.elements.records import DataRecordCollection, DataRecordSet
|
|
16
|
-
from palimpzest.policy import Policy
|
|
17
|
-
from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy
|
|
18
|
-
from palimpzest.query.execution.single_threaded_execution_strategy import SequentialSingleThreadExecutionStrategy
|
|
19
|
-
from palimpzest.query.operators.convert import ConvertOp, LLMConvert
|
|
20
|
-
from palimpzest.query.operators.filter import FilterOp, LLMFilter
|
|
21
|
-
from palimpzest.query.operators.physical import PhysicalOperator
|
|
22
|
-
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
23
|
-
from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp
|
|
24
|
-
from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
|
|
25
|
-
from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
|
|
26
|
-
from palimpzest.query.optimizer.plan import SentinelPlan
|
|
27
|
-
from palimpzest.query.processor.query_processor import QueryProcessor
|
|
28
|
-
from palimpzest.sets import Set
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class MABSentinelQueryProcessor(QueryProcessor):
|
|
32
|
-
"""
|
|
33
|
-
Specialized query processor that implements MAB sentinel strategy
|
|
34
|
-
for coordinating optimization and execution.
|
|
35
|
-
"""
|
|
36
|
-
def __init__(
|
|
37
|
-
self,
|
|
38
|
-
k: int,
|
|
39
|
-
j: int,
|
|
40
|
-
sample_budget: int,
|
|
41
|
-
early_stop_iters: int = 3,
|
|
42
|
-
use_final_op_quality: bool = False,
|
|
43
|
-
seed: int = 42,
|
|
44
|
-
*args,
|
|
45
|
-
**kwargs,
|
|
46
|
-
):
|
|
47
|
-
super().__init__(*args, **kwargs)
|
|
48
|
-
# self.max_workers = self.get_parallel_max_workers()
|
|
49
|
-
# TODO: undo
|
|
50
|
-
# self.max_workers = 4
|
|
51
|
-
self.k = k
|
|
52
|
-
self.j = j
|
|
53
|
-
self.sample_budget = sample_budget
|
|
54
|
-
self.early_stop_iters = early_stop_iters
|
|
55
|
-
self.use_final_op_quality = use_final_op_quality
|
|
56
|
-
self.pick_output_fn = self.pick_champion_output
|
|
57
|
-
self.rng = np.random.default_rng(seed=seed)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def update_frontier_ops(
|
|
61
|
-
self,
|
|
62
|
-
frontier_ops,
|
|
63
|
-
reservoir_ops,
|
|
64
|
-
policy,
|
|
65
|
-
all_outputs,
|
|
66
|
-
logical_op_id_to_num_samples,
|
|
67
|
-
phys_op_id_to_num_samples,
|
|
68
|
-
is_filter_op_dict,
|
|
69
|
-
):
|
|
70
|
-
"""
|
|
71
|
-
Update the set of frontier operators, pulling in new ones from the reservoir as needed.
|
|
72
|
-
This function will (for each op_set):
|
|
73
|
-
1. Compute the mean, LCB, and UCB for the cost, time, quality, and selectivity of each operator
|
|
74
|
-
2. Compute the pareto optimal set of operators (using the mean values)
|
|
75
|
-
3. Update the frontier and reservoir sets of operators based on their LCB/UCB overlap with the pareto frontier
|
|
76
|
-
"""
|
|
77
|
-
# compute metrics for each operator in all_outputs
|
|
78
|
-
logical_op_id_to_op_metrics = {}
|
|
79
|
-
for logical_op_id, source_idx_to_record_sets in all_outputs.items():
|
|
80
|
-
# compute selectivity for each physical operator
|
|
81
|
-
phys_op_to_num_inputs, phys_op_to_num_outputs = {}, {}
|
|
82
|
-
for _, record_sets in source_idx_to_record_sets.items():
|
|
83
|
-
for record_set in record_sets:
|
|
84
|
-
op_id = record_set.record_op_stats[0].op_id
|
|
85
|
-
num_outputs = sum([record_op_stats.passed_operator for record_op_stats in record_set.record_op_stats])
|
|
86
|
-
if op_id not in phys_op_to_num_inputs:
|
|
87
|
-
phys_op_to_num_inputs[op_id] = 1
|
|
88
|
-
phys_op_to_num_outputs[op_id] = num_outputs
|
|
89
|
-
else:
|
|
90
|
-
phys_op_to_num_inputs[op_id] += 1
|
|
91
|
-
phys_op_to_num_outputs[op_id] += num_outputs
|
|
92
|
-
|
|
93
|
-
phys_op_to_mean_selectivity = {
|
|
94
|
-
op_id: phys_op_to_num_outputs[op_id] / phys_op_to_num_inputs[op_id]
|
|
95
|
-
for op_id in phys_op_to_num_inputs
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
# compute average cost, time, and quality
|
|
99
|
-
phys_op_to_costs, phys_op_to_times, phys_op_to_qualities = {}, {}, {}
|
|
100
|
-
for _, record_sets in source_idx_to_record_sets.items():
|
|
101
|
-
for record_set in record_sets:
|
|
102
|
-
for record_op_stats in record_set.record_op_stats:
|
|
103
|
-
op_id = record_op_stats.op_id
|
|
104
|
-
cost = record_op_stats.cost_per_record
|
|
105
|
-
time = record_op_stats.time_per_record
|
|
106
|
-
quality = record_op_stats.quality
|
|
107
|
-
if op_id not in phys_op_to_costs:
|
|
108
|
-
phys_op_to_costs[op_id] = [cost]
|
|
109
|
-
phys_op_to_times[op_id] = [time]
|
|
110
|
-
phys_op_to_qualities[op_id] = [quality]
|
|
111
|
-
else:
|
|
112
|
-
phys_op_to_costs[op_id].append(cost)
|
|
113
|
-
phys_op_to_times[op_id].append(time)
|
|
114
|
-
phys_op_to_qualities[op_id].append(quality)
|
|
115
|
-
|
|
116
|
-
phys_op_to_mean_cost = {op: np.mean(costs) for op, costs in phys_op_to_costs.items()}
|
|
117
|
-
phys_op_to_mean_time = {op: np.mean(times) for op, times in phys_op_to_times.items()}
|
|
118
|
-
phys_op_to_mean_quality = {op: np.mean(qualities) for op, qualities in phys_op_to_qualities.items()}
|
|
119
|
-
|
|
120
|
-
# compute average, LCB, and UCB of each operator; the confidence bounds depend upon
|
|
121
|
-
# the computation of the alpha parameter, which we scale to be 0.5 * the mean (of means)
|
|
122
|
-
# of the metric across all operators in this operator set
|
|
123
|
-
cost_alpha = 0.5 * np.mean([mean_cost for mean_cost in phys_op_to_mean_cost.values()])
|
|
124
|
-
time_alpha = 0.5 * np.mean([mean_time for mean_time in phys_op_to_mean_time.values()])
|
|
125
|
-
quality_alpha = 0.5 * np.mean([mean_quality for mean_quality in phys_op_to_mean_quality.values()])
|
|
126
|
-
selectivity_alpha = 0.5 * np.mean([mean_selectivity for mean_selectivity in phys_op_to_mean_selectivity.values()])
|
|
127
|
-
|
|
128
|
-
op_metrics = {}
|
|
129
|
-
for op_id in phys_op_to_costs:
|
|
130
|
-
sample_ratio = np.sqrt(np.log(logical_op_id_to_num_samples[logical_op_id]) / phys_op_id_to_num_samples[op_id])
|
|
131
|
-
exploration_terms = np.array([cost_alpha * sample_ratio, time_alpha * sample_ratio, quality_alpha * sample_ratio, selectivity_alpha * sample_ratio])
|
|
132
|
-
mean_terms = (phys_op_to_mean_cost[op_id], phys_op_to_mean_time[op_id], phys_op_to_mean_quality[op_id], phys_op_to_mean_selectivity[op_id])
|
|
133
|
-
|
|
134
|
-
# NOTE: we could clip these; however I will not do so for now to allow for arbitrary quality metric(s)
|
|
135
|
-
lcb_terms = mean_terms - exploration_terms
|
|
136
|
-
ucb_terms = mean_terms + exploration_terms
|
|
137
|
-
op_metrics[op_id] = {"mean": mean_terms, "lcb": lcb_terms, "ucb": ucb_terms}
|
|
138
|
-
|
|
139
|
-
# store average metrics for each operator in the op_set
|
|
140
|
-
logical_op_id_to_op_metrics[logical_op_id] = op_metrics
|
|
141
|
-
|
|
142
|
-
# get the tuple representation of this policy
|
|
143
|
-
policy_dict = policy.get_dict()
|
|
144
|
-
|
|
145
|
-
# compute the pareto optimal set of operators for each logical_op_id
|
|
146
|
-
pareto_op_sets = {}
|
|
147
|
-
for logical_op_id, op_metrics in logical_op_id_to_op_metrics.items():
|
|
148
|
-
pareto_op_sets[logical_op_id] = set()
|
|
149
|
-
for op_id, metrics in op_metrics.items():
|
|
150
|
-
cost, time, quality, selectivity = metrics["mean"]
|
|
151
|
-
pareto_frontier = True
|
|
152
|
-
|
|
153
|
-
# check if any other operator dominates op_id
|
|
154
|
-
for other_op_id, other_metrics in op_metrics.items():
|
|
155
|
-
other_cost, other_time, other_quality, other_selectivity = other_metrics["mean"]
|
|
156
|
-
if op_id == other_op_id:
|
|
157
|
-
continue
|
|
158
|
-
|
|
159
|
-
# if op_id is dominated by other_op_id, set pareto_frontier = False and break
|
|
160
|
-
# NOTE: here we use a strict inequality (instead of the usual <= or >=) because
|
|
161
|
-
# all ops which have equal cost / time / quality / sel. should not be
|
|
162
|
-
# filtered out from sampling by our logic in this function
|
|
163
|
-
cost_dominated = True if policy_dict["cost"] == 0.0 else other_cost < cost
|
|
164
|
-
time_dominated = True if policy_dict["time"] == 0.0 else other_time < time
|
|
165
|
-
quality_dominated = True if policy_dict["quality"] == 0.0 else other_quality > quality
|
|
166
|
-
selectivity_dominated = True if not is_filter_op_dict[logical_op_id] else other_selectivity < selectivity
|
|
167
|
-
if cost_dominated and time_dominated and quality_dominated and selectivity_dominated:
|
|
168
|
-
pareto_frontier = False
|
|
169
|
-
break
|
|
170
|
-
|
|
171
|
-
# add op_id to pareto frontier if it's not dominated
|
|
172
|
-
if pareto_frontier:
|
|
173
|
-
pareto_op_sets[logical_op_id].add(op_id)
|
|
174
|
-
|
|
175
|
-
# iterate over frontier ops and replace any which do not overlap with pareto frontier
|
|
176
|
-
new_frontier_ops = {logical_op_id: [] for logical_op_id in frontier_ops}
|
|
177
|
-
new_reservoir_ops = {logical_op_id: [] for logical_op_id in reservoir_ops}
|
|
178
|
-
for logical_op_id, pareto_op_set in pareto_op_sets.items():
|
|
179
|
-
num_dropped_from_frontier = 0
|
|
180
|
-
for op, next_shuffled_sample_idx, new_operator, fully_sampled in frontier_ops[logical_op_id]:
|
|
181
|
-
op_id = op.get_op_id()
|
|
182
|
-
|
|
183
|
-
# if this op is fully sampled, remove it from the frontier
|
|
184
|
-
if fully_sampled:
|
|
185
|
-
num_dropped_from_frontier += 1
|
|
186
|
-
continue
|
|
187
|
-
|
|
188
|
-
# if this op is pareto optimal keep it in our frontier ops
|
|
189
|
-
if op_id in pareto_op_set:
|
|
190
|
-
new_frontier_ops[logical_op_id].append((op, next_shuffled_sample_idx, new_operator, fully_sampled))
|
|
191
|
-
continue
|
|
192
|
-
|
|
193
|
-
# otherwise, if this op overlaps with an op on the pareto frontier, keep it in our frontier ops
|
|
194
|
-
# NOTE: for now, we perform an optimistic comparison with the ucb/lcb
|
|
195
|
-
pareto_frontier = True
|
|
196
|
-
op_cost = logical_op_id_to_op_metrics[logical_op_id][op_id]["lcb"][0]
|
|
197
|
-
op_time = logical_op_id_to_op_metrics[logical_op_id][op_id]["lcb"][1]
|
|
198
|
-
op_quality = logical_op_id_to_op_metrics[logical_op_id][op_id]["ucb"][2]
|
|
199
|
-
op_selectivity = logical_op_id_to_op_metrics[logical_op_id][op_id]["lcb"][3]
|
|
200
|
-
for pareto_op_id in pareto_op_set:
|
|
201
|
-
pareto_cost = logical_op_id_to_op_metrics[logical_op_id][pareto_op_id]["ucb"][0]
|
|
202
|
-
pareto_time = logical_op_id_to_op_metrics[logical_op_id][pareto_op_id]["ucb"][1]
|
|
203
|
-
pareto_quality = logical_op_id_to_op_metrics[logical_op_id][pareto_op_id]["lcb"][2]
|
|
204
|
-
pareto_selectivity = logical_op_id_to_op_metrics[logical_op_id][pareto_op_id]["ucb"][3]
|
|
205
|
-
|
|
206
|
-
# if op_id is dominated by pareto_op_id, set pareto_frontier = False and break
|
|
207
|
-
cost_dominated = True if policy_dict["cost"] == 0.0 else pareto_cost <= op_cost
|
|
208
|
-
time_dominated = True if policy_dict["time"] == 0.0 else pareto_time <= op_time
|
|
209
|
-
quality_dominated = True if policy_dict["quality"] == 0.0 else pareto_quality >= op_quality
|
|
210
|
-
selectivity_dominated = True if not is_filter_op_dict[logical_op_id] else pareto_selectivity <= op_selectivity
|
|
211
|
-
if cost_dominated and time_dominated and quality_dominated and selectivity_dominated:
|
|
212
|
-
pareto_frontier = False
|
|
213
|
-
break
|
|
214
|
-
|
|
215
|
-
# add op_id to pareto frontier if it's not dominated
|
|
216
|
-
if pareto_frontier:
|
|
217
|
-
new_frontier_ops[logical_op_id].append((op, next_shuffled_sample_idx, new_operator, fully_sampled))
|
|
218
|
-
else:
|
|
219
|
-
num_dropped_from_frontier += 1
|
|
220
|
-
|
|
221
|
-
# replace the ops dropped from the frontier with new ops from the reservoir
|
|
222
|
-
num_dropped_from_frontier = min(num_dropped_from_frontier, len(reservoir_ops[logical_op_id]))
|
|
223
|
-
for idx in range(num_dropped_from_frontier):
|
|
224
|
-
new_frontier_ops[logical_op_id].append((reservoir_ops[logical_op_id][idx], 0, True, False))
|
|
225
|
-
|
|
226
|
-
# update reservoir ops for this logical_op_id
|
|
227
|
-
new_reservoir_ops[logical_op_id] = reservoir_ops[logical_op_id][num_dropped_from_frontier:]
|
|
228
|
-
|
|
229
|
-
return new_frontier_ops, new_reservoir_ops
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def compute_quality(
|
|
233
|
-
self,
|
|
234
|
-
record_set: DataRecordSet,
|
|
235
|
-
expected_output: dict | None = None,
|
|
236
|
-
champion_record_set: DataRecordSet | None = None,
|
|
237
|
-
is_filter_op: bool = False,
|
|
238
|
-
is_convert_op: bool = False,
|
|
239
|
-
) -> DataRecordSet:
|
|
240
|
-
"""
|
|
241
|
-
Compute the quality for the given `record_set` by comparing it to the `expected_output`.
|
|
242
|
-
|
|
243
|
-
Update the record_set by assigning the quality to each entry in its record_op_stats and
|
|
244
|
-
returning the updated record_set.
|
|
245
|
-
"""
|
|
246
|
-
# compute whether we can only use the champion
|
|
247
|
-
only_using_champion = expected_output is None
|
|
248
|
-
|
|
249
|
-
# if this operation is a failed convert
|
|
250
|
-
if is_convert_op and len(record_set) == 0:
|
|
251
|
-
record_set.record_op_stats[0].quality = 0.0
|
|
252
|
-
|
|
253
|
-
# if this operation is a filter:
|
|
254
|
-
# - we assign a quality of 1.0 if the record is in the expected outputs and it passes this filter
|
|
255
|
-
# - we assign a quality of 0.0 if the record is in the expected outputs and it does NOT pass this filter
|
|
256
|
-
# - we assign a quality relative to the champion / ensemble output if the record is not in the expected outputs
|
|
257
|
-
# we cannot know for certain what the correct behavior is a given filter on a record which is not in the output
|
|
258
|
-
# (unless it is the only filter in the plan), thus we only evaluate the filter based on its performance on
|
|
259
|
-
# records which are in the output
|
|
260
|
-
elif is_filter_op:
|
|
261
|
-
# NOTE:
|
|
262
|
-
# - we know that record_set.record_op_stats will contain a single entry for a filter op
|
|
263
|
-
# - if we are using the champion, then champion_record_set will also contain a single entry for a filter op
|
|
264
|
-
record_op_stats = record_set.record_op_stats[0]
|
|
265
|
-
if only_using_champion:
|
|
266
|
-
champion_record = champion_record_set[0]
|
|
267
|
-
record_op_stats.quality = int(record_op_stats.passed_operator == champion_record.passed_operator)
|
|
268
|
-
|
|
269
|
-
# - if we are using validation data, we may have multiple expected records in the expected_output for this source_idx,
|
|
270
|
-
# thus, if we can identify an exact match, we can use that to evaluate the filter's quality
|
|
271
|
-
# - if we are using validation data but we *cannot* find an exact match, then we will once again use the champion record set
|
|
272
|
-
else:
|
|
273
|
-
# compute number of matches between this record's computed fields and this expected record's outputs
|
|
274
|
-
found_match_in_output = False
|
|
275
|
-
labels_dict_lst = expected_output["labels"] if isinstance(expected_output["labels"], list) else [expected_output["labels"]]
|
|
276
|
-
for labels_dict in labels_dict_lst:
|
|
277
|
-
all_correct = True
|
|
278
|
-
for field, value in record_op_stats.record_state.items():
|
|
279
|
-
if value != labels_dict[field]:
|
|
280
|
-
all_correct = False
|
|
281
|
-
break
|
|
282
|
-
|
|
283
|
-
if all_correct:
|
|
284
|
-
found_match_in_output = True
|
|
285
|
-
break
|
|
286
|
-
|
|
287
|
-
if found_match_in_output:
|
|
288
|
-
record_op_stats.quality = int(record_op_stats.passed_operator)
|
|
289
|
-
else:
|
|
290
|
-
champion_record = champion_record_set[0]
|
|
291
|
-
record_op_stats.quality = int(record_op_stats.passed_operator == champion_record.passed_operator)
|
|
292
|
-
|
|
293
|
-
# if this is a successful convert operation
|
|
294
|
-
else:
|
|
295
|
-
# NOTE: the following computation assumes we do not project out computed values
|
|
296
|
-
# (and that the validation examples provide all computed fields); even if
|
|
297
|
-
# a user program does add projection, we can ignore the projection on the
|
|
298
|
-
# validation dataset and use the champion model (as opposed to the validation
|
|
299
|
-
# output) for scoring fields which have their values projected out
|
|
300
|
-
|
|
301
|
-
# create list of dictionaries of labels for each expected / champion output
|
|
302
|
-
labels_dict_lst = []
|
|
303
|
-
if only_using_champion:
|
|
304
|
-
for champion_record in champion_record_set:
|
|
305
|
-
labels_dict_lst.append(champion_record.to_dict())
|
|
306
|
-
else:
|
|
307
|
-
labels_dict_lst = (
|
|
308
|
-
expected_output["labels"]
|
|
309
|
-
if isinstance(expected_output["labels"], list)
|
|
310
|
-
else [expected_output["labels"]]
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
# GREEDY ALGORITHM
|
|
314
|
-
# for each record in the expected output, we look for the computed record which maximizes the quality metric;
|
|
315
|
-
# once we've identified that computed record we remove it from consideration for the next expected output
|
|
316
|
-
field_to_score_fn = {} if only_using_champion else expected_output["score_fn"]
|
|
317
|
-
for labels_dict in labels_dict_lst:
|
|
318
|
-
best_quality, best_record_op_stats = 0.0, None
|
|
319
|
-
for record_op_stats in record_set.record_op_stats:
|
|
320
|
-
# if we already assigned this record a quality, skip it
|
|
321
|
-
if record_op_stats.quality is not None:
|
|
322
|
-
continue
|
|
323
|
-
|
|
324
|
-
# compute number of matches between this record's computed fields and this expected record's outputs
|
|
325
|
-
total_quality = 0
|
|
326
|
-
for field in record_op_stats.generated_fields:
|
|
327
|
-
computed_value = record_op_stats.record_state.get(field, None)
|
|
328
|
-
expected_value = labels_dict[field]
|
|
329
|
-
|
|
330
|
-
# get the metric function for this field
|
|
331
|
-
score_fn = field_to_score_fn.get(field, "exact")
|
|
332
|
-
|
|
333
|
-
# compute exact match
|
|
334
|
-
if score_fn == "exact":
|
|
335
|
-
total_quality += int(computed_value == expected_value)
|
|
336
|
-
|
|
337
|
-
# compute UDF metric
|
|
338
|
-
elif callable(score_fn):
|
|
339
|
-
total_quality += score_fn(computed_value, expected_value)
|
|
340
|
-
|
|
341
|
-
# otherwise, throw an exception
|
|
342
|
-
else:
|
|
343
|
-
raise Exception(f"Unrecognized score_fn: {score_fn}")
|
|
344
|
-
|
|
345
|
-
# compute recall and update best seen so far
|
|
346
|
-
quality = total_quality / len(record_op_stats.generated_fields)
|
|
347
|
-
if quality > best_quality:
|
|
348
|
-
best_quality = quality
|
|
349
|
-
best_record_op_stats = record_op_stats
|
|
350
|
-
|
|
351
|
-
# set best_quality as quality for the best_record_op_stats
|
|
352
|
-
if best_record_op_stats is not None:
|
|
353
|
-
best_record_op_stats.quality = best_quality
|
|
354
|
-
|
|
355
|
-
# for any records which did not receive a quality, set it to 0.0 as these are unexpected extras
|
|
356
|
-
for record_op_stats in record_set.record_op_stats:
|
|
357
|
-
if record_op_stats.quality is None:
|
|
358
|
-
record_op_stats.quality = 0.0
|
|
359
|
-
|
|
360
|
-
return record_set
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
def score_quality(
|
|
364
|
-
self,
|
|
365
|
-
op_set: list[PhysicalOperator],
|
|
366
|
-
logical_op_id: str,
|
|
367
|
-
execution_data: dict[str, dict[str, list[DataRecordSet]]],
|
|
368
|
-
champion_outputs: dict[str, dict[str, DataRecordSet]],
|
|
369
|
-
expected_outputs: dict[str, dict],
|
|
370
|
-
) -> list[RecordOpStats]:
|
|
371
|
-
"""
|
|
372
|
-
NOTE: This approach to cost modeling does not work directly for aggregation queries;
|
|
373
|
-
for these queries, we would ask the user to provide validation data for the step immediately
|
|
374
|
-
before a final aggregation
|
|
375
|
-
|
|
376
|
-
NOTE: This function currently assumes that one-to-many converts do NOT create duplicate outputs.
|
|
377
|
-
This assumption would break if, for example, we extracted the breed of every dog in an image.
|
|
378
|
-
If there were two golden retrievers and a bernoodle in an image and we extracted:
|
|
379
|
-
|
|
380
|
-
{"image": "file1.png", "breed": "Golden Retriever"}
|
|
381
|
-
{"image": "file1.png", "breed": "Golden Retriever"}
|
|
382
|
-
{"image": "file1.png", "breed": "Bernedoodle"}
|
|
383
|
-
|
|
384
|
-
This function would currently give perfect accuracy to the following output:
|
|
385
|
-
|
|
386
|
-
{"image": "file1.png", "breed": "Golden Retriever"}
|
|
387
|
-
{"image": "file1.png", "breed": "Bernedoodle"}
|
|
388
|
-
|
|
389
|
-
Even though it is missing one of the golden retrievers.
|
|
390
|
-
"""
|
|
391
|
-
# extract information about the logical operation performed at this stage of the sentinel plan;
|
|
392
|
-
# NOTE: we can infer these fields from context clues, but in the long-term we should have a more
|
|
393
|
-
# principled way of getting these directly from attributes either stored in the sentinel_plan
|
|
394
|
-
# or in the PhysicalOperator
|
|
395
|
-
physical_op = op_set[0]
|
|
396
|
-
is_filter_op = isinstance(physical_op, FilterOp)
|
|
397
|
-
is_convert_op = isinstance(physical_op, ConvertOp)
|
|
398
|
-
is_perfect_quality_op = (
|
|
399
|
-
not isinstance(physical_op, LLMConvert)
|
|
400
|
-
and not isinstance(physical_op, LLMFilter)
|
|
401
|
-
and not isinstance(physical_op, RetrieveOp)
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
# pull out the execution data from this operator; place the upstream execution data in a new list
|
|
405
|
-
this_op_execution_data = execution_data[logical_op_id]
|
|
406
|
-
|
|
407
|
-
# compute quality of each output computed by this operator
|
|
408
|
-
for source_idx, record_sets in this_op_execution_data.items():
|
|
409
|
-
# NOTE
|
|
410
|
-
# source_idx is a particular input, for which we may have computed multiple output record_sets;
|
|
411
|
-
# each of these record_sets may contain more than one record (b/c one-to-many) and we have one
|
|
412
|
-
# record_set per operator in the op_set
|
|
413
|
-
|
|
414
|
-
# if this operation does not involve an LLM, every record_op_stats object gets perfect quality
|
|
415
|
-
if is_perfect_quality_op:
|
|
416
|
-
for record_set in record_sets:
|
|
417
|
-
for record_op_stats in record_set.record_op_stats:
|
|
418
|
-
record_op_stats.quality = 1.0
|
|
419
|
-
continue
|
|
420
|
-
|
|
421
|
-
# get the expected output for this source_idx if we have one
|
|
422
|
-
expected_output = (
|
|
423
|
-
expected_outputs[source_idx]
|
|
424
|
-
if expected_outputs is not None and source_idx in expected_outputs
|
|
425
|
-
else None
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
# extract champion output for this record set
|
|
429
|
-
champion_record_set = champion_outputs[logical_op_id][source_idx]
|
|
430
|
-
|
|
431
|
-
# for each record_set produced by an operation, compute its quality
|
|
432
|
-
for record_set in record_sets:
|
|
433
|
-
record_set = self.compute_quality(record_set, expected_output, champion_record_set, is_filter_op, is_convert_op)
|
|
434
|
-
|
|
435
|
-
# return the quality annotated record op stats
|
|
436
|
-
return execution_data
|
|
437
|
-
|
|
438
|
-
def pick_champion_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
|
|
439
|
-
# if there's only one operator in the set, we return its record_set
|
|
440
|
-
if len(op_set_record_sets) == 1:
|
|
441
|
-
record_set, _ = op_set_record_sets[0]
|
|
442
|
-
return record_set
|
|
443
|
-
|
|
444
|
-
# find the operator with the highest average quality and return its record_set
|
|
445
|
-
base_op_cost_est = OperatorCostEstimates(cardinality=1.0, cost_per_record=0.0, time_per_record=0.0, quality=1.0)
|
|
446
|
-
champion_record_set, champion_quality = None, -1.0
|
|
447
|
-
for record_set, op in op_set_record_sets:
|
|
448
|
-
op_cost_estimates = op.naive_cost_estimates(base_op_cost_est)
|
|
449
|
-
if op_cost_estimates.quality > champion_quality:
|
|
450
|
-
champion_record_set, champion_quality = record_set, op_cost_estimates.quality
|
|
451
|
-
|
|
452
|
-
return champion_record_set
|
|
453
|
-
|
|
454
|
-
def pick_ensemble_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
|
|
455
|
-
# if there's only one operator in the set, we return its record_set
|
|
456
|
-
if len(op_set_record_sets) == 1:
|
|
457
|
-
record_set, _ = op_set_record_sets[0]
|
|
458
|
-
return record_set
|
|
459
|
-
|
|
460
|
-
# NOTE: I don't like that this assumes the models are consistent in
|
|
461
|
-
# how they order their record outputs for one-to-many converts;
|
|
462
|
-
# eventually we can try out more robust schemes to account for
|
|
463
|
-
# differences in ordering
|
|
464
|
-
# aggregate records at each index in the response
|
|
465
|
-
idx_to_records = {}
|
|
466
|
-
for record_set, _ in op_set_record_sets:
|
|
467
|
-
for idx, record in enumerate(record_set):
|
|
468
|
-
if idx not in idx_to_records:
|
|
469
|
-
idx_to_records[idx] = [record]
|
|
470
|
-
else:
|
|
471
|
-
idx_to_records[idx].append(record)
|
|
472
|
-
|
|
473
|
-
# compute most common answer at each index
|
|
474
|
-
out_records = []
|
|
475
|
-
for idx in range(len(idx_to_records)):
|
|
476
|
-
records = idx_to_records[idx]
|
|
477
|
-
most_common_record = max(set(records), key=records.count)
|
|
478
|
-
out_records.append(most_common_record)
|
|
479
|
-
|
|
480
|
-
# create and return final DataRecordSet
|
|
481
|
-
return DataRecordSet(out_records, [])
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
def pick_highest_quality_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
|
|
485
|
-
# if there's only one operator in the set, we return its record_set
|
|
486
|
-
if len(op_set_record_sets) == 1:
|
|
487
|
-
record_set, _ = op_set_record_sets[0]
|
|
488
|
-
return record_set
|
|
489
|
-
|
|
490
|
-
# NOTE: I don't like that this assumes the models are consistent in
|
|
491
|
-
# how they order their record outputs for one-to-many converts;
|
|
492
|
-
# eventually we can try out more robust schemes to account for
|
|
493
|
-
# differences in ordering
|
|
494
|
-
# aggregate records at each index in the response
|
|
495
|
-
idx_to_records = {}
|
|
496
|
-
for record_set, _ in op_set_record_sets:
|
|
497
|
-
for idx in range(len(record_set)):
|
|
498
|
-
record, record_op_stats = record_set[idx], record_set.record_op_stats[idx]
|
|
499
|
-
if idx not in idx_to_records:
|
|
500
|
-
idx_to_records[idx] = [(record, record_op_stats)]
|
|
501
|
-
else:
|
|
502
|
-
idx_to_records[idx].append((record, record_op_stats))
|
|
503
|
-
|
|
504
|
-
# compute highest quality answer at each index
|
|
505
|
-
out_records = []
|
|
506
|
-
out_record_op_stats = []
|
|
507
|
-
for idx in range(len(idx_to_records)):
|
|
508
|
-
records_lst, record_op_stats_lst = zip(*idx_to_records[idx])
|
|
509
|
-
max_quality_record, max_quality = records_lst[0], record_op_stats_lst[0].quality
|
|
510
|
-
max_quality_stats = record_op_stats_lst[0]
|
|
511
|
-
for record, record_op_stats in zip(records_lst[1:], record_op_stats_lst[1:]):
|
|
512
|
-
record_quality = record_op_stats.quality
|
|
513
|
-
if record_quality > max_quality:
|
|
514
|
-
max_quality_record = record
|
|
515
|
-
max_quality = record_quality
|
|
516
|
-
max_quality_stats = record_op_stats
|
|
517
|
-
out_records.append(max_quality_record)
|
|
518
|
-
out_record_op_stats.append(max_quality_stats)
|
|
519
|
-
|
|
520
|
-
# create and return final DataRecordSet
|
|
521
|
-
return DataRecordSet(out_records, out_record_op_stats)
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
def execute_op_set(self, op_candidate_pairs):
|
|
525
|
-
# TODO: post-submission we will need to modify this to:
|
|
526
|
-
# - submit all candidates for aggregate operators
|
|
527
|
-
# - handle limits
|
|
528
|
-
# create thread pool w/max workers and run futures over worker pool
|
|
529
|
-
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
530
|
-
# create futures
|
|
531
|
-
futures = []
|
|
532
|
-
for operator, candidate in op_candidate_pairs:
|
|
533
|
-
future = executor.submit(PhysicalOperator.execute_op_wrapper, operator, candidate)
|
|
534
|
-
futures.append(future)
|
|
535
|
-
|
|
536
|
-
# compute output record_set for each (operator, candidate) pair
|
|
537
|
-
output_record_sets = []
|
|
538
|
-
while len(futures) > 0:
|
|
539
|
-
# get the set of futures that have (and have not) finished in the last PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
|
|
540
|
-
done_futures, not_done_futures = wait(futures, timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
|
|
541
|
-
|
|
542
|
-
# cast not_done_futures from a set to a list so we can append to it
|
|
543
|
-
not_done_futures = list(not_done_futures)
|
|
544
|
-
|
|
545
|
-
# process finished futures
|
|
546
|
-
for future in done_futures:
|
|
547
|
-
# get the result and add it to the output records set
|
|
548
|
-
record_set, operator, candidate = future.result()
|
|
549
|
-
output_record_sets.append((record_set, operator, candidate))
|
|
550
|
-
|
|
551
|
-
# update list of futures
|
|
552
|
-
futures = not_done_futures
|
|
553
|
-
|
|
554
|
-
# compute mapping from source_idx to record sets for all operators and for champion operator
|
|
555
|
-
all_record_sets, champion_record_sets = {}, {}
|
|
556
|
-
for _, candidate in op_candidate_pairs:
|
|
557
|
-
candidate_output_record_sets, source_idx = [], None
|
|
558
|
-
for record_set, operator, candidate_ in output_record_sets:
|
|
559
|
-
if candidate == candidate_:
|
|
560
|
-
candidate_output_record_sets.append((record_set, operator))
|
|
561
|
-
|
|
562
|
-
# get the source_idx associated with this input record
|
|
563
|
-
source_idx = candidate.source_idx
|
|
564
|
-
|
|
565
|
-
# select the champion (i.e. best) record_set from all the record sets computed for this candidate
|
|
566
|
-
champion_record_set = self.pick_output_fn(candidate_output_record_sets)
|
|
567
|
-
|
|
568
|
-
# add champion record_set to mapping from source_idx --> champion record_set
|
|
569
|
-
champion_record_sets[source_idx] = champion_record_set
|
|
570
|
-
|
|
571
|
-
# add all record_sets computed for this source_idx to mapping from source_idx --> record_sets
|
|
572
|
-
all_record_sets[source_idx] = [tup[0] for tup in candidate_output_record_sets]
|
|
573
|
-
|
|
574
|
-
return all_record_sets, champion_record_sets
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
def execute_sentinel_plan(self, plan: SentinelPlan, expected_outputs: dict[str, dict], policy: Policy):
|
|
578
|
-
"""
|
|
579
|
-
"""
|
|
580
|
-
if self.verbose:
|
|
581
|
-
print("----------------------")
|
|
582
|
-
print(f"PLAN[{plan.plan_id}] (sentinel):")
|
|
583
|
-
print(plan)
|
|
584
|
-
print("---")
|
|
585
|
-
|
|
586
|
-
plan_start_time = time.time()
|
|
587
|
-
|
|
588
|
-
# initialize plan stats and operator stats
|
|
589
|
-
plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
|
|
590
|
-
for logical_op_id, logical_op_name, op_set in plan:
|
|
591
|
-
op_set_details = {
|
|
592
|
-
op.op_name(): {k: str(v) for k, v in op.get_id_params().items()}
|
|
593
|
-
for op in op_set
|
|
594
|
-
}
|
|
595
|
-
plan_stats.operator_stats[logical_op_id] = OperatorStats(
|
|
596
|
-
op_id=logical_op_id,
|
|
597
|
-
op_name=logical_op_name,
|
|
598
|
-
op_details=op_set_details,
|
|
599
|
-
)
|
|
600
|
-
|
|
601
|
-
# shuffle the indices of records to sample
|
|
602
|
-
total_num_samples = len(self.val_datasource)
|
|
603
|
-
shuffled_source_indices = [int(idx) for idx in np.arange(total_num_samples)]
|
|
604
|
-
self.rng.shuffle(shuffled_source_indices)
|
|
605
|
-
|
|
606
|
-
# sample k initial operators for each operator set; for each operator maintain a tuple of:
|
|
607
|
-
# (operator, next_shuffled_sample_idx, new_operator); new_operator is True when an operator
|
|
608
|
-
# is added to the frontier
|
|
609
|
-
frontier_ops, reservoir_ops = {}, {}
|
|
610
|
-
for logical_op_id, _, op_set in plan:
|
|
611
|
-
op_set_copy = [op for op in op_set]
|
|
612
|
-
self.rng.shuffle(op_set_copy)
|
|
613
|
-
k = min(self.k, len(op_set_copy))
|
|
614
|
-
frontier_ops[logical_op_id] = [(op, 0, True, False) for op in op_set_copy[:k]]
|
|
615
|
-
reservoir_ops[logical_op_id] = [op for op in op_set_copy[k:]]
|
|
616
|
-
|
|
617
|
-
# create mapping from logical and physical op ids to the number of samples drawn
|
|
618
|
-
logical_op_id_to_num_samples = {logical_op_id: 0 for logical_op_id, _, _ in plan}
|
|
619
|
-
phys_op_id_to_num_samples = {op.get_op_id(): 0 for _, _, op_set in plan for op in op_set}
|
|
620
|
-
is_filter_op_dict = {
|
|
621
|
-
logical_op_id: isinstance(op_set[0], FilterOp)
|
|
622
|
-
for logical_op_id, _, op_set in plan
|
|
623
|
-
}
|
|
624
|
-
|
|
625
|
-
# NOTE: to maintain parity with our count of samples drawn in the random sampling execution,
|
|
626
|
-
# for each logical_op_id, we count the number of (record, op) executions as the number of samples within that op_set;
|
|
627
|
-
# the samples drawn is equal to the max of that number across all operator sets
|
|
628
|
-
samples_drawn = 0
|
|
629
|
-
all_outputs, champion_outputs = {}, {}
|
|
630
|
-
while samples_drawn < self.sample_budget:
|
|
631
|
-
# execute operator sets in sequence
|
|
632
|
-
for op_idx, (logical_op_id, _, op_set) in enumerate(plan):
|
|
633
|
-
prev_logical_op_id = plan.logical_op_ids[op_idx - 1] if op_idx > 0 else None
|
|
634
|
-
prev_logical_op_is_filter = prev_logical_op_id is not None and is_filter_op_dict[prev_logical_op_id]
|
|
635
|
-
|
|
636
|
-
# create list of tuples for (op, candidate) which we should execute
|
|
637
|
-
op_candidate_pairs = []
|
|
638
|
-
updated_frontier_ops_lst = []
|
|
639
|
-
for op, next_shuffled_sample_idx, new_operator, fully_sampled in frontier_ops[logical_op_id]:
|
|
640
|
-
# execute new operators on first j candidates, and previously sampled operators on one additional candidate
|
|
641
|
-
j = min(self.j, len(shuffled_source_indices)) if new_operator else 1
|
|
642
|
-
for j_idx in range(j):
|
|
643
|
-
candidates = []
|
|
644
|
-
if isinstance(op, (MarshalAndScanDataOp, CacheScanDataOp)):
|
|
645
|
-
source_idx = shuffled_source_indices[(next_shuffled_sample_idx + j_idx) % len(shuffled_source_indices)]
|
|
646
|
-
candidates = [source_idx]
|
|
647
|
-
logical_op_id_to_num_samples[logical_op_id] += 1
|
|
648
|
-
phys_op_id_to_num_samples[op.get_op_id()] += 1
|
|
649
|
-
else:
|
|
650
|
-
if next_shuffled_sample_idx + j_idx == len(shuffled_source_indices):
|
|
651
|
-
fully_sampled = True
|
|
652
|
-
break
|
|
653
|
-
|
|
654
|
-
# pick best output from all_outputs from previous logical operator
|
|
655
|
-
source_idx = shuffled_source_indices[next_shuffled_sample_idx + j_idx]
|
|
656
|
-
record_sets = all_outputs[prev_logical_op_id][source_idx]
|
|
657
|
-
all_source_record_sets = [(record_set, None) for record_set in record_sets]
|
|
658
|
-
max_quality_record_set = self.pick_highest_quality_output(all_source_record_sets)
|
|
659
|
-
if (
|
|
660
|
-
not prev_logical_op_is_filter
|
|
661
|
-
or (
|
|
662
|
-
prev_logical_op_is_filter
|
|
663
|
-
and max_quality_record_set.record_op_stats[0].passed_operator
|
|
664
|
-
)
|
|
665
|
-
):
|
|
666
|
-
candidates = [record for record in max_quality_record_set]
|
|
667
|
-
|
|
668
|
-
# increment number of samples drawn for this logical and physical op id; even if we get multiple
|
|
669
|
-
# candidates from the previous stage in the pipeline, we only count this as one sample
|
|
670
|
-
logical_op_id_to_num_samples[logical_op_id] += 1
|
|
671
|
-
phys_op_id_to_num_samples[op.get_op_id()] += 1
|
|
672
|
-
|
|
673
|
-
if len(candidates) > 0:
|
|
674
|
-
op_candidate_pairs.extend([(op, candidate) for candidate in candidates])
|
|
675
|
-
|
|
676
|
-
# set new_operator = False and update next_shuffled_sample_idx
|
|
677
|
-
updated_frontier_ops_lst.append((op, next_shuffled_sample_idx + j, False, fully_sampled))
|
|
678
|
-
|
|
679
|
-
frontier_ops[logical_op_id] = updated_frontier_ops_lst
|
|
680
|
-
|
|
681
|
-
# continue if op_candidate_pairs is an empty list, as this means all records have been filtered out
|
|
682
|
-
if len(op_candidate_pairs) == 0:
|
|
683
|
-
continue
|
|
684
|
-
|
|
685
|
-
# run sampled operators on sampled candidates
|
|
686
|
-
source_idx_to_record_sets, source_idx_to_champion_record_set = self.execute_op_set(op_candidate_pairs)
|
|
687
|
-
|
|
688
|
-
# update all_outputs and champion_outputs dictionary
|
|
689
|
-
if logical_op_id not in all_outputs:
|
|
690
|
-
all_outputs[logical_op_id] = source_idx_to_record_sets
|
|
691
|
-
champion_outputs[logical_op_id] = source_idx_to_champion_record_set
|
|
692
|
-
else:
|
|
693
|
-
for source_idx, record_sets in source_idx_to_record_sets.items():
|
|
694
|
-
if source_idx not in all_outputs[logical_op_id]:
|
|
695
|
-
all_outputs[logical_op_id][source_idx] = record_sets
|
|
696
|
-
champion_outputs[logical_op_id][source_idx] = source_idx_to_champion_record_set[source_idx]
|
|
697
|
-
else:
|
|
698
|
-
all_outputs[logical_op_id][source_idx].extend(record_sets)
|
|
699
|
-
# NOTE: short-term solution; in practice we can get multiple champion records from different
|
|
700
|
-
# sets of operators, so we should try to find a way to only take one
|
|
701
|
-
champion_outputs[logical_op_id][source_idx] = source_idx_to_champion_record_set[source_idx]
|
|
702
|
-
|
|
703
|
-
# flatten lists of records and record_op_stats
|
|
704
|
-
all_records, all_record_op_stats = [], []
|
|
705
|
-
for _, record_sets in source_idx_to_record_sets.items():
|
|
706
|
-
for record_set in record_sets:
|
|
707
|
-
all_records.extend(record_set.data_records)
|
|
708
|
-
all_record_op_stats.extend(record_set.record_op_stats)
|
|
709
|
-
|
|
710
|
-
# update plan stats
|
|
711
|
-
plan_stats.operator_stats[logical_op_id].add_record_op_stats(
|
|
712
|
-
all_record_op_stats,
|
|
713
|
-
source_op_id=prev_logical_op_id,
|
|
714
|
-
plan_id=plan.plan_id,
|
|
715
|
-
)
|
|
716
|
-
|
|
717
|
-
# add records (which are not filtered) to the cache, if allowed
|
|
718
|
-
if not self.nocache:
|
|
719
|
-
for record in all_records:
|
|
720
|
-
if getattr(record, "passed_operator", True):
|
|
721
|
-
# self.datadir.append_cache(logical_op_id, record)
|
|
722
|
-
pass
|
|
723
|
-
|
|
724
|
-
# compute quality for each operator
|
|
725
|
-
all_outputs = self.score_quality(
|
|
726
|
-
op_set,
|
|
727
|
-
logical_op_id,
|
|
728
|
-
all_outputs,
|
|
729
|
-
champion_outputs,
|
|
730
|
-
expected_outputs,
|
|
731
|
-
)
|
|
732
|
-
|
|
733
|
-
# update the (pareto) frontier for each set of operators
|
|
734
|
-
frontier_ops, reservoir_ops = self.update_frontier_ops(
|
|
735
|
-
frontier_ops,
|
|
736
|
-
reservoir_ops,
|
|
737
|
-
policy,
|
|
738
|
-
all_outputs,
|
|
739
|
-
logical_op_id_to_num_samples,
|
|
740
|
-
phys_op_id_to_num_samples,
|
|
741
|
-
is_filter_op_dict,
|
|
742
|
-
)
|
|
743
|
-
|
|
744
|
-
# update the number of samples drawn to be the max across all logical operators
|
|
745
|
-
samples_drawn = max(logical_op_id_to_num_samples.values())
|
|
746
|
-
|
|
747
|
-
# if caching was allowed, close the cache
|
|
748
|
-
if not self.nocache:
|
|
749
|
-
for _, _, _ in plan:
|
|
750
|
-
# self.datadir.close_cache(logical_op_id)
|
|
751
|
-
pass
|
|
752
|
-
|
|
753
|
-
# finalize plan stats
|
|
754
|
-
total_plan_time = time.time() - plan_start_time
|
|
755
|
-
plan_stats.finalize(total_plan_time)
|
|
756
|
-
|
|
757
|
-
return all_outputs, plan_stats
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
def generate_sample_observations(self, sentinel_plan: SentinelPlan, policy: Policy):
|
|
761
|
-
"""
|
|
762
|
-
This function is responsible for generating sample observation data which can be
|
|
763
|
-
consumed by the CostModel.
|
|
764
|
-
|
|
765
|
-
To accomplish this, we construct a special sentinel plan using the Optimizer which is
|
|
766
|
-
capable of executing any valid physical implementation of a Filter or Convert operator
|
|
767
|
-
on each record.
|
|
768
|
-
"""
|
|
769
|
-
# if we're using validation data, get the set of expected output records
|
|
770
|
-
expected_outputs = {}
|
|
771
|
-
for source_idx in range(len(self.val_datasource)):
|
|
772
|
-
# TODO: make sure execute_op_set uses self.val_datasource
|
|
773
|
-
expected_output = self.val_datasource[source_idx]
|
|
774
|
-
expected_outputs[source_idx] = expected_output
|
|
775
|
-
|
|
776
|
-
# run sentinel plan
|
|
777
|
-
execution_data, plan_stats = self.execute_sentinel_plan(sentinel_plan, expected_outputs, policy)
|
|
778
|
-
|
|
779
|
-
return execution_data, plan_stats
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
def create_sentinel_plan(self, dataset: Set, policy: Policy) -> SentinelPlan:
|
|
783
|
-
"""
|
|
784
|
-
Generates and returns a SentinelPlan for the given dataset.
|
|
785
|
-
"""
|
|
786
|
-
# TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
|
|
787
|
-
|
|
788
|
-
# create a new optimizer and update its strategy to SENTINEL
|
|
789
|
-
optimizer = self.optimizer.deepcopy_clean()
|
|
790
|
-
optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
|
|
791
|
-
|
|
792
|
-
# create copy of dataset, but change its data source to the validation data source
|
|
793
|
-
dataset = deepcopy(dataset)
|
|
794
|
-
dataset._set_data_source(self.val_datasource)
|
|
795
|
-
|
|
796
|
-
# get the sentinel plan for the given dataset
|
|
797
|
-
sentinel_plans = optimizer.optimize(dataset, policy)
|
|
798
|
-
sentinel_plan = sentinel_plans[0]
|
|
799
|
-
|
|
800
|
-
return sentinel_plan
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
def execute(self) -> DataRecordCollection:
|
|
804
|
-
execution_start_time = time.time()
|
|
805
|
-
|
|
806
|
-
# for now, enforce that we are using validation data; we can relax this after paper submission
|
|
807
|
-
if self.val_datasource is None:
|
|
808
|
-
raise Exception("Make sure you are using validation data with MABSentinelExecutionEngine")
|
|
809
|
-
|
|
810
|
-
# if nocache is True, make sure we do not re-use codegen examples
|
|
811
|
-
if self.nocache:
|
|
812
|
-
# self.clear_cached_examples()
|
|
813
|
-
pass
|
|
814
|
-
|
|
815
|
-
# create sentinel plan
|
|
816
|
-
sentinel_plan = self.create_sentinel_plan(self.dataset, self.policy)
|
|
817
|
-
|
|
818
|
-
# generate sample execution data
|
|
819
|
-
all_execution_data, plan_stats = self.generate_sample_observations(sentinel_plan, self.policy)
|
|
820
|
-
|
|
821
|
-
# put sentinel plan execution stats into list and prepare list of output records
|
|
822
|
-
all_plan_stats = [plan_stats]
|
|
823
|
-
all_records = []
|
|
824
|
-
|
|
825
|
-
# (re-)initialize the optimizer
|
|
826
|
-
optimizer = self.optimizer.deepcopy_clean()
|
|
827
|
-
|
|
828
|
-
# construct the CostModel with any sample execution data we've gathered
|
|
829
|
-
cost_model = SampleBasedCostModel(sentinel_plan, all_execution_data, self.verbose)
|
|
830
|
-
optimizer.update_cost_model(cost_model)
|
|
831
|
-
total_optimization_time = time.time() - execution_start_time
|
|
832
|
-
|
|
833
|
-
# execute plan(s) according to the optimization strategy
|
|
834
|
-
records, plan_stats = self._execute_with_strategy(self.dataset, self.policy, optimizer)
|
|
835
|
-
all_records.extend(records)
|
|
836
|
-
all_plan_stats.extend(plan_stats)
|
|
837
|
-
|
|
838
|
-
# aggregate plan stats
|
|
839
|
-
aggregate_plan_stats = self.aggregate_plan_stats(all_plan_stats)
|
|
840
|
-
|
|
841
|
-
# add sentinel records and plan stats (if captured) to plan execution data
|
|
842
|
-
execution_stats = ExecutionStats(
|
|
843
|
-
execution_id=self.execution_id(),
|
|
844
|
-
plan_stats=aggregate_plan_stats,
|
|
845
|
-
total_optimization_time=total_optimization_time,
|
|
846
|
-
total_execution_time=time.time() - execution_start_time,
|
|
847
|
-
total_execution_cost=sum(list(map(lambda plan_stats: plan_stats.total_plan_cost, aggregate_plan_stats.values()))),
|
|
848
|
-
plan_strs={plan_id: plan_stats.plan_str for plan_id, plan_stats in aggregate_plan_stats.items()},
|
|
849
|
-
)
|
|
850
|
-
|
|
851
|
-
return DataRecordCollection(all_records, execution_stats = execution_stats)
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
class MABSentinelSequentialSingleThreadProcessor(MABSentinelQueryProcessor, SequentialSingleThreadExecutionStrategy):
|
|
856
|
-
"""
|
|
857
|
-
This class performs sentinel execution while executing plans in a sequential, single-threaded fashion.
|
|
858
|
-
"""
|
|
859
|
-
def __init__(self, *args, **kwargs):
|
|
860
|
-
super().__init__(self, *args, **kwargs)
|
|
861
|
-
SequentialSingleThreadExecutionStrategy.__init__(
|
|
862
|
-
self,
|
|
863
|
-
scan_start_idx=self.scan_start_idx,
|
|
864
|
-
max_workers=self.max_workers,
|
|
865
|
-
nocache=self.nocache,
|
|
866
|
-
verbose=self.verbose
|
|
867
|
-
)
|
|
868
|
-
self.progress_manager = None
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
class MABSentinelPipelinedParallelProcessor(MABSentinelQueryProcessor, PipelinedParallelExecutionStrategy):
|
|
872
|
-
"""
|
|
873
|
-
This class performs sentinel execution while executing plans in a pipelined, parallel fashion.
|
|
874
|
-
"""
|
|
875
|
-
def __init__(self, *args, **kwargs):
|
|
876
|
-
MABSentinelQueryProcessor.__init__(self, *args, **kwargs)
|
|
877
|
-
PipelinedParallelExecutionStrategy.__init__(
|
|
878
|
-
self,
|
|
879
|
-
scan_start_idx=self.scan_start_idx,
|
|
880
|
-
max_workers=self.max_workers,
|
|
881
|
-
nocache=self.nocache,
|
|
882
|
-
verbose=self.verbose
|
|
883
|
-
)
|
|
884
|
-
self.progress_manager = None
|