palimpzest 0.7.7__py3-none-any.whl → 0.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/constants.py +113 -75
- palimpzest/core/data/dataclasses.py +55 -38
- palimpzest/core/elements/index.py +5 -15
- palimpzest/core/elements/records.py +1 -1
- palimpzest/prompts/prompt_factory.py +1 -1
- palimpzest/query/execution/all_sample_execution_strategy.py +216 -0
- palimpzest/query/execution/execution_strategy.py +4 -4
- palimpzest/query/execution/execution_strategy_type.py +7 -1
- palimpzest/query/execution/mab_execution_strategy.py +184 -72
- palimpzest/query/execution/parallel_execution_strategy.py +182 -15
- palimpzest/query/execution/single_threaded_execution_strategy.py +21 -21
- palimpzest/query/generators/api_client_factory.py +6 -7
- palimpzest/query/generators/generators.py +5 -8
- palimpzest/query/operators/aggregate.py +4 -3
- palimpzest/query/operators/convert.py +1 -1
- palimpzest/query/operators/filter.py +1 -1
- palimpzest/query/operators/limit.py +1 -1
- palimpzest/query/operators/map.py +1 -1
- palimpzest/query/operators/physical.py +8 -4
- palimpzest/query/operators/project.py +1 -1
- palimpzest/query/operators/retrieve.py +7 -23
- palimpzest/query/operators/scan.py +1 -1
- palimpzest/query/optimizer/cost_model.py +54 -62
- palimpzest/query/optimizer/optimizer.py +2 -6
- palimpzest/query/optimizer/plan.py +4 -4
- palimpzest/query/optimizer/primitives.py +1 -1
- palimpzest/query/optimizer/rules.py +8 -26
- palimpzest/query/optimizer/tasks.py +3 -3
- palimpzest/query/processor/processing_strategy_type.py +2 -2
- palimpzest/query/processor/sentinel_processor.py +0 -2
- palimpzest/sets.py +2 -3
- palimpzest/utils/generation_helpers.py +1 -1
- palimpzest/utils/model_helpers.py +27 -9
- palimpzest/utils/progress.py +81 -72
- {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/METADATA +4 -2
- {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/RECORD +39 -38
- {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/WHEEL +1 -1
- {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/top_level.txt +0 -0
|
@@ -23,7 +23,7 @@ class OpFrontier:
|
|
|
23
23
|
2. has been sampled fewer than j times
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
-
def __init__(self, op_set: list[PhysicalOperator], source_indices: list[int], k: int, j: int, seed: int, policy: Policy):
|
|
26
|
+
def __init__(self, op_set: list[PhysicalOperator], source_indices: list[int], k: int, j: int, seed: int, policy: Policy, priors: dict | None = None):
|
|
27
27
|
# set k and j, which are the initial number of operators in the frontier and the
|
|
28
28
|
# initial number of records to sample for each frontier operator
|
|
29
29
|
self.k = min(k, len(op_set))
|
|
@@ -32,19 +32,22 @@ class OpFrontier:
|
|
|
32
32
|
# store the policy that we are optimizing under
|
|
33
33
|
self.policy = policy
|
|
34
34
|
|
|
35
|
+
# store the prior beliefs on operator performance (if provided)
|
|
36
|
+
self.priors = priors
|
|
37
|
+
|
|
35
38
|
# get order in which we will sample physical operators for this logical operator
|
|
36
39
|
sample_op_indices = self._get_op_index_order(op_set, seed)
|
|
37
40
|
|
|
38
41
|
# construct the initial set of frontier and reservoir operators
|
|
39
42
|
self.frontier_ops = [op_set[sample_idx] for sample_idx in sample_op_indices[:self.k]]
|
|
40
43
|
self.reservoir_ops = [op_set[sample_idx] for sample_idx in sample_op_indices[self.k:]]
|
|
41
|
-
self.off_frontier_ops = []
|
|
44
|
+
self.off_frontier_ops: list[PhysicalOperator] = []
|
|
42
45
|
|
|
43
46
|
# store the order in which we will sample the source records
|
|
44
47
|
self.source_indices = source_indices
|
|
45
48
|
|
|
46
49
|
# keep track of the source ids processed by each physical operator
|
|
47
|
-
self.
|
|
50
|
+
self.full_op_id_to_sources_processed = {op.get_full_op_id(): set() for op in op_set}
|
|
48
51
|
|
|
49
52
|
# set the initial inputs for this logical operator
|
|
50
53
|
is_scan_op = isinstance(op_set[0], ScanPhysicalOp)
|
|
@@ -59,13 +62,122 @@ class OpFrontier:
|
|
|
59
62
|
"""
|
|
60
63
|
return self.frontier_ops
|
|
61
64
|
|
|
65
|
+
def _compute_op_id_to_pareto_distance(self, priors: dict[str, dict[str, float]]) -> dict[str, float]:
|
|
66
|
+
"""
|
|
67
|
+
Return l2-distance for each operator from the pareto frontier.
|
|
68
|
+
"""
|
|
69
|
+
# get the dictionary representation of this poicy
|
|
70
|
+
policy_dict = self.policy.get_dict()
|
|
71
|
+
|
|
72
|
+
# compute the pareto optimal set of operators
|
|
73
|
+
pareto_op_set = set()
|
|
74
|
+
for op_id, metrics in priors.items():
|
|
75
|
+
cost, time, quality = metrics["cost"], metrics["time"], metrics["quality"]
|
|
76
|
+
pareto_frontier = True
|
|
77
|
+
|
|
78
|
+
# check if any other operator dominates op_id
|
|
79
|
+
for other_op_id, other_metrics in priors.items():
|
|
80
|
+
other_cost, other_time, other_quality = other_metrics["cost"], other_metrics["time"], other_metrics["quality"]
|
|
81
|
+
if op_id == other_op_id:
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
# if op_id is dominated by other_op_id, set pareto_frontier = False and break
|
|
85
|
+
# NOTE: here we use a strict inequality (instead of the usual <= or >=) because
|
|
86
|
+
# all ops which have equal cost / time / quality / sel. should not be
|
|
87
|
+
# filtered out from sampling by our logic in this function
|
|
88
|
+
cost_dominated = True if policy_dict["cost"] == 0.0 else other_cost < cost
|
|
89
|
+
time_dominated = True if policy_dict["time"] == 0.0 else other_time < time
|
|
90
|
+
quality_dominated = True if policy_dict["quality"] == 0.0 else other_quality > quality
|
|
91
|
+
if cost_dominated and time_dominated and quality_dominated:
|
|
92
|
+
pareto_frontier = False
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
# add op_id to pareto frontier if it's not dominated
|
|
96
|
+
if pareto_frontier:
|
|
97
|
+
pareto_op_set.add(op_id)
|
|
98
|
+
|
|
99
|
+
# compute the shortest distance from each operator to the pareto frontier
|
|
100
|
+
op_id_to_pareto_distance = {}
|
|
101
|
+
for op_id, metrics in priors.items():
|
|
102
|
+
# set distance to 0.0 if this operator is on the pareto frontier
|
|
103
|
+
if op_id in pareto_op_set:
|
|
104
|
+
op_id_to_pareto_distance[op_id] = 0.0
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
# otherwise, compute min_dist to pareto operators
|
|
108
|
+
min_dist = None
|
|
109
|
+
cost, time, quality = metrics["cost"], metrics["time"], metrics["quality"]
|
|
110
|
+
for pareto_op_id in pareto_op_set:
|
|
111
|
+
pareto_cost, pareto_time, pareto_quality = priors[pareto_op_id]["cost"], priors[pareto_op_id]["time"], priors[pareto_op_id]["quality"]
|
|
112
|
+
|
|
113
|
+
cost_dist_squared = 0.0 if policy_dict["cost"] == 0.0 else (cost - pareto_cost) ** 2
|
|
114
|
+
time_dist_squared = 0.0 if policy_dict["time"] == 0.0 else (time - pareto_time) ** 2
|
|
115
|
+
quality_dist_squared = 0.0 if policy_dict["quality"] == 0.0 else (quality - pareto_quality) ** 2
|
|
116
|
+
dist = np.sqrt(cost_dist_squared + time_dist_squared + quality_dist_squared)
|
|
117
|
+
if min_dist is None or dist < min_dist:
|
|
118
|
+
min_dist = dist
|
|
119
|
+
|
|
120
|
+
# set minimum distance for this operator
|
|
121
|
+
op_id_to_pareto_distance[op_id] = min_dist
|
|
122
|
+
|
|
123
|
+
return op_id_to_pareto_distance
|
|
124
|
+
|
|
62
125
|
def _get_op_index_order(self, op_set: list[PhysicalOperator], seed: int) -> list[int]:
|
|
63
126
|
"""
|
|
64
127
|
Returns a list of indices for the operators in the op_set.
|
|
65
128
|
"""
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
129
|
+
if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
|
|
130
|
+
rng = np.random.default_rng(seed=seed)
|
|
131
|
+
op_indices = np.arange(len(op_set))
|
|
132
|
+
rng.shuffle(op_indices)
|
|
133
|
+
return op_indices
|
|
134
|
+
|
|
135
|
+
# NOTE: self.priors is a dictionary with format:
|
|
136
|
+
# {op_id: {"quality": quality, "cost": cost, "time": time}}
|
|
137
|
+
|
|
138
|
+
# compute mean and std. dev. for each field
|
|
139
|
+
qualities = [op_priors["quality"] for op_priors in self.priors.values()]
|
|
140
|
+
costs = [op_priors["cost"] for op_priors in self.priors.values()]
|
|
141
|
+
times = [op_priors["time"] for op_priors in self.priors.values()]
|
|
142
|
+
metric_to_mean = {"quality": np.mean(qualities), "cost": np.mean(costs), "time": np.mean(times)}
|
|
143
|
+
metric_to_std = {"quality": np.std(qualities), "cost": np.std(costs), "time": np.std(times)}
|
|
144
|
+
|
|
145
|
+
# normalize the scale of each field to be the same
|
|
146
|
+
for _, op_priors in self.priors.items():
|
|
147
|
+
for metric, value in op_priors.items():
|
|
148
|
+
if metric_to_std[metric] == 0.0:
|
|
149
|
+
op_priors[metric] = metric_to_mean[metric]
|
|
150
|
+
else:
|
|
151
|
+
op_priors[metric] = (value - metric_to_mean[metric]) / metric_to_std[metric]
|
|
152
|
+
|
|
153
|
+
# then, we compute the l2-distance from the pareto frontier for each operator
|
|
154
|
+
op_id_to_distance = self._compute_op_id_to_pareto_distance(self.priors)
|
|
155
|
+
|
|
156
|
+
# compute tuple for every operator, invert quality so ascending sort puts
|
|
157
|
+
# best operator first: (op_id, dist, -1 * quality, cost, time);
|
|
158
|
+
op_tuples = []
|
|
159
|
+
for op in op_set:
|
|
160
|
+
op_id = op.get_op_id()
|
|
161
|
+
op_priors = self.priors[op_id]
|
|
162
|
+
op_tuple = (op_id, op_id_to_distance[op_id], -1 * op_priors["quality"], op_priors["cost"], op_priors["time"])
|
|
163
|
+
op_tuples.append(op_tuple)
|
|
164
|
+
|
|
165
|
+
# sort tuples on distance, then second dim
|
|
166
|
+
second_dim_idx = None
|
|
167
|
+
if self.policy.get_primary_metric() == "quality":
|
|
168
|
+
second_dim_idx = 2
|
|
169
|
+
elif self.policy.get_primary_metric() == "cost":
|
|
170
|
+
second_dim_idx = 3
|
|
171
|
+
elif self.policy.get_primary_metric() == "time":
|
|
172
|
+
second_dim_idx = 4
|
|
173
|
+
|
|
174
|
+
# sort based on distance from pareto frontier; break ties with performance on max / min metric
|
|
175
|
+
op_tuples = sorted(op_tuples, key=lambda x: (x[1], x[second_dim_idx]))
|
|
176
|
+
|
|
177
|
+
# return final list of op indices in sample order
|
|
178
|
+
op_id_to_idx = {op.get_op_id(): idx for idx, op in enumerate(op_set)}
|
|
179
|
+
op_indices = [op_id_to_idx[op_tuple[0]] for op_tuple in op_tuples]
|
|
180
|
+
|
|
69
181
|
return op_indices
|
|
70
182
|
|
|
71
183
|
def _get_op_source_idx_pairs(self) -> list[tuple[PhysicalOperator, int]]:
|
|
@@ -76,7 +188,7 @@ class OpFrontier:
|
|
|
76
188
|
op_source_idx_pairs = []
|
|
77
189
|
for op in self.frontier_ops:
|
|
78
190
|
# execute new operators on first j source indices, and previously sampled operators on one additional source_idx
|
|
79
|
-
num_processed = len(self.
|
|
191
|
+
num_processed = len(self.full_op_id_to_sources_processed[op.get_full_op_id()])
|
|
80
192
|
num_new_samples = 1 if num_processed > 0 else self.j
|
|
81
193
|
num_new_samples = min(num_new_samples, len(self.source_indices) - num_processed)
|
|
82
194
|
assert num_new_samples >= 0, "Number of new samples must be non-negative"
|
|
@@ -84,7 +196,7 @@ class OpFrontier:
|
|
|
84
196
|
# construct list of inputs by looking up the input for the given source_idx
|
|
85
197
|
samples_added = 0
|
|
86
198
|
for source_idx in self.source_indices:
|
|
87
|
-
if source_idx in self.
|
|
199
|
+
if source_idx in self.full_op_id_to_sources_processed[op.get_full_op_id()]:
|
|
88
200
|
continue
|
|
89
201
|
|
|
90
202
|
if samples_added == num_new_samples:
|
|
@@ -120,7 +232,7 @@ class OpFrontier:
|
|
|
120
232
|
for source_idx in unsampled_source_indices:
|
|
121
233
|
op_source_idx_pairs.append((max_quality_op, source_idx))
|
|
122
234
|
for op in self.frontier_ops:
|
|
123
|
-
if len(self.
|
|
235
|
+
if len(self.full_op_id_to_sources_processed[op.get_full_op_id()]) == 0 and op.get_full_op_id() != max_quality_op.get_full_op_id():
|
|
124
236
|
op_source_idx_pairs.append((op, source_idx))
|
|
125
237
|
|
|
126
238
|
# fetch the corresponding (op, input) pairs
|
|
@@ -143,10 +255,10 @@ class OpFrontier:
|
|
|
143
255
|
# NOTE: downstream operators may end up re-computing the same record_id with a diff. input as upstream
|
|
144
256
|
# upstream operators change; in this case, we de-duplicate record_op_stats with identical record_ids
|
|
145
257
|
# and keep the one with the maximum quality
|
|
146
|
-
# get a mapping from
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
for
|
|
258
|
+
# get a mapping from full_op_id --> list[RecordOpStats]
|
|
259
|
+
full_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(logical_op_id, {})
|
|
260
|
+
full_op_id_to_record_op_stats = {}
|
|
261
|
+
for full_op_id, op_stats in full_op_id_to_op_stats.items():
|
|
150
262
|
# skip over operators which have not been sampled
|
|
151
263
|
if len(op_stats.record_op_stats_lst) == 0:
|
|
152
264
|
continue
|
|
@@ -162,19 +274,19 @@ class OpFrontier:
|
|
|
162
274
|
record_id_to_max_quality_record_op_stats[record_id] = record_op_stats
|
|
163
275
|
|
|
164
276
|
# compute final list of record op stats
|
|
165
|
-
|
|
277
|
+
full_op_id_to_record_op_stats[full_op_id] = list(record_id_to_max_quality_record_op_stats.values())
|
|
166
278
|
|
|
167
279
|
# compute mapping of physical op to num samples and total samples drawn;
|
|
168
280
|
# also update the set of source indices which have been processed by each physical operator
|
|
169
|
-
|
|
170
|
-
for
|
|
171
|
-
# update
|
|
281
|
+
full_op_id_to_num_samples, total_num_samples = {}, 0
|
|
282
|
+
for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items():
|
|
283
|
+
# update the set of source indices processed
|
|
172
284
|
for record_op_stats in record_op_stats_lst:
|
|
173
|
-
self.
|
|
285
|
+
self.full_op_id_to_sources_processed[full_op_id].add(record_op_stats.record_source_idx)
|
|
174
286
|
|
|
175
287
|
# compute the number of samples as the number of source indices processed
|
|
176
|
-
num_samples = len(self.
|
|
177
|
-
|
|
288
|
+
num_samples = len(self.full_op_id_to_sources_processed[full_op_id])
|
|
289
|
+
full_op_id_to_num_samples[full_op_id] = num_samples
|
|
178
290
|
total_num_samples += num_samples
|
|
179
291
|
|
|
180
292
|
# compute avg. selectivity, cost, time, and quality for each physical operator
|
|
@@ -184,63 +296,63 @@ class OpFrontier:
|
|
|
184
296
|
def total_input(record_op_stats_lst):
|
|
185
297
|
return len(set([record_op_stats.record_parent_id for record_op_stats in record_op_stats_lst]))
|
|
186
298
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
for
|
|
299
|
+
full_op_id_to_mean_selectivity = {
|
|
300
|
+
full_op_id: total_output(record_op_stats_lst) / total_input(record_op_stats_lst)
|
|
301
|
+
for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items()
|
|
190
302
|
}
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
for
|
|
303
|
+
full_op_id_to_mean_cost = {
|
|
304
|
+
full_op_id: np.mean([record_op_stats.cost_per_record for record_op_stats in record_op_stats_lst])
|
|
305
|
+
for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items()
|
|
194
306
|
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
for
|
|
307
|
+
full_op_id_to_mean_time = {
|
|
308
|
+
full_op_id: np.mean([record_op_stats.time_per_record for record_op_stats in record_op_stats_lst])
|
|
309
|
+
for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items()
|
|
198
310
|
}
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
for
|
|
311
|
+
full_op_id_to_mean_quality = {
|
|
312
|
+
full_op_id: np.mean([record_op_stats.quality for record_op_stats in record_op_stats_lst])
|
|
313
|
+
for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items()
|
|
202
314
|
}
|
|
203
315
|
|
|
204
316
|
# # compute average, LCB, and UCB of each operator; the confidence bounds depend upon
|
|
205
317
|
# # the computation of the alpha parameter, which we scale to be 0.5 * the mean (of means)
|
|
206
318
|
# # of the metric across all operators in this operator set
|
|
207
|
-
# cost_alpha = 0.5 * np.mean([mean_cost for mean_cost in
|
|
208
|
-
# time_alpha = 0.5 * np.mean([mean_time for mean_time in
|
|
209
|
-
# quality_alpha = 0.5 * np.mean([mean_quality for mean_quality in
|
|
210
|
-
# selectivity_alpha = 0.5 * np.mean([mean_selectivity for mean_selectivity in
|
|
211
|
-
cost_alpha = 0.5 * (np.max(list(
|
|
212
|
-
time_alpha = 0.5 * (np.max(list(
|
|
213
|
-
quality_alpha = 0.5 * (np.max(list(
|
|
214
|
-
selectivity_alpha = 0.5 * (np.max(list(
|
|
319
|
+
# cost_alpha = 0.5 * np.mean([mean_cost for mean_cost in full_op_id_to_mean_cost.values()])
|
|
320
|
+
# time_alpha = 0.5 * np.mean([mean_time for mean_time in full_op_id_to_mean_time.values()])
|
|
321
|
+
# quality_alpha = 0.5 * np.mean([mean_quality for mean_quality in full_op_id_to_mean_quality.values()])
|
|
322
|
+
# selectivity_alpha = 0.5 * np.mean([mean_selectivity for mean_selectivity in full_op_id_to_mean_selectivity.values()])
|
|
323
|
+
cost_alpha = 0.5 * (np.max(list(full_op_id_to_mean_cost.values())) - np.min(list(full_op_id_to_mean_cost.values())))
|
|
324
|
+
time_alpha = 0.5 * (np.max(list(full_op_id_to_mean_time.values())) - np.min(list(full_op_id_to_mean_time.values())))
|
|
325
|
+
quality_alpha = 0.5 * (np.max(list(full_op_id_to_mean_quality.values())) - np.min(list(full_op_id_to_mean_quality.values())))
|
|
326
|
+
selectivity_alpha = 0.5 * (np.max(list(full_op_id_to_mean_selectivity.values())) - np.min(list(full_op_id_to_mean_selectivity.values())))
|
|
215
327
|
|
|
216
328
|
# compute metrics for each physical operator
|
|
217
329
|
op_metrics = {}
|
|
218
|
-
for
|
|
219
|
-
sample_ratio = np.sqrt(np.log(total_num_samples) /
|
|
330
|
+
for full_op_id in full_op_id_to_record_op_stats:
|
|
331
|
+
sample_ratio = np.sqrt(np.log(total_num_samples) / full_op_id_to_num_samples[full_op_id])
|
|
220
332
|
exploration_terms = np.array([cost_alpha * sample_ratio, time_alpha * sample_ratio, quality_alpha * sample_ratio, selectivity_alpha * sample_ratio])
|
|
221
|
-
mean_terms = (
|
|
333
|
+
mean_terms = (full_op_id_to_mean_cost[full_op_id], full_op_id_to_mean_time[full_op_id], full_op_id_to_mean_quality[full_op_id], full_op_id_to_mean_selectivity[full_op_id])
|
|
222
334
|
|
|
223
335
|
# NOTE: we could clip these; however I will not do so for now to allow for arbitrary quality metric(s)
|
|
224
336
|
lcb_terms = mean_terms - exploration_terms
|
|
225
337
|
ucb_terms = mean_terms + exploration_terms
|
|
226
|
-
op_metrics[
|
|
338
|
+
op_metrics[full_op_id] = {"mean": mean_terms, "lcb": lcb_terms, "ucb": ucb_terms}
|
|
227
339
|
|
|
228
340
|
# get the tuple representation of this policy
|
|
229
341
|
policy_dict = self.policy.get_dict()
|
|
230
342
|
|
|
231
343
|
# compute the pareto optimal set of operators
|
|
232
344
|
pareto_op_set = set()
|
|
233
|
-
for
|
|
345
|
+
for full_op_id, metrics in op_metrics.items():
|
|
234
346
|
cost, time, quality, selectivity = metrics["mean"]
|
|
235
347
|
pareto_frontier = True
|
|
236
348
|
|
|
237
|
-
# check if any other operator dominates
|
|
238
|
-
for
|
|
349
|
+
# check if any other operator dominates full_op_id
|
|
350
|
+
for other_full_op_id, other_metrics in op_metrics.items():
|
|
239
351
|
other_cost, other_time, other_quality, other_selectivity = other_metrics["mean"]
|
|
240
|
-
if
|
|
352
|
+
if full_op_id == other_full_op_id:
|
|
241
353
|
continue
|
|
242
354
|
|
|
243
|
-
# if
|
|
355
|
+
# if full_op_id is dominated by other_full_op_id, set pareto_frontier = False and break
|
|
244
356
|
# NOTE: here we use a strict inequality (instead of the usual <= or >=) because
|
|
245
357
|
# all ops which have equal cost / time / quality / sel. should not be
|
|
246
358
|
# filtered out from sampling by our logic in this function
|
|
@@ -252,21 +364,21 @@ class OpFrontier:
|
|
|
252
364
|
pareto_frontier = False
|
|
253
365
|
break
|
|
254
366
|
|
|
255
|
-
# add
|
|
367
|
+
# add full_op_id to pareto frontier if it's not dominated
|
|
256
368
|
if pareto_frontier:
|
|
257
|
-
pareto_op_set.add(
|
|
369
|
+
pareto_op_set.add(full_op_id)
|
|
258
370
|
|
|
259
371
|
# iterate over op metrics and compute the new frontier set of operators
|
|
260
|
-
|
|
261
|
-
for
|
|
372
|
+
new_frontier_full_op_ids = set()
|
|
373
|
+
for full_op_id, metrics in op_metrics.items():
|
|
262
374
|
|
|
263
375
|
# if this op is fully sampled, do not keep it on the frontier
|
|
264
|
-
if
|
|
376
|
+
if full_op_id_to_num_samples[full_op_id] == len(self.source_indices):
|
|
265
377
|
continue
|
|
266
378
|
|
|
267
379
|
# if this op is pareto optimal keep it in our frontier ops
|
|
268
|
-
if
|
|
269
|
-
|
|
380
|
+
if full_op_id in pareto_op_set:
|
|
381
|
+
new_frontier_full_op_ids.add(full_op_id)
|
|
270
382
|
continue
|
|
271
383
|
|
|
272
384
|
# otherwise, if this op overlaps with an op on the pareto frontier, keep it in our frontier ops
|
|
@@ -274,11 +386,11 @@ class OpFrontier:
|
|
|
274
386
|
pareto_frontier = True
|
|
275
387
|
op_cost, op_time, _, op_selectivity = metrics["lcb"]
|
|
276
388
|
op_quality = metrics["ucb"][2]
|
|
277
|
-
for
|
|
278
|
-
pareto_cost, pareto_time, _, pareto_selectivity = op_metrics[
|
|
279
|
-
pareto_quality = op_metrics[
|
|
389
|
+
for pareto_full_op_id in pareto_op_set:
|
|
390
|
+
pareto_cost, pareto_time, _, pareto_selectivity = op_metrics[pareto_full_op_id]["ucb"]
|
|
391
|
+
pareto_quality = op_metrics[pareto_full_op_id]["lcb"][2]
|
|
280
392
|
|
|
281
|
-
# if
|
|
393
|
+
# if full_op_id is dominated by pareto_full_op_id, set pareto_frontier = False and break
|
|
282
394
|
cost_dominated = True if policy_dict["cost"] == 0.0 else pareto_cost <= op_cost
|
|
283
395
|
time_dominated = True if policy_dict["time"] == 0.0 else pareto_time <= op_time
|
|
284
396
|
quality_dominated = True if policy_dict["quality"] == 0.0 else pareto_quality >= op_quality
|
|
@@ -287,15 +399,15 @@ class OpFrontier:
|
|
|
287
399
|
pareto_frontier = False
|
|
288
400
|
break
|
|
289
401
|
|
|
290
|
-
# add
|
|
402
|
+
# add full_op_id to pareto frontier if it's not dominated
|
|
291
403
|
if pareto_frontier:
|
|
292
|
-
|
|
404
|
+
new_frontier_full_op_ids.add(full_op_id)
|
|
293
405
|
|
|
294
406
|
# for operators that were in the frontier, keep them in the frontier if they
|
|
295
407
|
# are still pareto optimal, otherwise, move them to the end of the reservoir
|
|
296
408
|
new_frontier_ops = []
|
|
297
409
|
for op in self.frontier_ops:
|
|
298
|
-
if op.
|
|
410
|
+
if op.get_full_op_id() in new_frontier_full_op_ids:
|
|
299
411
|
new_frontier_ops.append(op)
|
|
300
412
|
else:
|
|
301
413
|
self.off_frontier_ops.append(op)
|
|
@@ -304,7 +416,7 @@ class OpFrontier:
|
|
|
304
416
|
# add them to the frontier, otherwise, put them back in the off_frontier_ops
|
|
305
417
|
new_off_frontier_ops = []
|
|
306
418
|
for op in self.off_frontier_ops:
|
|
307
|
-
if op.
|
|
419
|
+
if op.get_full_op_id() in new_frontier_full_op_ids:
|
|
308
420
|
new_frontier_ops.append(op)
|
|
309
421
|
else:
|
|
310
422
|
new_off_frontier_ops.append(op)
|
|
@@ -388,19 +500,19 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
|
|
|
388
500
|
# get the operators in the frontier set for this logical_op_id
|
|
389
501
|
frontier_ops = op_frontiers[logical_op_id].get_frontier_ops()
|
|
390
502
|
|
|
391
|
-
# get a mapping from
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
for
|
|
503
|
+
# get a mapping from full_op_id --> list[RecordOpStats]
|
|
504
|
+
full_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(logical_op_id, {})
|
|
505
|
+
full_op_id_to_record_op_stats = {
|
|
506
|
+
full_op_id: op_stats.record_op_stats_lst
|
|
507
|
+
for full_op_id, op_stats in full_op_id_to_op_stats.items()
|
|
396
508
|
}
|
|
397
509
|
|
|
398
510
|
# iterate over the frontier ops and return the one with the highest quality
|
|
399
511
|
max_quality_op, max_avg_quality = None, None
|
|
400
512
|
for op in frontier_ops:
|
|
401
513
|
op_quality_stats = []
|
|
402
|
-
if op.
|
|
403
|
-
op_quality_stats = [record_op_stats.quality for record_op_stats in
|
|
514
|
+
if op.get_full_op_id() in full_op_id_to_record_op_stats:
|
|
515
|
+
op_quality_stats = [record_op_stats.quality for record_op_stats in full_op_id_to_record_op_stats[op.get_full_op_id()]]
|
|
404
516
|
avg_op_quality = sum(op_quality_stats) / len(op_quality_stats) if len(op_quality_stats) > 0 else 0.0
|
|
405
517
|
if max_avg_quality is None or avg_op_quality > max_avg_quality:
|
|
406
518
|
max_quality_op = op
|
|
@@ -507,7 +619,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
|
|
|
507
619
|
|
|
508
620
|
# initialize frontier for each logical operator
|
|
509
621
|
op_frontiers = {
|
|
510
|
-
logical_op_id: OpFrontier(op_set, shuffled_source_indices, self.k, self.j, self.seed, self.policy)
|
|
622
|
+
logical_op_id: OpFrontier(op_set, shuffled_source_indices, self.k, self.j, self.seed, self.policy, self.priors)
|
|
511
623
|
for logical_op_id, op_set in plan
|
|
512
624
|
}
|
|
513
625
|
|