palimpzest 0.7.7__py3-none-any.whl → 0.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. palimpzest/constants.py +113 -75
  2. palimpzest/core/data/dataclasses.py +55 -38
  3. palimpzest/core/elements/index.py +5 -15
  4. palimpzest/core/elements/records.py +1 -1
  5. palimpzest/prompts/prompt_factory.py +1 -1
  6. palimpzest/query/execution/all_sample_execution_strategy.py +216 -0
  7. palimpzest/query/execution/execution_strategy.py +4 -4
  8. palimpzest/query/execution/execution_strategy_type.py +7 -1
  9. palimpzest/query/execution/mab_execution_strategy.py +184 -72
  10. palimpzest/query/execution/parallel_execution_strategy.py +182 -15
  11. palimpzest/query/execution/single_threaded_execution_strategy.py +21 -21
  12. palimpzest/query/generators/api_client_factory.py +6 -7
  13. palimpzest/query/generators/generators.py +5 -8
  14. palimpzest/query/operators/aggregate.py +4 -3
  15. palimpzest/query/operators/convert.py +1 -1
  16. palimpzest/query/operators/filter.py +1 -1
  17. palimpzest/query/operators/limit.py +1 -1
  18. palimpzest/query/operators/map.py +1 -1
  19. palimpzest/query/operators/physical.py +8 -4
  20. palimpzest/query/operators/project.py +1 -1
  21. palimpzest/query/operators/retrieve.py +7 -23
  22. palimpzest/query/operators/scan.py +1 -1
  23. palimpzest/query/optimizer/cost_model.py +54 -62
  24. palimpzest/query/optimizer/optimizer.py +2 -6
  25. palimpzest/query/optimizer/plan.py +4 -4
  26. palimpzest/query/optimizer/primitives.py +1 -1
  27. palimpzest/query/optimizer/rules.py +8 -26
  28. palimpzest/query/optimizer/tasks.py +3 -3
  29. palimpzest/query/processor/processing_strategy_type.py +2 -2
  30. palimpzest/query/processor/sentinel_processor.py +0 -2
  31. palimpzest/sets.py +2 -3
  32. palimpzest/utils/generation_helpers.py +1 -1
  33. palimpzest/utils/model_helpers.py +27 -9
  34. palimpzest/utils/progress.py +81 -72
  35. {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/METADATA +4 -2
  36. {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/RECORD +39 -38
  37. {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/WHEEL +1 -1
  38. {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/licenses/LICENSE +0 -0
  39. {palimpzest-0.7.7.dist-info → palimpzest-0.7.8.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ class OpFrontier:
23
23
  2. has been sampled fewer than j times
24
24
  """
25
25
 
26
- def __init__(self, op_set: list[PhysicalOperator], source_indices: list[int], k: int, j: int, seed: int, policy: Policy):
26
+ def __init__(self, op_set: list[PhysicalOperator], source_indices: list[int], k: int, j: int, seed: int, policy: Policy, priors: dict | None = None):
27
27
  # set k and j, which are the initial number of operators in the frontier and the
28
28
  # initial number of records to sample for each frontier operator
29
29
  self.k = min(k, len(op_set))
@@ -32,19 +32,22 @@ class OpFrontier:
32
32
  # store the policy that we are optimizing under
33
33
  self.policy = policy
34
34
 
35
+ # store the prior beliefs on operator performance (if provided)
36
+ self.priors = priors
37
+
35
38
  # get order in which we will sample physical operators for this logical operator
36
39
  sample_op_indices = self._get_op_index_order(op_set, seed)
37
40
 
38
41
  # construct the initial set of frontier and reservoir operators
39
42
  self.frontier_ops = [op_set[sample_idx] for sample_idx in sample_op_indices[:self.k]]
40
43
  self.reservoir_ops = [op_set[sample_idx] for sample_idx in sample_op_indices[self.k:]]
41
- self.off_frontier_ops = []
44
+ self.off_frontier_ops: list[PhysicalOperator] = []
42
45
 
43
46
  # store the order in which we will sample the source records
44
47
  self.source_indices = source_indices
45
48
 
46
49
  # keep track of the source ids processed by each physical operator
47
- self.phys_op_id_to_sources_processed = {op.get_op_id(): set() for op in op_set}
50
+ self.full_op_id_to_sources_processed = {op.get_full_op_id(): set() for op in op_set}
48
51
 
49
52
  # set the initial inputs for this logical operator
50
53
  is_scan_op = isinstance(op_set[0], ScanPhysicalOp)
@@ -59,13 +62,122 @@ class OpFrontier:
59
62
  """
60
63
  return self.frontier_ops
61
64
 
65
+ def _compute_op_id_to_pareto_distance(self, priors: dict[str, dict[str, float]]) -> dict[str, float]:
66
+ """
67
+ Return l2-distance for each operator from the pareto frontier.
68
+ """
69
+ # get the dictionary representation of this poicy
70
+ policy_dict = self.policy.get_dict()
71
+
72
+ # compute the pareto optimal set of operators
73
+ pareto_op_set = set()
74
+ for op_id, metrics in priors.items():
75
+ cost, time, quality = metrics["cost"], metrics["time"], metrics["quality"]
76
+ pareto_frontier = True
77
+
78
+ # check if any other operator dominates op_id
79
+ for other_op_id, other_metrics in priors.items():
80
+ other_cost, other_time, other_quality = other_metrics["cost"], other_metrics["time"], other_metrics["quality"]
81
+ if op_id == other_op_id:
82
+ continue
83
+
84
+ # if op_id is dominated by other_op_id, set pareto_frontier = False and break
85
+ # NOTE: here we use a strict inequality (instead of the usual <= or >=) because
86
+ # all ops which have equal cost / time / quality / sel. should not be
87
+ # filtered out from sampling by our logic in this function
88
+ cost_dominated = True if policy_dict["cost"] == 0.0 else other_cost < cost
89
+ time_dominated = True if policy_dict["time"] == 0.0 else other_time < time
90
+ quality_dominated = True if policy_dict["quality"] == 0.0 else other_quality > quality
91
+ if cost_dominated and time_dominated and quality_dominated:
92
+ pareto_frontier = False
93
+ break
94
+
95
+ # add op_id to pareto frontier if it's not dominated
96
+ if pareto_frontier:
97
+ pareto_op_set.add(op_id)
98
+
99
+ # compute the shortest distance from each operator to the pareto frontier
100
+ op_id_to_pareto_distance = {}
101
+ for op_id, metrics in priors.items():
102
+ # set distance to 0.0 if this operator is on the pareto frontier
103
+ if op_id in pareto_op_set:
104
+ op_id_to_pareto_distance[op_id] = 0.0
105
+ continue
106
+
107
+ # otherwise, compute min_dist to pareto operators
108
+ min_dist = None
109
+ cost, time, quality = metrics["cost"], metrics["time"], metrics["quality"]
110
+ for pareto_op_id in pareto_op_set:
111
+ pareto_cost, pareto_time, pareto_quality = priors[pareto_op_id]["cost"], priors[pareto_op_id]["time"], priors[pareto_op_id]["quality"]
112
+
113
+ cost_dist_squared = 0.0 if policy_dict["cost"] == 0.0 else (cost - pareto_cost) ** 2
114
+ time_dist_squared = 0.0 if policy_dict["time"] == 0.0 else (time - pareto_time) ** 2
115
+ quality_dist_squared = 0.0 if policy_dict["quality"] == 0.0 else (quality - pareto_quality) ** 2
116
+ dist = np.sqrt(cost_dist_squared + time_dist_squared + quality_dist_squared)
117
+ if min_dist is None or dist < min_dist:
118
+ min_dist = dist
119
+
120
+ # set minimum distance for this operator
121
+ op_id_to_pareto_distance[op_id] = min_dist
122
+
123
+ return op_id_to_pareto_distance
124
+
62
125
  def _get_op_index_order(self, op_set: list[PhysicalOperator], seed: int) -> list[int]:
63
126
  """
64
127
  Returns a list of indices for the operators in the op_set.
65
128
  """
66
- rng = np.random.default_rng(seed=seed)
67
- op_indices = np.arange(len(op_set))
68
- rng.shuffle(op_indices)
129
+ if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
130
+ rng = np.random.default_rng(seed=seed)
131
+ op_indices = np.arange(len(op_set))
132
+ rng.shuffle(op_indices)
133
+ return op_indices
134
+
135
+ # NOTE: self.priors is a dictionary with format:
136
+ # {op_id: {"quality": quality, "cost": cost, "time": time}}
137
+
138
+ # compute mean and std. dev. for each field
139
+ qualities = [op_priors["quality"] for op_priors in self.priors.values()]
140
+ costs = [op_priors["cost"] for op_priors in self.priors.values()]
141
+ times = [op_priors["time"] for op_priors in self.priors.values()]
142
+ metric_to_mean = {"quality": np.mean(qualities), "cost": np.mean(costs), "time": np.mean(times)}
143
+ metric_to_std = {"quality": np.std(qualities), "cost": np.std(costs), "time": np.std(times)}
144
+
145
+ # normalize the scale of each field to be the same
146
+ for _, op_priors in self.priors.items():
147
+ for metric, value in op_priors.items():
148
+ if metric_to_std[metric] == 0.0:
149
+ op_priors[metric] = metric_to_mean[metric]
150
+ else:
151
+ op_priors[metric] = (value - metric_to_mean[metric]) / metric_to_std[metric]
152
+
153
+ # then, we compute the l2-distance from the pareto frontier for each operator
154
+ op_id_to_distance = self._compute_op_id_to_pareto_distance(self.priors)
155
+
156
+ # compute tuple for every operator, invert quality so ascending sort puts
157
+ # best operator first: (op_id, dist, -1 * quality, cost, time);
158
+ op_tuples = []
159
+ for op in op_set:
160
+ op_id = op.get_op_id()
161
+ op_priors = self.priors[op_id]
162
+ op_tuple = (op_id, op_id_to_distance[op_id], -1 * op_priors["quality"], op_priors["cost"], op_priors["time"])
163
+ op_tuples.append(op_tuple)
164
+
165
+ # sort tuples on distance, then second dim
166
+ second_dim_idx = None
167
+ if self.policy.get_primary_metric() == "quality":
168
+ second_dim_idx = 2
169
+ elif self.policy.get_primary_metric() == "cost":
170
+ second_dim_idx = 3
171
+ elif self.policy.get_primary_metric() == "time":
172
+ second_dim_idx = 4
173
+
174
+ # sort based on distance from pareto frontier; break ties with performance on max / min metric
175
+ op_tuples = sorted(op_tuples, key=lambda x: (x[1], x[second_dim_idx]))
176
+
177
+ # return final list of op indices in sample order
178
+ op_id_to_idx = {op.get_op_id(): idx for idx, op in enumerate(op_set)}
179
+ op_indices = [op_id_to_idx[op_tuple[0]] for op_tuple in op_tuples]
180
+
69
181
  return op_indices
70
182
 
71
183
  def _get_op_source_idx_pairs(self) -> list[tuple[PhysicalOperator, int]]:
@@ -76,7 +188,7 @@ class OpFrontier:
76
188
  op_source_idx_pairs = []
77
189
  for op in self.frontier_ops:
78
190
  # execute new operators on first j source indices, and previously sampled operators on one additional source_idx
79
- num_processed = len(self.phys_op_id_to_sources_processed[op.get_op_id()])
191
+ num_processed = len(self.full_op_id_to_sources_processed[op.get_full_op_id()])
80
192
  num_new_samples = 1 if num_processed > 0 else self.j
81
193
  num_new_samples = min(num_new_samples, len(self.source_indices) - num_processed)
82
194
  assert num_new_samples >= 0, "Number of new samples must be non-negative"
@@ -84,7 +196,7 @@ class OpFrontier:
84
196
  # construct list of inputs by looking up the input for the given source_idx
85
197
  samples_added = 0
86
198
  for source_idx in self.source_indices:
87
- if source_idx in self.phys_op_id_to_sources_processed[op.get_op_id()]:
199
+ if source_idx in self.full_op_id_to_sources_processed[op.get_full_op_id()]:
88
200
  continue
89
201
 
90
202
  if samples_added == num_new_samples:
@@ -120,7 +232,7 @@ class OpFrontier:
120
232
  for source_idx in unsampled_source_indices:
121
233
  op_source_idx_pairs.append((max_quality_op, source_idx))
122
234
  for op in self.frontier_ops:
123
- if len(self.phys_op_id_to_sources_processed[op.get_op_id()]) == 0 and op.get_op_id() != max_quality_op.get_op_id():
235
+ if len(self.full_op_id_to_sources_processed[op.get_full_op_id()]) == 0 and op.get_full_op_id() != max_quality_op.get_full_op_id():
124
236
  op_source_idx_pairs.append((op, source_idx))
125
237
 
126
238
  # fetch the corresponding (op, input) pairs
@@ -143,10 +255,10 @@ class OpFrontier:
143
255
  # NOTE: downstream operators may end up re-computing the same record_id with a diff. input as upstream
144
256
  # upstream operators change; in this case, we de-duplicate record_op_stats with identical record_ids
145
257
  # and keep the one with the maximum quality
146
- # get a mapping from physical_op_id --> list[RecordOpStats]
147
- phys_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(logical_op_id, {})
148
- phys_op_id_to_record_op_stats = {}
149
- for phys_op_id, op_stats in phys_op_id_to_op_stats.items():
258
+ # get a mapping from full_op_id --> list[RecordOpStats]
259
+ full_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(logical_op_id, {})
260
+ full_op_id_to_record_op_stats = {}
261
+ for full_op_id, op_stats in full_op_id_to_op_stats.items():
150
262
  # skip over operators which have not been sampled
151
263
  if len(op_stats.record_op_stats_lst) == 0:
152
264
  continue
@@ -162,19 +274,19 @@ class OpFrontier:
162
274
  record_id_to_max_quality_record_op_stats[record_id] = record_op_stats
163
275
 
164
276
  # compute final list of record op stats
165
- phys_op_id_to_record_op_stats[phys_op_id] = list(record_id_to_max_quality_record_op_stats.values())
277
+ full_op_id_to_record_op_stats[full_op_id] = list(record_id_to_max_quality_record_op_stats.values())
166
278
 
167
279
  # compute mapping of physical op to num samples and total samples drawn;
168
280
  # also update the set of source indices which have been processed by each physical operator
169
- phys_op_id_to_num_samples, total_num_samples = {}, 0
170
- for phys_op_id, record_op_stats_lst in phys_op_id_to_record_op_stats.items():
171
- # update teh set of source indices processed
281
+ full_op_id_to_num_samples, total_num_samples = {}, 0
282
+ for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items():
283
+ # update the set of source indices processed
172
284
  for record_op_stats in record_op_stats_lst:
173
- self.phys_op_id_to_sources_processed[phys_op_id].add(record_op_stats.record_source_idx)
285
+ self.full_op_id_to_sources_processed[full_op_id].add(record_op_stats.record_source_idx)
174
286
 
175
287
  # compute the number of samples as the number of source indices processed
176
- num_samples = len(self.phys_op_id_to_sources_processed[phys_op_id])
177
- phys_op_id_to_num_samples[phys_op_id] = num_samples
288
+ num_samples = len(self.full_op_id_to_sources_processed[full_op_id])
289
+ full_op_id_to_num_samples[full_op_id] = num_samples
178
290
  total_num_samples += num_samples
179
291
 
180
292
  # compute avg. selectivity, cost, time, and quality for each physical operator
@@ -184,63 +296,63 @@ class OpFrontier:
184
296
  def total_input(record_op_stats_lst):
185
297
  return len(set([record_op_stats.record_parent_id for record_op_stats in record_op_stats_lst]))
186
298
 
187
- phys_op_to_mean_selectivity = {
188
- op_id: total_output(record_op_stats_lst) / total_input(record_op_stats_lst)
189
- for op_id, record_op_stats_lst in phys_op_id_to_record_op_stats.items()
299
+ full_op_id_to_mean_selectivity = {
300
+ full_op_id: total_output(record_op_stats_lst) / total_input(record_op_stats_lst)
301
+ for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items()
190
302
  }
191
- phys_op_to_mean_cost = {
192
- op_id: np.mean([record_op_stats.cost_per_record for record_op_stats in record_op_stats_lst])
193
- for op_id, record_op_stats_lst in phys_op_id_to_record_op_stats.items()
303
+ full_op_id_to_mean_cost = {
304
+ full_op_id: np.mean([record_op_stats.cost_per_record for record_op_stats in record_op_stats_lst])
305
+ for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items()
194
306
  }
195
- phys_op_to_mean_time = {
196
- op_id: np.mean([record_op_stats.time_per_record for record_op_stats in record_op_stats_lst])
197
- for op_id, record_op_stats_lst in phys_op_id_to_record_op_stats.items()
307
+ full_op_id_to_mean_time = {
308
+ full_op_id: np.mean([record_op_stats.time_per_record for record_op_stats in record_op_stats_lst])
309
+ for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items()
198
310
  }
199
- phys_op_to_mean_quality = {
200
- op_id: np.mean([record_op_stats.quality for record_op_stats in record_op_stats_lst])
201
- for op_id, record_op_stats_lst in phys_op_id_to_record_op_stats.items()
311
+ full_op_id_to_mean_quality = {
312
+ full_op_id: np.mean([record_op_stats.quality for record_op_stats in record_op_stats_lst])
313
+ for full_op_id, record_op_stats_lst in full_op_id_to_record_op_stats.items()
202
314
  }
203
315
 
204
316
  # # compute average, LCB, and UCB of each operator; the confidence bounds depend upon
205
317
  # # the computation of the alpha parameter, which we scale to be 0.5 * the mean (of means)
206
318
  # # of the metric across all operators in this operator set
207
- # cost_alpha = 0.5 * np.mean([mean_cost for mean_cost in phys_op_to_mean_cost.values()])
208
- # time_alpha = 0.5 * np.mean([mean_time for mean_time in phys_op_to_mean_time.values()])
209
- # quality_alpha = 0.5 * np.mean([mean_quality for mean_quality in phys_op_to_mean_quality.values()])
210
- # selectivity_alpha = 0.5 * np.mean([mean_selectivity for mean_selectivity in phys_op_to_mean_selectivity.values()])
211
- cost_alpha = 0.5 * (np.max(list(phys_op_to_mean_cost.values())) - np.min(list(phys_op_to_mean_cost.values())))
212
- time_alpha = 0.5 * (np.max(list(phys_op_to_mean_time.values())) - np.min(list(phys_op_to_mean_time.values())))
213
- quality_alpha = 0.5 * (np.max(list(phys_op_to_mean_quality.values())) - np.min(list(phys_op_to_mean_quality.values())))
214
- selectivity_alpha = 0.5 * (np.max(list(phys_op_to_mean_selectivity.values())) - np.min(list(phys_op_to_mean_selectivity.values())))
319
+ # cost_alpha = 0.5 * np.mean([mean_cost for mean_cost in full_op_id_to_mean_cost.values()])
320
+ # time_alpha = 0.5 * np.mean([mean_time for mean_time in full_op_id_to_mean_time.values()])
321
+ # quality_alpha = 0.5 * np.mean([mean_quality for mean_quality in full_op_id_to_mean_quality.values()])
322
+ # selectivity_alpha = 0.5 * np.mean([mean_selectivity for mean_selectivity in full_op_id_to_mean_selectivity.values()])
323
+ cost_alpha = 0.5 * (np.max(list(full_op_id_to_mean_cost.values())) - np.min(list(full_op_id_to_mean_cost.values())))
324
+ time_alpha = 0.5 * (np.max(list(full_op_id_to_mean_time.values())) - np.min(list(full_op_id_to_mean_time.values())))
325
+ quality_alpha = 0.5 * (np.max(list(full_op_id_to_mean_quality.values())) - np.min(list(full_op_id_to_mean_quality.values())))
326
+ selectivity_alpha = 0.5 * (np.max(list(full_op_id_to_mean_selectivity.values())) - np.min(list(full_op_id_to_mean_selectivity.values())))
215
327
 
216
328
  # compute metrics for each physical operator
217
329
  op_metrics = {}
218
- for op_id in phys_op_id_to_record_op_stats:
219
- sample_ratio = np.sqrt(np.log(total_num_samples) / phys_op_id_to_num_samples[op_id])
330
+ for full_op_id in full_op_id_to_record_op_stats:
331
+ sample_ratio = np.sqrt(np.log(total_num_samples) / full_op_id_to_num_samples[full_op_id])
220
332
  exploration_terms = np.array([cost_alpha * sample_ratio, time_alpha * sample_ratio, quality_alpha * sample_ratio, selectivity_alpha * sample_ratio])
221
- mean_terms = (phys_op_to_mean_cost[op_id], phys_op_to_mean_time[op_id], phys_op_to_mean_quality[op_id], phys_op_to_mean_selectivity[op_id])
333
+ mean_terms = (full_op_id_to_mean_cost[full_op_id], full_op_id_to_mean_time[full_op_id], full_op_id_to_mean_quality[full_op_id], full_op_id_to_mean_selectivity[full_op_id])
222
334
 
223
335
  # NOTE: we could clip these; however I will not do so for now to allow for arbitrary quality metric(s)
224
336
  lcb_terms = mean_terms - exploration_terms
225
337
  ucb_terms = mean_terms + exploration_terms
226
- op_metrics[op_id] = {"mean": mean_terms, "lcb": lcb_terms, "ucb": ucb_terms}
338
+ op_metrics[full_op_id] = {"mean": mean_terms, "lcb": lcb_terms, "ucb": ucb_terms}
227
339
 
228
340
  # get the tuple representation of this policy
229
341
  policy_dict = self.policy.get_dict()
230
342
 
231
343
  # compute the pareto optimal set of operators
232
344
  pareto_op_set = set()
233
- for op_id, metrics in op_metrics.items():
345
+ for full_op_id, metrics in op_metrics.items():
234
346
  cost, time, quality, selectivity = metrics["mean"]
235
347
  pareto_frontier = True
236
348
 
237
- # check if any other operator dominates op_id
238
- for other_op_id, other_metrics in op_metrics.items():
349
+ # check if any other operator dominates full_op_id
350
+ for other_full_op_id, other_metrics in op_metrics.items():
239
351
  other_cost, other_time, other_quality, other_selectivity = other_metrics["mean"]
240
- if op_id == other_op_id:
352
+ if full_op_id == other_full_op_id:
241
353
  continue
242
354
 
243
- # if op_id is dominated by other_op_id, set pareto_frontier = False and break
355
+ # if full_op_id is dominated by other_full_op_id, set pareto_frontier = False and break
244
356
  # NOTE: here we use a strict inequality (instead of the usual <= or >=) because
245
357
  # all ops which have equal cost / time / quality / sel. should not be
246
358
  # filtered out from sampling by our logic in this function
@@ -252,21 +364,21 @@ class OpFrontier:
252
364
  pareto_frontier = False
253
365
  break
254
366
 
255
- # add op_id to pareto frontier if it's not dominated
367
+ # add full_op_id to pareto frontier if it's not dominated
256
368
  if pareto_frontier:
257
- pareto_op_set.add(op_id)
369
+ pareto_op_set.add(full_op_id)
258
370
 
259
371
  # iterate over op metrics and compute the new frontier set of operators
260
- new_frontier_op_ids = set()
261
- for op_id, metrics in op_metrics.items():
372
+ new_frontier_full_op_ids = set()
373
+ for full_op_id, metrics in op_metrics.items():
262
374
 
263
375
  # if this op is fully sampled, do not keep it on the frontier
264
- if phys_op_id_to_num_samples[op_id] == len(self.source_indices):
376
+ if full_op_id_to_num_samples[full_op_id] == len(self.source_indices):
265
377
  continue
266
378
 
267
379
  # if this op is pareto optimal keep it in our frontier ops
268
- if op_id in pareto_op_set:
269
- new_frontier_op_ids.add(op_id)
380
+ if full_op_id in pareto_op_set:
381
+ new_frontier_full_op_ids.add(full_op_id)
270
382
  continue
271
383
 
272
384
  # otherwise, if this op overlaps with an op on the pareto frontier, keep it in our frontier ops
@@ -274,11 +386,11 @@ class OpFrontier:
274
386
  pareto_frontier = True
275
387
  op_cost, op_time, _, op_selectivity = metrics["lcb"]
276
388
  op_quality = metrics["ucb"][2]
277
- for pareto_op_id in pareto_op_set:
278
- pareto_cost, pareto_time, _, pareto_selectivity = op_metrics[pareto_op_id]["ucb"]
279
- pareto_quality = op_metrics[pareto_op_id]["lcb"][2]
389
+ for pareto_full_op_id in pareto_op_set:
390
+ pareto_cost, pareto_time, _, pareto_selectivity = op_metrics[pareto_full_op_id]["ucb"]
391
+ pareto_quality = op_metrics[pareto_full_op_id]["lcb"][2]
280
392
 
281
- # if op_id is dominated by pareto_op_id, set pareto_frontier = False and break
393
+ # if full_op_id is dominated by pareto_full_op_id, set pareto_frontier = False and break
282
394
  cost_dominated = True if policy_dict["cost"] == 0.0 else pareto_cost <= op_cost
283
395
  time_dominated = True if policy_dict["time"] == 0.0 else pareto_time <= op_time
284
396
  quality_dominated = True if policy_dict["quality"] == 0.0 else pareto_quality >= op_quality
@@ -287,15 +399,15 @@ class OpFrontier:
287
399
  pareto_frontier = False
288
400
  break
289
401
 
290
- # add op_id to pareto frontier if it's not dominated
402
+ # add full_op_id to pareto frontier if it's not dominated
291
403
  if pareto_frontier:
292
- new_frontier_op_ids.add(op_id)
404
+ new_frontier_full_op_ids.add(full_op_id)
293
405
 
294
406
  # for operators that were in the frontier, keep them in the frontier if they
295
407
  # are still pareto optimal, otherwise, move them to the end of the reservoir
296
408
  new_frontier_ops = []
297
409
  for op in self.frontier_ops:
298
- if op.get_op_id() in new_frontier_op_ids:
410
+ if op.get_full_op_id() in new_frontier_full_op_ids:
299
411
  new_frontier_ops.append(op)
300
412
  else:
301
413
  self.off_frontier_ops.append(op)
@@ -304,7 +416,7 @@ class OpFrontier:
304
416
  # add them to the frontier, otherwise, put them back in the off_frontier_ops
305
417
  new_off_frontier_ops = []
306
418
  for op in self.off_frontier_ops:
307
- if op.get_op_id() in new_frontier_op_ids:
419
+ if op.get_full_op_id() in new_frontier_full_op_ids:
308
420
  new_frontier_ops.append(op)
309
421
  else:
310
422
  new_off_frontier_ops.append(op)
@@ -388,19 +500,19 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
388
500
  # get the operators in the frontier set for this logical_op_id
389
501
  frontier_ops = op_frontiers[logical_op_id].get_frontier_ops()
390
502
 
391
- # get a mapping from physical_op_id --> list[RecordOpStats]
392
- phys_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(logical_op_id, {})
393
- phys_op_id_to_record_op_stats = {
394
- phys_op_id: op_stats.record_op_stats_lst
395
- for phys_op_id, op_stats in phys_op_id_to_op_stats.items()
503
+ # get a mapping from full_op_id --> list[RecordOpStats]
504
+ full_op_id_to_op_stats: dict[str, OperatorStats] = plan_stats.operator_stats.get(logical_op_id, {})
505
+ full_op_id_to_record_op_stats = {
506
+ full_op_id: op_stats.record_op_stats_lst
507
+ for full_op_id, op_stats in full_op_id_to_op_stats.items()
396
508
  }
397
509
 
398
510
  # iterate over the frontier ops and return the one with the highest quality
399
511
  max_quality_op, max_avg_quality = None, None
400
512
  for op in frontier_ops:
401
513
  op_quality_stats = []
402
- if op.get_op_id() in phys_op_id_to_record_op_stats:
403
- op_quality_stats = [record_op_stats.quality for record_op_stats in phys_op_id_to_record_op_stats[op.get_op_id()]]
514
+ if op.get_full_op_id() in full_op_id_to_record_op_stats:
515
+ op_quality_stats = [record_op_stats.quality for record_op_stats in full_op_id_to_record_op_stats[op.get_full_op_id()]]
404
516
  avg_op_quality = sum(op_quality_stats) / len(op_quality_stats) if len(op_quality_stats) > 0 else 0.0
405
517
  if max_avg_quality is None or avg_op_quality > max_avg_quality:
406
518
  max_quality_op = op
@@ -507,7 +619,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
507
619
 
508
620
  # initialize frontier for each logical operator
509
621
  op_frontiers = {
510
- logical_op_id: OpFrontier(op_set, shuffled_source_indices, self.k, self.j, self.seed, self.policy)
622
+ logical_op_id: OpFrontier(op_set, shuffled_source_indices, self.k, self.j, self.seed, self.policy, self.priors)
511
623
  for logical_op_id, op_set in plan
512
624
  }
513
625