palimpzest 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/METADATA +28 -24
  57. palimpzest-0.7.0.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.3.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.3.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,884 +0,0 @@
1
- import time
2
- from concurrent.futures import ThreadPoolExecutor, wait
3
- from copy import deepcopy
4
-
5
- import numpy as np
6
-
7
- from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
8
- from palimpzest.core.data.dataclasses import (
9
- ExecutionStats,
10
- OperatorCostEstimates,
11
- OperatorStats,
12
- PlanStats,
13
- RecordOpStats,
14
- )
15
- from palimpzest.core.elements.records import DataRecordCollection, DataRecordSet
16
- from palimpzest.policy import Policy
17
- from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy
18
- from palimpzest.query.execution.single_threaded_execution_strategy import SequentialSingleThreadExecutionStrategy
19
- from palimpzest.query.operators.convert import ConvertOp, LLMConvert
20
- from palimpzest.query.operators.filter import FilterOp, LLMFilter
21
- from palimpzest.query.operators.physical import PhysicalOperator
22
- from palimpzest.query.operators.retrieve import RetrieveOp
23
- from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp
24
- from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
25
- from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
26
- from palimpzest.query.optimizer.plan import SentinelPlan
27
- from palimpzest.query.processor.query_processor import QueryProcessor
28
- from palimpzest.sets import Set
29
-
30
-
31
- class MABSentinelQueryProcessor(QueryProcessor):
32
- """
33
- Specialized query processor that implements MAB sentinel strategy
34
- for coordinating optimization and execution.
35
- """
36
- def __init__(
37
- self,
38
- k: int,
39
- j: int,
40
- sample_budget: int,
41
- early_stop_iters: int = 3,
42
- use_final_op_quality: bool = False,
43
- seed: int = 42,
44
- *args,
45
- **kwargs,
46
- ):
47
- super().__init__(*args, **kwargs)
48
- # self.max_workers = self.get_parallel_max_workers()
49
- # TODO: undo
50
- # self.max_workers = 4
51
- self.k = k
52
- self.j = j
53
- self.sample_budget = sample_budget
54
- self.early_stop_iters = early_stop_iters
55
- self.use_final_op_quality = use_final_op_quality
56
- self.pick_output_fn = self.pick_champion_output
57
- self.rng = np.random.default_rng(seed=seed)
58
-
59
-
60
- def update_frontier_ops(
61
- self,
62
- frontier_ops,
63
- reservoir_ops,
64
- policy,
65
- all_outputs,
66
- logical_op_id_to_num_samples,
67
- phys_op_id_to_num_samples,
68
- is_filter_op_dict,
69
- ):
70
- """
71
- Update the set of frontier operators, pulling in new ones from the reservoir as needed.
72
- This function will (for each op_set):
73
- 1. Compute the mean, LCB, and UCB for the cost, time, quality, and selectivity of each operator
74
- 2. Compute the pareto optimal set of operators (using the mean values)
75
- 3. Update the frontier and reservoir sets of operators based on their LCB/UCB overlap with the pareto frontier
76
- """
77
- # compute metrics for each operator in all_outputs
78
- logical_op_id_to_op_metrics = {}
79
- for logical_op_id, source_idx_to_record_sets in all_outputs.items():
80
- # compute selectivity for each physical operator
81
- phys_op_to_num_inputs, phys_op_to_num_outputs = {}, {}
82
- for _, record_sets in source_idx_to_record_sets.items():
83
- for record_set in record_sets:
84
- op_id = record_set.record_op_stats[0].op_id
85
- num_outputs = sum([record_op_stats.passed_operator for record_op_stats in record_set.record_op_stats])
86
- if op_id not in phys_op_to_num_inputs:
87
- phys_op_to_num_inputs[op_id] = 1
88
- phys_op_to_num_outputs[op_id] = num_outputs
89
- else:
90
- phys_op_to_num_inputs[op_id] += 1
91
- phys_op_to_num_outputs[op_id] += num_outputs
92
-
93
- phys_op_to_mean_selectivity = {
94
- op_id: phys_op_to_num_outputs[op_id] / phys_op_to_num_inputs[op_id]
95
- for op_id in phys_op_to_num_inputs
96
- }
97
-
98
- # compute average cost, time, and quality
99
- phys_op_to_costs, phys_op_to_times, phys_op_to_qualities = {}, {}, {}
100
- for _, record_sets in source_idx_to_record_sets.items():
101
- for record_set in record_sets:
102
- for record_op_stats in record_set.record_op_stats:
103
- op_id = record_op_stats.op_id
104
- cost = record_op_stats.cost_per_record
105
- time = record_op_stats.time_per_record
106
- quality = record_op_stats.quality
107
- if op_id not in phys_op_to_costs:
108
- phys_op_to_costs[op_id] = [cost]
109
- phys_op_to_times[op_id] = [time]
110
- phys_op_to_qualities[op_id] = [quality]
111
- else:
112
- phys_op_to_costs[op_id].append(cost)
113
- phys_op_to_times[op_id].append(time)
114
- phys_op_to_qualities[op_id].append(quality)
115
-
116
- phys_op_to_mean_cost = {op: np.mean(costs) for op, costs in phys_op_to_costs.items()}
117
- phys_op_to_mean_time = {op: np.mean(times) for op, times in phys_op_to_times.items()}
118
- phys_op_to_mean_quality = {op: np.mean(qualities) for op, qualities in phys_op_to_qualities.items()}
119
-
120
- # compute average, LCB, and UCB of each operator; the confidence bounds depend upon
121
- # the computation of the alpha parameter, which we scale to be 0.5 * the mean (of means)
122
- # of the metric across all operators in this operator set
123
- cost_alpha = 0.5 * np.mean([mean_cost for mean_cost in phys_op_to_mean_cost.values()])
124
- time_alpha = 0.5 * np.mean([mean_time for mean_time in phys_op_to_mean_time.values()])
125
- quality_alpha = 0.5 * np.mean([mean_quality for mean_quality in phys_op_to_mean_quality.values()])
126
- selectivity_alpha = 0.5 * np.mean([mean_selectivity for mean_selectivity in phys_op_to_mean_selectivity.values()])
127
-
128
- op_metrics = {}
129
- for op_id in phys_op_to_costs:
130
- sample_ratio = np.sqrt(np.log(logical_op_id_to_num_samples[logical_op_id]) / phys_op_id_to_num_samples[op_id])
131
- exploration_terms = np.array([cost_alpha * sample_ratio, time_alpha * sample_ratio, quality_alpha * sample_ratio, selectivity_alpha * sample_ratio])
132
- mean_terms = (phys_op_to_mean_cost[op_id], phys_op_to_mean_time[op_id], phys_op_to_mean_quality[op_id], phys_op_to_mean_selectivity[op_id])
133
-
134
- # NOTE: we could clip these; however I will not do so for now to allow for arbitrary quality metric(s)
135
- lcb_terms = mean_terms - exploration_terms
136
- ucb_terms = mean_terms + exploration_terms
137
- op_metrics[op_id] = {"mean": mean_terms, "lcb": lcb_terms, "ucb": ucb_terms}
138
-
139
- # store average metrics for each operator in the op_set
140
- logical_op_id_to_op_metrics[logical_op_id] = op_metrics
141
-
142
- # get the tuple representation of this policy
143
- policy_dict = policy.get_dict()
144
-
145
- # compute the pareto optimal set of operators for each logical_op_id
146
- pareto_op_sets = {}
147
- for logical_op_id, op_metrics in logical_op_id_to_op_metrics.items():
148
- pareto_op_sets[logical_op_id] = set()
149
- for op_id, metrics in op_metrics.items():
150
- cost, time, quality, selectivity = metrics["mean"]
151
- pareto_frontier = True
152
-
153
- # check if any other operator dominates op_id
154
- for other_op_id, other_metrics in op_metrics.items():
155
- other_cost, other_time, other_quality, other_selectivity = other_metrics["mean"]
156
- if op_id == other_op_id:
157
- continue
158
-
159
- # if op_id is dominated by other_op_id, set pareto_frontier = False and break
160
- # NOTE: here we use a strict inequality (instead of the usual <= or >=) because
161
- # all ops which have equal cost / time / quality / sel. should not be
162
- # filtered out from sampling by our logic in this function
163
- cost_dominated = True if policy_dict["cost"] == 0.0 else other_cost < cost
164
- time_dominated = True if policy_dict["time"] == 0.0 else other_time < time
165
- quality_dominated = True if policy_dict["quality"] == 0.0 else other_quality > quality
166
- selectivity_dominated = True if not is_filter_op_dict[logical_op_id] else other_selectivity < selectivity
167
- if cost_dominated and time_dominated and quality_dominated and selectivity_dominated:
168
- pareto_frontier = False
169
- break
170
-
171
- # add op_id to pareto frontier if it's not dominated
172
- if pareto_frontier:
173
- pareto_op_sets[logical_op_id].add(op_id)
174
-
175
- # iterate over frontier ops and replace any which do not overlap with pareto frontier
176
- new_frontier_ops = {logical_op_id: [] for logical_op_id in frontier_ops}
177
- new_reservoir_ops = {logical_op_id: [] for logical_op_id in reservoir_ops}
178
- for logical_op_id, pareto_op_set in pareto_op_sets.items():
179
- num_dropped_from_frontier = 0
180
- for op, next_shuffled_sample_idx, new_operator, fully_sampled in frontier_ops[logical_op_id]:
181
- op_id = op.get_op_id()
182
-
183
- # if this op is fully sampled, remove it from the frontier
184
- if fully_sampled:
185
- num_dropped_from_frontier += 1
186
- continue
187
-
188
- # if this op is pareto optimal keep it in our frontier ops
189
- if op_id in pareto_op_set:
190
- new_frontier_ops[logical_op_id].append((op, next_shuffled_sample_idx, new_operator, fully_sampled))
191
- continue
192
-
193
- # otherwise, if this op overlaps with an op on the pareto frontier, keep it in our frontier ops
194
- # NOTE: for now, we perform an optimistic comparison with the ucb/lcb
195
- pareto_frontier = True
196
- op_cost = logical_op_id_to_op_metrics[logical_op_id][op_id]["lcb"][0]
197
- op_time = logical_op_id_to_op_metrics[logical_op_id][op_id]["lcb"][1]
198
- op_quality = logical_op_id_to_op_metrics[logical_op_id][op_id]["ucb"][2]
199
- op_selectivity = logical_op_id_to_op_metrics[logical_op_id][op_id]["lcb"][3]
200
- for pareto_op_id in pareto_op_set:
201
- pareto_cost = logical_op_id_to_op_metrics[logical_op_id][pareto_op_id]["ucb"][0]
202
- pareto_time = logical_op_id_to_op_metrics[logical_op_id][pareto_op_id]["ucb"][1]
203
- pareto_quality = logical_op_id_to_op_metrics[logical_op_id][pareto_op_id]["lcb"][2]
204
- pareto_selectivity = logical_op_id_to_op_metrics[logical_op_id][pareto_op_id]["ucb"][3]
205
-
206
- # if op_id is dominated by pareto_op_id, set pareto_frontier = False and break
207
- cost_dominated = True if policy_dict["cost"] == 0.0 else pareto_cost <= op_cost
208
- time_dominated = True if policy_dict["time"] == 0.0 else pareto_time <= op_time
209
- quality_dominated = True if policy_dict["quality"] == 0.0 else pareto_quality >= op_quality
210
- selectivity_dominated = True if not is_filter_op_dict[logical_op_id] else pareto_selectivity <= op_selectivity
211
- if cost_dominated and time_dominated and quality_dominated and selectivity_dominated:
212
- pareto_frontier = False
213
- break
214
-
215
- # add op_id to pareto frontier if it's not dominated
216
- if pareto_frontier:
217
- new_frontier_ops[logical_op_id].append((op, next_shuffled_sample_idx, new_operator, fully_sampled))
218
- else:
219
- num_dropped_from_frontier += 1
220
-
221
- # replace the ops dropped from the frontier with new ops from the reservoir
222
- num_dropped_from_frontier = min(num_dropped_from_frontier, len(reservoir_ops[logical_op_id]))
223
- for idx in range(num_dropped_from_frontier):
224
- new_frontier_ops[logical_op_id].append((reservoir_ops[logical_op_id][idx], 0, True, False))
225
-
226
- # update reservoir ops for this logical_op_id
227
- new_reservoir_ops[logical_op_id] = reservoir_ops[logical_op_id][num_dropped_from_frontier:]
228
-
229
- return new_frontier_ops, new_reservoir_ops
230
-
231
-
232
- def compute_quality(
233
- self,
234
- record_set: DataRecordSet,
235
- expected_output: dict | None = None,
236
- champion_record_set: DataRecordSet | None = None,
237
- is_filter_op: bool = False,
238
- is_convert_op: bool = False,
239
- ) -> DataRecordSet:
240
- """
241
- Compute the quality for the given `record_set` by comparing it to the `expected_output`.
242
-
243
- Update the record_set by assigning the quality to each entry in its record_op_stats and
244
- returning the updated record_set.
245
- """
246
- # compute whether we can only use the champion
247
- only_using_champion = expected_output is None
248
-
249
- # if this operation is a failed convert
250
- if is_convert_op and len(record_set) == 0:
251
- record_set.record_op_stats[0].quality = 0.0
252
-
253
- # if this operation is a filter:
254
- # - we assign a quality of 1.0 if the record is in the expected outputs and it passes this filter
255
- # - we assign a quality of 0.0 if the record is in the expected outputs and it does NOT pass this filter
256
- # - we assign a quality relative to the champion / ensemble output if the record is not in the expected outputs
257
- # we cannot know for certain what the correct behavior is a given filter on a record which is not in the output
258
- # (unless it is the only filter in the plan), thus we only evaluate the filter based on its performance on
259
- # records which are in the output
260
- elif is_filter_op:
261
- # NOTE:
262
- # - we know that record_set.record_op_stats will contain a single entry for a filter op
263
- # - if we are using the champion, then champion_record_set will also contain a single entry for a filter op
264
- record_op_stats = record_set.record_op_stats[0]
265
- if only_using_champion:
266
- champion_record = champion_record_set[0]
267
- record_op_stats.quality = int(record_op_stats.passed_operator == champion_record.passed_operator)
268
-
269
- # - if we are using validation data, we may have multiple expected records in the expected_output for this source_idx,
270
- # thus, if we can identify an exact match, we can use that to evaluate the filter's quality
271
- # - if we are using validation data but we *cannot* find an exact match, then we will once again use the champion record set
272
- else:
273
- # compute number of matches between this record's computed fields and this expected record's outputs
274
- found_match_in_output = False
275
- labels_dict_lst = expected_output["labels"] if isinstance(expected_output["labels"], list) else [expected_output["labels"]]
276
- for labels_dict in labels_dict_lst:
277
- all_correct = True
278
- for field, value in record_op_stats.record_state.items():
279
- if value != labels_dict[field]:
280
- all_correct = False
281
- break
282
-
283
- if all_correct:
284
- found_match_in_output = True
285
- break
286
-
287
- if found_match_in_output:
288
- record_op_stats.quality = int(record_op_stats.passed_operator)
289
- else:
290
- champion_record = champion_record_set[0]
291
- record_op_stats.quality = int(record_op_stats.passed_operator == champion_record.passed_operator)
292
-
293
- # if this is a successful convert operation
294
- else:
295
- # NOTE: the following computation assumes we do not project out computed values
296
- # (and that the validation examples provide all computed fields); even if
297
- # a user program does add projection, we can ignore the projection on the
298
- # validation dataset and use the champion model (as opposed to the validation
299
- # output) for scoring fields which have their values projected out
300
-
301
- # create list of dictionaries of labels for each expected / champion output
302
- labels_dict_lst = []
303
- if only_using_champion:
304
- for champion_record in champion_record_set:
305
- labels_dict_lst.append(champion_record.to_dict())
306
- else:
307
- labels_dict_lst = (
308
- expected_output["labels"]
309
- if isinstance(expected_output["labels"], list)
310
- else [expected_output["labels"]]
311
- )
312
-
313
- # GREEDY ALGORITHM
314
- # for each record in the expected output, we look for the computed record which maximizes the quality metric;
315
- # once we've identified that computed record we remove it from consideration for the next expected output
316
- field_to_score_fn = {} if only_using_champion else expected_output["score_fn"]
317
- for labels_dict in labels_dict_lst:
318
- best_quality, best_record_op_stats = 0.0, None
319
- for record_op_stats in record_set.record_op_stats:
320
- # if we already assigned this record a quality, skip it
321
- if record_op_stats.quality is not None:
322
- continue
323
-
324
- # compute number of matches between this record's computed fields and this expected record's outputs
325
- total_quality = 0
326
- for field in record_op_stats.generated_fields:
327
- computed_value = record_op_stats.record_state.get(field, None)
328
- expected_value = labels_dict[field]
329
-
330
- # get the metric function for this field
331
- score_fn = field_to_score_fn.get(field, "exact")
332
-
333
- # compute exact match
334
- if score_fn == "exact":
335
- total_quality += int(computed_value == expected_value)
336
-
337
- # compute UDF metric
338
- elif callable(score_fn):
339
- total_quality += score_fn(computed_value, expected_value)
340
-
341
- # otherwise, throw an exception
342
- else:
343
- raise Exception(f"Unrecognized score_fn: {score_fn}")
344
-
345
- # compute recall and update best seen so far
346
- quality = total_quality / len(record_op_stats.generated_fields)
347
- if quality > best_quality:
348
- best_quality = quality
349
- best_record_op_stats = record_op_stats
350
-
351
- # set best_quality as quality for the best_record_op_stats
352
- if best_record_op_stats is not None:
353
- best_record_op_stats.quality = best_quality
354
-
355
- # for any records which did not receive a quality, set it to 0.0 as these are unexpected extras
356
- for record_op_stats in record_set.record_op_stats:
357
- if record_op_stats.quality is None:
358
- record_op_stats.quality = 0.0
359
-
360
- return record_set
361
-
362
-
363
- def score_quality(
364
- self,
365
- op_set: list[PhysicalOperator],
366
- logical_op_id: str,
367
- execution_data: dict[str, dict[str, list[DataRecordSet]]],
368
- champion_outputs: dict[str, dict[str, DataRecordSet]],
369
- expected_outputs: dict[str, dict],
370
- ) -> list[RecordOpStats]:
371
- """
372
- NOTE: This approach to cost modeling does not work directly for aggregation queries;
373
- for these queries, we would ask the user to provide validation data for the step immediately
374
- before a final aggregation
375
-
376
- NOTE: This function currently assumes that one-to-many converts do NOT create duplicate outputs.
377
- This assumption would break if, for example, we extracted the breed of every dog in an image.
378
- If there were two golden retrievers and a bernoodle in an image and we extracted:
379
-
380
- {"image": "file1.png", "breed": "Golden Retriever"}
381
- {"image": "file1.png", "breed": "Golden Retriever"}
382
- {"image": "file1.png", "breed": "Bernedoodle"}
383
-
384
- This function would currently give perfect accuracy to the following output:
385
-
386
- {"image": "file1.png", "breed": "Golden Retriever"}
387
- {"image": "file1.png", "breed": "Bernedoodle"}
388
-
389
- Even though it is missing one of the golden retrievers.
390
- """
391
- # extract information about the logical operation performed at this stage of the sentinel plan;
392
- # NOTE: we can infer these fields from context clues, but in the long-term we should have a more
393
- # principled way of getting these directly from attributes either stored in the sentinel_plan
394
- # or in the PhysicalOperator
395
- physical_op = op_set[0]
396
- is_filter_op = isinstance(physical_op, FilterOp)
397
- is_convert_op = isinstance(physical_op, ConvertOp)
398
- is_perfect_quality_op = (
399
- not isinstance(physical_op, LLMConvert)
400
- and not isinstance(physical_op, LLMFilter)
401
- and not isinstance(physical_op, RetrieveOp)
402
- )
403
-
404
- # pull out the execution data from this operator; place the upstream execution data in a new list
405
- this_op_execution_data = execution_data[logical_op_id]
406
-
407
- # compute quality of each output computed by this operator
408
- for source_idx, record_sets in this_op_execution_data.items():
409
- # NOTE
410
- # source_idx is a particular input, for which we may have computed multiple output record_sets;
411
- # each of these record_sets may contain more than one record (b/c one-to-many) and we have one
412
- # record_set per operator in the op_set
413
-
414
- # if this operation does not involve an LLM, every record_op_stats object gets perfect quality
415
- if is_perfect_quality_op:
416
- for record_set in record_sets:
417
- for record_op_stats in record_set.record_op_stats:
418
- record_op_stats.quality = 1.0
419
- continue
420
-
421
- # get the expected output for this source_idx if we have one
422
- expected_output = (
423
- expected_outputs[source_idx]
424
- if expected_outputs is not None and source_idx in expected_outputs
425
- else None
426
- )
427
-
428
- # extract champion output for this record set
429
- champion_record_set = champion_outputs[logical_op_id][source_idx]
430
-
431
- # for each record_set produced by an operation, compute its quality
432
- for record_set in record_sets:
433
- record_set = self.compute_quality(record_set, expected_output, champion_record_set, is_filter_op, is_convert_op)
434
-
435
- # return the quality annotated record op stats
436
- return execution_data
437
-
438
- def pick_champion_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
439
- # if there's only one operator in the set, we return its record_set
440
- if len(op_set_record_sets) == 1:
441
- record_set, _ = op_set_record_sets[0]
442
- return record_set
443
-
444
- # find the operator with the highest average quality and return its record_set
445
- base_op_cost_est = OperatorCostEstimates(cardinality=1.0, cost_per_record=0.0, time_per_record=0.0, quality=1.0)
446
- champion_record_set, champion_quality = None, -1.0
447
- for record_set, op in op_set_record_sets:
448
- op_cost_estimates = op.naive_cost_estimates(base_op_cost_est)
449
- if op_cost_estimates.quality > champion_quality:
450
- champion_record_set, champion_quality = record_set, op_cost_estimates.quality
451
-
452
- return champion_record_set
453
-
454
- def pick_ensemble_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
455
- # if there's only one operator in the set, we return its record_set
456
- if len(op_set_record_sets) == 1:
457
- record_set, _ = op_set_record_sets[0]
458
- return record_set
459
-
460
- # NOTE: I don't like that this assumes the models are consistent in
461
- # how they order their record outputs for one-to-many converts;
462
- # eventually we can try out more robust schemes to account for
463
- # differences in ordering
464
- # aggregate records at each index in the response
465
- idx_to_records = {}
466
- for record_set, _ in op_set_record_sets:
467
- for idx, record in enumerate(record_set):
468
- if idx not in idx_to_records:
469
- idx_to_records[idx] = [record]
470
- else:
471
- idx_to_records[idx].append(record)
472
-
473
- # compute most common answer at each index
474
- out_records = []
475
- for idx in range(len(idx_to_records)):
476
- records = idx_to_records[idx]
477
- most_common_record = max(set(records), key=records.count)
478
- out_records.append(most_common_record)
479
-
480
- # create and return final DataRecordSet
481
- return DataRecordSet(out_records, [])
482
-
483
-
484
- def pick_highest_quality_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
485
- # if there's only one operator in the set, we return its record_set
486
- if len(op_set_record_sets) == 1:
487
- record_set, _ = op_set_record_sets[0]
488
- return record_set
489
-
490
- # NOTE: I don't like that this assumes the models are consistent in
491
- # how they order their record outputs for one-to-many converts;
492
- # eventually we can try out more robust schemes to account for
493
- # differences in ordering
494
- # aggregate records at each index in the response
495
- idx_to_records = {}
496
- for record_set, _ in op_set_record_sets:
497
- for idx in range(len(record_set)):
498
- record, record_op_stats = record_set[idx], record_set.record_op_stats[idx]
499
- if idx not in idx_to_records:
500
- idx_to_records[idx] = [(record, record_op_stats)]
501
- else:
502
- idx_to_records[idx].append((record, record_op_stats))
503
-
504
- # compute highest quality answer at each index
505
- out_records = []
506
- out_record_op_stats = []
507
- for idx in range(len(idx_to_records)):
508
- records_lst, record_op_stats_lst = zip(*idx_to_records[idx])
509
- max_quality_record, max_quality = records_lst[0], record_op_stats_lst[0].quality
510
- max_quality_stats = record_op_stats_lst[0]
511
- for record, record_op_stats in zip(records_lst[1:], record_op_stats_lst[1:]):
512
- record_quality = record_op_stats.quality
513
- if record_quality > max_quality:
514
- max_quality_record = record
515
- max_quality = record_quality
516
- max_quality_stats = record_op_stats
517
- out_records.append(max_quality_record)
518
- out_record_op_stats.append(max_quality_stats)
519
-
520
- # create and return final DataRecordSet
521
- return DataRecordSet(out_records, out_record_op_stats)
522
-
523
-
524
- def execute_op_set(self, op_candidate_pairs):
525
- # TODO: post-submission we will need to modify this to:
526
- # - submit all candidates for aggregate operators
527
- # - handle limits
528
- # create thread pool w/max workers and run futures over worker pool
529
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
530
- # create futures
531
- futures = []
532
- for operator, candidate in op_candidate_pairs:
533
- future = executor.submit(PhysicalOperator.execute_op_wrapper, operator, candidate)
534
- futures.append(future)
535
-
536
- # compute output record_set for each (operator, candidate) pair
537
- output_record_sets = []
538
- while len(futures) > 0:
539
- # get the set of futures that have (and have not) finished in the last PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
540
- done_futures, not_done_futures = wait(futures, timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
541
-
542
- # cast not_done_futures from a set to a list so we can append to it
543
- not_done_futures = list(not_done_futures)
544
-
545
- # process finished futures
546
- for future in done_futures:
547
- # get the result and add it to the output records set
548
- record_set, operator, candidate = future.result()
549
- output_record_sets.append((record_set, operator, candidate))
550
-
551
- # update list of futures
552
- futures = not_done_futures
553
-
554
- # compute mapping from source_idx to record sets for all operators and for champion operator
555
- all_record_sets, champion_record_sets = {}, {}
556
- for _, candidate in op_candidate_pairs:
557
- candidate_output_record_sets, source_idx = [], None
558
- for record_set, operator, candidate_ in output_record_sets:
559
- if candidate == candidate_:
560
- candidate_output_record_sets.append((record_set, operator))
561
-
562
- # get the source_idx associated with this input record
563
- source_idx = candidate.source_idx
564
-
565
- # select the champion (i.e. best) record_set from all the record sets computed for this candidate
566
- champion_record_set = self.pick_output_fn(candidate_output_record_sets)
567
-
568
- # add champion record_set to mapping from source_idx --> champion record_set
569
- champion_record_sets[source_idx] = champion_record_set
570
-
571
- # add all record_sets computed for this source_idx to mapping from source_idx --> record_sets
572
- all_record_sets[source_idx] = [tup[0] for tup in candidate_output_record_sets]
573
-
574
- return all_record_sets, champion_record_sets
575
-
576
-
577
- def execute_sentinel_plan(self, plan: SentinelPlan, expected_outputs: dict[str, dict], policy: Policy):
578
- """
579
- """
580
- if self.verbose:
581
- print("----------------------")
582
- print(f"PLAN[{plan.plan_id}] (sentinel):")
583
- print(plan)
584
- print("---")
585
-
586
- plan_start_time = time.time()
587
-
588
- # initialize plan stats and operator stats
589
- plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
590
- for logical_op_id, logical_op_name, op_set in plan:
591
- op_set_details = {
592
- op.op_name(): {k: str(v) for k, v in op.get_id_params().items()}
593
- for op in op_set
594
- }
595
- plan_stats.operator_stats[logical_op_id] = OperatorStats(
596
- op_id=logical_op_id,
597
- op_name=logical_op_name,
598
- op_details=op_set_details,
599
- )
600
-
601
- # shuffle the indices of records to sample
602
- total_num_samples = len(self.val_datasource)
603
- shuffled_source_indices = [int(idx) for idx in np.arange(total_num_samples)]
604
- self.rng.shuffle(shuffled_source_indices)
605
-
606
- # sample k initial operators for each operator set; for each operator maintain a tuple of:
607
- # (operator, next_shuffled_sample_idx, new_operator); new_operator is True when an operator
608
- # is added to the frontier
609
- frontier_ops, reservoir_ops = {}, {}
610
- for logical_op_id, _, op_set in plan:
611
- op_set_copy = [op for op in op_set]
612
- self.rng.shuffle(op_set_copy)
613
- k = min(self.k, len(op_set_copy))
614
- frontier_ops[logical_op_id] = [(op, 0, True, False) for op in op_set_copy[:k]]
615
- reservoir_ops[logical_op_id] = [op for op in op_set_copy[k:]]
616
-
617
- # create mapping from logical and physical op ids to the number of samples drawn
618
- logical_op_id_to_num_samples = {logical_op_id: 0 for logical_op_id, _, _ in plan}
619
- phys_op_id_to_num_samples = {op.get_op_id(): 0 for _, _, op_set in plan for op in op_set}
620
- is_filter_op_dict = {
621
- logical_op_id: isinstance(op_set[0], FilterOp)
622
- for logical_op_id, _, op_set in plan
623
- }
624
-
625
- # NOTE: to maintain parity with our count of samples drawn in the random sampling execution,
626
- # for each logical_op_id, we count the number of (record, op) executions as the number of samples within that op_set;
627
- # the samples drawn is equal to the max of that number across all operator sets
628
- samples_drawn = 0
629
- all_outputs, champion_outputs = {}, {}
630
- while samples_drawn < self.sample_budget:
631
- # execute operator sets in sequence
632
- for op_idx, (logical_op_id, _, op_set) in enumerate(plan):
633
- prev_logical_op_id = plan.logical_op_ids[op_idx - 1] if op_idx > 0 else None
634
- prev_logical_op_is_filter = prev_logical_op_id is not None and is_filter_op_dict[prev_logical_op_id]
635
-
636
- # create list of tuples for (op, candidate) which we should execute
637
- op_candidate_pairs = []
638
- updated_frontier_ops_lst = []
639
- for op, next_shuffled_sample_idx, new_operator, fully_sampled in frontier_ops[logical_op_id]:
640
- # execute new operators on first j candidates, and previously sampled operators on one additional candidate
641
- j = min(self.j, len(shuffled_source_indices)) if new_operator else 1
642
- for j_idx in range(j):
643
- candidates = []
644
- if isinstance(op, (MarshalAndScanDataOp, CacheScanDataOp)):
645
- source_idx = shuffled_source_indices[(next_shuffled_sample_idx + j_idx) % len(shuffled_source_indices)]
646
- candidates = [source_idx]
647
- logical_op_id_to_num_samples[logical_op_id] += 1
648
- phys_op_id_to_num_samples[op.get_op_id()] += 1
649
- else:
650
- if next_shuffled_sample_idx + j_idx == len(shuffled_source_indices):
651
- fully_sampled = True
652
- break
653
-
654
- # pick best output from all_outputs from previous logical operator
655
- source_idx = shuffled_source_indices[next_shuffled_sample_idx + j_idx]
656
- record_sets = all_outputs[prev_logical_op_id][source_idx]
657
- all_source_record_sets = [(record_set, None) for record_set in record_sets]
658
- max_quality_record_set = self.pick_highest_quality_output(all_source_record_sets)
659
- if (
660
- not prev_logical_op_is_filter
661
- or (
662
- prev_logical_op_is_filter
663
- and max_quality_record_set.record_op_stats[0].passed_operator
664
- )
665
- ):
666
- candidates = [record for record in max_quality_record_set]
667
-
668
- # increment number of samples drawn for this logical and physical op id; even if we get multiple
669
- # candidates from the previous stage in the pipeline, we only count this as one sample
670
- logical_op_id_to_num_samples[logical_op_id] += 1
671
- phys_op_id_to_num_samples[op.get_op_id()] += 1
672
-
673
- if len(candidates) > 0:
674
- op_candidate_pairs.extend([(op, candidate) for candidate in candidates])
675
-
676
- # set new_operator = False and update next_shuffled_sample_idx
677
- updated_frontier_ops_lst.append((op, next_shuffled_sample_idx + j, False, fully_sampled))
678
-
679
- frontier_ops[logical_op_id] = updated_frontier_ops_lst
680
-
681
- # continue if op_candidate_pairs is an empty list, as this means all records have been filtered out
682
- if len(op_candidate_pairs) == 0:
683
- continue
684
-
685
- # run sampled operators on sampled candidates
686
- source_idx_to_record_sets, source_idx_to_champion_record_set = self.execute_op_set(op_candidate_pairs)
687
-
688
- # update all_outputs and champion_outputs dictionary
689
- if logical_op_id not in all_outputs:
690
- all_outputs[logical_op_id] = source_idx_to_record_sets
691
- champion_outputs[logical_op_id] = source_idx_to_champion_record_set
692
- else:
693
- for source_idx, record_sets in source_idx_to_record_sets.items():
694
- if source_idx not in all_outputs[logical_op_id]:
695
- all_outputs[logical_op_id][source_idx] = record_sets
696
- champion_outputs[logical_op_id][source_idx] = source_idx_to_champion_record_set[source_idx]
697
- else:
698
- all_outputs[logical_op_id][source_idx].extend(record_sets)
699
- # NOTE: short-term solution; in practice we can get multiple champion records from different
700
- # sets of operators, so we should try to find a way to only take one
701
- champion_outputs[logical_op_id][source_idx] = source_idx_to_champion_record_set[source_idx]
702
-
703
- # flatten lists of records and record_op_stats
704
- all_records, all_record_op_stats = [], []
705
- for _, record_sets in source_idx_to_record_sets.items():
706
- for record_set in record_sets:
707
- all_records.extend(record_set.data_records)
708
- all_record_op_stats.extend(record_set.record_op_stats)
709
-
710
- # update plan stats
711
- plan_stats.operator_stats[logical_op_id].add_record_op_stats(
712
- all_record_op_stats,
713
- source_op_id=prev_logical_op_id,
714
- plan_id=plan.plan_id,
715
- )
716
-
717
- # add records (which are not filtered) to the cache, if allowed
718
- if not self.nocache:
719
- for record in all_records:
720
- if getattr(record, "passed_operator", True):
721
- # self.datadir.append_cache(logical_op_id, record)
722
- pass
723
-
724
- # compute quality for each operator
725
- all_outputs = self.score_quality(
726
- op_set,
727
- logical_op_id,
728
- all_outputs,
729
- champion_outputs,
730
- expected_outputs,
731
- )
732
-
733
- # update the (pareto) frontier for each set of operators
734
- frontier_ops, reservoir_ops = self.update_frontier_ops(
735
- frontier_ops,
736
- reservoir_ops,
737
- policy,
738
- all_outputs,
739
- logical_op_id_to_num_samples,
740
- phys_op_id_to_num_samples,
741
- is_filter_op_dict,
742
- )
743
-
744
- # update the number of samples drawn to be the max across all logical operators
745
- samples_drawn = max(logical_op_id_to_num_samples.values())
746
-
747
- # if caching was allowed, close the cache
748
- if not self.nocache:
749
- for _, _, _ in plan:
750
- # self.datadir.close_cache(logical_op_id)
751
- pass
752
-
753
- # finalize plan stats
754
- total_plan_time = time.time() - plan_start_time
755
- plan_stats.finalize(total_plan_time)
756
-
757
- return all_outputs, plan_stats
758
-
759
-
760
- def generate_sample_observations(self, sentinel_plan: SentinelPlan, policy: Policy):
761
- """
762
- This function is responsible for generating sample observation data which can be
763
- consumed by the CostModel.
764
-
765
- To accomplish this, we construct a special sentinel plan using the Optimizer which is
766
- capable of executing any valid physical implementation of a Filter or Convert operator
767
- on each record.
768
- """
769
- # if we're using validation data, get the set of expected output records
770
- expected_outputs = {}
771
- for source_idx in range(len(self.val_datasource)):
772
- # TODO: make sure execute_op_set uses self.val_datasource
773
- expected_output = self.val_datasource[source_idx]
774
- expected_outputs[source_idx] = expected_output
775
-
776
- # run sentinel plan
777
- execution_data, plan_stats = self.execute_sentinel_plan(sentinel_plan, expected_outputs, policy)
778
-
779
- return execution_data, plan_stats
780
-
781
-
782
- def create_sentinel_plan(self, dataset: Set, policy: Policy) -> SentinelPlan:
783
- """
784
- Generates and returns a SentinelPlan for the given dataset.
785
- """
786
- # TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
787
-
788
- # create a new optimizer and update its strategy to SENTINEL
789
- optimizer = self.optimizer.deepcopy_clean()
790
- optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
791
-
792
- # create copy of dataset, but change its data source to the validation data source
793
- dataset = deepcopy(dataset)
794
- dataset._set_data_source(self.val_datasource)
795
-
796
- # get the sentinel plan for the given dataset
797
- sentinel_plans = optimizer.optimize(dataset, policy)
798
- sentinel_plan = sentinel_plans[0]
799
-
800
- return sentinel_plan
801
-
802
-
803
- def execute(self) -> DataRecordCollection:
804
- execution_start_time = time.time()
805
-
806
- # for now, enforce that we are using validation data; we can relax this after paper submission
807
- if self.val_datasource is None:
808
- raise Exception("Make sure you are using validation data with MABSentinelExecutionEngine")
809
-
810
- # if nocache is True, make sure we do not re-use codegen examples
811
- if self.nocache:
812
- # self.clear_cached_examples()
813
- pass
814
-
815
- # create sentinel plan
816
- sentinel_plan = self.create_sentinel_plan(self.dataset, self.policy)
817
-
818
- # generate sample execution data
819
- all_execution_data, plan_stats = self.generate_sample_observations(sentinel_plan, self.policy)
820
-
821
- # put sentinel plan execution stats into list and prepare list of output records
822
- all_plan_stats = [plan_stats]
823
- all_records = []
824
-
825
- # (re-)initialize the optimizer
826
- optimizer = self.optimizer.deepcopy_clean()
827
-
828
- # construct the CostModel with any sample execution data we've gathered
829
- cost_model = SampleBasedCostModel(sentinel_plan, all_execution_data, self.verbose)
830
- optimizer.update_cost_model(cost_model)
831
- total_optimization_time = time.time() - execution_start_time
832
-
833
- # execute plan(s) according to the optimization strategy
834
- records, plan_stats = self._execute_with_strategy(self.dataset, self.policy, optimizer)
835
- all_records.extend(records)
836
- all_plan_stats.extend(plan_stats)
837
-
838
- # aggregate plan stats
839
- aggregate_plan_stats = self.aggregate_plan_stats(all_plan_stats)
840
-
841
- # add sentinel records and plan stats (if captured) to plan execution data
842
- execution_stats = ExecutionStats(
843
- execution_id=self.execution_id(),
844
- plan_stats=aggregate_plan_stats,
845
- total_optimization_time=total_optimization_time,
846
- total_execution_time=time.time() - execution_start_time,
847
- total_execution_cost=sum(list(map(lambda plan_stats: plan_stats.total_plan_cost, aggregate_plan_stats.values()))),
848
- plan_strs={plan_id: plan_stats.plan_str for plan_id, plan_stats in aggregate_plan_stats.items()},
849
- )
850
-
851
- return DataRecordCollection(all_records, execution_stats = execution_stats)
852
-
853
-
854
-
855
- class MABSentinelSequentialSingleThreadProcessor(MABSentinelQueryProcessor, SequentialSingleThreadExecutionStrategy):
856
- """
857
- This class performs sentinel execution while executing plans in a sequential, single-threaded fashion.
858
- """
859
- def __init__(self, *args, **kwargs):
860
- super().__init__(self, *args, **kwargs)
861
- SequentialSingleThreadExecutionStrategy.__init__(
862
- self,
863
- scan_start_idx=self.scan_start_idx,
864
- max_workers=self.max_workers,
865
- nocache=self.nocache,
866
- verbose=self.verbose
867
- )
868
- self.progress_manager = None
869
-
870
-
871
- class MABSentinelPipelinedParallelProcessor(MABSentinelQueryProcessor, PipelinedParallelExecutionStrategy):
872
- """
873
- This class performs sentinel execution while executing plans in a pipelined, parallel fashion.
874
- """
875
- def __init__(self, *args, **kwargs):
876
- MABSentinelQueryProcessor.__init__(self, *args, **kwargs)
877
- PipelinedParallelExecutionStrategy.__init__(
878
- self,
879
- scan_start_idx=self.scan_start_idx,
880
- max_workers=self.max_workers,
881
- nocache=self.nocache,
882
- verbose=self.verbose
883
- )
884
- self.progress_manager = None