palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
  57. palimpzest-0.7.1.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.4.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0
@@ -1,639 +0,0 @@
1
- import time
2
- from concurrent.futures import ThreadPoolExecutor, wait
3
- from copy import deepcopy
4
-
5
- import numpy as np
6
-
7
- from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
8
- from palimpzest.core.data.dataclasses import (
9
- ExecutionStats,
10
- OperatorCostEstimates,
11
- OperatorStats,
12
- PlanStats,
13
- RecordOpStats,
14
- )
15
- from palimpzest.core.elements.records import DataRecordCollection, DataRecordSet
16
- from palimpzest.policy import Policy
17
- from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy
18
- from palimpzest.query.execution.single_threaded_execution_strategy import (
19
- PipelinedSingleThreadExecutionStrategy,
20
- SequentialSingleThreadExecutionStrategy,
21
- )
22
- from palimpzest.query.operators.convert import ConvertOp, LLMConvert
23
- from palimpzest.query.operators.filter import FilterOp, LLMFilter
24
- from palimpzest.query.operators.physical import PhysicalOperator
25
- from palimpzest.query.operators.retrieve import RetrieveOp
26
- from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp
27
- from palimpzest.query.optimizer.cost_model import SampleBasedCostModel
28
- from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType
29
- from palimpzest.query.optimizer.plan import SentinelPlan
30
- from palimpzest.query.processor.query_processor import QueryProcessor
31
- from palimpzest.sets import Set
32
-
33
-
34
- class RandomSamplingSentinelQueryProcessor(QueryProcessor):
35
- """
36
-
37
- """
38
- def __init__(
39
- self,
40
- k: int,
41
- sample_budget: int,
42
- sample_all_ops: bool = False,
43
- sample_all_records: bool = False,
44
- sample_start_idx: int | None = None,
45
- sample_end_idx: int | None = None,
46
- use_final_op_quality: bool = False,
47
- seed: int = 42,
48
- exp_name: str | None = None,
49
- *args,
50
- **kwargs,
51
- ):
52
- super().__init__(*args, **kwargs)
53
- # self.max_workers = self.get_parallel_max_workers()
54
- # TODO: undo
55
- # self.max_workers = 1
56
- self.k = k
57
- self.sample_budget = sample_budget
58
- self.j = int(sample_budget / k)
59
- self.sample_all_ops = sample_all_ops
60
- self.sample_all_records = sample_all_records
61
- self.sample_start_idx = sample_start_idx
62
- self.sample_end_idx = sample_end_idx
63
- self.use_final_op_quality = use_final_op_quality
64
- self.pick_output_fn = self.pick_ensemble_output
65
- self.rng = np.random.default_rng(seed=seed)
66
- self.exp_name = exp_name
67
-
68
-
69
- def compute_quality(
70
- self,
71
- record_set: DataRecordSet,
72
- expected_output: dict | None = None,
73
- champion_record_set: DataRecordSet | None = None,
74
- is_filter_op: bool = False,
75
- is_convert_op: bool = False,
76
- ) -> DataRecordSet:
77
- """
78
- Compute the quality for the given `record_set` by comparing it to the `expected_output`.
79
-
80
- Update the record_set by assigning the quality to each entry in its record_op_stats and
81
- returning the updated record_set.
82
- """
83
- # compute whether we can only use the champion
84
- only_using_champion = expected_output is None
85
-
86
- # if this operation is a failed convert
87
- if is_convert_op and len(record_set) == 0:
88
- record_set.record_op_stats[0].quality = 0.0
89
-
90
- # if this operation is a filter:
91
- # - we assign a quality of 1.0 if the record is in the expected outputs and it passes this filter
92
- # - we assign a quality of 0.0 if the record is in the expected outputs and it does NOT pass this filter
93
- # - we assign a quality relative to the champion / ensemble output if the record is not in the expected outputs
94
- # we cannot know for certain what the correct behavior is a given filter on a record which is not in the output
95
- # (unless it is the only filter in the plan), thus we only evaluate the filter based on its performance on
96
- # records which are in the output
97
- elif is_filter_op:
98
- # NOTE:
99
- # - we know that record_set.record_op_stats will contain a single entry for a filter op
100
- # - if we are using the champion, then champion_record_set will also contain a single entry for a filter op
101
- record_op_stats = record_set.record_op_stats[0]
102
- if only_using_champion:
103
- champion_record = champion_record_set[0]
104
- record_op_stats.quality = int(record_op_stats.passed_operator == champion_record.passed_operator)
105
-
106
- # - if we are using validation data, we may have multiple expected records in the expected_output for this source_idx,
107
- # thus, if we can identify an exact match, we can use that to evaluate the filter's quality
108
- # - if we are using validation data but we *cannot* find an exact match, then we will once again use the champion record set
109
- else:
110
- # compute number of matches between this record's computed fields and this expected record's outputs
111
- found_match_in_output = False
112
- labels_dict_lst = expected_output["labels"] if isinstance(expected_output["labels"], list) else [expected_output["labels"]]
113
- for labels_dict in labels_dict_lst:
114
- all_correct = True
115
- for field, value in record_op_stats.record_state.items():
116
- if value != labels_dict[field]:
117
- all_correct = False
118
- break
119
-
120
- if all_correct:
121
- found_match_in_output = True
122
- break
123
-
124
- if found_match_in_output:
125
- record_op_stats.quality = int(record_op_stats.passed_operator)
126
- else:
127
- champion_record = champion_record_set[0]
128
- record_op_stats.quality = int(record_op_stats.passed_operator == champion_record.passed_operator)
129
-
130
- # if this is a successful convert operation
131
- else:
132
- # NOTE: the following computation assumes we do not project out computed values
133
- # (and that the validation examples provide all computed fields); even if
134
- # a user program does add projection, we can ignore the projection on the
135
- # validation dataset and use the champion model (as opposed to the validation
136
- # output) for scoring fields which have their values projected out
137
-
138
- # create list of dictionaries of labels for each expected / champion output
139
- labels_dict_lst = []
140
- if only_using_champion:
141
- for champion_record in champion_record_set:
142
- labels_dict_lst.append(champion_record.to_dict())
143
- else:
144
- labels_dict_lst = (
145
- expected_output["labels"]
146
- if isinstance(expected_output["labels"], list)
147
- else [expected_output["labels"]]
148
- )
149
-
150
- # GREEDY ALGORITHM
151
- # for each record in the expected output, we look for the computed record which maximizes the quality metric;
152
- # once we've identified that computed record we remove it from consideration for the next expected output
153
- field_to_score_fn = {} if only_using_champion else expected_output["score_fn"]
154
- for labels_dict in labels_dict_lst:
155
- best_quality, best_record_op_stats = 0.0, None
156
- for record_op_stats in record_set.record_op_stats:
157
- # if we already assigned this record a quality, skip it
158
- if record_op_stats.quality is not None:
159
- continue
160
-
161
- # compute number of matches between this record's computed fields and this expected record's outputs
162
- total_quality = 0
163
- for field in record_op_stats.generated_fields:
164
- computed_value = record_op_stats.record_state.get(field, None)
165
- expected_value = labels_dict[field]
166
-
167
- # get the metric function for this field
168
- score_fn = field_to_score_fn.get(field, "exact")
169
-
170
- # compute exact match
171
- if score_fn == "exact":
172
- total_quality += int(computed_value == expected_value)
173
-
174
- # compute UDF metric
175
- elif callable(score_fn):
176
- total_quality += score_fn(computed_value, expected_value)
177
-
178
- # otherwise, throw an exception
179
- else:
180
- raise Exception(f"Unrecognized score_fn: {score_fn}")
181
-
182
- # compute recall and update best seen so far
183
- quality = total_quality / len(record_op_stats.generated_fields)
184
- if quality > best_quality:
185
- best_quality = quality
186
- best_record_op_stats = record_op_stats
187
-
188
- # set best_quality as quality for the best_record_op_stats
189
- if best_record_op_stats is not None:
190
- best_record_op_stats.quality = best_quality
191
-
192
- # for any records which did not receive a quality, set it to 0.0 as these are unexpected extras
193
- for record_op_stats in record_set.record_op_stats:
194
- if record_op_stats.quality is None:
195
- record_op_stats.quality = 0.0
196
-
197
- return record_set
198
-
199
-
200
- def score_quality(
201
- self,
202
- operator_sets: list[list[PhysicalOperator]],
203
- execution_data: dict[str, dict[str, list[DataRecordSet]]],
204
- champion_outputs: dict[str, dict[str, DataRecordSet]],
205
- expected_outputs: dict[str, dict],
206
- ) -> list[RecordOpStats]:
207
- """
208
- NOTE: This approach to cost modeling does not work directly for aggregation queries;
209
- for these queries, we would ask the user to provide validation data for the step immediately
210
- before a final aggregation
211
-
212
- NOTE: This function currently assumes that one-to-many converts do NOT create duplicate outputs.
213
- This assumption would break if, for example, we extracted the breed of every dog in an image.
214
- If there were two golden retrievers and a bernoodle in an image and we extracted:
215
-
216
- {"image": "file1.png", "breed": "Golden Retriever"}
217
- {"image": "file1.png", "breed": "Golden Retriever"}
218
- {"image": "file1.png", "breed": "Bernedoodle"}
219
-
220
- This function would currently give perfect accuracy to the following output:
221
-
222
- {"image": "file1.png", "breed": "Golden Retriever"}
223
- {"image": "file1.png", "breed": "Bernedoodle"}
224
-
225
- Even though it is missing one of the golden retrievers.
226
- """
227
- # extract information about the logical operation performed at this stage of the sentinel plan;
228
- # NOTE: we can infer these fields from context clues, but in the long-term we should have a more
229
- # principled way of getting these directly from attributes either stored in the sentinel_plan
230
- # or in the PhysicalOperator
231
- op_set = operator_sets[-1]
232
- physical_op = op_set[0]
233
- is_source_op = isinstance(physical_op, (MarshalAndScanDataOp, CacheScanDataOp))
234
- is_filter_op = isinstance(physical_op, FilterOp)
235
- is_convert_op = isinstance(physical_op, ConvertOp)
236
- is_perfect_quality_op = (
237
- not isinstance(physical_op, LLMConvert)
238
- and not isinstance(physical_op, LLMFilter)
239
- and not isinstance(physical_op, RetrieveOp)
240
- )
241
- logical_op_id = physical_op.logical_op_id
242
-
243
- # if this logical_op_id is not in the execution_data (because all upstream records were filtered), return
244
- if logical_op_id not in execution_data:
245
- return execution_data
246
-
247
- # pull out the execution data from this operator; place the upstream execution data in a new list
248
- this_op_execution_data = execution_data[logical_op_id]
249
-
250
- # compute quality of each output computed by this operator
251
- for source_idx, record_sets in this_op_execution_data.items():
252
- # NOTE
253
- # source_idx is a particular input, for which we may have computed multiple output record_sets;
254
- # each of these record_sets may contain more than one record (b/c one-to-many) and we have one
255
- # record_set per operator in the op_set
256
-
257
- # if this operation does not involve an LLM, every record_op_stats object gets perfect quality
258
- if is_perfect_quality_op:
259
- for record_set in record_sets:
260
- for record_op_stats in record_set.record_op_stats:
261
- record_op_stats.quality = 1.0
262
- continue
263
-
264
- # get the expected output for this source_idx if we have one
265
- expected_output = (
266
- expected_outputs[source_idx]
267
- if expected_outputs is not None and source_idx in expected_outputs
268
- else None
269
- )
270
-
271
- # extract champion output for this record set
272
- champion_record_set = champion_outputs[logical_op_id][source_idx]
273
-
274
- # for each record_set produced by an operation, compute its quality
275
- for record_set in record_sets:
276
- record_set = self.compute_quality(record_set, expected_output, champion_record_set, is_filter_op, is_convert_op)
277
-
278
- # if this operator is a source op (i.e. has no input logical operator), return the execution data
279
- if is_source_op:
280
- return execution_data
281
-
282
- # recursively call the function on the next logical operator until you reach a scan
283
- execution_data = self.score_quality(operator_sets[:-1], execution_data, champion_outputs, expected_outputs)
284
-
285
- # return the quality annotated record op stats
286
- return execution_data
287
-
288
- def pick_champion_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
289
- # if there's only one operator in the set, we return its record_set
290
- if len(op_set_record_sets) == 1:
291
- record_set, _ = op_set_record_sets[0]
292
- return record_set
293
-
294
- # find the operator with the highest average quality and return its record_set
295
- base_op_cost_est = OperatorCostEstimates(cardinality=1.0, cost_per_record=0.0, time_per_record=0.0, quality=1.0)
296
- champion_record_set, champion_quality = None, -1.0
297
- for record_set, op in op_set_record_sets:
298
- op_cost_estimates = op.naive_cost_estimates(base_op_cost_est)
299
- if op_cost_estimates.quality > champion_quality:
300
- champion_record_set, champion_quality = record_set, op_cost_estimates.quality
301
-
302
- return champion_record_set
303
-
304
- def pick_ensemble_output(self, op_set_record_sets: list[tuple[DataRecordSet, PhysicalOperator]]) -> DataRecordSet:
305
- # if there's only one operator in the set, we return its record_set
306
- if len(op_set_record_sets) == 1:
307
- record_set, _ = op_set_record_sets[0]
308
- return record_set
309
-
310
- # NOTE: I don't like that this assumes the models are consistent in
311
- # how they order their record outputs for one-to-many converts;
312
- # eventually we can try out more robust schemes to account for
313
- # differences in ordering
314
- # aggregate records at each index in the response
315
- idx_to_records = {}
316
- for record_set, _ in op_set_record_sets:
317
- for idx, record in enumerate(record_set):
318
- if idx not in idx_to_records:
319
- idx_to_records[idx] = [record]
320
- else:
321
- idx_to_records[idx].append(record)
322
-
323
- # compute most common answer at each index
324
- out_records = []
325
- for idx in range(len(idx_to_records)):
326
- records = idx_to_records[idx]
327
- most_common_record = max(set(records), key=records.count)
328
- out_records.append(most_common_record)
329
-
330
- # create and return final DataRecordSet
331
- return DataRecordSet(out_records, [])
332
-
333
-
334
- def execute_op_set(self, candidates, op_set):
335
- # TODO: post-submission we will need to modify this to:
336
- # - submit all candidates for aggregate operators
337
- # - handle limits
338
- # create thread pool w/max workers and run futures over worker pool
339
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
340
- # create futures
341
- futures = []
342
- for candidate in candidates:
343
- for operator in op_set:
344
- future = executor.submit(PhysicalOperator.execute_op_wrapper, operator, candidate)
345
- futures.append(future)
346
-
347
- # compute output record_set for each (operator, candidate) pair
348
- output_record_sets = []
349
- while len(futures) > 0:
350
- # get the set of futures that have (and have not) finished in the last PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
351
- done_futures, not_done_futures = wait(futures, timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
352
-
353
- # cast not_done_futures from a set to a list so we can append to it
354
- not_done_futures = list(not_done_futures)
355
-
356
- # process finished futures
357
- for future in done_futures:
358
- # get the result and add it to the output records set
359
- record_set, operator, candidate = future.result()
360
- output_record_sets.append((record_set, operator, candidate))
361
-
362
- # update list of futures
363
- futures = not_done_futures
364
-
365
- # compute mapping from source_idx to record sets for all operators and for champion operator
366
- all_record_sets, champion_record_sets = {}, {}
367
- for candidate in candidates:
368
- candidate_output_record_sets = []
369
- for record_set, operator, candidate_ in output_record_sets:
370
- if candidate == candidate_:
371
- candidate_output_record_sets.append((record_set, operator))
372
-
373
- # select the champion (i.e. best) record_set from all the record sets computed for this operator
374
- champion_record_set = self.pick_output_fn(candidate_output_record_sets)
375
-
376
- # get the source_idx associated with this input record
377
- source_idx = candidate.source_idx
378
-
379
- # add champion record_set to mapping from source_idx --> champion record_set
380
- champion_record_sets[source_idx] = champion_record_set
381
-
382
- # add all record_sets computed for this source_idx to mapping from source_idx --> record_sets
383
- all_record_sets[source_idx] = [tup[0] for tup in candidate_output_record_sets]
384
-
385
- return all_record_sets, champion_record_sets
386
-
387
-
388
- def execute_sentinel_plan(self, plan: SentinelPlan, expected_outputs: dict[str, dict], policy: Policy):
389
- """
390
- """
391
- if self.verbose:
392
- print("----------------------")
393
- print(f"PLAN[{plan.plan_id}] (sentinel):")
394
- print(plan)
395
- print("---")
396
-
397
- plan_start_time = time.time()
398
-
399
- # initialize plan stats and operator stats
400
- plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
401
- for logical_op_id, logical_op_name, op_set in plan:
402
- op_set_details = {
403
- op.op_name(): {k: str(v) for k, v in op.get_id_params().items()}
404
- for op in op_set
405
- }
406
- plan_stats.operator_stats[logical_op_id] = OperatorStats(
407
- op_id=logical_op_id,
408
- op_name=logical_op_name,
409
- op_details=op_set_details,
410
- )
411
-
412
- # sample validation records
413
- total_num_samples = len(self.val_datasource)
414
- source_indices = np.arange(total_num_samples)
415
- if self.sample_start_idx is not None:
416
- assert self.sample_end_idx is not None
417
- source_indices = source_indices[self.sample_start_idx:self.sample_end_idx]
418
- elif not self.sample_all_records:
419
- self.rng.shuffle(source_indices)
420
- j = min(self.j, len(source_indices))
421
- source_indices = source_indices[:j]
422
-
423
- # initialize output variables
424
- all_outputs, champion_outputs = {}, {}
425
-
426
- # create initial set of candidates for source scan operator
427
- candidates = []
428
- for source_idx in source_indices:
429
- candidates.append(source_idx)
430
-
431
- # NOTE: because we need to dynamically create sample matrices for each operator,
432
- # sentinel execution must be executed one operator at a time (i.e. sequentially)
433
- # execute operator sets in sequence
434
- for op_idx, (logical_op_id, _, op_set) in enumerate(plan):
435
- prev_logical_op_id = plan.logical_op_ids[op_idx - 1] if op_idx > 0 else None
436
- next_logical_op_id = plan.logical_op_ids[op_idx + 1] if op_idx + 1 < len(plan) else None
437
-
438
- # sample k optimizations
439
- k = min(self.k, len(op_set)) if not self.sample_all_ops else len(op_set)
440
- sampled_ops = self.rng.choice(op_set, size=k, replace=False)
441
-
442
- # run sampled operators on sampled candidates
443
- source_idx_to_record_sets, source_idx_to_champion_record_set = self.execute_op_set(candidates, sampled_ops)
444
-
445
- # update all_outputs and champion_outputs dictionary
446
- if logical_op_id not in all_outputs:
447
- all_outputs[logical_op_id] = source_idx_to_record_sets
448
- champion_outputs[logical_op_id] = source_idx_to_champion_record_set
449
- else:
450
- for source_idx, record_sets in source_idx_to_record_sets.items():
451
- if source_idx not in all_outputs[logical_op_id]:
452
- all_outputs[logical_op_id][source_idx] = record_sets
453
- champion_outputs[logical_op_id][source_idx] = source_idx_to_champion_record_set[source_idx]
454
- else:
455
- all_outputs[logical_op_id][source_idx].extend(record_sets)
456
- champion_outputs[logical_op_id][source_idx].extend(source_idx_to_champion_record_set[source_idx])
457
-
458
- # flatten lists of records and record_op_stats
459
- all_records, all_record_op_stats = [], []
460
- for _, record_sets in source_idx_to_record_sets.items():
461
- for record_set in record_sets:
462
- all_records.extend(record_set.data_records)
463
- all_record_op_stats.extend(record_set.record_op_stats)
464
-
465
- # update plan stats
466
- plan_stats.operator_stats[logical_op_id].add_record_op_stats(
467
- all_record_op_stats,
468
- source_op_id=prev_logical_op_id,
469
- plan_id=plan.plan_id,
470
- )
471
-
472
- # add records (which are not filtered) to the cache, if allowed
473
- if not self.nocache:
474
- for record in all_records:
475
- if getattr(record, "passed_operator", True):
476
- # self.datadir.append_cache(logical_op_id, record)
477
- pass
478
-
479
- # update candidates for next operator; we use champion outputs as input
480
- candidates = []
481
- if next_logical_op_id is not None:
482
- for _, record_set in source_idx_to_champion_record_set.items():
483
- for record in record_set:
484
- if isinstance(op_set[0], FilterOp) and not record.passed_operator:
485
- continue
486
- candidates.append(record)
487
-
488
- # if we've filtered out all records, terminate early
489
- if next_logical_op_id is not None and candidates == []:
490
- break
491
-
492
- # compute quality for each operator
493
- all_outputs = self.score_quality(plan.operator_sets, all_outputs, champion_outputs, expected_outputs)
494
-
495
- # if caching was allowed, close the cache
496
- if not self.nocache:
497
- for _, _, _ in plan:
498
- # self.datadir.close_cache(logical_op_id)
499
- pass
500
-
501
- # finalize plan stats
502
- total_plan_time = time.time() - plan_start_time
503
- plan_stats.finalize(total_plan_time)
504
-
505
- return all_outputs, plan_stats
506
-
507
-
508
- def generate_sample_observations(self, sentinel_plan: SentinelPlan, policy: Policy):
509
- """
510
- This function is responsible for generating sample observation data which can be
511
- consumed by the CostModel.
512
-
513
- To accomplish this, we construct a special sentinel plan using the Optimizer which is
514
- capable of executing any valid physical implementation of a Filter or Convert operator
515
- on each record.
516
- """
517
- # if we're using validation data, get the set of expected output records
518
- expected_outputs = {}
519
- for source_idx in range(len(self.val_datasource)):
520
- # TODO: make sure execute_op_set uses self.val_datasource
521
- expected_output = self.val_datasource[source_idx]
522
- expected_outputs[source_idx] = expected_output
523
-
524
- # run sentinel plan
525
- execution_data, plan_stats = self.execute_sentinel_plan(sentinel_plan, expected_outputs, policy)
526
-
527
- return execution_data, plan_stats
528
-
529
-
530
- def create_sentinel_plan(self, dataset: Set, policy: Policy) -> SentinelPlan:
531
- """
532
- Generates and returns a SentinelPlan for the given dataset.
533
- """
534
- # TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
535
-
536
- # create a new optimizer and update its strategy to SENTINEL
537
- optimizer = self.optimizer.deepcopy_clean()
538
- optimizer.update_strategy(OptimizationStrategyType.SENTINEL)
539
-
540
- # create copy of dataset, but change its data source to the validation data source
541
- dataset = deepcopy(dataset)
542
- dataset._set_data_source(self.val_datasource)
543
-
544
- # get the sentinel plan for the given dataset
545
- sentinel_plans = optimizer.optimize(dataset, policy)
546
- sentinel_plan = sentinel_plans[0]
547
-
548
- return sentinel_plan
549
-
550
-
551
- def execute(self) -> DataRecordCollection:
552
- execution_start_time = time.time()
553
-
554
- # for now, enforce that we are using validation data; we can relax this after paper submission
555
- if self.val_datasource is None:
556
- raise Exception("Make sure you are using validation data with MABSentinelExecutionEngine")
557
-
558
- # if nocache is True, make sure we do not re-use codegen examples
559
- if self.nocache:
560
- # self.clear_cached_examples()
561
- pass
562
-
563
- # create sentinel plan
564
- sentinel_plan = self.create_sentinel_plan(self.dataset, self.policy)
565
-
566
- # generate sample execution data
567
- all_execution_data, plan_stats = self.generate_sample_observations(sentinel_plan, self.policy)
568
-
569
- # put sentinel plan execution stats into list and prepare list of output records
570
- all_plan_stats = [plan_stats]
571
- all_records = []
572
-
573
- # construct the CostModel with any sample execution data we've gathered
574
- cost_model = SampleBasedCostModel(sentinel_plan, all_execution_data, self.verbose, self.exp_name)
575
- optimizer = self.optimizer.deepcopy_clean()
576
- optimizer.update_cost_model(cost_model)
577
- total_optimization_time = time.time() - execution_start_time
578
-
579
- # execute plan(s) according to the optimization strategy
580
- records, plan_stats = self._execute_with_strategy(self.dataset, self.policy, optimizer)
581
- all_records.extend(records)
582
- all_plan_stats.extend(plan_stats)
583
-
584
- # aggregate plan stats
585
- aggregate_plan_stats = self.aggregate_plan_stats(all_plan_stats)
586
-
587
- # add sentinel records and plan stats (if captured) to plan execution data
588
- execution_stats = ExecutionStats(
589
- execution_id=self.execution_id(),
590
- plan_stats=aggregate_plan_stats,
591
- total_optimization_time=total_optimization_time,
592
- total_execution_time=time.time() - execution_start_time,
593
- total_execution_cost=sum(list(map(lambda plan_stats: plan_stats.total_plan_cost, aggregate_plan_stats.values()))),
594
- plan_strs={plan_id: plan_stats.plan_str for plan_id, plan_stats in aggregate_plan_stats.items()},
595
- )
596
-
597
- return DataRecordCollection(all_records, execution_stats=execution_stats)
598
-
599
-
600
- class RandomSamplingSentinelSequentialSingleThreadProcessor(RandomSamplingSentinelQueryProcessor, SequentialSingleThreadExecutionStrategy):
601
- """
602
- This class performs sentinel execution while executing plans in a sequential, single-threaded fashion.
603
- """
604
- def __init__(self, *args, **kwargs):
605
- RandomSamplingSentinelQueryProcessor.__init__(self, *args, **kwargs)
606
- SequentialSingleThreadExecutionStrategy.__init__(
607
- self,
608
- scan_start_idx=self.scan_start_idx,
609
- max_workers=self.max_workers,
610
- verbose=self.verbose
611
- )
612
-
613
-
614
- class RandomSamplingSentinelPipelinedParallelProcessor(RandomSamplingSentinelQueryProcessor, PipelinedParallelExecutionStrategy):
615
- """
616
- This class performs sentinel execution while executing plans in a pipelined, parallel fashion.
617
- """
618
- def __init__(self, *args, **kwargs):
619
- RandomSamplingSentinelQueryProcessor.__init__(self, *args, **kwargs)
620
- PipelinedParallelExecutionStrategy.__init__(
621
- self,
622
- scan_start_idx=self.scan_start_idx,
623
- max_workers=self.max_workers,
624
- verbose=self.verbose
625
- )
626
-
627
-
628
- class RandomSamplingSentinelPipelinedSingleThreadProcessor(RandomSamplingSentinelQueryProcessor, PipelinedSingleThreadExecutionStrategy):
629
- """
630
- This class performs sentinel execution while executing plans in a pipelined, parallel fashion.
631
- """
632
- def __init__(self, *args, **kwargs):
633
- RandomSamplingSentinelQueryProcessor.__init__(self, *args, **kwargs)
634
- PipelinedSingleThreadExecutionStrategy.__init__(
635
- self,
636
- scan_start_idx=self.scan_start_idx,
637
- max_workers=self.max_workers,
638
- verbose=self.verbose
639
- )
@@ -1,6 +0,0 @@
1
- import re
2
-
3
-
4
- def get_index_str(index):
5
- regex = re.compile('<(.*?) object at.*?>')
6
- return regex.match(str(index))[1]