palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. palimpzest/__init__.py +5 -0
  2. palimpzest/constants.py +110 -43
  3. palimpzest/core/__init__.py +0 -78
  4. palimpzest/core/data/dataclasses.py +382 -44
  5. palimpzest/core/elements/filters.py +7 -3
  6. palimpzest/core/elements/index.py +70 -0
  7. palimpzest/core/elements/records.py +33 -11
  8. palimpzest/core/lib/fields.py +1 -0
  9. palimpzest/core/lib/schemas.py +4 -3
  10. palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
  11. palimpzest/prompts/prompt_factory.py +44 -7
  12. palimpzest/prompts/split_merge_prompts.py +56 -0
  13. palimpzest/prompts/split_proposer_prompts.py +55 -0
  14. palimpzest/query/execution/execution_strategy.py +435 -53
  15. palimpzest/query/execution/execution_strategy_type.py +20 -0
  16. palimpzest/query/execution/mab_execution_strategy.py +532 -0
  17. palimpzest/query/execution/parallel_execution_strategy.py +143 -172
  18. palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
  19. palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
  20. palimpzest/query/generators/api_client_factory.py +31 -0
  21. palimpzest/query/generators/generators.py +256 -76
  22. palimpzest/query/operators/__init__.py +1 -2
  23. palimpzest/query/operators/code_synthesis_convert.py +33 -18
  24. palimpzest/query/operators/convert.py +30 -97
  25. palimpzest/query/operators/critique_and_refine_convert.py +5 -6
  26. palimpzest/query/operators/filter.py +7 -10
  27. palimpzest/query/operators/logical.py +54 -10
  28. palimpzest/query/operators/map.py +130 -0
  29. palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
  30. palimpzest/query/operators/physical.py +3 -12
  31. palimpzest/query/operators/rag_convert.py +66 -18
  32. palimpzest/query/operators/retrieve.py +230 -34
  33. palimpzest/query/operators/scan.py +5 -2
  34. palimpzest/query/operators/split_convert.py +169 -0
  35. palimpzest/query/operators/token_reduction_convert.py +8 -14
  36. palimpzest/query/optimizer/__init__.py +4 -16
  37. palimpzest/query/optimizer/cost_model.py +73 -266
  38. palimpzest/query/optimizer/optimizer.py +87 -58
  39. palimpzest/query/optimizer/optimizer_strategy.py +18 -97
  40. palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
  41. palimpzest/query/optimizer/plan.py +2 -3
  42. palimpzest/query/optimizer/primitives.py +5 -3
  43. palimpzest/query/optimizer/rules.py +336 -172
  44. palimpzest/query/optimizer/tasks.py +30 -100
  45. palimpzest/query/processor/config.py +38 -22
  46. palimpzest/query/processor/nosentinel_processor.py +16 -520
  47. palimpzest/query/processor/processing_strategy_type.py +28 -0
  48. palimpzest/query/processor/query_processor.py +38 -206
  49. palimpzest/query/processor/query_processor_factory.py +117 -130
  50. palimpzest/query/processor/sentinel_processor.py +90 -0
  51. palimpzest/query/processor/streaming_processor.py +25 -32
  52. palimpzest/sets.py +88 -41
  53. palimpzest/utils/model_helpers.py +8 -7
  54. palimpzest/utils/progress.py +368 -152
  55. palimpzest/utils/token_reduction_helpers.py +1 -3
  56. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
  57. palimpzest-0.7.1.dist-info/RECORD +96 -0
  58. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
  59. palimpzest/query/processor/mab_sentinel_processor.py +0 -884
  60. palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
  61. palimpzest/utils/index_helpers.py +0 -6
  62. palimpzest-0.6.4.dist-info/RECORD +0 -87
  63. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
  64. {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0
@@ -1,537 +1,33 @@
1
- import time
1
+ import logging
2
2
 
3
- from palimpzest.core.data.dataclasses import ExecutionStats, OperatorStats, PlanStats
3
+ from palimpzest.core.data.dataclasses import ExecutionStats
4
4
  from palimpzest.core.elements.records import DataRecordCollection
5
- from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy
6
- from palimpzest.query.execution.single_threaded_execution_strategy import (
7
- PipelinedSingleThreadExecutionStrategy,
8
- SequentialSingleThreadExecutionStrategy,
9
- )
10
- from palimpzest.query.operators.aggregate import AggregateOp
11
- from palimpzest.query.operators.filter import FilterOp
12
- from palimpzest.query.operators.limit import LimitScanOp
13
- from palimpzest.query.operators.scan import ScanPhysicalOp
14
- from palimpzest.query.optimizer.plan import PhysicalPlan
15
5
  from palimpzest.query.processor.query_processor import QueryProcessor
16
- from palimpzest.utils.progress import create_progress_manager
17
6
 
7
+ logger = logging.getLogger(__name__)
18
8
 
19
9
  class NoSentinelQueryProcessor(QueryProcessor):
20
10
  """
21
- Specialized query processor that implements no sentinel strategy
22
- for coordinating optimization and execution.
11
+ Query processor that uses naive cost estimates to select the best plan.
23
12
  """
24
13
 
25
14
  # TODO: Consider to support dry_run.
26
15
  def execute(self) -> DataRecordCollection:
27
- execution_start_time = time.time()
16
+ logger.info("Executing NoSentinelQueryProcessor")
28
17
 
29
- # if nocache is True, make sure we do not re-use codegen examples
30
- if self.nocache:
31
- # self.clear_cached_examples()
32
- pass
18
+ # create execution stats
19
+ execution_stats = ExecutionStats(execution_id=self.execution_id())
20
+ execution_stats.start()
33
21
 
34
22
  # execute plan(s) according to the optimization strategy
35
- records, plan_stats = self._execute_with_strategy(self.dataset, self.policy, self.optimizer)
23
+ records, plan_stats = self._execute_best_plan(self.dataset, self.optimizer)
36
24
 
37
- # aggregate plan stats
38
- aggregate_plan_stats = self.aggregate_plan_stats(plan_stats)
25
+ # update the execution stats to account for the work to execute the final plan
26
+ execution_stats.add_plan_stats(plan_stats)
27
+ execution_stats.finish()
39
28
 
40
- # add sentinel records and plan stats (if captured) to plan execution data
41
- execution_stats = ExecutionStats(
42
- execution_id=self.execution_id(),
43
- plan_stats=aggregate_plan_stats,
44
- total_execution_time=time.time() - execution_start_time,
45
- total_execution_cost=sum(
46
- list(map(lambda plan_stats: plan_stats.total_plan_cost, aggregate_plan_stats.values()))
47
- ),
48
- plan_strs={plan_id: plan_stats.plan_str for plan_id, plan_stats in aggregate_plan_stats.items()},
49
- )
29
+ # construct and return the DataRecordCollection
30
+ result = DataRecordCollection(records, execution_stats=execution_stats)
31
+ logger.info("Done executing NoSentinelQueryProcessor")
50
32
 
51
- return DataRecordCollection(records, execution_stats=execution_stats)
52
-
53
-
54
- class NoSentinelSequentialSingleThreadProcessor(NoSentinelQueryProcessor, SequentialSingleThreadExecutionStrategy):
55
- """
56
- This class performs non-sample based execution while executing plans in a sequential, single-threaded fashion.
57
- """
58
- def __init__(self, *args, **kwargs):
59
- NoSentinelQueryProcessor.__init__(self, *args, **kwargs)
60
- SequentialSingleThreadExecutionStrategy.__init__(
61
- self,
62
- scan_start_idx=self.scan_start_idx,
63
- max_workers=self.max_workers,
64
- nocache=self.nocache,
65
- verbose=self.verbose
66
- )
67
- self.progress_manager = None
68
-
69
- def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1):
70
- """Initialize the stats and execute the plan with progress reporting."""
71
- if self.verbose:
72
- print("----------------------")
73
- print(f"PLAN[{plan.plan_id}] (n={num_samples}):")
74
- print(plan)
75
- print("---")
76
-
77
- plan_start_time = time.time()
78
-
79
- # Initialize progress manager
80
- self.progress_manager = create_progress_manager()
81
-
82
- # initialize plan stats and operator stats
83
- plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
84
- for op in plan.operators:
85
- op_id = op.get_op_id()
86
- op_name = op.op_name()
87
- op_details = {k: str(v) for k, v in op.get_id_params().items()}
88
- plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
89
-
90
- # initialize list of output records and intermediate variables
91
- output_records = []
92
- current_scan_idx = self.scan_start_idx
93
-
94
- # get handle to scan operator and pre-compute its size
95
- source_operator = plan.operators[0]
96
- assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
97
- datareader_len = len(source_operator.datareader)
98
-
99
- # Calculate total work units - each record needs to go through each operator
100
- total_ops = len(plan.operators)
101
- total_items = min(num_samples, datareader_len) if num_samples != float("inf") else datareader_len
102
- total_work_units = total_items * total_ops
103
- self.progress_manager.start(total_work_units)
104
- work_units_completed = 0
105
-
106
- # initialize processing queues for each operation
107
- processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
108
-
109
- try:
110
- # execute the plan one operator at a time
111
- for op_idx, operator in enumerate(plan.operators):
112
- op_id = operator.get_op_id()
113
- prev_op_id = plan.operators[op_idx - 1].get_op_id() if op_idx > 1 else None
114
- next_op_id = plan.operators[op_idx + 1].get_op_id() if op_idx + 1 < len(plan.operators) else None
115
-
116
- # Update progress to show which operator is currently running
117
- op_name = operator.__class__.__name__
118
- self.progress_manager.update(work_units_completed, f"Running {op_name} ({op_idx + 1}/{total_ops})")
119
-
120
- # initialize output records and record_op_stats for this operator
121
- records, record_op_stats = [], []
122
-
123
- # invoke scan operator(s) until we run out of source records or hit the num_samples limit
124
- if isinstance(operator, ScanPhysicalOp):
125
- keep_scanning_source_records = True
126
- while keep_scanning_source_records:
127
- # run ScanPhysicalOp on current scan index
128
- record_set = operator(current_scan_idx)
129
- records.extend(record_set.data_records)
130
- record_op_stats.extend(record_set.record_op_stats)
131
-
132
- # Update progress for each processed record in data source
133
- work_units_completed += 1
134
- self.progress_manager.update(
135
- work_units_completed,
136
- f"Scanning data source: {current_scan_idx + 1}/{total_items}"
137
- )
138
-
139
- # update the current scan index
140
- current_scan_idx += 1
141
-
142
- # update whether to keep scanning source records
143
- keep_scanning_source_records = current_scan_idx < datareader_len and len(records) < num_samples
144
-
145
- # aggregate operators accept all input records at once
146
- elif isinstance(operator, AggregateOp):
147
- record_set = operator(candidates=processing_queues[op_id])
148
- records = record_set.data_records
149
- record_op_stats = record_set.record_op_stats
150
-
151
- # Update progress for aggregate operation - count all records being aggregated
152
- work_units_completed += len(processing_queues[op_id])
153
- self.progress_manager.update(
154
- work_units_completed,
155
- f"Aggregating {len(processing_queues[op_id])} records"
156
- )
157
-
158
- # otherwise, process the records in the processing queue for this operator one at a time
159
- elif len(processing_queues[op_id]) > 0:
160
- queue_size = len(processing_queues[op_id])
161
- for idx, input_record in enumerate(processing_queues[op_id]):
162
- record_set = operator(input_record)
163
- records.extend(record_set.data_records)
164
- record_op_stats.extend(record_set.record_op_stats)
165
-
166
- # Update progress for each processed record in the queue
167
- work_units_completed += 1
168
- self.progress_manager.update(
169
- work_units_completed,
170
- f"Processing records: {idx + 1}/{queue_size}"
171
- )
172
-
173
- if isinstance(operator, LimitScanOp) and len(records) == operator.limit:
174
- break
175
-
176
- # update plan stats
177
- plan_stats.operator_stats[op_id].add_record_op_stats(
178
- record_op_stats,
179
- source_op_id=prev_op_id,
180
- plan_id=plan.plan_id,
181
- )
182
-
183
- # add records (which are not filtered) to the cache, if allowed
184
- if not self.nocache:
185
- for record in records:
186
- if getattr(record, "passed_operator", True):
187
- # self.datadir.append_cache(operator.target_cache_id, record)
188
- pass
189
-
190
- # update processing_queues or output_records
191
- for record in records:
192
- if isinstance(operator, FilterOp) and not record.passed_operator:
193
- continue
194
- if next_op_id is not None:
195
- processing_queues[next_op_id].append(record)
196
- else:
197
- output_records.append(record)
198
-
199
- # if we've filtered out all records, terminate early
200
- if next_op_id is not None and processing_queues[next_op_id] == []:
201
- break
202
-
203
- # if caching was allowed, close the cache
204
- if not self.nocache:
205
- for _ in plan.operators:
206
- # self.datadir.close_cache(operator.target_cache_id)
207
- pass
208
-
209
- # finalize plan stats
210
- total_plan_time = time.time() - plan_start_time
211
- plan_stats.finalize(total_plan_time)
212
-
213
- finally:
214
- # Always finish progress tracking
215
- if self.progress_manager:
216
- self.progress_manager.finish()
217
-
218
- return output_records, plan_stats
219
-
220
-
221
- class NoSentinelPipelinedSingleThreadProcessor(NoSentinelQueryProcessor, PipelinedSingleThreadExecutionStrategy):
222
- """
223
- This class performs non-sample based execution while executing plans in a pipelined, parallel fashion.
224
- """
225
- def __init__(self, *args, **kwargs):
226
- NoSentinelQueryProcessor.__init__(self, *args, **kwargs)
227
- PipelinedSingleThreadExecutionStrategy.__init__(
228
- self,
229
- scan_start_idx=self.scan_start_idx,
230
- max_workers=self.max_workers,
231
- nocache=self.nocache,
232
- verbose=self.verbose
233
- )
234
- self.progress_manager = None
235
-
236
- def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1):
237
- """Initialize the stats and execute the plan with progress reporting."""
238
- if self.verbose:
239
- print("----------------------")
240
- print(f"PLAN[{plan.plan_id}] (n={num_samples}):")
241
- print(plan)
242
- print("---")
243
-
244
- plan_start_time = time.time()
245
-
246
- # Initialize progress manager
247
- self.progress_manager = create_progress_manager()
248
-
249
- # initialize plan stats and operator stats
250
- plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
251
- for op in plan.operators:
252
- op_id = op.get_op_id()
253
- op_name = op.op_name()
254
- op_details = {k: str(v) for k, v in op.get_id_params().items()}
255
- plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
256
-
257
- # initialize list of output records and intermediate variables
258
- output_records = []
259
- source_records_scanned = 0
260
- current_scan_idx = self.scan_start_idx
261
-
262
- # get handle to scan operator and pre-compute its size
263
- source_operator = plan.operators[0]
264
- assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
265
- datareader_len = len(source_operator.datareader)
266
-
267
- # Calculate total work units - each record needs to go through each operator
268
- total_ops = len(plan.operators)
269
- total_items = min(num_samples, datareader_len) if num_samples != float("inf") else datareader_len
270
- total_work_units = total_items * total_ops
271
- self.progress_manager.start(total_work_units)
272
- work_units_completed = 0
273
-
274
- try:
275
- # initialize processing queues for each operation
276
- processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
277
-
278
- # execute the plan until either:
279
- # 1. all records have been processed, or
280
- # 2. the final limit operation has completed
281
- finished_executing, keep_scanning_source_records = False, True
282
- while not finished_executing:
283
- for op_idx, operator in enumerate(plan.operators):
284
- op_id = operator.get_op_id()
285
- prev_op_id = plan.operators[op_idx - 1].get_op_id() if op_idx > 1 else None
286
- next_op_id = plan.operators[op_idx + 1].get_op_id() if op_idx + 1 < len(plan.operators) else None
287
-
288
- # Update progress with current operator info
289
- op_name = operator.__class__.__name__
290
- self.progress_manager.update(work_units_completed, f"Running {op_name} ({op_idx + 1}/{total_ops})")
291
-
292
- # create empty lists for records and execution stats generated by executing this operator on its next input(s)
293
- records, record_op_stats = [], []
294
-
295
- # invoke scan operator(s) until we run out of source records or hit the num_samples limit
296
- if isinstance(operator, ScanPhysicalOp):
297
- if keep_scanning_source_records:
298
- # run ScanPhysicalOp on current scan index
299
- record_set = operator(current_scan_idx)
300
- records = record_set.data_records
301
- record_op_stats = record_set.record_op_stats
302
-
303
- # Update progress for each processed record
304
- work_units_completed += 1
305
- self.progress_manager.update(
306
- work_units_completed,
307
- f"Scanning data source: {current_scan_idx + 1}/{total_items}"
308
- )
309
-
310
- # update number of source records scanned and the current index
311
- source_records_scanned += len(records)
312
- current_scan_idx += 1
313
-
314
- # update whether to keep scanning source records
315
- keep_scanning_source_records = current_scan_idx < datareader_len and source_records_scanned < num_samples
316
-
317
- # only invoke aggregate operator(s) once there are no more source records and all
318
- # upstream operators' processing queues are empty
319
- elif isinstance(operator, AggregateOp):
320
- upstream_ops_are_finished = True
321
- for upstream_op_idx in range(op_idx):
322
- # scan operators do not have processing queues
323
- if isinstance(plan.operators[upstream_op_idx], ScanPhysicalOp):
324
- continue
325
-
326
- # check upstream ops which do have a processing queue
327
- upstream_op_id = plan.operators[upstream_op_idx].get_op_id()
328
- upstream_ops_are_finished = (
329
- upstream_ops_are_finished and len(processing_queues[upstream_op_id]) == 0
330
- )
331
-
332
- if not keep_scanning_source_records and upstream_ops_are_finished:
333
- record_set = operator(candidates=processing_queues[op_id])
334
- records = record_set.data_records
335
- record_op_stats = record_set.record_op_stats
336
- processing_queues[op_id] = []
337
-
338
- # Update progress for aggregate operation
339
- work_units_completed += len(processing_queues[op_id])
340
- self.progress_manager.update(
341
- work_units_completed,
342
- f"Aggregating {len(processing_queues[op_id])} records"
343
- )
344
-
345
- # otherwise, process the next record in the processing queue for this operator
346
- elif len(processing_queues[op_id]) > 0:
347
- input_record = processing_queues[op_id].pop(0)
348
- record_set = operator(input_record)
349
- records = record_set.data_records
350
- record_op_stats = record_set.record_op_stats
351
-
352
- # Update progress for processed record
353
- work_units_completed += 1
354
- self.progress_manager.update(
355
- work_units_completed,
356
- f"Processing record through {op_name}"
357
- )
358
-
359
- # if records were generated by this operator, process them
360
- if len(records) > 0:
361
- # update plan stats
362
- plan_stats.operator_stats[op_id].add_record_op_stats(
363
- record_op_stats,
364
- source_op_id=prev_op_id,
365
- plan_id=plan.plan_id,
366
- )
367
-
368
- # add records (which are not filtered) to the cache, if allowed
369
- if not self.nocache:
370
- for record in records:
371
- if getattr(record, "passed_operator", True):
372
- # self.datadir.append_cache(operator.target_cache_id, record)
373
- pass
374
-
375
- # update processing_queues or output_records
376
- for record in records:
377
- if isinstance(operator, FilterOp) and not record.passed_operator:
378
- continue
379
- if next_op_id is not None:
380
- processing_queues[next_op_id].append(record)
381
- else:
382
- output_records.append(record)
383
-
384
- # update finished_executing based on whether all records have been processed
385
- still_processing = any([len(queue) > 0 for queue in processing_queues.values()])
386
- finished_executing = not keep_scanning_source_records and not still_processing
387
-
388
- # update finished_executing based on limit
389
- if isinstance(operator, LimitScanOp):
390
- finished_executing = len(output_records) == operator.limit
391
-
392
- # if caching was allowed, close the cache
393
- if not self.nocache:
394
- for _ in plan.operators:
395
- # self.datadir.close_cache(operator.target_cache_id)
396
- pass
397
-
398
- # finalize plan stats
399
- total_plan_time = time.time() - plan_start_time
400
- plan_stats.finalize(total_plan_time)
401
-
402
- finally:
403
- # Always finish progress tracking
404
- if self.progress_manager:
405
- self.progress_manager.finish()
406
-
407
- return output_records, plan_stats
408
-
409
-
410
- class NoSentinelPipelinedParallelProcessor(NoSentinelQueryProcessor, PipelinedParallelExecutionStrategy):
411
- """
412
- This class performs non-sample based execution while executing plans in a pipelined, parallel fashion.
413
- """
414
- def __init__(self, *args, **kwargs):
415
- NoSentinelQueryProcessor.__init__(self, *args, **kwargs)
416
- PipelinedParallelExecutionStrategy.__init__(
417
- self,
418
- scan_start_idx=self.scan_start_idx,
419
- max_workers=self.max_workers,
420
- nocache=self.nocache,
421
- verbose=self.verbose
422
- )
423
- self.progress_manager = None
424
-
425
- # def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1):
426
- # """Initialize the stats and execute the plan with progress reporting."""
427
- # if self.verbose:
428
- # print("----------------------")
429
- # print(f"PLAN[{plan.plan_id}] (n={num_samples}):")
430
- # print(plan)
431
- # print("---")
432
-
433
- # plan_start_time = time.time()
434
-
435
- # # Initialize progress manager
436
- # self.progress_manager = create_progress_manager()
437
-
438
- # # initialize plan stats and operator stats
439
- # plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
440
- # for op in plan.operators:
441
- # op_id = op.get_op_id()
442
- # op_name = op.op_name()
443
- # op_details = {k: str(v) for k, v in op.get_id_params().items()}
444
- # plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
445
-
446
- # # initialize list of output records and intermediate variables
447
- # output_records = []
448
- # source_records_scanned = 0
449
- # current_scan_idx = self.scan_start_idx
450
-
451
- # # get handle to scan operator and pre-compute its size
452
- # source_operator = plan.operators[0]
453
- # assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
454
- # datareader_len = len(source_operator.datareader)
455
-
456
- # # Calculate total work units - each record needs to go through each operator
457
- # total_ops = len(plan.operators)
458
- # total_items = min(num_samples, datareader_len) if num_samples != float("inf") else datareader_len
459
- # total_work_units = total_items * total_ops
460
- # self.progress_manager.start(total_work_units)
461
- # work_units_completed = 0
462
-
463
- # try:
464
- # with ThreadPoolExecutor(max_workers=plan_workers) as executor:
465
- # # initialize processing queues and futures for each operation
466
- # processing_queues = {op.get_op_id(): [] for op in plan.operators}
467
- # futures = []
468
-
469
- # # execute the plan until either:
470
- # # 1. all records have been processed, or
471
- # # 2. the final limit operation has completed
472
- # finished_executing, keep_scanning_source_records = False, True
473
- # last_work_units_completed = 0
474
- # while not finished_executing:
475
- # # Process completed futures
476
- # done_futures, not_done_futures = wait(futures, timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
477
- # futures = list(not_done_futures)
478
-
479
- # for future in done_futures:
480
- # record_set, operator, _ = future.result()
481
- # op_id = operator.get_op_id()
482
- # op_idx = next(i for i, op in enumerate(plan.operators) if op.get_op_id() == op_id)
483
- # next_op_id = plan.operators[op_idx + 1].get_op_id() if op_idx + 1 < len(plan.operators) else None
484
-
485
- # # Update progress for completed operation
486
- # work_units_completed += len(record_set.data_records)
487
- # if work_units_completed > last_work_units_completed:
488
- # self.progress_manager.update(
489
- # work_units_completed,
490
- # f"Completed {operator.__class__.__name__} on {len(record_set.data_records)} records"
491
- # )
492
- # last_work_units_completed = work_units_completed
493
-
494
- # # Process records
495
- # for record in record_set:
496
- # if isinstance(operator, FilterOp) and not record.passed_operator:
497
- # continue
498
- # if next_op_id is not None:
499
- # processing_queues[next_op_id].append(record)
500
- # else:
501
- # output_records.append(record)
502
-
503
- # # Submit new tasks
504
- # for _, operator in enumerate(plan.operators):
505
- # op_id = operator.get_op_id()
506
-
507
- # if isinstance(operator, ScanPhysicalOp) and keep_scanning_source_records:
508
- # # Submit source operator task
509
- # futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, operator, current_scan_idx))
510
- # current_scan_idx += 1
511
- # keep_scanning_source_records = current_scan_idx < datareader_len and source_records_scanned < num_samples
512
-
513
- # elif len(processing_queues[op_id]) > 0:
514
- # # Submit task for next record in queue
515
- # input_record = processing_queues[op_id].pop(0)
516
- # futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, operator, input_record))
517
-
518
- # # Check if we're done
519
- # still_processing = any([len(queue) > 0 for queue in processing_queues.values()])
520
- # finished_executing = not keep_scanning_source_records and not still_processing and len(futures) == 0
521
-
522
- # # if caching was allowed, close the cache
523
- # if not self.nocache:
524
- # for _ in plan.operators:
525
- # # self.datadir.close_cache(operator.target_cache_id)
526
- # pass
527
-
528
- # # finalize plan stats
529
- # total_plan_time = time.time() - plan_start_time
530
- # plan_stats.finalize(total_plan_time)
531
-
532
- # finally:
533
- # # Always finish progress tracking
534
- # if self.progress_manager:
535
- # self.progress_manager.finish()
536
-
537
- # return output_records, plan_stats
33
+ return result
@@ -0,0 +1,28 @@
1
+ from enum import Enum
2
+
3
+ from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType
4
+ from palimpzest.query.processor.nosentinel_processor import NoSentinelQueryProcessor
5
+ from palimpzest.query.processor.sentinel_processor import SentinelQueryProcessor
6
+ from palimpzest.query.processor.streaming_processor import StreamingQueryProcessor
7
+
8
+
9
+ class ProcessingStrategyType(Enum):
10
+ """How to generate and optimize query plans"""
11
+ SENTINEL = SentinelQueryProcessor
12
+ NO_SENTINEL = NoSentinelQueryProcessor
13
+ STREAMING = StreamingQueryProcessor
14
+
15
+ def valid_execution_strategies(self) -> list[ExecutionStrategyType]:
16
+ """
17
+ Returns a list of valid execution strategies for the given processing strategy.
18
+ """
19
+ if self == ProcessingStrategyType.SENTINEL or self == ProcessingStrategyType.NO_SENTINEL:
20
+ return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
21
+ elif self == ProcessingStrategyType.STREAMING:
22
+ return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
23
+
24
+ def is_sentinel_strategy(self) -> bool:
25
+ """
26
+ Returns True if the query processor associated with this strategy uses sentinel execution.
27
+ """
28
+ return self == ProcessingStrategyType.SENTINEL