palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +5 -0
- palimpzest/constants.py +110 -43
- palimpzest/core/__init__.py +0 -78
- palimpzest/core/data/dataclasses.py +382 -44
- palimpzest/core/elements/filters.py +7 -3
- palimpzest/core/elements/index.py +70 -0
- palimpzest/core/elements/records.py +33 -11
- palimpzest/core/lib/fields.py +1 -0
- palimpzest/core/lib/schemas.py +4 -3
- palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
- palimpzest/prompts/prompt_factory.py +44 -7
- palimpzest/prompts/split_merge_prompts.py +56 -0
- palimpzest/prompts/split_proposer_prompts.py +55 -0
- palimpzest/query/execution/execution_strategy.py +435 -53
- palimpzest/query/execution/execution_strategy_type.py +20 -0
- palimpzest/query/execution/mab_execution_strategy.py +532 -0
- palimpzest/query/execution/parallel_execution_strategy.py +143 -172
- palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
- palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
- palimpzest/query/generators/api_client_factory.py +31 -0
- palimpzest/query/generators/generators.py +256 -76
- palimpzest/query/operators/__init__.py +1 -2
- palimpzest/query/operators/code_synthesis_convert.py +33 -18
- palimpzest/query/operators/convert.py +30 -97
- palimpzest/query/operators/critique_and_refine_convert.py +5 -6
- palimpzest/query/operators/filter.py +7 -10
- palimpzest/query/operators/logical.py +54 -10
- palimpzest/query/operators/map.py +130 -0
- palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
- palimpzest/query/operators/physical.py +3 -12
- palimpzest/query/operators/rag_convert.py +66 -18
- palimpzest/query/operators/retrieve.py +230 -34
- palimpzest/query/operators/scan.py +5 -2
- palimpzest/query/operators/split_convert.py +169 -0
- palimpzest/query/operators/token_reduction_convert.py +8 -14
- palimpzest/query/optimizer/__init__.py +4 -16
- palimpzest/query/optimizer/cost_model.py +73 -266
- palimpzest/query/optimizer/optimizer.py +87 -58
- palimpzest/query/optimizer/optimizer_strategy.py +18 -97
- palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
- palimpzest/query/optimizer/plan.py +2 -3
- palimpzest/query/optimizer/primitives.py +5 -3
- palimpzest/query/optimizer/rules.py +336 -172
- palimpzest/query/optimizer/tasks.py +30 -100
- palimpzest/query/processor/config.py +38 -22
- palimpzest/query/processor/nosentinel_processor.py +16 -520
- palimpzest/query/processor/processing_strategy_type.py +28 -0
- palimpzest/query/processor/query_processor.py +38 -206
- palimpzest/query/processor/query_processor_factory.py +117 -130
- palimpzest/query/processor/sentinel_processor.py +90 -0
- palimpzest/query/processor/streaming_processor.py +25 -32
- palimpzest/sets.py +88 -41
- palimpzest/utils/model_helpers.py +8 -7
- palimpzest/utils/progress.py +368 -152
- palimpzest/utils/token_reduction_helpers.py +1 -3
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
- palimpzest-0.7.1.dist-info/RECORD +96 -0
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
- palimpzest/query/processor/mab_sentinel_processor.py +0 -884
- palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
- palimpzest/utils/index_helpers.py +0 -6
- palimpzest-0.6.4.dist-info/RECORD +0 -87
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0
|
@@ -1,537 +1,33 @@
|
|
|
1
|
-
import
|
|
1
|
+
import logging
|
|
2
2
|
|
|
3
|
-
from palimpzest.core.data.dataclasses import ExecutionStats
|
|
3
|
+
from palimpzest.core.data.dataclasses import ExecutionStats
|
|
4
4
|
from palimpzest.core.elements.records import DataRecordCollection
|
|
5
|
-
from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy
|
|
6
|
-
from palimpzest.query.execution.single_threaded_execution_strategy import (
|
|
7
|
-
PipelinedSingleThreadExecutionStrategy,
|
|
8
|
-
SequentialSingleThreadExecutionStrategy,
|
|
9
|
-
)
|
|
10
|
-
from palimpzest.query.operators.aggregate import AggregateOp
|
|
11
|
-
from palimpzest.query.operators.filter import FilterOp
|
|
12
|
-
from palimpzest.query.operators.limit import LimitScanOp
|
|
13
|
-
from palimpzest.query.operators.scan import ScanPhysicalOp
|
|
14
|
-
from palimpzest.query.optimizer.plan import PhysicalPlan
|
|
15
5
|
from palimpzest.query.processor.query_processor import QueryProcessor
|
|
16
|
-
from palimpzest.utils.progress import create_progress_manager
|
|
17
6
|
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
18
8
|
|
|
19
9
|
class NoSentinelQueryProcessor(QueryProcessor):
|
|
20
10
|
"""
|
|
21
|
-
|
|
22
|
-
for coordinating optimization and execution.
|
|
11
|
+
Query processor that uses naive cost estimates to select the best plan.
|
|
23
12
|
"""
|
|
24
13
|
|
|
25
14
|
# TODO: Consider to support dry_run.
|
|
26
15
|
def execute(self) -> DataRecordCollection:
|
|
27
|
-
|
|
16
|
+
logger.info("Executing NoSentinelQueryProcessor")
|
|
28
17
|
|
|
29
|
-
#
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
pass
|
|
18
|
+
# create execution stats
|
|
19
|
+
execution_stats = ExecutionStats(execution_id=self.execution_id())
|
|
20
|
+
execution_stats.start()
|
|
33
21
|
|
|
34
22
|
# execute plan(s) according to the optimization strategy
|
|
35
|
-
records, plan_stats = self.
|
|
23
|
+
records, plan_stats = self._execute_best_plan(self.dataset, self.optimizer)
|
|
36
24
|
|
|
37
|
-
#
|
|
38
|
-
|
|
25
|
+
# update the execution stats to account for the work to execute the final plan
|
|
26
|
+
execution_stats.add_plan_stats(plan_stats)
|
|
27
|
+
execution_stats.finish()
|
|
39
28
|
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
plan_stats=aggregate_plan_stats,
|
|
44
|
-
total_execution_time=time.time() - execution_start_time,
|
|
45
|
-
total_execution_cost=sum(
|
|
46
|
-
list(map(lambda plan_stats: plan_stats.total_plan_cost, aggregate_plan_stats.values()))
|
|
47
|
-
),
|
|
48
|
-
plan_strs={plan_id: plan_stats.plan_str for plan_id, plan_stats in aggregate_plan_stats.items()},
|
|
49
|
-
)
|
|
29
|
+
# construct and return the DataRecordCollection
|
|
30
|
+
result = DataRecordCollection(records, execution_stats=execution_stats)
|
|
31
|
+
logger.info("Done executing NoSentinelQueryProcessor")
|
|
50
32
|
|
|
51
|
-
return
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
class NoSentinelSequentialSingleThreadProcessor(NoSentinelQueryProcessor, SequentialSingleThreadExecutionStrategy):
|
|
55
|
-
"""
|
|
56
|
-
This class performs non-sample based execution while executing plans in a sequential, single-threaded fashion.
|
|
57
|
-
"""
|
|
58
|
-
def __init__(self, *args, **kwargs):
|
|
59
|
-
NoSentinelQueryProcessor.__init__(self, *args, **kwargs)
|
|
60
|
-
SequentialSingleThreadExecutionStrategy.__init__(
|
|
61
|
-
self,
|
|
62
|
-
scan_start_idx=self.scan_start_idx,
|
|
63
|
-
max_workers=self.max_workers,
|
|
64
|
-
nocache=self.nocache,
|
|
65
|
-
verbose=self.verbose
|
|
66
|
-
)
|
|
67
|
-
self.progress_manager = None
|
|
68
|
-
|
|
69
|
-
def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1):
|
|
70
|
-
"""Initialize the stats and execute the plan with progress reporting."""
|
|
71
|
-
if self.verbose:
|
|
72
|
-
print("----------------------")
|
|
73
|
-
print(f"PLAN[{plan.plan_id}] (n={num_samples}):")
|
|
74
|
-
print(plan)
|
|
75
|
-
print("---")
|
|
76
|
-
|
|
77
|
-
plan_start_time = time.time()
|
|
78
|
-
|
|
79
|
-
# Initialize progress manager
|
|
80
|
-
self.progress_manager = create_progress_manager()
|
|
81
|
-
|
|
82
|
-
# initialize plan stats and operator stats
|
|
83
|
-
plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
|
|
84
|
-
for op in plan.operators:
|
|
85
|
-
op_id = op.get_op_id()
|
|
86
|
-
op_name = op.op_name()
|
|
87
|
-
op_details = {k: str(v) for k, v in op.get_id_params().items()}
|
|
88
|
-
plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
|
|
89
|
-
|
|
90
|
-
# initialize list of output records and intermediate variables
|
|
91
|
-
output_records = []
|
|
92
|
-
current_scan_idx = self.scan_start_idx
|
|
93
|
-
|
|
94
|
-
# get handle to scan operator and pre-compute its size
|
|
95
|
-
source_operator = plan.operators[0]
|
|
96
|
-
assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
|
|
97
|
-
datareader_len = len(source_operator.datareader)
|
|
98
|
-
|
|
99
|
-
# Calculate total work units - each record needs to go through each operator
|
|
100
|
-
total_ops = len(plan.operators)
|
|
101
|
-
total_items = min(num_samples, datareader_len) if num_samples != float("inf") else datareader_len
|
|
102
|
-
total_work_units = total_items * total_ops
|
|
103
|
-
self.progress_manager.start(total_work_units)
|
|
104
|
-
work_units_completed = 0
|
|
105
|
-
|
|
106
|
-
# initialize processing queues for each operation
|
|
107
|
-
processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
|
|
108
|
-
|
|
109
|
-
try:
|
|
110
|
-
# execute the plan one operator at a time
|
|
111
|
-
for op_idx, operator in enumerate(plan.operators):
|
|
112
|
-
op_id = operator.get_op_id()
|
|
113
|
-
prev_op_id = plan.operators[op_idx - 1].get_op_id() if op_idx > 1 else None
|
|
114
|
-
next_op_id = plan.operators[op_idx + 1].get_op_id() if op_idx + 1 < len(plan.operators) else None
|
|
115
|
-
|
|
116
|
-
# Update progress to show which operator is currently running
|
|
117
|
-
op_name = operator.__class__.__name__
|
|
118
|
-
self.progress_manager.update(work_units_completed, f"Running {op_name} ({op_idx + 1}/{total_ops})")
|
|
119
|
-
|
|
120
|
-
# initialize output records and record_op_stats for this operator
|
|
121
|
-
records, record_op_stats = [], []
|
|
122
|
-
|
|
123
|
-
# invoke scan operator(s) until we run out of source records or hit the num_samples limit
|
|
124
|
-
if isinstance(operator, ScanPhysicalOp):
|
|
125
|
-
keep_scanning_source_records = True
|
|
126
|
-
while keep_scanning_source_records:
|
|
127
|
-
# run ScanPhysicalOp on current scan index
|
|
128
|
-
record_set = operator(current_scan_idx)
|
|
129
|
-
records.extend(record_set.data_records)
|
|
130
|
-
record_op_stats.extend(record_set.record_op_stats)
|
|
131
|
-
|
|
132
|
-
# Update progress for each processed record in data source
|
|
133
|
-
work_units_completed += 1
|
|
134
|
-
self.progress_manager.update(
|
|
135
|
-
work_units_completed,
|
|
136
|
-
f"Scanning data source: {current_scan_idx + 1}/{total_items}"
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
# update the current scan index
|
|
140
|
-
current_scan_idx += 1
|
|
141
|
-
|
|
142
|
-
# update whether to keep scanning source records
|
|
143
|
-
keep_scanning_source_records = current_scan_idx < datareader_len and len(records) < num_samples
|
|
144
|
-
|
|
145
|
-
# aggregate operators accept all input records at once
|
|
146
|
-
elif isinstance(operator, AggregateOp):
|
|
147
|
-
record_set = operator(candidates=processing_queues[op_id])
|
|
148
|
-
records = record_set.data_records
|
|
149
|
-
record_op_stats = record_set.record_op_stats
|
|
150
|
-
|
|
151
|
-
# Update progress for aggregate operation - count all records being aggregated
|
|
152
|
-
work_units_completed += len(processing_queues[op_id])
|
|
153
|
-
self.progress_manager.update(
|
|
154
|
-
work_units_completed,
|
|
155
|
-
f"Aggregating {len(processing_queues[op_id])} records"
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
# otherwise, process the records in the processing queue for this operator one at a time
|
|
159
|
-
elif len(processing_queues[op_id]) > 0:
|
|
160
|
-
queue_size = len(processing_queues[op_id])
|
|
161
|
-
for idx, input_record in enumerate(processing_queues[op_id]):
|
|
162
|
-
record_set = operator(input_record)
|
|
163
|
-
records.extend(record_set.data_records)
|
|
164
|
-
record_op_stats.extend(record_set.record_op_stats)
|
|
165
|
-
|
|
166
|
-
# Update progress for each processed record in the queue
|
|
167
|
-
work_units_completed += 1
|
|
168
|
-
self.progress_manager.update(
|
|
169
|
-
work_units_completed,
|
|
170
|
-
f"Processing records: {idx + 1}/{queue_size}"
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
if isinstance(operator, LimitScanOp) and len(records) == operator.limit:
|
|
174
|
-
break
|
|
175
|
-
|
|
176
|
-
# update plan stats
|
|
177
|
-
plan_stats.operator_stats[op_id].add_record_op_stats(
|
|
178
|
-
record_op_stats,
|
|
179
|
-
source_op_id=prev_op_id,
|
|
180
|
-
plan_id=plan.plan_id,
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
# add records (which are not filtered) to the cache, if allowed
|
|
184
|
-
if not self.nocache:
|
|
185
|
-
for record in records:
|
|
186
|
-
if getattr(record, "passed_operator", True):
|
|
187
|
-
# self.datadir.append_cache(operator.target_cache_id, record)
|
|
188
|
-
pass
|
|
189
|
-
|
|
190
|
-
# update processing_queues or output_records
|
|
191
|
-
for record in records:
|
|
192
|
-
if isinstance(operator, FilterOp) and not record.passed_operator:
|
|
193
|
-
continue
|
|
194
|
-
if next_op_id is not None:
|
|
195
|
-
processing_queues[next_op_id].append(record)
|
|
196
|
-
else:
|
|
197
|
-
output_records.append(record)
|
|
198
|
-
|
|
199
|
-
# if we've filtered out all records, terminate early
|
|
200
|
-
if next_op_id is not None and processing_queues[next_op_id] == []:
|
|
201
|
-
break
|
|
202
|
-
|
|
203
|
-
# if caching was allowed, close the cache
|
|
204
|
-
if not self.nocache:
|
|
205
|
-
for _ in plan.operators:
|
|
206
|
-
# self.datadir.close_cache(operator.target_cache_id)
|
|
207
|
-
pass
|
|
208
|
-
|
|
209
|
-
# finalize plan stats
|
|
210
|
-
total_plan_time = time.time() - plan_start_time
|
|
211
|
-
plan_stats.finalize(total_plan_time)
|
|
212
|
-
|
|
213
|
-
finally:
|
|
214
|
-
# Always finish progress tracking
|
|
215
|
-
if self.progress_manager:
|
|
216
|
-
self.progress_manager.finish()
|
|
217
|
-
|
|
218
|
-
return output_records, plan_stats
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
class NoSentinelPipelinedSingleThreadProcessor(NoSentinelQueryProcessor, PipelinedSingleThreadExecutionStrategy):
|
|
222
|
-
"""
|
|
223
|
-
This class performs non-sample based execution while executing plans in a pipelined, parallel fashion.
|
|
224
|
-
"""
|
|
225
|
-
def __init__(self, *args, **kwargs):
|
|
226
|
-
NoSentinelQueryProcessor.__init__(self, *args, **kwargs)
|
|
227
|
-
PipelinedSingleThreadExecutionStrategy.__init__(
|
|
228
|
-
self,
|
|
229
|
-
scan_start_idx=self.scan_start_idx,
|
|
230
|
-
max_workers=self.max_workers,
|
|
231
|
-
nocache=self.nocache,
|
|
232
|
-
verbose=self.verbose
|
|
233
|
-
)
|
|
234
|
-
self.progress_manager = None
|
|
235
|
-
|
|
236
|
-
def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1):
|
|
237
|
-
"""Initialize the stats and execute the plan with progress reporting."""
|
|
238
|
-
if self.verbose:
|
|
239
|
-
print("----------------------")
|
|
240
|
-
print(f"PLAN[{plan.plan_id}] (n={num_samples}):")
|
|
241
|
-
print(plan)
|
|
242
|
-
print("---")
|
|
243
|
-
|
|
244
|
-
plan_start_time = time.time()
|
|
245
|
-
|
|
246
|
-
# Initialize progress manager
|
|
247
|
-
self.progress_manager = create_progress_manager()
|
|
248
|
-
|
|
249
|
-
# initialize plan stats and operator stats
|
|
250
|
-
plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
|
|
251
|
-
for op in plan.operators:
|
|
252
|
-
op_id = op.get_op_id()
|
|
253
|
-
op_name = op.op_name()
|
|
254
|
-
op_details = {k: str(v) for k, v in op.get_id_params().items()}
|
|
255
|
-
plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
|
|
256
|
-
|
|
257
|
-
# initialize list of output records and intermediate variables
|
|
258
|
-
output_records = []
|
|
259
|
-
source_records_scanned = 0
|
|
260
|
-
current_scan_idx = self.scan_start_idx
|
|
261
|
-
|
|
262
|
-
# get handle to scan operator and pre-compute its size
|
|
263
|
-
source_operator = plan.operators[0]
|
|
264
|
-
assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
|
|
265
|
-
datareader_len = len(source_operator.datareader)
|
|
266
|
-
|
|
267
|
-
# Calculate total work units - each record needs to go through each operator
|
|
268
|
-
total_ops = len(plan.operators)
|
|
269
|
-
total_items = min(num_samples, datareader_len) if num_samples != float("inf") else datareader_len
|
|
270
|
-
total_work_units = total_items * total_ops
|
|
271
|
-
self.progress_manager.start(total_work_units)
|
|
272
|
-
work_units_completed = 0
|
|
273
|
-
|
|
274
|
-
try:
|
|
275
|
-
# initialize processing queues for each operation
|
|
276
|
-
processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
|
|
277
|
-
|
|
278
|
-
# execute the plan until either:
|
|
279
|
-
# 1. all records have been processed, or
|
|
280
|
-
# 2. the final limit operation has completed
|
|
281
|
-
finished_executing, keep_scanning_source_records = False, True
|
|
282
|
-
while not finished_executing:
|
|
283
|
-
for op_idx, operator in enumerate(plan.operators):
|
|
284
|
-
op_id = operator.get_op_id()
|
|
285
|
-
prev_op_id = plan.operators[op_idx - 1].get_op_id() if op_idx > 1 else None
|
|
286
|
-
next_op_id = plan.operators[op_idx + 1].get_op_id() if op_idx + 1 < len(plan.operators) else None
|
|
287
|
-
|
|
288
|
-
# Update progress with current operator info
|
|
289
|
-
op_name = operator.__class__.__name__
|
|
290
|
-
self.progress_manager.update(work_units_completed, f"Running {op_name} ({op_idx + 1}/{total_ops})")
|
|
291
|
-
|
|
292
|
-
# create empty lists for records and execution stats generated by executing this operator on its next input(s)
|
|
293
|
-
records, record_op_stats = [], []
|
|
294
|
-
|
|
295
|
-
# invoke scan operator(s) until we run out of source records or hit the num_samples limit
|
|
296
|
-
if isinstance(operator, ScanPhysicalOp):
|
|
297
|
-
if keep_scanning_source_records:
|
|
298
|
-
# run ScanPhysicalOp on current scan index
|
|
299
|
-
record_set = operator(current_scan_idx)
|
|
300
|
-
records = record_set.data_records
|
|
301
|
-
record_op_stats = record_set.record_op_stats
|
|
302
|
-
|
|
303
|
-
# Update progress for each processed record
|
|
304
|
-
work_units_completed += 1
|
|
305
|
-
self.progress_manager.update(
|
|
306
|
-
work_units_completed,
|
|
307
|
-
f"Scanning data source: {current_scan_idx + 1}/{total_items}"
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
# update number of source records scanned and the current index
|
|
311
|
-
source_records_scanned += len(records)
|
|
312
|
-
current_scan_idx += 1
|
|
313
|
-
|
|
314
|
-
# update whether to keep scanning source records
|
|
315
|
-
keep_scanning_source_records = current_scan_idx < datareader_len and source_records_scanned < num_samples
|
|
316
|
-
|
|
317
|
-
# only invoke aggregate operator(s) once there are no more source records and all
|
|
318
|
-
# upstream operators' processing queues are empty
|
|
319
|
-
elif isinstance(operator, AggregateOp):
|
|
320
|
-
upstream_ops_are_finished = True
|
|
321
|
-
for upstream_op_idx in range(op_idx):
|
|
322
|
-
# scan operators do not have processing queues
|
|
323
|
-
if isinstance(plan.operators[upstream_op_idx], ScanPhysicalOp):
|
|
324
|
-
continue
|
|
325
|
-
|
|
326
|
-
# check upstream ops which do have a processing queue
|
|
327
|
-
upstream_op_id = plan.operators[upstream_op_idx].get_op_id()
|
|
328
|
-
upstream_ops_are_finished = (
|
|
329
|
-
upstream_ops_are_finished and len(processing_queues[upstream_op_id]) == 0
|
|
330
|
-
)
|
|
331
|
-
|
|
332
|
-
if not keep_scanning_source_records and upstream_ops_are_finished:
|
|
333
|
-
record_set = operator(candidates=processing_queues[op_id])
|
|
334
|
-
records = record_set.data_records
|
|
335
|
-
record_op_stats = record_set.record_op_stats
|
|
336
|
-
processing_queues[op_id] = []
|
|
337
|
-
|
|
338
|
-
# Update progress for aggregate operation
|
|
339
|
-
work_units_completed += len(processing_queues[op_id])
|
|
340
|
-
self.progress_manager.update(
|
|
341
|
-
work_units_completed,
|
|
342
|
-
f"Aggregating {len(processing_queues[op_id])} records"
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
# otherwise, process the next record in the processing queue for this operator
|
|
346
|
-
elif len(processing_queues[op_id]) > 0:
|
|
347
|
-
input_record = processing_queues[op_id].pop(0)
|
|
348
|
-
record_set = operator(input_record)
|
|
349
|
-
records = record_set.data_records
|
|
350
|
-
record_op_stats = record_set.record_op_stats
|
|
351
|
-
|
|
352
|
-
# Update progress for processed record
|
|
353
|
-
work_units_completed += 1
|
|
354
|
-
self.progress_manager.update(
|
|
355
|
-
work_units_completed,
|
|
356
|
-
f"Processing record through {op_name}"
|
|
357
|
-
)
|
|
358
|
-
|
|
359
|
-
# if records were generated by this operator, process them
|
|
360
|
-
if len(records) > 0:
|
|
361
|
-
# update plan stats
|
|
362
|
-
plan_stats.operator_stats[op_id].add_record_op_stats(
|
|
363
|
-
record_op_stats,
|
|
364
|
-
source_op_id=prev_op_id,
|
|
365
|
-
plan_id=plan.plan_id,
|
|
366
|
-
)
|
|
367
|
-
|
|
368
|
-
# add records (which are not filtered) to the cache, if allowed
|
|
369
|
-
if not self.nocache:
|
|
370
|
-
for record in records:
|
|
371
|
-
if getattr(record, "passed_operator", True):
|
|
372
|
-
# self.datadir.append_cache(operator.target_cache_id, record)
|
|
373
|
-
pass
|
|
374
|
-
|
|
375
|
-
# update processing_queues or output_records
|
|
376
|
-
for record in records:
|
|
377
|
-
if isinstance(operator, FilterOp) and not record.passed_operator:
|
|
378
|
-
continue
|
|
379
|
-
if next_op_id is not None:
|
|
380
|
-
processing_queues[next_op_id].append(record)
|
|
381
|
-
else:
|
|
382
|
-
output_records.append(record)
|
|
383
|
-
|
|
384
|
-
# update finished_executing based on whether all records have been processed
|
|
385
|
-
still_processing = any([len(queue) > 0 for queue in processing_queues.values()])
|
|
386
|
-
finished_executing = not keep_scanning_source_records and not still_processing
|
|
387
|
-
|
|
388
|
-
# update finished_executing based on limit
|
|
389
|
-
if isinstance(operator, LimitScanOp):
|
|
390
|
-
finished_executing = len(output_records) == operator.limit
|
|
391
|
-
|
|
392
|
-
# if caching was allowed, close the cache
|
|
393
|
-
if not self.nocache:
|
|
394
|
-
for _ in plan.operators:
|
|
395
|
-
# self.datadir.close_cache(operator.target_cache_id)
|
|
396
|
-
pass
|
|
397
|
-
|
|
398
|
-
# finalize plan stats
|
|
399
|
-
total_plan_time = time.time() - plan_start_time
|
|
400
|
-
plan_stats.finalize(total_plan_time)
|
|
401
|
-
|
|
402
|
-
finally:
|
|
403
|
-
# Always finish progress tracking
|
|
404
|
-
if self.progress_manager:
|
|
405
|
-
self.progress_manager.finish()
|
|
406
|
-
|
|
407
|
-
return output_records, plan_stats
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
class NoSentinelPipelinedParallelProcessor(NoSentinelQueryProcessor, PipelinedParallelExecutionStrategy):
|
|
411
|
-
"""
|
|
412
|
-
This class performs non-sample based execution while executing plans in a pipelined, parallel fashion.
|
|
413
|
-
"""
|
|
414
|
-
def __init__(self, *args, **kwargs):
|
|
415
|
-
NoSentinelQueryProcessor.__init__(self, *args, **kwargs)
|
|
416
|
-
PipelinedParallelExecutionStrategy.__init__(
|
|
417
|
-
self,
|
|
418
|
-
scan_start_idx=self.scan_start_idx,
|
|
419
|
-
max_workers=self.max_workers,
|
|
420
|
-
nocache=self.nocache,
|
|
421
|
-
verbose=self.verbose
|
|
422
|
-
)
|
|
423
|
-
self.progress_manager = None
|
|
424
|
-
|
|
425
|
-
# def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1):
|
|
426
|
-
# """Initialize the stats and execute the plan with progress reporting."""
|
|
427
|
-
# if self.verbose:
|
|
428
|
-
# print("----------------------")
|
|
429
|
-
# print(f"PLAN[{plan.plan_id}] (n={num_samples}):")
|
|
430
|
-
# print(plan)
|
|
431
|
-
# print("---")
|
|
432
|
-
|
|
433
|
-
# plan_start_time = time.time()
|
|
434
|
-
|
|
435
|
-
# # Initialize progress manager
|
|
436
|
-
# self.progress_manager = create_progress_manager()
|
|
437
|
-
|
|
438
|
-
# # initialize plan stats and operator stats
|
|
439
|
-
# plan_stats = PlanStats(plan_id=plan.plan_id, plan_str=str(plan))
|
|
440
|
-
# for op in plan.operators:
|
|
441
|
-
# op_id = op.get_op_id()
|
|
442
|
-
# op_name = op.op_name()
|
|
443
|
-
# op_details = {k: str(v) for k, v in op.get_id_params().items()}
|
|
444
|
-
# plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
|
|
445
|
-
|
|
446
|
-
# # initialize list of output records and intermediate variables
|
|
447
|
-
# output_records = []
|
|
448
|
-
# source_records_scanned = 0
|
|
449
|
-
# current_scan_idx = self.scan_start_idx
|
|
450
|
-
|
|
451
|
-
# # get handle to scan operator and pre-compute its size
|
|
452
|
-
# source_operator = plan.operators[0]
|
|
453
|
-
# assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
|
|
454
|
-
# datareader_len = len(source_operator.datareader)
|
|
455
|
-
|
|
456
|
-
# # Calculate total work units - each record needs to go through each operator
|
|
457
|
-
# total_ops = len(plan.operators)
|
|
458
|
-
# total_items = min(num_samples, datareader_len) if num_samples != float("inf") else datareader_len
|
|
459
|
-
# total_work_units = total_items * total_ops
|
|
460
|
-
# self.progress_manager.start(total_work_units)
|
|
461
|
-
# work_units_completed = 0
|
|
462
|
-
|
|
463
|
-
# try:
|
|
464
|
-
# with ThreadPoolExecutor(max_workers=plan_workers) as executor:
|
|
465
|
-
# # initialize processing queues and futures for each operation
|
|
466
|
-
# processing_queues = {op.get_op_id(): [] for op in plan.operators}
|
|
467
|
-
# futures = []
|
|
468
|
-
|
|
469
|
-
# # execute the plan until either:
|
|
470
|
-
# # 1. all records have been processed, or
|
|
471
|
-
# # 2. the final limit operation has completed
|
|
472
|
-
# finished_executing, keep_scanning_source_records = False, True
|
|
473
|
-
# last_work_units_completed = 0
|
|
474
|
-
# while not finished_executing:
|
|
475
|
-
# # Process completed futures
|
|
476
|
-
# done_futures, not_done_futures = wait(futures, timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
|
|
477
|
-
# futures = list(not_done_futures)
|
|
478
|
-
|
|
479
|
-
# for future in done_futures:
|
|
480
|
-
# record_set, operator, _ = future.result()
|
|
481
|
-
# op_id = operator.get_op_id()
|
|
482
|
-
# op_idx = next(i for i, op in enumerate(plan.operators) if op.get_op_id() == op_id)
|
|
483
|
-
# next_op_id = plan.operators[op_idx + 1].get_op_id() if op_idx + 1 < len(plan.operators) else None
|
|
484
|
-
|
|
485
|
-
# # Update progress for completed operation
|
|
486
|
-
# work_units_completed += len(record_set.data_records)
|
|
487
|
-
# if work_units_completed > last_work_units_completed:
|
|
488
|
-
# self.progress_manager.update(
|
|
489
|
-
# work_units_completed,
|
|
490
|
-
# f"Completed {operator.__class__.__name__} on {len(record_set.data_records)} records"
|
|
491
|
-
# )
|
|
492
|
-
# last_work_units_completed = work_units_completed
|
|
493
|
-
|
|
494
|
-
# # Process records
|
|
495
|
-
# for record in record_set:
|
|
496
|
-
# if isinstance(operator, FilterOp) and not record.passed_operator:
|
|
497
|
-
# continue
|
|
498
|
-
# if next_op_id is not None:
|
|
499
|
-
# processing_queues[next_op_id].append(record)
|
|
500
|
-
# else:
|
|
501
|
-
# output_records.append(record)
|
|
502
|
-
|
|
503
|
-
# # Submit new tasks
|
|
504
|
-
# for _, operator in enumerate(plan.operators):
|
|
505
|
-
# op_id = operator.get_op_id()
|
|
506
|
-
|
|
507
|
-
# if isinstance(operator, ScanPhysicalOp) and keep_scanning_source_records:
|
|
508
|
-
# # Submit source operator task
|
|
509
|
-
# futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, operator, current_scan_idx))
|
|
510
|
-
# current_scan_idx += 1
|
|
511
|
-
# keep_scanning_source_records = current_scan_idx < datareader_len and source_records_scanned < num_samples
|
|
512
|
-
|
|
513
|
-
# elif len(processing_queues[op_id]) > 0:
|
|
514
|
-
# # Submit task for next record in queue
|
|
515
|
-
# input_record = processing_queues[op_id].pop(0)
|
|
516
|
-
# futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, operator, input_record))
|
|
517
|
-
|
|
518
|
-
# # Check if we're done
|
|
519
|
-
# still_processing = any([len(queue) > 0 for queue in processing_queues.values()])
|
|
520
|
-
# finished_executing = not keep_scanning_source_records and not still_processing and len(futures) == 0
|
|
521
|
-
|
|
522
|
-
# # if caching was allowed, close the cache
|
|
523
|
-
# if not self.nocache:
|
|
524
|
-
# for _ in plan.operators:
|
|
525
|
-
# # self.datadir.close_cache(operator.target_cache_id)
|
|
526
|
-
# pass
|
|
527
|
-
|
|
528
|
-
# # finalize plan stats
|
|
529
|
-
# total_plan_time = time.time() - plan_start_time
|
|
530
|
-
# plan_stats.finalize(total_plan_time)
|
|
531
|
-
|
|
532
|
-
# finally:
|
|
533
|
-
# # Always finish progress tracking
|
|
534
|
-
# if self.progress_manager:
|
|
535
|
-
# self.progress_manager.finish()
|
|
536
|
-
|
|
537
|
-
# return output_records, plan_stats
|
|
33
|
+
return result
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from palimpzest.query.execution.execution_strategy_type import ExecutionStrategyType
|
|
4
|
+
from palimpzest.query.processor.nosentinel_processor import NoSentinelQueryProcessor
|
|
5
|
+
from palimpzest.query.processor.sentinel_processor import SentinelQueryProcessor
|
|
6
|
+
from palimpzest.query.processor.streaming_processor import StreamingQueryProcessor
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProcessingStrategyType(Enum):
|
|
10
|
+
"""How to generate and optimize query plans"""
|
|
11
|
+
SENTINEL = SentinelQueryProcessor
|
|
12
|
+
NO_SENTINEL = NoSentinelQueryProcessor
|
|
13
|
+
STREAMING = StreamingQueryProcessor
|
|
14
|
+
|
|
15
|
+
def valid_execution_strategies(self) -> list[ExecutionStrategyType]:
|
|
16
|
+
"""
|
|
17
|
+
Returns a list of valid execution strategies for the given processing strategy.
|
|
18
|
+
"""
|
|
19
|
+
if self == ProcessingStrategyType.SENTINEL or self == ProcessingStrategyType.NO_SENTINEL:
|
|
20
|
+
return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
|
|
21
|
+
elif self == ProcessingStrategyType.STREAMING:
|
|
22
|
+
return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
|
|
23
|
+
|
|
24
|
+
def is_sentinel_strategy(self) -> bool:
|
|
25
|
+
"""
|
|
26
|
+
Returns True if the query processor associated with this strategy uses sentinel execution.
|
|
27
|
+
"""
|
|
28
|
+
return self == ProcessingStrategyType.SENTINEL
|