palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +343 -209
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +639 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +62 -6
- palimpzest/prompts/filter_prompts.py +51 -6
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
- palimpzest/prompts/prompt_factory.py +375 -47
- palimpzest/prompts/split_proposer_prompts.py +1 -1
- palimpzest/prompts/util_phrases.py +5 -0
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +160 -331
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +33 -19
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +26 -16
- palimpzest/query/operators/join.py +403 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +205 -77
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +42 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +32 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
- palimpzest-0.8.1.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.21.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0
palimpzest/utils/progress.py
CHANGED
|
@@ -21,6 +21,7 @@ from rich.table import Table
|
|
|
21
21
|
from palimpzest.query.operators.aggregate import AggregateOp
|
|
22
22
|
from palimpzest.query.operators.convert import LLMConvert
|
|
23
23
|
from palimpzest.query.operators.filter import LLMFilter
|
|
24
|
+
from palimpzest.query.operators.join import JoinOp
|
|
24
25
|
from palimpzest.query.operators.limit import LimitScanOp
|
|
25
26
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
26
27
|
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
@@ -57,7 +58,7 @@ class ProgressManager(ABC):
|
|
|
57
58
|
Initialize the progress manager for the given plan. This function takes in a plan,
|
|
58
59
|
the number of samples to process (if specified).
|
|
59
60
|
|
|
60
|
-
If `num_samples` is None, then the entire
|
|
61
|
+
If `num_samples` is None, then the entire Dataset will be scanned.
|
|
61
62
|
|
|
62
63
|
For each operator which is not an `AggregateOp` or `LimitScanOp`, we set its task `total`
|
|
63
64
|
to the number of inputs to be processed by the plan. As intermediate operators process
|
|
@@ -81,51 +82,50 @@ class ProgressManager(ABC):
|
|
|
81
82
|
expand=True, # Use full width
|
|
82
83
|
)
|
|
83
84
|
|
|
84
|
-
# initialize mapping from
|
|
85
|
-
self.
|
|
85
|
+
# initialize mapping from unique_full_op_id --> ProgressStats
|
|
86
|
+
self.unique_full_op_id_to_stats: dict[str, ProgressStats] = {}
|
|
86
87
|
|
|
87
|
-
# initialize mapping from
|
|
88
|
-
self.
|
|
88
|
+
# initialize mapping from unique_full_op_id --> task
|
|
89
|
+
self.unique_full_op_id_to_task = {}
|
|
89
90
|
|
|
90
91
|
# initialize start time
|
|
91
92
|
self.start_time = None
|
|
92
93
|
|
|
93
|
-
#
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
94
|
+
# TODO: store plan and use its methods within incr()
|
|
95
|
+
# create mapping from unique_full_op_id --> input unique_full_op_ids
|
|
96
|
+
self.unique_full_op_id_to_input_unique_full_op_ids: dict[str, list[str]] = {}
|
|
97
|
+
for topo_idx, op in enumerate(plan):
|
|
98
|
+
unique_full_op_id = f"{topo_idx}-{op.get_full_op_id()}"
|
|
99
|
+
input_unique_full_op_ids = plan.get_source_unique_full_op_ids(topo_idx, op)
|
|
100
|
+
self.unique_full_op_id_to_input_unique_full_op_ids[unique_full_op_id] = input_unique_full_op_ids
|
|
101
|
+
|
|
102
|
+
# create mapping from unique_full_op_id --> next_op
|
|
103
|
+
self.unique_full_op_id_to_next_op_and_id: dict[str, tuple[PhysicalOperator, str]] = {}
|
|
104
|
+
for topo_idx, op in enumerate(plan):
|
|
105
|
+
unique_full_op_id = f"{topo_idx}-{op.get_full_op_id()}"
|
|
106
|
+
next_op, next_unique_full_op_id = plan.get_next_unique_full_op_and_id(topo_idx, op)
|
|
107
|
+
self.unique_full_op_id_to_next_op_and_id[unique_full_op_id] = (next_op, next_unique_full_op_id)
|
|
103
108
|
|
|
104
109
|
# add a task to the progress manager for each operator in the plan
|
|
105
|
-
|
|
110
|
+
est_total_outputs, _ = plan.get_est_total_outputs(num_samples)
|
|
111
|
+
for topo_idx, op in enumerate(plan):
|
|
106
112
|
# get the op id and a short string representation of the op; (str(op) is too long)
|
|
107
113
|
op_str = f"{op.op_name()} ({op.get_op_id()})"
|
|
114
|
+
unique_full_op_id = f"{topo_idx}-{op.get_full_op_id()}"
|
|
115
|
+
self.add_task(unique_full_op_id, op_str, est_total_outputs[unique_full_op_id])
|
|
108
116
|
|
|
109
|
-
|
|
110
|
-
if isinstance(op, AggregateOp):
|
|
111
|
-
total = 1
|
|
112
|
-
elif isinstance(op, LimitScanOp):
|
|
113
|
-
total = op.limit
|
|
114
|
-
|
|
115
|
-
self.add_task(op.get_full_op_id(), op_str, total)
|
|
116
|
-
|
|
117
|
-
def get_task_total(self, full_op_id: str) -> int:
|
|
117
|
+
def get_task_total(self, unique_full_op_id: str) -> int:
|
|
118
118
|
"""Return the current total value for the given task."""
|
|
119
|
-
task = self.
|
|
119
|
+
task = self.unique_full_op_id_to_task[unique_full_op_id]
|
|
120
120
|
return self.progress._tasks[task].total
|
|
121
121
|
|
|
122
|
-
def get_task_description(self,
|
|
122
|
+
def get_task_description(self, unique_full_op_id: str) -> str:
|
|
123
123
|
"""Return the current description for the given task."""
|
|
124
|
-
task = self.
|
|
124
|
+
task = self.unique_full_op_id_to_task[unique_full_op_id]
|
|
125
125
|
return self.progress._tasks[task].description
|
|
126
126
|
|
|
127
127
|
@abstractmethod
|
|
128
|
-
def add_task(self,
|
|
128
|
+
def add_task(self, unique_full_op_id: str, op_str: str, total: int):
|
|
129
129
|
"""Initialize progress tracking for operator execution with total items"""
|
|
130
130
|
pass
|
|
131
131
|
|
|
@@ -135,18 +135,16 @@ class ProgressManager(ABC):
|
|
|
135
135
|
pass
|
|
136
136
|
|
|
137
137
|
@abstractmethod
|
|
138
|
-
def incr(self,
|
|
138
|
+
def incr(self, unique_full_op_id: str, num_inputs: int = 1, num_outputs: int = 1, display_text: str | None = None, **kwargs):
|
|
139
139
|
"""
|
|
140
|
-
Advance the progress bar for the given operator
|
|
140
|
+
Advance the progress bar for the given operator. Modify the downstream operators'
|
|
141
141
|
progress bar `total` to reflect the number of outputs produced by this operator.
|
|
142
142
|
|
|
143
|
-
NOTE:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
have 1 fewer inputs to process. Alternatively, a convert which generates 3 `num_outputs` will
|
|
149
|
-
increase the inputs for the next operator by `delta = num_outputs - 1 = 2`.
|
|
143
|
+
NOTE: `num_outputs` specifies how many outputs were generated by the operator when processing
|
|
144
|
+
the `num_inputs` inputs for which `incr()` was called. E.g. a filter which filters one input record
|
|
145
|
+
will advance its progress bar by 1, but the next operator will now have 1 fewer inputs to process.
|
|
146
|
+
Alternatively, a convert which generates 3 `num_outputs` for 2 `num_inputs` will increase the inputs
|
|
147
|
+
for the next operator by `delta = num_outputs - num_inputs = 3 - 2 = 1`.
|
|
150
148
|
"""
|
|
151
149
|
pass
|
|
152
150
|
|
|
@@ -162,13 +160,13 @@ class MockProgressManager(ProgressManager):
|
|
|
162
160
|
def __init__(self, plan: PhysicalPlan | SentinelPlan, num_samples: int | None = None):
|
|
163
161
|
pass
|
|
164
162
|
|
|
165
|
-
def add_task(self,
|
|
163
|
+
def add_task(self, unique_full_op_id: str, op_str: str, total: int):
|
|
166
164
|
pass
|
|
167
165
|
|
|
168
166
|
def start(self):
|
|
169
167
|
pass
|
|
170
168
|
|
|
171
|
-
def incr(self,
|
|
169
|
+
def incr(self, unique_full_op_id: str, num_inputs: int = 1, num_outputs: int = 1, display_text: str | None = None, **kwargs):
|
|
172
170
|
pass
|
|
173
171
|
|
|
174
172
|
def finish(self):
|
|
@@ -181,7 +179,7 @@ class PZProgressManager(ProgressManager):
|
|
|
181
179
|
super().__init__(plan, num_samples)
|
|
182
180
|
self.console = Console()
|
|
183
181
|
|
|
184
|
-
def add_task(self,
|
|
182
|
+
def add_task(self, unique_full_op_id: str, op_str: str, total: int):
|
|
185
183
|
"""Add a new task to the progress bar"""
|
|
186
184
|
task = self.progress.add_task(
|
|
187
185
|
f"[blue]{op_str}",
|
|
@@ -194,10 +192,10 @@ class PZProgressManager(ProgressManager):
|
|
|
194
192
|
)
|
|
195
193
|
|
|
196
194
|
# store the mapping of operator ID to task ID
|
|
197
|
-
self.
|
|
195
|
+
self.unique_full_op_id_to_task[unique_full_op_id] = task
|
|
198
196
|
|
|
199
197
|
# initialize the stats for this operation
|
|
200
|
-
self.
|
|
198
|
+
self.unique_full_op_id_to_stats[unique_full_op_id] = ProgressStats(start_time=time.time())
|
|
201
199
|
|
|
202
200
|
def start(self):
|
|
203
201
|
# print a newline before starting to separate from previous output
|
|
@@ -209,41 +207,53 @@ class PZProgressManager(ProgressManager):
|
|
|
209
207
|
# start progress bar
|
|
210
208
|
self.progress.start()
|
|
211
209
|
|
|
212
|
-
def incr(self,
|
|
210
|
+
def incr(self, unique_full_op_id: str, num_inputs: int = 1, num_outputs: int = 1, display_text: str | None = None, **kwargs):
|
|
213
211
|
# get the task for the given operation
|
|
214
|
-
task = self.
|
|
212
|
+
task = self.unique_full_op_id_to_task.get(unique_full_op_id)
|
|
215
213
|
|
|
216
214
|
# update statistics with any additional keyword arguments
|
|
217
215
|
if kwargs != {}:
|
|
218
|
-
self.update_stats(
|
|
216
|
+
self.update_stats(unique_full_op_id, **kwargs)
|
|
219
217
|
|
|
220
218
|
# update progress bar and recent text in one update
|
|
221
219
|
if display_text is not None:
|
|
222
|
-
self.
|
|
220
|
+
self.unique_full_op_id_to_stats[unique_full_op_id].recent_text = display_text
|
|
223
221
|
|
|
224
|
-
#
|
|
225
|
-
|
|
226
|
-
delta = num_outputs - 1
|
|
222
|
+
# update the downstream operators' progress bar total for any operator which is not an AggregateOp or LimitScanOp
|
|
223
|
+
delta = num_outputs - num_inputs
|
|
227
224
|
if delta != 0:
|
|
228
|
-
|
|
225
|
+
current_unique_full_op_id = unique_full_op_id
|
|
226
|
+
next_op, next_unique_full_op_id = self.unique_full_op_id_to_next_op_and_id[unique_full_op_id]
|
|
229
227
|
while next_op is not None:
|
|
230
228
|
if not isinstance(next_op, (AggregateOp, LimitScanOp)):
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
229
|
+
next_task = self.unique_full_op_id_to_task[next_unique_full_op_id]
|
|
230
|
+
multiplier = 1
|
|
231
|
+
if isinstance(next_op, JoinOp):
|
|
232
|
+
# for joins, scale the delta by the number of inputs from the other side of the join
|
|
233
|
+
left_input_unique_full_op_id, right_input_unique_input_op_id = self.unique_full_op_id_to_input_unique_full_op_ids[next_unique_full_op_id]
|
|
234
|
+
if current_unique_full_op_id == left_input_unique_full_op_id:
|
|
235
|
+
multiplier = self.get_task_total(right_input_unique_input_op_id)
|
|
236
|
+
elif current_unique_full_op_id == right_input_unique_input_op_id:
|
|
237
|
+
multiplier = self.get_task_total(left_input_unique_full_op_id)
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError(f"Current op ID {current_unique_full_op_id} not found in join inputs {left_input_unique_full_op_id}, {right_input_unique_input_op_id}")
|
|
240
|
+
delta_adjusted = delta * multiplier
|
|
241
|
+
self.progress.update(next_task, total=self.get_task_total(next_unique_full_op_id) + delta_adjusted)
|
|
242
|
+
|
|
243
|
+
# move to the next operator in the plan
|
|
244
|
+
current_unique_full_op_id = next_unique_full_op_id
|
|
245
|
+
next_op, next_unique_full_op_id = self.unique_full_op_id_to_next_op_and_id[next_unique_full_op_id]
|
|
236
246
|
|
|
237
247
|
# advance the progress bar for this task
|
|
238
248
|
self.progress.update(
|
|
239
249
|
task,
|
|
240
|
-
advance=
|
|
241
|
-
description=f"[bold blue]{self.get_task_description(
|
|
242
|
-
cost=self.
|
|
243
|
-
success=self.
|
|
244
|
-
failed=self.
|
|
250
|
+
advance=num_inputs,
|
|
251
|
+
description=f"[bold blue]{self.get_task_description(unique_full_op_id)}",
|
|
252
|
+
cost=self.unique_full_op_id_to_stats[unique_full_op_id].total_cost,
|
|
253
|
+
success=self.unique_full_op_id_to_stats[unique_full_op_id].success_count,
|
|
254
|
+
failed=self.unique_full_op_id_to_stats[unique_full_op_id].failure_count,
|
|
245
255
|
memory=get_memory_usage(),
|
|
246
|
-
recent=f"{self.
|
|
256
|
+
recent=f"{self.unique_full_op_id_to_stats[unique_full_op_id].recent_text}" if display_text is not None else "",
|
|
247
257
|
refresh=True,
|
|
248
258
|
)
|
|
249
259
|
|
|
@@ -251,24 +261,24 @@ class PZProgressManager(ProgressManager):
|
|
|
251
261
|
self.progress.stop()
|
|
252
262
|
|
|
253
263
|
# compute total cost, success, and failure
|
|
254
|
-
total_cost = sum(stats.total_cost for stats in self.
|
|
255
|
-
# success_count = sum(stats.success_count for stats in self.
|
|
256
|
-
# failure_count = sum(stats.failure_count for stats in self.
|
|
264
|
+
total_cost = sum(stats.total_cost for stats in self.unique_full_op_id_to_stats.values())
|
|
265
|
+
# success_count = sum(stats.success_count for stats in self.unique_full_op_id_to_stats.values())
|
|
266
|
+
# failure_count = sum(stats.failure_count for stats in self.unique_full_op_id_to_stats.values())
|
|
257
267
|
|
|
258
268
|
# Print final stats on new lines after progress display
|
|
259
269
|
print(f"Total time: {time.time() - self.start_time:.2f}s")
|
|
260
270
|
print(f"Total cost: ${total_cost:.4f}")
|
|
261
271
|
# print(f"Success rate: {success_count}/{success_count + failure_count}")
|
|
262
272
|
|
|
263
|
-
def update_stats(self,
|
|
273
|
+
def update_stats(self, unique_full_op_id: str, **kwargs):
|
|
264
274
|
"""Update progress statistics"""
|
|
265
275
|
for key, value in kwargs.items():
|
|
266
|
-
if hasattr(self.
|
|
276
|
+
if hasattr(self.unique_full_op_id_to_stats[unique_full_op_id], key):
|
|
267
277
|
if key != "total_cost":
|
|
268
|
-
setattr(self.
|
|
278
|
+
setattr(self.unique_full_op_id_to_stats[unique_full_op_id], key, value)
|
|
269
279
|
else:
|
|
270
|
-
self.
|
|
271
|
-
self.
|
|
280
|
+
self.unique_full_op_id_to_stats[unique_full_op_id].total_cost += value
|
|
281
|
+
self.unique_full_op_id_to_stats[unique_full_op_id].memory_usage_mb = get_memory_usage()
|
|
272
282
|
|
|
273
283
|
class PZSentinelProgressManager(ProgressManager):
|
|
274
284
|
def __init__(self, plan: SentinelPlan, sample_budget: int):
|
|
@@ -313,24 +323,25 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
313
323
|
)
|
|
314
324
|
self.live_display = Live(self.progress_table, refresh_per_second=10)
|
|
315
325
|
|
|
316
|
-
# initialize mapping from
|
|
317
|
-
self.
|
|
326
|
+
# initialize mapping from unique_logical_op_id --> ProgressStats
|
|
327
|
+
self.unique_logical_op_id_to_stats: dict[str, ProgressStats] = {}
|
|
318
328
|
|
|
319
|
-
# initialize mapping from
|
|
320
|
-
self.
|
|
329
|
+
# initialize mapping from unique_logical_op_id --> task
|
|
330
|
+
self.unique_logical_op_id_to_task = {}
|
|
321
331
|
|
|
322
332
|
# initialize start time
|
|
323
333
|
self.start_time = None
|
|
324
334
|
|
|
325
335
|
# add a task to the progress manager for each operator in the plan
|
|
326
|
-
for logical_op_id, op_set in plan:
|
|
336
|
+
for topo_idx, (logical_op_id, op_set) in enumerate(plan):
|
|
337
|
+
unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
|
|
327
338
|
physical_op = op_set[0]
|
|
328
339
|
is_llm_convert = isinstance(physical_op, LLMConvert)
|
|
329
340
|
is_llm_filter = isinstance(physical_op, LLMFilter)
|
|
330
341
|
op_name = "LLMConvert" if is_llm_convert else "LLMFilter" if is_llm_filter else physical_op.op_name()
|
|
331
|
-
op_str = f"{op_name} ({
|
|
342
|
+
op_str = f"{op_name} ({unique_logical_op_id})"
|
|
332
343
|
total = sample_budget if self._is_llm_op(op_set[0]) else 0
|
|
333
|
-
self.add_task(
|
|
344
|
+
self.add_task(unique_logical_op_id, op_str, total)
|
|
334
345
|
|
|
335
346
|
self.console = Console()
|
|
336
347
|
|
|
@@ -338,14 +349,15 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
338
349
|
is_llm_convert = isinstance(physical_op, LLMConvert)
|
|
339
350
|
is_llm_filter = isinstance(physical_op, LLMFilter)
|
|
340
351
|
is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
|
|
341
|
-
|
|
352
|
+
is_llm_join = isinstance(physical_op, JoinOp)
|
|
353
|
+
return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
|
|
342
354
|
|
|
343
|
-
def get_task_description(self,
|
|
355
|
+
def get_task_description(self, unique_logical_op_id: str) -> str:
|
|
344
356
|
"""Return the current description for the given task."""
|
|
345
|
-
task = self.
|
|
357
|
+
task = self.unique_logical_op_id_to_task[unique_logical_op_id]
|
|
346
358
|
return self.op_progress._tasks[task].description
|
|
347
359
|
|
|
348
|
-
def add_task(self,
|
|
360
|
+
def add_task(self, unique_logical_op_id: str, op_str: str, total: int):
|
|
349
361
|
"""Add a new task to the op progress bars"""
|
|
350
362
|
task = self.op_progress.add_task(
|
|
351
363
|
f"[blue]{op_str}",
|
|
@@ -358,10 +370,10 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
358
370
|
)
|
|
359
371
|
|
|
360
372
|
# store the mapping of operator ID to task ID
|
|
361
|
-
self.
|
|
373
|
+
self.unique_logical_op_id_to_task[unique_logical_op_id] = task
|
|
362
374
|
|
|
363
375
|
# initialize the stats for this operation
|
|
364
|
-
self.
|
|
376
|
+
self.unique_logical_op_id_to_stats[unique_logical_op_id] = ProgressStats(start_time=time.time())
|
|
365
377
|
|
|
366
378
|
def start(self):
|
|
367
379
|
# print a newline before starting to separate from previous output
|
|
@@ -373,29 +385,29 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
373
385
|
# start progress bars
|
|
374
386
|
self.live_display.start()
|
|
375
387
|
|
|
376
|
-
def incr(self,
|
|
388
|
+
def incr(self, unique_logical_op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
|
|
377
389
|
# TODO: (above) organize progress bars into a Live / Table / Panel or something
|
|
378
390
|
# get the task for the given operation
|
|
379
|
-
task = self.
|
|
391
|
+
task = self.unique_logical_op_id_to_task.get(unique_logical_op_id)
|
|
380
392
|
|
|
381
393
|
# update statistics with any additional keyword arguments
|
|
382
394
|
if kwargs != {}:
|
|
383
|
-
self.update_stats(
|
|
395
|
+
self.update_stats(unique_logical_op_id, **kwargs)
|
|
384
396
|
|
|
385
397
|
# update progress bar and recent text in one update
|
|
386
398
|
if display_text is not None:
|
|
387
|
-
self.
|
|
399
|
+
self.unique_logical_op_id_to_stats[unique_logical_op_id].recent_text = display_text
|
|
388
400
|
|
|
389
|
-
# advance the op progress bar for this
|
|
401
|
+
# advance the op progress bar for this unique_logical_op_id
|
|
390
402
|
self.op_progress.update(
|
|
391
403
|
task,
|
|
392
404
|
advance=num_samples,
|
|
393
|
-
description=f"[bold blue]{self.get_task_description(
|
|
394
|
-
cost=self.
|
|
395
|
-
success=self.
|
|
396
|
-
failed=self.
|
|
405
|
+
description=f"[bold blue]{self.get_task_description(unique_logical_op_id)}",
|
|
406
|
+
cost=self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost,
|
|
407
|
+
success=self.unique_logical_op_id_to_stats[unique_logical_op_id].success_count,
|
|
408
|
+
failed=self.unique_logical_op_id_to_stats[unique_logical_op_id].failure_count,
|
|
397
409
|
memory=get_memory_usage(),
|
|
398
|
-
recent=f"{self.
|
|
410
|
+
recent=f"{self.unique_logical_op_id_to_stats[unique_logical_op_id].recent_text}" if display_text is not None else "",
|
|
399
411
|
refresh=True,
|
|
400
412
|
)
|
|
401
413
|
|
|
@@ -403,7 +415,7 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
403
415
|
self.overall_progress.update(
|
|
404
416
|
self.overall_task_id,
|
|
405
417
|
advance=num_samples,
|
|
406
|
-
cost=sum(stats.total_cost for _, stats in self.
|
|
418
|
+
cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()),
|
|
407
419
|
refresh=True,
|
|
408
420
|
)
|
|
409
421
|
|
|
@@ -414,24 +426,24 @@ class PZSentinelProgressManager(ProgressManager):
|
|
|
414
426
|
self.live_display.stop()
|
|
415
427
|
|
|
416
428
|
# compute total cost, success, and failure
|
|
417
|
-
total_cost = sum(stats.total_cost for stats in self.
|
|
418
|
-
# success_count = sum(stats.success_count for stats in self.
|
|
419
|
-
# failure_count = sum(stats.failure_count for stats in self.
|
|
429
|
+
total_cost = sum(stats.total_cost for stats in self.unique_logical_op_id_to_stats.values())
|
|
430
|
+
# success_count = sum(stats.success_count for stats in self.unique_logical_op_id_to_stats.values())
|
|
431
|
+
# failure_count = sum(stats.failure_count for stats in self.unique_logical_op_id_to_stats.values())
|
|
420
432
|
|
|
421
433
|
# Print final stats on new lines after progress display
|
|
422
434
|
print(f"Total opt. time: {time.time() - self.start_time:.2f}s")
|
|
423
435
|
print(f"Total opt. cost: ${total_cost:.4f}")
|
|
424
436
|
# print(f"Success rate: {success_count}/{success_count + failure_count}")
|
|
425
437
|
|
|
426
|
-
def update_stats(self,
|
|
438
|
+
def update_stats(self, unique_logical_op_id: str, **kwargs):
|
|
427
439
|
"""Update progress statistics"""
|
|
428
440
|
for key, value in kwargs.items():
|
|
429
|
-
if hasattr(self.
|
|
441
|
+
if hasattr(self.unique_logical_op_id_to_stats[unique_logical_op_id], key):
|
|
430
442
|
if key != "total_cost":
|
|
431
|
-
setattr(self.
|
|
443
|
+
setattr(self.unique_logical_op_id_to_stats[unique_logical_op_id], key, value)
|
|
432
444
|
else:
|
|
433
|
-
self.
|
|
434
|
-
self.
|
|
445
|
+
self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost += value
|
|
446
|
+
self.unique_logical_op_id_to_stats[unique_logical_op_id].memory_usage_mb = get_memory_usage()
|
|
435
447
|
|
|
436
448
|
def create_progress_manager(
|
|
437
449
|
plan: PhysicalPlan | SentinelPlan,
|
|
File without changes
|