pixeltable 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +370 -93
- pixeltable/catalog/column.py +6 -4
- pixeltable/catalog/dir.py +5 -5
- pixeltable/catalog/globals.py +14 -16
- pixeltable/catalog/insertable_table.py +6 -8
- pixeltable/catalog/path.py +14 -7
- pixeltable/catalog/table.py +72 -62
- pixeltable/catalog/table_version.py +137 -107
- pixeltable/catalog/table_version_handle.py +3 -0
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/view.py +10 -14
- pixeltable/dataframe.py +5 -3
- pixeltable/env.py +108 -42
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/aggregation_node.py +6 -8
- pixeltable/exec/cache_prefetch_node.py +4 -7
- pixeltable/exec/component_iteration_node.py +1 -3
- pixeltable/exec/data_row_batch.py +1 -2
- pixeltable/exec/exec_context.py +1 -1
- pixeltable/exec/exec_node.py +1 -2
- pixeltable/exec/expr_eval/__init__.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +137 -20
- pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
- pixeltable/exec/expr_eval/globals.py +68 -7
- pixeltable/exec/expr_eval/schedulers.py +25 -23
- pixeltable/exec/in_memory_data_node.py +8 -6
- pixeltable/exec/row_update_node.py +3 -4
- pixeltable/exec/sql_node.py +16 -18
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/column_property_ref.py +1 -1
- pixeltable/exprs/column_ref.py +3 -3
- pixeltable/exprs/compound_predicate.py +1 -1
- pixeltable/exprs/data_row.py +17 -1
- pixeltable/exprs/expr.py +12 -12
- pixeltable/exprs/function_call.py +34 -2
- pixeltable/exprs/json_mapper.py +95 -48
- pixeltable/exprs/json_path.py +4 -9
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +33 -6
- pixeltable/exprs/similarity_expr.py +1 -1
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/ext/__init__.py +1 -1
- pixeltable/ext/functions/__init__.py +1 -1
- pixeltable/ext/functions/whisperx.py +1 -1
- pixeltable/ext/functions/yolox.py +1 -1
- pixeltable/func/__init__.py +1 -1
- pixeltable/func/aggregate_function.py +2 -2
- pixeltable/func/callable_function.py +3 -6
- pixeltable/func/expr_template_function.py +24 -4
- pixeltable/func/function.py +7 -9
- pixeltable/func/function_registry.py +1 -1
- pixeltable/func/query_template_function.py +87 -4
- pixeltable/func/signature.py +1 -1
- pixeltable/func/tools.py +1 -1
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/anthropic.py +2 -2
- pixeltable/functions/audio.py +1 -1
- pixeltable/functions/deepseek.py +1 -1
- pixeltable/functions/fireworks.py +1 -1
- pixeltable/functions/globals.py +6 -6
- pixeltable/functions/huggingface.py +1 -1
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +1 -1
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +2 -2
- pixeltable/functions/replicate.py +1 -1
- pixeltable/functions/string.py +1 -1
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/util.py +1 -1
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/vision.py +2 -2
- pixeltable/globals.py +7 -2
- pixeltable/index/embedding_index.py +12 -1
- pixeltable/io/__init__.py +5 -3
- pixeltable/io/fiftyone.py +6 -7
- pixeltable/io/label_studio.py +21 -20
- pixeltable/io/pandas.py +6 -5
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/metadata/__init__.py +6 -4
- pixeltable/metadata/converters/convert_24.py +3 -3
- pixeltable/metadata/converters/convert_25.py +1 -1
- pixeltable/metadata/converters/convert_29.py +1 -1
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +26 -1
- pixeltable/plan.py +2 -3
- pixeltable/share/packager.py +8 -24
- pixeltable/share/publish.py +20 -9
- pixeltable/store.py +9 -6
- pixeltable/type_system.py +19 -7
- pixeltable/utils/console_output.py +3 -2
- pixeltable/utils/coroutine.py +3 -3
- pixeltable/utils/dbms.py +66 -0
- pixeltable/utils/documents.py +61 -67
- pixeltable/utils/exception_handler.py +59 -0
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +3 -2
- pixeltable/utils/pytorch.py +1 -1
- pixeltable/utils/sql.py +1 -1
- pixeltable-0.3.12.dist-info/METADATA +436 -0
- pixeltable-0.3.12.dist-info/RECORD +183 -0
- pixeltable/catalog/path_dict.py +0 -169
- pixeltable-0.3.10.dist-info/METADATA +0 -382
- pixeltable-0.3.10.dist-info/RECORD +0 -179
- {pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/entry_points.txt +0 -0
|
@@ -9,7 +9,7 @@ from typing import Any, Callable, Iterator, Optional, cast
|
|
|
9
9
|
|
|
10
10
|
from pixeltable import exprs, func
|
|
11
11
|
|
|
12
|
-
from .globals import Dispatcher, Evaluator, FnCallArgs
|
|
12
|
+
from .globals import Dispatcher, Evaluator, ExecCtx, FnCallArgs
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
15
15
|
|
|
@@ -26,8 +26,8 @@ class DefaultExprEvaluator(Evaluator):
|
|
|
26
26
|
|
|
27
27
|
e: exprs.Expr
|
|
28
28
|
|
|
29
|
-
def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
|
|
30
|
-
super().__init__(dispatcher)
|
|
29
|
+
def __init__(self, e: exprs.Expr, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
30
|
+
super().__init__(dispatcher, exec_ctx)
|
|
31
31
|
self.e = e
|
|
32
32
|
|
|
33
33
|
def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
|
|
@@ -47,8 +47,8 @@ class DefaultExprEvaluator(Evaluator):
|
|
|
47
47
|
_, _, exc_tb = sys.exc_info()
|
|
48
48
|
row.set_exc(self.e.slot_idx, exc)
|
|
49
49
|
rows_with_excs.add(idx)
|
|
50
|
-
self.dispatcher.dispatch_exc([row], self.e.slot_idx, exc_tb)
|
|
51
|
-
self.dispatcher.dispatch([rows[i] for i in range(len(rows)) if i not in rows_with_excs])
|
|
50
|
+
self.dispatcher.dispatch_exc([row], self.e.slot_idx, exc_tb, self.exec_ctx)
|
|
51
|
+
self.dispatcher.dispatch([rows[i] for i in range(len(rows)) if i not in rows_with_excs], self.exec_ctx)
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
class FnCallEvaluator(Evaluator):
|
|
@@ -70,8 +70,8 @@ class FnCallEvaluator(Evaluator):
|
|
|
70
70
|
call_args_queue: Optional[asyncio.Queue[FnCallArgs]] # FnCallArgs waiting for execution
|
|
71
71
|
batch_size: Optional[int]
|
|
72
72
|
|
|
73
|
-
def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher):
|
|
74
|
-
super().__init__(dispatcher)
|
|
73
|
+
def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
74
|
+
super().__init__(dispatcher, exec_ctx)
|
|
75
75
|
self.fn_call = fn_call
|
|
76
76
|
self.fn = cast(func.CallableFunction, fn_call.fn)
|
|
77
77
|
if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
|
|
@@ -104,7 +104,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
104
104
|
rows_call_args.append(FnCallArgs(self.fn_call, [row], args=args, kwargs=kwargs))
|
|
105
105
|
|
|
106
106
|
if len(skip_rows) > 0:
|
|
107
|
-
self.dispatcher.dispatch(skip_rows)
|
|
107
|
+
self.dispatcher.dispatch(skip_rows, self.exec_ctx)
|
|
108
108
|
|
|
109
109
|
if self.batch_size is not None:
|
|
110
110
|
if not self.is_closed and (len(rows_call_args) + self.call_args_queue.qsize() < self.batch_size):
|
|
@@ -132,7 +132,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
132
132
|
if self.fn_call.resource_pool is not None:
|
|
133
133
|
# hand the call off to the resource pool's scheduler
|
|
134
134
|
scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
|
|
135
|
-
scheduler.submit(batched_call_args)
|
|
135
|
+
scheduler.submit(batched_call_args, self.exec_ctx)
|
|
136
136
|
else:
|
|
137
137
|
task = asyncio.create_task(self.eval_batch(batched_call_args))
|
|
138
138
|
self.dispatcher.register_task(task)
|
|
@@ -142,7 +142,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
142
142
|
# hand the call off to the resource pool's scheduler
|
|
143
143
|
scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
|
|
144
144
|
for item in rows_call_args:
|
|
145
|
-
scheduler.submit(item)
|
|
145
|
+
scheduler.submit(item, self.exec_ctx)
|
|
146
146
|
else:
|
|
147
147
|
# create one task per call
|
|
148
148
|
for item in rows_call_args:
|
|
@@ -161,14 +161,12 @@ class FnCallEvaluator(Evaluator):
|
|
|
161
161
|
def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
|
|
162
162
|
"""Roll call_args into a single batched FnCallArgs"""
|
|
163
163
|
batch_args: list[list[Optional[Any]]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
|
|
164
|
-
batch_kwargs: dict[str, list[Optional[Any]]] = {
|
|
165
|
-
k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs.keys()
|
|
166
|
-
}
|
|
164
|
+
batch_kwargs: dict[str, list[Optional[Any]]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
|
|
167
165
|
assert isinstance(self.fn, func.CallableFunction)
|
|
168
166
|
for i, item in enumerate(call_args):
|
|
169
167
|
for j in range(len(item.args)):
|
|
170
168
|
batch_args[j][i] = item.args[j]
|
|
171
|
-
for k in item.kwargs
|
|
169
|
+
for k in item.kwargs:
|
|
172
170
|
batch_kwargs[k][i] = item.kwargs[k]
|
|
173
171
|
return FnCallArgs(
|
|
174
172
|
self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
|
|
@@ -190,12 +188,12 @@ class FnCallEvaluator(Evaluator):
|
|
|
190
188
|
_, _, exc_tb = sys.exc_info()
|
|
191
189
|
for row in batched_call_args.rows:
|
|
192
190
|
row.set_exc(self.fn_call.slot_idx, exc)
|
|
193
|
-
self.dispatcher.dispatch_exc(batched_call_args.rows, self.fn_call.slot_idx, exc_tb)
|
|
191
|
+
self.dispatcher.dispatch_exc(batched_call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
|
|
194
192
|
return
|
|
195
193
|
|
|
196
194
|
for i, row in enumerate(batched_call_args.rows):
|
|
197
195
|
row[self.fn_call.slot_idx] = result_batch[i]
|
|
198
|
-
self.dispatcher.dispatch(batched_call_args.rows)
|
|
196
|
+
self.dispatcher.dispatch(batched_call_args.rows, self.exec_ctx)
|
|
199
197
|
|
|
200
198
|
async def eval_async(self, call_args: FnCallArgs) -> None:
|
|
201
199
|
assert len(call_args.rows) == 1
|
|
@@ -208,7 +206,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
208
206
|
call_args.row[self.fn_call.slot_idx] = await self.fn.aexec(*call_args.args, **call_args.kwargs)
|
|
209
207
|
end_ts = datetime.datetime.now()
|
|
210
208
|
_logger.debug(f'Evaluated slot {self.fn_call.slot_idx} in {end_ts - start_ts}')
|
|
211
|
-
self.dispatcher.dispatch([call_args.row])
|
|
209
|
+
self.dispatcher.dispatch([call_args.row], self.exec_ctx)
|
|
212
210
|
except Exception as exc:
|
|
213
211
|
import anthropic
|
|
214
212
|
|
|
@@ -216,7 +214,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
216
214
|
_logger.debug(f'RateLimitError: {exc}')
|
|
217
215
|
_, _, exc_tb = sys.exc_info()
|
|
218
216
|
call_args.row.set_exc(self.fn_call.slot_idx, exc)
|
|
219
|
-
self.dispatcher.dispatch_exc(call_args.rows, self.fn_call.slot_idx, exc_tb)
|
|
217
|
+
self.dispatcher.dispatch_exc(call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
|
|
220
218
|
|
|
221
219
|
async def eval(self, call_args_batch: list[FnCallArgs]) -> None:
|
|
222
220
|
rows_with_excs: set[int] = set() # records idxs into 'rows'
|
|
@@ -233,9 +231,9 @@ class FnCallEvaluator(Evaluator):
|
|
|
233
231
|
_, _, exc_tb = sys.exc_info()
|
|
234
232
|
item.row.set_exc(self.fn_call.slot_idx, exc)
|
|
235
233
|
rows_with_excs.add(idx)
|
|
236
|
-
self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
|
|
234
|
+
self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
|
|
237
235
|
self.dispatcher.dispatch(
|
|
238
|
-
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
|
|
236
|
+
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs], self.exec_ctx
|
|
239
237
|
)
|
|
240
238
|
|
|
241
239
|
def _close(self) -> None:
|
|
@@ -246,3 +244,122 @@ class FnCallEvaluator(Evaluator):
|
|
|
246
244
|
batched_call_args = self._create_batch_call_args(list(self._queued_call_args_iter()))
|
|
247
245
|
task = asyncio.create_task(self.eval_batch(batched_call_args))
|
|
248
246
|
self.dispatcher.register_task(task)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class NestedRowList:
|
|
250
|
+
"""
|
|
251
|
+
A list of nested rows, used by JsonMapperDispatcher to store the rows corresponding to the elements of the
|
|
252
|
+
JsonMapper source list and make completion awaitable.
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
rows: list[exprs.DataRow]
|
|
256
|
+
num_completed: int
|
|
257
|
+
completion: asyncio.Event
|
|
258
|
+
|
|
259
|
+
def __init__(self, rows: list[exprs.DataRow]):
|
|
260
|
+
self.num_completed = 0
|
|
261
|
+
self.rows = rows
|
|
262
|
+
self.completion = asyncio.Event()
|
|
263
|
+
|
|
264
|
+
def complete_row(self) -> None:
|
|
265
|
+
self.num_completed += 1
|
|
266
|
+
if self.num_completed == len(self.rows):
|
|
267
|
+
self.completion.set()
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class JsonMapperDispatcher(Evaluator):
|
|
271
|
+
"""
|
|
272
|
+
The execution logic for materializing the nested DataRows of a JsonMapper/JsonMapperDispatch.
|
|
273
|
+
|
|
274
|
+
The rows are stored in a NestedRowList, which itself is stored in the JsonMapperDispatch instance's slot.
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
e: exprs.JsonMapperDispatch
|
|
278
|
+
target_expr: exprs.Expr
|
|
279
|
+
scope_anchor: exprs.ObjectRef
|
|
280
|
+
nested_exec_ctx: ExecCtx # ExecCtx needed to evaluate the nested rows
|
|
281
|
+
external_slot_map: dict[int, int] # slot idx in parent row -> slot idx in nested row
|
|
282
|
+
has_async_calls: bool # True if target_expr contains any async FunctionCalls
|
|
283
|
+
|
|
284
|
+
def __init__(self, e: exprs.JsonMapperDispatch, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
285
|
+
super().__init__(dispatcher, exec_ctx)
|
|
286
|
+
self.e = e
|
|
287
|
+
self.target_expr = e.target_expr.copy() # we need new slot idxs
|
|
288
|
+
self.scope_anchor = e.scope_anchor.copy()
|
|
289
|
+
nested_row_builder = exprs.RowBuilder(output_exprs=[self.target_expr], columns=[], input_exprs=[])
|
|
290
|
+
nested_row_builder.set_slot_idxs([self.target_expr, self.scope_anchor])
|
|
291
|
+
target_expr_ctx = nested_row_builder.create_eval_ctx([self.target_expr], limit_scope=True)
|
|
292
|
+
self.has_async_calls = any(isinstance(e, exprs.FunctionCall) and e.is_async for e in target_expr_ctx.exprs)
|
|
293
|
+
target_scope = self.target_expr.scope()
|
|
294
|
+
# we need to pre-populated nested rows with slot values that are produced in an outer scope (literals excluded)
|
|
295
|
+
parent_exprs = [
|
|
296
|
+
e for e in target_expr_ctx.exprs if e.scope() != target_scope and not isinstance(e, exprs.Literal)
|
|
297
|
+
]
|
|
298
|
+
self.external_slot_map = {exec_ctx.row_builder.unique_exprs[e].slot_idx: e.slot_idx for e in parent_exprs}
|
|
299
|
+
self.nested_exec_ctx = ExecCtx(dispatcher, nested_row_builder, [self.target_expr], parent_exprs)
|
|
300
|
+
|
|
301
|
+
def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
|
|
302
|
+
"""Create nested rows for all source list elements and dispatch them"""
|
|
303
|
+
assert self.e.slot_idx >= 0
|
|
304
|
+
all_nested_rows: list[exprs.DataRow] = []
|
|
305
|
+
for row in rows:
|
|
306
|
+
src = row[self.e.src_expr.slot_idx]
|
|
307
|
+
if not isinstance(src, list):
|
|
308
|
+
# invalid/non-list src path
|
|
309
|
+
row[self.e.slot_idx] = None
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
nested_rows = [
|
|
313
|
+
exprs.DataRow(
|
|
314
|
+
size=self.nested_exec_ctx.row_builder.num_materialized,
|
|
315
|
+
img_slot_idxs=[],
|
|
316
|
+
media_slot_idxs=[],
|
|
317
|
+
array_slot_idxs=[],
|
|
318
|
+
parent_row=row,
|
|
319
|
+
parent_slot_idx=self.e.slot_idx,
|
|
320
|
+
)
|
|
321
|
+
for _ in src
|
|
322
|
+
]
|
|
323
|
+
for nested_row, anchor_val in zip(nested_rows, src):
|
|
324
|
+
nested_row[self.scope_anchor.slot_idx] = anchor_val
|
|
325
|
+
for slot_idx_, nested_slot_idx in self.external_slot_map.items():
|
|
326
|
+
nested_row[nested_slot_idx] = row[slot_idx_]
|
|
327
|
+
self.nested_exec_ctx.init_rows(nested_rows)
|
|
328
|
+
|
|
329
|
+
# we modify DataRow.vals here directly, rather than going through __getitem__(), because we don't have
|
|
330
|
+
# an official "value" yet (the nested rows are not yet materialized)
|
|
331
|
+
row.vals[self.e.slot_idx] = NestedRowList(nested_rows)
|
|
332
|
+
all_nested_rows.extend(nested_rows)
|
|
333
|
+
|
|
334
|
+
self.dispatcher.dispatch(all_nested_rows, self.nested_exec_ctx)
|
|
335
|
+
task = asyncio.create_task(self.gather(rows))
|
|
336
|
+
self.dispatcher.register_task(task)
|
|
337
|
+
|
|
338
|
+
async def gather(self, rows: list[exprs.DataRow]) -> None:
|
|
339
|
+
"""Wait for nested rows to complete, then signal completion to the parent rows"""
|
|
340
|
+
if self.has_async_calls:
|
|
341
|
+
# if our target expr contains async FunctionCalls, they typically get completed out-of-order, and it's
|
|
342
|
+
# more effective to dispatch them as they complete
|
|
343
|
+
remaining = {
|
|
344
|
+
asyncio.create_task(row.vals[self.e.slot_idx].completion.wait()): row
|
|
345
|
+
for row in rows
|
|
346
|
+
if not row.has_val[self.e.slot_idx]
|
|
347
|
+
}
|
|
348
|
+
while len(remaining) > 0:
|
|
349
|
+
done, _ = await asyncio.wait(remaining.keys(), return_when=asyncio.FIRST_COMPLETED)
|
|
350
|
+
done_rows = [remaining.pop(task) for task in done]
|
|
351
|
+
for row in done_rows:
|
|
352
|
+
row.has_val[self.e.slot_idx] = True
|
|
353
|
+
self.dispatcher.dispatch(done_rows, self.exec_ctx)
|
|
354
|
+
|
|
355
|
+
else:
|
|
356
|
+
# our target expr doesn't contain async FunctionCalls, which means they will get completed in-order
|
|
357
|
+
for row in rows:
|
|
358
|
+
if row.has_val[self.e.slot_idx]:
|
|
359
|
+
# the source_expr's value is not a list
|
|
360
|
+
assert row.vals[self.e.slot_idx] is None
|
|
361
|
+
continue
|
|
362
|
+
assert row.vals[self.e.slot_idx] is not None and isinstance(row.vals[self.e.slot_idx], NestedRowList)
|
|
363
|
+
await row.vals[self.e.slot_idx].completion.wait()
|
|
364
|
+
row.has_val[self.e.slot_idx] = True
|
|
365
|
+
self.dispatcher.dispatch(rows, self.exec_ctx)
|
|
@@ -9,12 +9,12 @@ from typing import AsyncIterator, Iterable, Optional, Union
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
11
11
|
import pixeltable.exceptions as excs
|
|
12
|
-
from pixeltable import exprs
|
|
12
|
+
from pixeltable import exprs
|
|
13
13
|
|
|
14
14
|
from ..data_row_batch import DataRowBatch
|
|
15
15
|
from ..exec_node import ExecNode
|
|
16
|
-
from .evaluators import
|
|
17
|
-
from .globals import
|
|
16
|
+
from .evaluators import FnCallEvaluator, NestedRowList
|
|
17
|
+
from .globals import ExecCtx, Scheduler
|
|
18
18
|
from .row_buffer import RowBuffer
|
|
19
19
|
from .schedulers import SCHEDULERS
|
|
20
20
|
|
|
@@ -42,12 +42,9 @@ class ExprEvalNode(ExecNode):
|
|
|
42
42
|
"""
|
|
43
43
|
|
|
44
44
|
maintain_input_order: bool # True if we're returning rows in the order we received them from our input
|
|
45
|
-
num_dependencies: np.ndarray # number of dependencies for our output slots; indexed by slot idx
|
|
46
45
|
outputs: np.ndarray # bool per slot; True if this slot is part of our output
|
|
47
|
-
slot_evaluators: dict[int, Evaluator] # key: slot idx
|
|
48
46
|
schedulers: dict[str, Scheduler] # key: resource pool name
|
|
49
|
-
|
|
50
|
-
eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
|
|
47
|
+
exec_ctx: ExecCtx # for input/output rows
|
|
51
48
|
|
|
52
49
|
# execution state
|
|
53
50
|
tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
|
|
@@ -82,19 +79,10 @@ class ExprEvalNode(ExecNode):
|
|
|
82
79
|
):
|
|
83
80
|
super().__init__(row_builder, output_exprs, input_exprs, input)
|
|
84
81
|
self.maintain_input_order = maintain_input_order
|
|
85
|
-
self.num_dependencies = np.sum(row_builder.dependencies, axis=1)
|
|
86
82
|
self.outputs = np.zeros(row_builder.num_materialized, dtype=bool)
|
|
87
83
|
output_slot_idxs = [e.slot_idx for e in output_exprs]
|
|
88
84
|
self.outputs[output_slot_idxs] = True
|
|
89
85
|
self.tasks = set()
|
|
90
|
-
|
|
91
|
-
self.gc_targets = np.ones(row_builder.num_materialized, dtype=bool)
|
|
92
|
-
# we need to retain all slots that are part of the output
|
|
93
|
-
self.gc_targets[[e.slot_idx for e in row_builder.output_exprs]] = False
|
|
94
|
-
|
|
95
|
-
output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
|
|
96
|
-
self.eval_ctx = np.zeros(row_builder.num_materialized, dtype=bool)
|
|
97
|
-
self.eval_ctx[output_ctx.slot_idxs] = True
|
|
98
86
|
self.error = None
|
|
99
87
|
|
|
100
88
|
self.input_iter = self.input.__aiter__()
|
|
@@ -110,30 +98,14 @@ class ExprEvalNode(ExecNode):
|
|
|
110
98
|
self.num_input_rows = 0
|
|
111
99
|
self.num_output_rows = 0
|
|
112
100
|
|
|
113
|
-
self.slot_evaluators = {}
|
|
101
|
+
# self.slot_evaluators = {}
|
|
114
102
|
self.schedulers = {}
|
|
115
|
-
self._init_slot_evaluators()
|
|
103
|
+
# self._init_slot_evaluators()
|
|
104
|
+
self.exec_ctx = ExecCtx(self, self.row_builder, output_exprs, input_exprs)
|
|
116
105
|
|
|
117
106
|
def set_input_order(self, maintain_input_order: bool) -> None:
|
|
118
107
|
self.maintain_input_order = maintain_input_order
|
|
119
108
|
|
|
120
|
-
def _init_slot_evaluators(self) -> None:
|
|
121
|
-
"""Create slot evaluators and resource pool schedulers"""
|
|
122
|
-
resource_pools: set[str] = set()
|
|
123
|
-
for slot_idx in range(self.row_builder.num_materialized):
|
|
124
|
-
expr = self.row_builder.unique_exprs[slot_idx]
|
|
125
|
-
if (
|
|
126
|
-
isinstance(expr, exprs.FunctionCall)
|
|
127
|
-
# ExprTemplateFunction and AggregateFunction calls are best handled by FunctionCall.eval()
|
|
128
|
-
and not isinstance(expr.fn, func.ExprTemplateFunction)
|
|
129
|
-
and not isinstance(expr.fn, func.AggregateFunction)
|
|
130
|
-
):
|
|
131
|
-
if expr.resource_pool is not None:
|
|
132
|
-
resource_pools.add(expr.resource_pool)
|
|
133
|
-
self.slot_evaluators[slot_idx] = FnCallEvaluator(expr, self)
|
|
134
|
-
else:
|
|
135
|
-
self.slot_evaluators[slot_idx] = DefaultExprEvaluator(expr, self)
|
|
136
|
-
|
|
137
109
|
async def _fetch_input_batch(self) -> None:
|
|
138
110
|
"""
|
|
139
111
|
Fetches another batch from our input or sets input_complete to True if there are no more batches.
|
|
@@ -155,7 +127,8 @@ class ExprEvalNode(ExecNode):
|
|
|
155
127
|
self.num_input_rows += len(batch)
|
|
156
128
|
self.avail_input_rows += len(batch)
|
|
157
129
|
_logger.debug(
|
|
158
|
-
f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows}
|
|
130
|
+
f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} '
|
|
131
|
+
f'#avail={self.avail_input_rows}'
|
|
159
132
|
)
|
|
160
133
|
except StopAsyncIteration:
|
|
161
134
|
self.input_complete = True
|
|
@@ -199,8 +172,8 @@ class ExprEvalNode(ExecNode):
|
|
|
199
172
|
self.num_in_flight += num_rows
|
|
200
173
|
self._log_state(f'dispatch input ({num_rows})')
|
|
201
174
|
|
|
202
|
-
self.
|
|
203
|
-
self.dispatch(rows)
|
|
175
|
+
self.exec_ctx.init_rows(rows)
|
|
176
|
+
self.dispatch(rows, self.exec_ctx)
|
|
204
177
|
|
|
205
178
|
def _log_state(self, prefix: str) -> None:
|
|
206
179
|
_logger.debug(
|
|
@@ -212,7 +185,9 @@ class ExprEvalNode(ExecNode):
|
|
|
212
185
|
|
|
213
186
|
def _init_schedulers(self) -> None:
|
|
214
187
|
resource_pools = {
|
|
215
|
-
eval.fn_call.resource_pool
|
|
188
|
+
eval.fn_call.resource_pool
|
|
189
|
+
for eval in self.exec_ctx.slot_evaluators.values()
|
|
190
|
+
if isinstance(eval, FnCallEvaluator)
|
|
216
191
|
}
|
|
217
192
|
resource_pools = {pool for pool in resource_pools if pool is not None}
|
|
218
193
|
for pool_name in resource_pools:
|
|
@@ -287,7 +262,7 @@ class ExprEvalNode(ExecNode):
|
|
|
287
262
|
if self.input_complete and self.avail_input_rows == 0 and not closed_evaluators:
|
|
288
263
|
# no more input rows to dispatch, but we're still waiting for rows to finish:
|
|
289
264
|
# close all slot evaluators to flush queued rows
|
|
290
|
-
for evaluator in self.slot_evaluators.values():
|
|
265
|
+
for evaluator in self.exec_ctx.slot_evaluators.values():
|
|
291
266
|
evaluator.close()
|
|
292
267
|
closed_evaluators = True
|
|
293
268
|
|
|
@@ -303,7 +278,7 @@ class ExprEvalNode(ExecNode):
|
|
|
303
278
|
if completed_aw is None:
|
|
304
279
|
completed_aw = asyncio.create_task(self.completed_event.wait(), name='completed.wait()')
|
|
305
280
|
aws.add(completed_aw)
|
|
306
|
-
done,
|
|
281
|
+
done, _ = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
|
|
307
282
|
|
|
308
283
|
if self.exc_event.is_set():
|
|
309
284
|
# we got an exception that we need to propagate through __iter__()
|
|
@@ -332,22 +307,18 @@ class ExprEvalNode(ExecNode):
|
|
|
332
307
|
task.cancel()
|
|
333
308
|
_ = await asyncio.gather(*active_tasks, return_exceptions=True)
|
|
334
309
|
|
|
335
|
-
def
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
row.missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
|
|
339
|
-
row.missing_slots = self.eval_ctx & (row.has_val == False)
|
|
340
|
-
|
|
341
|
-
def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
|
|
310
|
+
def dispatch_exc(
|
|
311
|
+
self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
|
|
312
|
+
) -> None:
|
|
342
313
|
"""Propagate exception to main event loop or to dependent slots, depending on ignore_errors"""
|
|
343
314
|
if len(rows) == 0 or self.exc_event.is_set():
|
|
344
315
|
return
|
|
345
316
|
|
|
346
317
|
if not self.ctx.ignore_errors:
|
|
347
|
-
dependency_idxs = [e.slot_idx for e in
|
|
318
|
+
dependency_idxs = [e.slot_idx for e in exec_ctx.row_builder.unique_exprs[slot_with_exc].dependencies()]
|
|
348
319
|
first_row = rows[0]
|
|
349
320
|
input_vals = [first_row[idx] for idx in dependency_idxs]
|
|
350
|
-
e =
|
|
321
|
+
e = exec_ctx.row_builder.unique_exprs[slot_with_exc]
|
|
351
322
|
self.error = excs.ExprEvalError(e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
|
|
352
323
|
self.exc_event.set()
|
|
353
324
|
return
|
|
@@ -356,17 +327,17 @@ class ExprEvalNode(ExecNode):
|
|
|
356
327
|
assert row.has_exc(slot_with_exc)
|
|
357
328
|
exc = row.get_exc(slot_with_exc)
|
|
358
329
|
# propagate exception
|
|
359
|
-
for slot_idx in np.nonzero(
|
|
330
|
+
for slot_idx in np.nonzero(exec_ctx.row_builder.transitive_dependents[slot_with_exc])[0].tolist():
|
|
360
331
|
row.set_exc(slot_idx, exc)
|
|
361
|
-
self.dispatch(rows)
|
|
332
|
+
self.dispatch(rows, exec_ctx)
|
|
362
333
|
|
|
363
|
-
def dispatch(self, rows: list[exprs.DataRow]) -> None:
|
|
334
|
+
def dispatch(self, rows: list[exprs.DataRow], exec_ctx: ExecCtx) -> None:
|
|
364
335
|
"""Dispatch rows to slot evaluators, based on materialized dependencies"""
|
|
365
336
|
if len(rows) == 0 or self.exc_event.is_set():
|
|
366
337
|
return
|
|
367
338
|
|
|
368
339
|
# slots ready for evaluation; rows x slots
|
|
369
|
-
ready_slots = np.zeros((len(rows),
|
|
340
|
+
ready_slots = np.zeros((len(rows), exec_ctx.row_builder.num_materialized), dtype=bool)
|
|
370
341
|
completed_rows = np.zeros(len(rows), dtype=bool)
|
|
371
342
|
for i, row in enumerate(rows):
|
|
372
343
|
row.missing_slots &= row.has_val == False
|
|
@@ -375,25 +346,33 @@ class ExprEvalNode(ExecNode):
|
|
|
375
346
|
completed_rows[i] = True
|
|
376
347
|
else:
|
|
377
348
|
# dependencies of missing slots
|
|
378
|
-
missing_dependencies =
|
|
349
|
+
missing_dependencies = exec_ctx.row_builder.num_dependencies * row.missing_slots
|
|
379
350
|
# determine ready slots that are not yet materialized and not yet scheduled
|
|
380
|
-
num_mat_dependencies = np.sum(
|
|
351
|
+
num_mat_dependencies = np.sum(exec_ctx.row_builder.dependencies * row.has_val, axis=1)
|
|
381
352
|
num_missing = missing_dependencies - num_mat_dependencies
|
|
382
353
|
ready_slots[i] = (num_missing == 0) & (row.is_scheduled == False) & row.missing_slots
|
|
383
|
-
row.is_scheduled
|
|
354
|
+
row.is_scheduled |= ready_slots[i]
|
|
384
355
|
|
|
385
356
|
# clear intermediate values that are no longer needed (ie, all dependents are materialized)
|
|
386
|
-
missing_dependents = np.sum(
|
|
387
|
-
gc_targets = (missing_dependents == 0) & (row.missing_dependents > 0) &
|
|
357
|
+
missing_dependents = np.sum(exec_ctx.row_builder.dependencies[row.has_val == False], axis=0)
|
|
358
|
+
gc_targets = (missing_dependents == 0) & (row.missing_dependents > 0) & exec_ctx.gc_targets
|
|
388
359
|
row.clear(gc_targets)
|
|
389
360
|
row.missing_dependents = missing_dependents
|
|
390
361
|
|
|
391
362
|
if np.any(completed_rows):
|
|
392
363
|
completed_idxs = list(completed_rows.nonzero()[0])
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
364
|
+
if rows[i].parent_row is not None:
|
|
365
|
+
# these are nested rows
|
|
366
|
+
for i in completed_idxs:
|
|
367
|
+
row = rows[i]
|
|
368
|
+
assert row.parent_row is not None and row.parent_slot_idx is not None
|
|
369
|
+
assert isinstance(row.parent_row.vals[row.parent_slot_idx], NestedRowList)
|
|
370
|
+
row.parent_row.vals[row.parent_slot_idx].complete_row()
|
|
371
|
+
else:
|
|
372
|
+
for i in completed_idxs:
|
|
373
|
+
self.completed_rows.put_nowait(rows[i])
|
|
374
|
+
self.completed_event.set()
|
|
375
|
+
self.num_in_flight -= len(completed_idxs)
|
|
397
376
|
|
|
398
377
|
# schedule all ready slots
|
|
399
378
|
for slot_idx in np.sum(ready_slots, axis=0).nonzero()[0]:
|
|
@@ -401,7 +380,7 @@ class ExprEvalNode(ExecNode):
|
|
|
401
380
|
_ = ready_rows_v.nonzero()
|
|
402
381
|
ready_rows = [rows[i] for i in ready_rows_v.nonzero()[0]]
|
|
403
382
|
_logger.debug(f'Scheduling {len(ready_rows)} rows for slot {slot_idx}')
|
|
404
|
-
|
|
383
|
+
exec_ctx.slot_evaluators[slot_idx].schedule(ready_rows, slot_idx)
|
|
405
384
|
|
|
406
385
|
def register_task(self, t: asyncio.Task) -> None:
|
|
407
386
|
self.tasks.add(t)
|
|
@@ -4,7 +4,9 @@ import abc
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import Any, Optional, Protocol
|
|
7
|
+
from typing import Any, Iterable, Optional, Protocol
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
8
10
|
|
|
9
11
|
from pixeltable import exprs, func
|
|
10
12
|
|
|
@@ -53,6 +55,7 @@ class Scheduler(abc.ABC):
|
|
|
53
55
|
|
|
54
56
|
request: FnCallArgs
|
|
55
57
|
num_retries: int
|
|
58
|
+
exec_ctx: ExecCtx
|
|
56
59
|
|
|
57
60
|
def __lt__(self, other: Scheduler.QueueItem) -> bool:
|
|
58
61
|
# prioritize by number of retries (more retries = higher priority)
|
|
@@ -67,8 +70,8 @@ class Scheduler(abc.ABC):
|
|
|
67
70
|
self.queue = asyncio.PriorityQueue()
|
|
68
71
|
self.dispatcher = dispatcher
|
|
69
72
|
|
|
70
|
-
def submit(self, item: FnCallArgs) -> None:
|
|
71
|
-
self.queue.put_nowait(self.QueueItem(item, 0))
|
|
73
|
+
def submit(self, item: FnCallArgs, exec_ctx: ExecCtx) -> None:
|
|
74
|
+
self.queue.put_nowait(self.QueueItem(item, 0, exec_ctx))
|
|
72
75
|
|
|
73
76
|
@classmethod
|
|
74
77
|
@abc.abstractmethod
|
|
@@ -90,11 +93,11 @@ class Dispatcher(Protocol):
|
|
|
90
93
|
exc_event: asyncio.Event
|
|
91
94
|
schedulers: dict[str, Scheduler] # key: resource pool id
|
|
92
95
|
|
|
93
|
-
def dispatch(self, rows: list[exprs.DataRow]) -> None:
|
|
96
|
+
def dispatch(self, rows: list[exprs.DataRow], exec_ctx: Any) -> None:
|
|
94
97
|
"""Dispatches row slots to the appropriate schedulers; does not block"""
|
|
95
98
|
...
|
|
96
99
|
|
|
97
|
-
def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
|
|
100
|
+
def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: Any) -> None:
|
|
98
101
|
"""Propagates exception in slot_with_exc to all dependent slots and dispatches the rest; does not block"""
|
|
99
102
|
...
|
|
100
103
|
|
|
@@ -116,15 +119,16 @@ class Evaluator(abc.ABC):
|
|
|
116
119
|
|
|
117
120
|
dispatcher: Dispatcher
|
|
118
121
|
is_closed: bool
|
|
122
|
+
exec_ctx: 'ExecCtx'
|
|
119
123
|
|
|
120
|
-
def __init__(self, dispatcher: Dispatcher):
|
|
124
|
+
def __init__(self, dispatcher: Dispatcher, exec_ctx: 'ExecCtx') -> None:
|
|
121
125
|
self.dispatcher = dispatcher
|
|
122
126
|
self.is_closed = False
|
|
127
|
+
self.exec_ctx = exec_ctx
|
|
123
128
|
|
|
124
129
|
@abc.abstractmethod
|
|
125
130
|
def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
|
|
126
131
|
"""Create tasks to evaluate the expression in the given slot for the given rows; must not block."""
|
|
127
|
-
...
|
|
128
132
|
|
|
129
133
|
def _close(self) -> None:
|
|
130
134
|
"""Close the evaluator; must not block"""
|
|
@@ -134,3 +138,60 @@ class Evaluator(abc.ABC):
|
|
|
134
138
|
"""Indicates that there may not be any more rows getting scheduled"""
|
|
135
139
|
self.is_closed = True
|
|
136
140
|
self._close()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class ExecCtx:
|
|
144
|
+
"""DataRow-specific state needed by ExprEvalNode"""
|
|
145
|
+
|
|
146
|
+
row_builder: exprs.RowBuilder
|
|
147
|
+
slot_evaluators: dict[int, Evaluator] # key: slot idx
|
|
148
|
+
gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
|
|
149
|
+
eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
|
|
150
|
+
literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
|
|
151
|
+
|
|
152
|
+
def __init__(
|
|
153
|
+
self,
|
|
154
|
+
dispatcher: Dispatcher,
|
|
155
|
+
row_builder: exprs.RowBuilder,
|
|
156
|
+
output_exprs: Iterable[exprs.Expr],
|
|
157
|
+
input_exprs: Iterable[exprs.Expr],
|
|
158
|
+
):
|
|
159
|
+
self.row_builder = row_builder
|
|
160
|
+
self.slot_evaluators = {}
|
|
161
|
+
# TODO: only include output_exprs dependencies
|
|
162
|
+
self.gc_targets = np.ones(self.row_builder.num_materialized, dtype=bool)
|
|
163
|
+
# we need to retain all slots that are part of the output
|
|
164
|
+
self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
|
|
165
|
+
|
|
166
|
+
output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
|
|
167
|
+
self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
|
|
168
|
+
self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
|
|
169
|
+
non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
|
|
170
|
+
self.eval_ctx[non_literal_slot_idxs] = True
|
|
171
|
+
self._init_slot_evaluators(dispatcher, non_literal_slot_idxs)
|
|
172
|
+
|
|
173
|
+
def _init_slot_evaluators(self, dispatcher: Dispatcher, target_slot_idxs: list[int]) -> None:
|
|
174
|
+
from .evaluators import DefaultExprEvaluator, FnCallEvaluator, JsonMapperDispatcher
|
|
175
|
+
|
|
176
|
+
for slot_idx in target_slot_idxs:
|
|
177
|
+
expr = self.row_builder.unique_exprs[slot_idx]
|
|
178
|
+
if (
|
|
179
|
+
isinstance(expr, exprs.FunctionCall)
|
|
180
|
+
# ExprTemplateFunction and AggregateFunction calls are best handled by FunctionCall.eval()
|
|
181
|
+
and not isinstance(expr.fn, func.ExprTemplateFunction)
|
|
182
|
+
and not isinstance(expr.fn, func.AggregateFunction)
|
|
183
|
+
):
|
|
184
|
+
self.slot_evaluators[slot_idx] = FnCallEvaluator(expr, dispatcher, self)
|
|
185
|
+
elif isinstance(expr, exprs.JsonMapperDispatch):
|
|
186
|
+
self.slot_evaluators[slot_idx] = JsonMapperDispatcher(expr, dispatcher, self)
|
|
187
|
+
else:
|
|
188
|
+
self.slot_evaluators[slot_idx] = DefaultExprEvaluator(expr, dispatcher, self)
|
|
189
|
+
|
|
190
|
+
def init_rows(self, rows: list[exprs.DataRow]) -> None:
|
|
191
|
+
"""Pre-populate rows with literals and initialize execution state"""
|
|
192
|
+
for row in rows:
|
|
193
|
+
# set literals before missing_dependents/slots
|
|
194
|
+
for slot_idx, val in self.literals.items():
|
|
195
|
+
row[slot_idx] = val
|
|
196
|
+
row.missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
|
|
197
|
+
row.missing_slots = self.eval_ctx & (row.has_val == False)
|