pixeltable 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (106) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -1
  4. pixeltable/catalog/catalog.py +63 -36
  5. pixeltable/catalog/column.py +6 -4
  6. pixeltable/catalog/dir.py +5 -5
  7. pixeltable/catalog/globals.py +12 -14
  8. pixeltable/catalog/insertable_table.py +4 -7
  9. pixeltable/catalog/path.py +2 -2
  10. pixeltable/catalog/table.py +64 -56
  11. pixeltable/catalog/table_version.py +42 -40
  12. pixeltable/catalog/table_version_handle.py +3 -0
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/view.py +8 -7
  15. pixeltable/dataframe.py +5 -3
  16. pixeltable/env.py +108 -42
  17. pixeltable/exec/__init__.py +2 -0
  18. pixeltable/exec/aggregation_node.py +6 -8
  19. pixeltable/exec/cache_prefetch_node.py +4 -7
  20. pixeltable/exec/component_iteration_node.py +1 -3
  21. pixeltable/exec/data_row_batch.py +1 -2
  22. pixeltable/exec/exec_context.py +1 -1
  23. pixeltable/exec/exec_node.py +1 -2
  24. pixeltable/exec/expr_eval/__init__.py +2 -0
  25. pixeltable/exec/expr_eval/evaluators.py +137 -20
  26. pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
  27. pixeltable/exec/expr_eval/globals.py +68 -7
  28. pixeltable/exec/expr_eval/schedulers.py +25 -23
  29. pixeltable/exec/in_memory_data_node.py +8 -6
  30. pixeltable/exec/row_update_node.py +3 -4
  31. pixeltable/exec/sql_node.py +16 -17
  32. pixeltable/exprs/__init__.py +1 -1
  33. pixeltable/exprs/column_property_ref.py +1 -1
  34. pixeltable/exprs/column_ref.py +3 -3
  35. pixeltable/exprs/compound_predicate.py +1 -1
  36. pixeltable/exprs/data_row.py +17 -1
  37. pixeltable/exprs/expr.py +12 -12
  38. pixeltable/exprs/function_call.py +34 -2
  39. pixeltable/exprs/json_mapper.py +95 -48
  40. pixeltable/exprs/json_path.py +3 -4
  41. pixeltable/exprs/method_ref.py +2 -2
  42. pixeltable/exprs/object_ref.py +2 -2
  43. pixeltable/exprs/row_builder.py +33 -6
  44. pixeltable/exprs/similarity_expr.py +1 -1
  45. pixeltable/exprs/sql_element_cache.py +1 -1
  46. pixeltable/exprs/string_op.py +2 -2
  47. pixeltable/ext/__init__.py +1 -1
  48. pixeltable/ext/functions/__init__.py +1 -1
  49. pixeltable/ext/functions/whisperx.py +1 -1
  50. pixeltable/ext/functions/yolox.py +1 -1
  51. pixeltable/func/aggregate_function.py +1 -1
  52. pixeltable/func/callable_function.py +2 -5
  53. pixeltable/func/expr_template_function.py +22 -2
  54. pixeltable/func/function.py +4 -5
  55. pixeltable/func/function_registry.py +1 -1
  56. pixeltable/func/signature.py +1 -1
  57. pixeltable/func/udf.py +2 -2
  58. pixeltable/functions/__init__.py +1 -1
  59. pixeltable/functions/anthropic.py +2 -2
  60. pixeltable/functions/audio.py +1 -1
  61. pixeltable/functions/deepseek.py +1 -1
  62. pixeltable/functions/fireworks.py +1 -1
  63. pixeltable/functions/globals.py +6 -6
  64. pixeltable/functions/huggingface.py +1 -1
  65. pixeltable/functions/image.py +1 -1
  66. pixeltable/functions/json.py +1 -1
  67. pixeltable/functions/llama_cpp.py +1 -1
  68. pixeltable/functions/math.py +1 -1
  69. pixeltable/functions/mistralai.py +1 -1
  70. pixeltable/functions/ollama.py +1 -1
  71. pixeltable/functions/openai.py +2 -2
  72. pixeltable/functions/replicate.py +1 -1
  73. pixeltable/functions/string.py +1 -1
  74. pixeltable/functions/timestamp.py +1 -1
  75. pixeltable/functions/together.py +1 -1
  76. pixeltable/functions/util.py +1 -1
  77. pixeltable/functions/video.py +2 -2
  78. pixeltable/functions/vision.py +2 -2
  79. pixeltable/index/embedding_index.py +12 -1
  80. pixeltable/io/__init__.py +5 -3
  81. pixeltable/io/fiftyone.py +6 -7
  82. pixeltable/io/label_studio.py +21 -20
  83. pixeltable/io/pandas.py +6 -5
  84. pixeltable/iterators/__init__.py +1 -1
  85. pixeltable/metadata/__init__.py +5 -3
  86. pixeltable/metadata/converters/convert_24.py +3 -3
  87. pixeltable/metadata/converters/convert_25.py +1 -1
  88. pixeltable/metadata/converters/convert_29.py +1 -1
  89. pixeltable/store.py +2 -2
  90. pixeltable/type_system.py +19 -7
  91. pixeltable/utils/console_output.py +3 -2
  92. pixeltable/utils/coroutine.py +3 -3
  93. pixeltable/utils/dbms.py +66 -0
  94. pixeltable/utils/documents.py +61 -67
  95. pixeltable/utils/filecache.py +1 -1
  96. pixeltable/utils/http_server.py +3 -2
  97. pixeltable/utils/pytorch.py +1 -1
  98. pixeltable/utils/sql.py +1 -1
  99. pixeltable-0.3.11.dist-info/METADATA +436 -0
  100. pixeltable-0.3.11.dist-info/RECORD +179 -0
  101. pixeltable/catalog/path_dict.py +0 -169
  102. pixeltable-0.3.10.dist-info/METADATA +0 -382
  103. pixeltable-0.3.10.dist-info/RECORD +0 -179
  104. {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
  105. {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +0 -0
  106. {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
@@ -9,7 +9,7 @@ from typing import Any, Callable, Iterator, Optional, cast
9
9
 
10
10
  from pixeltable import exprs, func
11
11
 
12
- from .globals import Dispatcher, Evaluator, FnCallArgs
12
+ from .globals import Dispatcher, Evaluator, ExecCtx, FnCallArgs
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
15
15
 
@@ -26,8 +26,8 @@ class DefaultExprEvaluator(Evaluator):
26
26
 
27
27
  e: exprs.Expr
28
28
 
29
- def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
30
- super().__init__(dispatcher)
29
+ def __init__(self, e: exprs.Expr, dispatcher: Dispatcher, exec_ctx: ExecCtx):
30
+ super().__init__(dispatcher, exec_ctx)
31
31
  self.e = e
32
32
 
33
33
  def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
@@ -47,8 +47,8 @@ class DefaultExprEvaluator(Evaluator):
47
47
  _, _, exc_tb = sys.exc_info()
48
48
  row.set_exc(self.e.slot_idx, exc)
49
49
  rows_with_excs.add(idx)
50
- self.dispatcher.dispatch_exc([row], self.e.slot_idx, exc_tb)
51
- self.dispatcher.dispatch([rows[i] for i in range(len(rows)) if i not in rows_with_excs])
50
+ self.dispatcher.dispatch_exc([row], self.e.slot_idx, exc_tb, self.exec_ctx)
51
+ self.dispatcher.dispatch([rows[i] for i in range(len(rows)) if i not in rows_with_excs], self.exec_ctx)
52
52
 
53
53
 
54
54
  class FnCallEvaluator(Evaluator):
@@ -70,8 +70,8 @@ class FnCallEvaluator(Evaluator):
70
70
  call_args_queue: Optional[asyncio.Queue[FnCallArgs]] # FnCallArgs waiting for execution
71
71
  batch_size: Optional[int]
72
72
 
73
- def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher):
74
- super().__init__(dispatcher)
73
+ def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
74
+ super().__init__(dispatcher, exec_ctx)
75
75
  self.fn_call = fn_call
76
76
  self.fn = cast(func.CallableFunction, fn_call.fn)
77
77
  if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
@@ -104,7 +104,7 @@ class FnCallEvaluator(Evaluator):
104
104
  rows_call_args.append(FnCallArgs(self.fn_call, [row], args=args, kwargs=kwargs))
105
105
 
106
106
  if len(skip_rows) > 0:
107
- self.dispatcher.dispatch(skip_rows)
107
+ self.dispatcher.dispatch(skip_rows, self.exec_ctx)
108
108
 
109
109
  if self.batch_size is not None:
110
110
  if not self.is_closed and (len(rows_call_args) + self.call_args_queue.qsize() < self.batch_size):
@@ -132,7 +132,7 @@ class FnCallEvaluator(Evaluator):
132
132
  if self.fn_call.resource_pool is not None:
133
133
  # hand the call off to the resource pool's scheduler
134
134
  scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
135
- scheduler.submit(batched_call_args)
135
+ scheduler.submit(batched_call_args, self.exec_ctx)
136
136
  else:
137
137
  task = asyncio.create_task(self.eval_batch(batched_call_args))
138
138
  self.dispatcher.register_task(task)
@@ -142,7 +142,7 @@ class FnCallEvaluator(Evaluator):
142
142
  # hand the call off to the resource pool's scheduler
143
143
  scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
144
144
  for item in rows_call_args:
145
- scheduler.submit(item)
145
+ scheduler.submit(item, self.exec_ctx)
146
146
  else:
147
147
  # create one task per call
148
148
  for item in rows_call_args:
@@ -161,14 +161,12 @@ class FnCallEvaluator(Evaluator):
161
161
  def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
162
162
  """Roll call_args into a single batched FnCallArgs"""
163
163
  batch_args: list[list[Optional[Any]]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
164
- batch_kwargs: dict[str, list[Optional[Any]]] = {
165
- k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs.keys()
166
- }
164
+ batch_kwargs: dict[str, list[Optional[Any]]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
167
165
  assert isinstance(self.fn, func.CallableFunction)
168
166
  for i, item in enumerate(call_args):
169
167
  for j in range(len(item.args)):
170
168
  batch_args[j][i] = item.args[j]
171
- for k in item.kwargs.keys():
169
+ for k in item.kwargs:
172
170
  batch_kwargs[k][i] = item.kwargs[k]
173
171
  return FnCallArgs(
174
172
  self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
@@ -190,12 +188,12 @@ class FnCallEvaluator(Evaluator):
190
188
  _, _, exc_tb = sys.exc_info()
191
189
  for row in batched_call_args.rows:
192
190
  row.set_exc(self.fn_call.slot_idx, exc)
193
- self.dispatcher.dispatch_exc(batched_call_args.rows, self.fn_call.slot_idx, exc_tb)
191
+ self.dispatcher.dispatch_exc(batched_call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
194
192
  return
195
193
 
196
194
  for i, row in enumerate(batched_call_args.rows):
197
195
  row[self.fn_call.slot_idx] = result_batch[i]
198
- self.dispatcher.dispatch(batched_call_args.rows)
196
+ self.dispatcher.dispatch(batched_call_args.rows, self.exec_ctx)
199
197
 
200
198
  async def eval_async(self, call_args: FnCallArgs) -> None:
201
199
  assert len(call_args.rows) == 1
@@ -208,7 +206,7 @@ class FnCallEvaluator(Evaluator):
208
206
  call_args.row[self.fn_call.slot_idx] = await self.fn.aexec(*call_args.args, **call_args.kwargs)
209
207
  end_ts = datetime.datetime.now()
210
208
  _logger.debug(f'Evaluated slot {self.fn_call.slot_idx} in {end_ts - start_ts}')
211
- self.dispatcher.dispatch([call_args.row])
209
+ self.dispatcher.dispatch([call_args.row], self.exec_ctx)
212
210
  except Exception as exc:
213
211
  import anthropic
214
212
 
@@ -216,7 +214,7 @@ class FnCallEvaluator(Evaluator):
216
214
  _logger.debug(f'RateLimitError: {exc}')
217
215
  _, _, exc_tb = sys.exc_info()
218
216
  call_args.row.set_exc(self.fn_call.slot_idx, exc)
219
- self.dispatcher.dispatch_exc(call_args.rows, self.fn_call.slot_idx, exc_tb)
217
+ self.dispatcher.dispatch_exc(call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
220
218
 
221
219
  async def eval(self, call_args_batch: list[FnCallArgs]) -> None:
222
220
  rows_with_excs: set[int] = set() # records idxs into 'rows'
@@ -233,9 +231,9 @@ class FnCallEvaluator(Evaluator):
233
231
  _, _, exc_tb = sys.exc_info()
234
232
  item.row.set_exc(self.fn_call.slot_idx, exc)
235
233
  rows_with_excs.add(idx)
236
- self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
234
+ self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
237
235
  self.dispatcher.dispatch(
238
- [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
236
+ [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs], self.exec_ctx
239
237
  )
240
238
 
241
239
  def _close(self) -> None:
@@ -246,3 +244,122 @@ class FnCallEvaluator(Evaluator):
246
244
  batched_call_args = self._create_batch_call_args(list(self._queued_call_args_iter()))
247
245
  task = asyncio.create_task(self.eval_batch(batched_call_args))
248
246
  self.dispatcher.register_task(task)
247
+
248
+
249
+ class NestedRowList:
250
+ """
251
+ A list of nested rows, used by JsonMapperDispatcher to store the rows corresponding to the elements of the
252
+ JsonMapper source list and make completion awaitable.
253
+ """
254
+
255
+ rows: list[exprs.DataRow]
256
+ num_completed: int
257
+ completion: asyncio.Event
258
+
259
+ def __init__(self, rows: list[exprs.DataRow]):
260
+ self.num_completed = 0
261
+ self.rows = rows
262
+ self.completion = asyncio.Event()
263
+
264
+ def complete_row(self) -> None:
265
+ self.num_completed += 1
266
+ if self.num_completed == len(self.rows):
267
+ self.completion.set()
268
+
269
+
270
+ class JsonMapperDispatcher(Evaluator):
271
+ """
272
+ The execution logic for materializing the nested DataRows of a JsonMapper/JsonMapperDispatch.
273
+
274
+ The rows are stored in a NestedRowList, which itself is stored in the JsonMapperDispatch instance's slot.
275
+ """
276
+
277
+ e: exprs.JsonMapperDispatch
278
+ target_expr: exprs.Expr
279
+ scope_anchor: exprs.ObjectRef
280
+ nested_exec_ctx: ExecCtx # ExecCtx needed to evaluate the nested rows
281
+ external_slot_map: dict[int, int] # slot idx in parent row -> slot idx in nested row
282
+ has_async_calls: bool # True if target_expr contains any async FunctionCalls
283
+
284
+ def __init__(self, e: exprs.JsonMapperDispatch, dispatcher: Dispatcher, exec_ctx: ExecCtx):
285
+ super().__init__(dispatcher, exec_ctx)
286
+ self.e = e
287
+ self.target_expr = e.target_expr.copy() # we need new slot idxs
288
+ self.scope_anchor = e.scope_anchor.copy()
289
+ nested_row_builder = exprs.RowBuilder(output_exprs=[self.target_expr], columns=[], input_exprs=[])
290
+ nested_row_builder.set_slot_idxs([self.target_expr, self.scope_anchor])
291
+ target_expr_ctx = nested_row_builder.create_eval_ctx([self.target_expr], limit_scope=True)
292
+ self.has_async_calls = any(isinstance(e, exprs.FunctionCall) and e.is_async for e in target_expr_ctx.exprs)
293
+ target_scope = self.target_expr.scope()
294
+ # we need to pre-populated nested rows with slot values that are produced in an outer scope (literals excluded)
295
+ parent_exprs = [
296
+ e for e in target_expr_ctx.exprs if e.scope() != target_scope and not isinstance(e, exprs.Literal)
297
+ ]
298
+ self.external_slot_map = {exec_ctx.row_builder.unique_exprs[e].slot_idx: e.slot_idx for e in parent_exprs}
299
+ self.nested_exec_ctx = ExecCtx(dispatcher, nested_row_builder, [self.target_expr], parent_exprs)
300
+
301
+ def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
302
+ """Create nested rows for all source list elements and dispatch them"""
303
+ assert self.e.slot_idx >= 0
304
+ all_nested_rows: list[exprs.DataRow] = []
305
+ for row in rows:
306
+ src = row[self.e.src_expr.slot_idx]
307
+ if not isinstance(src, list):
308
+ # invalid/non-list src path
309
+ row[self.e.slot_idx] = None
310
+ continue
311
+
312
+ nested_rows = [
313
+ exprs.DataRow(
314
+ size=self.nested_exec_ctx.row_builder.num_materialized,
315
+ img_slot_idxs=[],
316
+ media_slot_idxs=[],
317
+ array_slot_idxs=[],
318
+ parent_row=row,
319
+ parent_slot_idx=self.e.slot_idx,
320
+ )
321
+ for _ in src
322
+ ]
323
+ for nested_row, anchor_val in zip(nested_rows, src):
324
+ nested_row[self.scope_anchor.slot_idx] = anchor_val
325
+ for slot_idx_, nested_slot_idx in self.external_slot_map.items():
326
+ nested_row[nested_slot_idx] = row[slot_idx_]
327
+ self.nested_exec_ctx.init_rows(nested_rows)
328
+
329
+ # we modify DataRow.vals here directly, rather than going through __getitem__(), because we don't have
330
+ # an official "value" yet (the nested rows are not yet materialized)
331
+ row.vals[self.e.slot_idx] = NestedRowList(nested_rows)
332
+ all_nested_rows.extend(nested_rows)
333
+
334
+ self.dispatcher.dispatch(all_nested_rows, self.nested_exec_ctx)
335
+ task = asyncio.create_task(self.gather(rows))
336
+ self.dispatcher.register_task(task)
337
+
338
+ async def gather(self, rows: list[exprs.DataRow]) -> None:
339
+ """Wait for nested rows to complete, then signal completion to the parent rows"""
340
+ if self.has_async_calls:
341
+ # if our target expr contains async FunctionCalls, they typically get completed out-of-order, and it's
342
+ # more effective to dispatch them as they complete
343
+ remaining = {
344
+ asyncio.create_task(row.vals[self.e.slot_idx].completion.wait()): row
345
+ for row in rows
346
+ if not row.has_val[self.e.slot_idx]
347
+ }
348
+ while len(remaining) > 0:
349
+ done, _ = await asyncio.wait(remaining.keys(), return_when=asyncio.FIRST_COMPLETED)
350
+ done_rows = [remaining.pop(task) for task in done]
351
+ for row in done_rows:
352
+ row.has_val[self.e.slot_idx] = True
353
+ self.dispatcher.dispatch(done_rows, self.exec_ctx)
354
+
355
+ else:
356
+ # our target expr doesn't contain async FunctionCalls, which means they will get completed in-order
357
+ for row in rows:
358
+ if row.has_val[self.e.slot_idx]:
359
+ # the source_expr's value is not a list
360
+ assert row.vals[self.e.slot_idx] is None
361
+ continue
362
+ assert row.vals[self.e.slot_idx] is not None and isinstance(row.vals[self.e.slot_idx], NestedRowList)
363
+ await row.vals[self.e.slot_idx].completion.wait()
364
+ row.has_val[self.e.slot_idx] = True
365
+ self.dispatcher.dispatch(rows, self.exec_ctx)
@@ -9,12 +9,12 @@ from typing import AsyncIterator, Iterable, Optional, Union
9
9
  import numpy as np
10
10
 
11
11
  import pixeltable.exceptions as excs
12
- from pixeltable import exprs, func
12
+ from pixeltable import exprs
13
13
 
14
14
  from ..data_row_batch import DataRowBatch
15
15
  from ..exec_node import ExecNode
16
- from .evaluators import DefaultExprEvaluator, FnCallEvaluator
17
- from .globals import Evaluator, Scheduler
16
+ from .evaluators import FnCallEvaluator, NestedRowList
17
+ from .globals import ExecCtx, Scheduler
18
18
  from .row_buffer import RowBuffer
19
19
  from .schedulers import SCHEDULERS
20
20
 
@@ -42,12 +42,9 @@ class ExprEvalNode(ExecNode):
42
42
  """
43
43
 
44
44
  maintain_input_order: bool # True if we're returning rows in the order we received them from our input
45
- num_dependencies: np.ndarray # number of dependencies for our output slots; indexed by slot idx
46
45
  outputs: np.ndarray # bool per slot; True if this slot is part of our output
47
- slot_evaluators: dict[int, Evaluator] # key: slot idx
48
46
  schedulers: dict[str, Scheduler] # key: resource pool name
49
- gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
50
- eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
47
+ exec_ctx: ExecCtx # for input/output rows
51
48
 
52
49
  # execution state
53
50
  tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
@@ -82,19 +79,10 @@ class ExprEvalNode(ExecNode):
82
79
  ):
83
80
  super().__init__(row_builder, output_exprs, input_exprs, input)
84
81
  self.maintain_input_order = maintain_input_order
85
- self.num_dependencies = np.sum(row_builder.dependencies, axis=1)
86
82
  self.outputs = np.zeros(row_builder.num_materialized, dtype=bool)
87
83
  output_slot_idxs = [e.slot_idx for e in output_exprs]
88
84
  self.outputs[output_slot_idxs] = True
89
85
  self.tasks = set()
90
-
91
- self.gc_targets = np.ones(row_builder.num_materialized, dtype=bool)
92
- # we need to retain all slots that are part of the output
93
- self.gc_targets[[e.slot_idx for e in row_builder.output_exprs]] = False
94
-
95
- output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
96
- self.eval_ctx = np.zeros(row_builder.num_materialized, dtype=bool)
97
- self.eval_ctx[output_ctx.slot_idxs] = True
98
86
  self.error = None
99
87
 
100
88
  self.input_iter = self.input.__aiter__()
@@ -110,30 +98,14 @@ class ExprEvalNode(ExecNode):
110
98
  self.num_input_rows = 0
111
99
  self.num_output_rows = 0
112
100
 
113
- self.slot_evaluators = {}
101
+ # self.slot_evaluators = {}
114
102
  self.schedulers = {}
115
- self._init_slot_evaluators()
103
+ # self._init_slot_evaluators()
104
+ self.exec_ctx = ExecCtx(self, self.row_builder, output_exprs, input_exprs)
116
105
 
117
106
  def set_input_order(self, maintain_input_order: bool) -> None:
118
107
  self.maintain_input_order = maintain_input_order
119
108
 
120
- def _init_slot_evaluators(self) -> None:
121
- """Create slot evaluators and resource pool schedulers"""
122
- resource_pools: set[str] = set()
123
- for slot_idx in range(self.row_builder.num_materialized):
124
- expr = self.row_builder.unique_exprs[slot_idx]
125
- if (
126
- isinstance(expr, exprs.FunctionCall)
127
- # ExprTemplateFunction and AggregateFunction calls are best handled by FunctionCall.eval()
128
- and not isinstance(expr.fn, func.ExprTemplateFunction)
129
- and not isinstance(expr.fn, func.AggregateFunction)
130
- ):
131
- if expr.resource_pool is not None:
132
- resource_pools.add(expr.resource_pool)
133
- self.slot_evaluators[slot_idx] = FnCallEvaluator(expr, self)
134
- else:
135
- self.slot_evaluators[slot_idx] = DefaultExprEvaluator(expr, self)
136
-
137
109
  async def _fetch_input_batch(self) -> None:
138
110
  """
139
111
  Fetches another batch from our input or sets input_complete to True if there are no more batches.
@@ -155,7 +127,8 @@ class ExprEvalNode(ExecNode):
155
127
  self.num_input_rows += len(batch)
156
128
  self.avail_input_rows += len(batch)
157
129
  _logger.debug(
158
- f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} #avail={self.avail_input_rows}'
130
+ f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} '
131
+ f'#avail={self.avail_input_rows}'
159
132
  )
160
133
  except StopAsyncIteration:
161
134
  self.input_complete = True
@@ -199,8 +172,8 @@ class ExprEvalNode(ExecNode):
199
172
  self.num_in_flight += num_rows
200
173
  self._log_state(f'dispatch input ({num_rows})')
201
174
 
202
- self._init_input_rows(rows)
203
- self.dispatch(rows)
175
+ self.exec_ctx.init_rows(rows)
176
+ self.dispatch(rows, self.exec_ctx)
204
177
 
205
178
  def _log_state(self, prefix: str) -> None:
206
179
  _logger.debug(
@@ -212,7 +185,9 @@ class ExprEvalNode(ExecNode):
212
185
 
213
186
  def _init_schedulers(self) -> None:
214
187
  resource_pools = {
215
- eval.fn_call.resource_pool for eval in self.slot_evaluators.values() if isinstance(eval, FnCallEvaluator)
188
+ eval.fn_call.resource_pool
189
+ for eval in self.exec_ctx.slot_evaluators.values()
190
+ if isinstance(eval, FnCallEvaluator)
216
191
  }
217
192
  resource_pools = {pool for pool in resource_pools if pool is not None}
218
193
  for pool_name in resource_pools:
@@ -287,7 +262,7 @@ class ExprEvalNode(ExecNode):
287
262
  if self.input_complete and self.avail_input_rows == 0 and not closed_evaluators:
288
263
  # no more input rows to dispatch, but we're still waiting for rows to finish:
289
264
  # close all slot evaluators to flush queued rows
290
- for evaluator in self.slot_evaluators.values():
265
+ for evaluator in self.exec_ctx.slot_evaluators.values():
291
266
  evaluator.close()
292
267
  closed_evaluators = True
293
268
 
@@ -303,7 +278,7 @@ class ExprEvalNode(ExecNode):
303
278
  if completed_aw is None:
304
279
  completed_aw = asyncio.create_task(self.completed_event.wait(), name='completed.wait()')
305
280
  aws.add(completed_aw)
306
- done, pending = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
281
+ done, _ = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
307
282
 
308
283
  if self.exc_event.is_set():
309
284
  # we got an exception that we need to propagate through __iter__()
@@ -332,22 +307,18 @@ class ExprEvalNode(ExecNode):
332
307
  task.cancel()
333
308
  _ = await asyncio.gather(*active_tasks, return_exceptions=True)
334
309
 
335
- def _init_input_rows(self, rows: list[exprs.DataRow]) -> None:
336
- """Set execution state in DataRow"""
337
- for row in rows:
338
- row.missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
339
- row.missing_slots = self.eval_ctx & (row.has_val == False)
340
-
341
- def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
310
+ def dispatch_exc(
311
+ self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
312
+ ) -> None:
342
313
  """Propagate exception to main event loop or to dependent slots, depending on ignore_errors"""
343
314
  if len(rows) == 0 or self.exc_event.is_set():
344
315
  return
345
316
 
346
317
  if not self.ctx.ignore_errors:
347
- dependency_idxs = [e.slot_idx for e in self.row_builder.unique_exprs[slot_with_exc].dependencies()]
318
+ dependency_idxs = [e.slot_idx for e in exec_ctx.row_builder.unique_exprs[slot_with_exc].dependencies()]
348
319
  first_row = rows[0]
349
320
  input_vals = [first_row[idx] for idx in dependency_idxs]
350
- e = self.row_builder.unique_exprs[slot_with_exc]
321
+ e = exec_ctx.row_builder.unique_exprs[slot_with_exc]
351
322
  self.error = excs.ExprEvalError(e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
352
323
  self.exc_event.set()
353
324
  return
@@ -356,17 +327,17 @@ class ExprEvalNode(ExecNode):
356
327
  assert row.has_exc(slot_with_exc)
357
328
  exc = row.get_exc(slot_with_exc)
358
329
  # propagate exception
359
- for slot_idx in np.nonzero(self.row_builder.transitive_dependents[slot_with_exc])[0].tolist():
330
+ for slot_idx in np.nonzero(exec_ctx.row_builder.transitive_dependents[slot_with_exc])[0].tolist():
360
331
  row.set_exc(slot_idx, exc)
361
- self.dispatch(rows)
332
+ self.dispatch(rows, exec_ctx)
362
333
 
363
- def dispatch(self, rows: list[exprs.DataRow]) -> None:
334
+ def dispatch(self, rows: list[exprs.DataRow], exec_ctx: ExecCtx) -> None:
364
335
  """Dispatch rows to slot evaluators, based on materialized dependencies"""
365
336
  if len(rows) == 0 or self.exc_event.is_set():
366
337
  return
367
338
 
368
339
  # slots ready for evaluation; rows x slots
369
- ready_slots = np.zeros((len(rows), self.row_builder.num_materialized), dtype=bool)
340
+ ready_slots = np.zeros((len(rows), exec_ctx.row_builder.num_materialized), dtype=bool)
370
341
  completed_rows = np.zeros(len(rows), dtype=bool)
371
342
  for i, row in enumerate(rows):
372
343
  row.missing_slots &= row.has_val == False
@@ -375,25 +346,33 @@ class ExprEvalNode(ExecNode):
375
346
  completed_rows[i] = True
376
347
  else:
377
348
  # dependencies of missing slots
378
- missing_dependencies = self.num_dependencies * row.missing_slots
349
+ missing_dependencies = exec_ctx.row_builder.num_dependencies * row.missing_slots
379
350
  # determine ready slots that are not yet materialized and not yet scheduled
380
- num_mat_dependencies = np.sum(self.row_builder.dependencies * row.has_val, axis=1)
351
+ num_mat_dependencies = np.sum(exec_ctx.row_builder.dependencies * row.has_val, axis=1)
381
352
  num_missing = missing_dependencies - num_mat_dependencies
382
353
  ready_slots[i] = (num_missing == 0) & (row.is_scheduled == False) & row.missing_slots
383
- row.is_scheduled = row.is_scheduled | ready_slots[i]
354
+ row.is_scheduled |= ready_slots[i]
384
355
 
385
356
  # clear intermediate values that are no longer needed (ie, all dependents are materialized)
386
- missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
387
- gc_targets = (missing_dependents == 0) & (row.missing_dependents > 0) & self.gc_targets
357
+ missing_dependents = np.sum(exec_ctx.row_builder.dependencies[row.has_val == False], axis=0)
358
+ gc_targets = (missing_dependents == 0) & (row.missing_dependents > 0) & exec_ctx.gc_targets
388
359
  row.clear(gc_targets)
389
360
  row.missing_dependents = missing_dependents
390
361
 
391
362
  if np.any(completed_rows):
392
363
  completed_idxs = list(completed_rows.nonzero()[0])
393
- for i in completed_idxs:
394
- self.completed_rows.put_nowait(rows[i])
395
- self.completed_event.set()
396
- self.num_in_flight -= len(completed_idxs)
364
+ if rows[i].parent_row is not None:
365
+ # these are nested rows
366
+ for i in completed_idxs:
367
+ row = rows[i]
368
+ assert row.parent_row is not None and row.parent_slot_idx is not None
369
+ assert isinstance(row.parent_row.vals[row.parent_slot_idx], NestedRowList)
370
+ row.parent_row.vals[row.parent_slot_idx].complete_row()
371
+ else:
372
+ for i in completed_idxs:
373
+ self.completed_rows.put_nowait(rows[i])
374
+ self.completed_event.set()
375
+ self.num_in_flight -= len(completed_idxs)
397
376
 
398
377
  # schedule all ready slots
399
378
  for slot_idx in np.sum(ready_slots, axis=0).nonzero()[0]:
@@ -401,7 +380,7 @@ class ExprEvalNode(ExecNode):
401
380
  _ = ready_rows_v.nonzero()
402
381
  ready_rows = [rows[i] for i in ready_rows_v.nonzero()[0]]
403
382
  _logger.debug(f'Scheduling {len(ready_rows)} rows for slot {slot_idx}')
404
- self.slot_evaluators[slot_idx].schedule(ready_rows, slot_idx)
383
+ exec_ctx.slot_evaluators[slot_idx].schedule(ready_rows, slot_idx)
405
384
 
406
385
  def register_task(self, t: asyncio.Task) -> None:
407
386
  self.tasks.add(t)
@@ -4,7 +4,9 @@ import abc
4
4
  import asyncio
5
5
  from dataclasses import dataclass
6
6
  from types import TracebackType
7
- from typing import Any, Optional, Protocol
7
+ from typing import Any, Iterable, Optional, Protocol
8
+
9
+ import numpy as np
8
10
 
9
11
  from pixeltable import exprs, func
10
12
 
@@ -53,6 +55,7 @@ class Scheduler(abc.ABC):
53
55
 
54
56
  request: FnCallArgs
55
57
  num_retries: int
58
+ exec_ctx: ExecCtx
56
59
 
57
60
  def __lt__(self, other: Scheduler.QueueItem) -> bool:
58
61
  # prioritize by number of retries (more retries = higher priority)
@@ -67,8 +70,8 @@ class Scheduler(abc.ABC):
67
70
  self.queue = asyncio.PriorityQueue()
68
71
  self.dispatcher = dispatcher
69
72
 
70
- def submit(self, item: FnCallArgs) -> None:
71
- self.queue.put_nowait(self.QueueItem(item, 0))
73
+ def submit(self, item: FnCallArgs, exec_ctx: ExecCtx) -> None:
74
+ self.queue.put_nowait(self.QueueItem(item, 0, exec_ctx))
72
75
 
73
76
  @classmethod
74
77
  @abc.abstractmethod
@@ -90,11 +93,11 @@ class Dispatcher(Protocol):
90
93
  exc_event: asyncio.Event
91
94
  schedulers: dict[str, Scheduler] # key: resource pool id
92
95
 
93
- def dispatch(self, rows: list[exprs.DataRow]) -> None:
96
+ def dispatch(self, rows: list[exprs.DataRow], exec_ctx: Any) -> None:
94
97
  """Dispatches row slots to the appropriate schedulers; does not block"""
95
98
  ...
96
99
 
97
- def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
100
+ def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: Any) -> None:
98
101
  """Propagates exception in slot_with_exc to all dependent slots and dispatches the rest; does not block"""
99
102
  ...
100
103
 
@@ -116,15 +119,16 @@ class Evaluator(abc.ABC):
116
119
 
117
120
  dispatcher: Dispatcher
118
121
  is_closed: bool
122
+ exec_ctx: 'ExecCtx'
119
123
 
120
- def __init__(self, dispatcher: Dispatcher):
124
+ def __init__(self, dispatcher: Dispatcher, exec_ctx: 'ExecCtx') -> None:
121
125
  self.dispatcher = dispatcher
122
126
  self.is_closed = False
127
+ self.exec_ctx = exec_ctx
123
128
 
124
129
  @abc.abstractmethod
125
130
  def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
126
131
  """Create tasks to evaluate the expression in the given slot for the given rows; must not block."""
127
- ...
128
132
 
129
133
  def _close(self) -> None:
130
134
  """Close the evaluator; must not block"""
@@ -134,3 +138,60 @@ class Evaluator(abc.ABC):
134
138
  """Indicates that there may not be any more rows getting scheduled"""
135
139
  self.is_closed = True
136
140
  self._close()
141
+
142
+
143
+ class ExecCtx:
144
+ """DataRow-specific state needed by ExprEvalNode"""
145
+
146
+ row_builder: exprs.RowBuilder
147
+ slot_evaluators: dict[int, Evaluator] # key: slot idx
148
+ gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
149
+ eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
150
+ literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
151
+
152
+ def __init__(
153
+ self,
154
+ dispatcher: Dispatcher,
155
+ row_builder: exprs.RowBuilder,
156
+ output_exprs: Iterable[exprs.Expr],
157
+ input_exprs: Iterable[exprs.Expr],
158
+ ):
159
+ self.row_builder = row_builder
160
+ self.slot_evaluators = {}
161
+ # TODO: only include output_exprs dependencies
162
+ self.gc_targets = np.ones(self.row_builder.num_materialized, dtype=bool)
163
+ # we need to retain all slots that are part of the output
164
+ self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
165
+
166
+ output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
167
+ self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
168
+ self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
169
+ non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
170
+ self.eval_ctx[non_literal_slot_idxs] = True
171
+ self._init_slot_evaluators(dispatcher, non_literal_slot_idxs)
172
+
173
+ def _init_slot_evaluators(self, dispatcher: Dispatcher, target_slot_idxs: list[int]) -> None:
174
+ from .evaluators import DefaultExprEvaluator, FnCallEvaluator, JsonMapperDispatcher
175
+
176
+ for slot_idx in target_slot_idxs:
177
+ expr = self.row_builder.unique_exprs[slot_idx]
178
+ if (
179
+ isinstance(expr, exprs.FunctionCall)
180
+ # ExprTemplateFunction and AggregateFunction calls are best handled by FunctionCall.eval()
181
+ and not isinstance(expr.fn, func.ExprTemplateFunction)
182
+ and not isinstance(expr.fn, func.AggregateFunction)
183
+ ):
184
+ self.slot_evaluators[slot_idx] = FnCallEvaluator(expr, dispatcher, self)
185
+ elif isinstance(expr, exprs.JsonMapperDispatch):
186
+ self.slot_evaluators[slot_idx] = JsonMapperDispatcher(expr, dispatcher, self)
187
+ else:
188
+ self.slot_evaluators[slot_idx] = DefaultExprEvaluator(expr, dispatcher, self)
189
+
190
+ def init_rows(self, rows: list[exprs.DataRow]) -> None:
191
+ """Pre-populate rows with literals and initialize execution state"""
192
+ for row in rows:
193
+ # set literals before missing_dependents/slots
194
+ for slot_idx, val in self.literals.items():
195
+ row[slot_idx] = val
196
+ row.missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
197
+ row.missing_slots = self.eval_ctx & (row.has_val == False)