pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,27 +1,42 @@
1
- from typing import Optional
1
+ import random
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
5
- import pixeltable.exprs as exprs
5
+ from pixeltable import exprs
6
+
6
7
 
7
8
  class ExecContext:
8
9
  """Class for execution runtime constants"""
10
+
11
+ row_builder: exprs.RowBuilder
12
+ profile: exprs.ExecProfile
13
+ show_pbar: bool
14
+ batch_size: int
15
+ num_rows: int | None
16
+ conn: sql.engine.Connection | None
17
+ pk_clause: list[sql.ClauseElement] | None
18
+ num_computed_exprs: int
19
+ ignore_errors: bool
20
+ random_seed: int # general-purpose source of randomness with execution scope
21
+
9
22
  def __init__(
10
- self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
11
- pk_clause: Optional[list[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
12
- ignore_errors: bool = False
23
+ self,
24
+ row_builder: exprs.RowBuilder,
25
+ *,
26
+ show_pbar: bool = False,
27
+ batch_size: int = 0,
28
+ pk_clause: list[sql.ClauseElement] | None = None,
29
+ num_computed_exprs: int = 0,
30
+ ignore_errors: bool = False,
13
31
  ):
14
32
  self.show_pbar = show_pbar
15
33
  self.batch_size = batch_size
16
34
  self.row_builder = row_builder
17
35
  self.profile = exprs.ExecProfile(row_builder)
18
36
  # num_rows is used to compute the total number of computed cells used for the progress bar
19
- self.num_rows: Optional[int] = None
20
- self.conn: Optional[sql.engine.Connection] = None # if present, use this to execute SQL queries
37
+ self.num_rows = None
38
+ self.conn = None # if present, use this to execute SQL queries
21
39
  self.pk_clause = pk_clause
22
40
  self.num_computed_exprs = num_computed_exprs
23
41
  self.ignore_errors = ignore_errors
24
-
25
- def set_conn(self, conn: sql.engine.Connection) -> None:
26
- self.conn = conn
27
- self.row_builder.set_conn(conn)
42
+ self.random_seed = random.randint(0, 1 << 63)
@@ -1,27 +1,35 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- from typing import TYPE_CHECKING, Iterable, Iterator, Optional, TypeVar
4
+ import logging
5
+ from typing import AsyncIterator, Iterable, Iterator, TypeVar
5
6
 
6
- import pixeltable.exprs as exprs
7
+ from pixeltable import exprs
8
+ from pixeltable.env import Env
7
9
 
8
10
  from .data_row_batch import DataRowBatch
9
11
  from .exec_context import ExecContext
10
12
 
13
+ _logger = logging.getLogger('pixeltable')
14
+
11
15
 
12
16
  class ExecNode(abc.ABC):
13
17
  """Base class of all execution nodes"""
18
+
14
19
  output_exprs: Iterable[exprs.Expr]
15
20
  row_builder: exprs.RowBuilder
16
- input: Optional[ExecNode]
21
+ input: ExecNode | None
17
22
  flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
18
- stored_img_cols: list[exprs.ColumnSlotIdx]
19
- ctx: Optional[ExecContext]
20
- __iter: Optional[Iterator[DataRowBatch]]
23
+ ctx: ExecContext | None
21
24
 
22
25
  def __init__(
23
- self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
24
- input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
26
+ self,
27
+ row_builder: exprs.RowBuilder,
28
+ output_exprs: Iterable[exprs.Expr],
29
+ input_exprs: Iterable[exprs.Expr],
30
+ input: ExecNode | None = None,
31
+ ):
32
+ assert all(expr.is_valid for expr in output_exprs)
25
33
  self.output_exprs = output_exprs
26
34
  self.row_builder = row_builder
27
35
  self.input = input
@@ -29,33 +37,33 @@ class ExecNode(abc.ABC):
29
37
  output_slot_idxs = {e.slot_idx for e in output_exprs}
30
38
  output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
31
39
  self.flushed_img_slots = [
32
- e.slot_idx for e in output_dependencies
33
- if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
40
+ e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
34
41
  ]
35
- self.stored_img_cols = []
36
- self.ctx = None # all nodes of a tree share the same context
37
- self.__iter = None
42
+ self.ctx = input.ctx if input is not None else None
38
43
 
39
44
  def set_ctx(self, ctx: ExecContext) -> None:
40
45
  self.ctx = ctx
41
46
  if self.input is not None:
42
47
  self.input.set_ctx(ctx)
43
48
 
44
- def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
45
- self.stored_img_cols = stored_img_cols
46
- # propagate batch size to the source
47
- if self.input is not None:
48
- self.input.set_stored_img_cols(stored_img_cols)
49
+ @abc.abstractmethod
50
+ def __aiter__(self) -> AsyncIterator[DataRowBatch]:
51
+ pass
49
52
 
50
- # TODO: make this an abstractmethod when __next__() is removed
51
53
  def __iter__(self) -> Iterator[DataRowBatch]:
52
- return self
53
-
54
- # TODO: remove this and switch every subclass over to implementing __iter__
55
- def __next__(self) -> DataRowBatch:
56
- if self.__iter is None:
57
- self.__iter = iter(self)
58
- return next(self.__iter)
54
+ loop = Env.get().event_loop
55
+ aiter = self.__aiter__()
56
+ try:
57
+ while True:
58
+ batch: DataRowBatch = loop.run_until_complete(aiter.__anext__())
59
+ yield batch
60
+ except StopAsyncIteration:
61
+ pass
62
+ # TODO:
63
+ # - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
64
+ # we end up here
65
+ # - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
66
+ # creates tasks on its own
59
67
 
60
68
  def open(self) -> None:
61
69
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -77,7 +85,7 @@ class ExecNode(abc.ABC):
77
85
 
78
86
  T = TypeVar('T', bound='ExecNode')
79
87
 
80
- def get_node(self, node_class: type[T]) -> Optional[T]:
88
+ def get_node(self, node_class: type[T]) -> T | None:
81
89
  if isinstance(self, node_class):
82
90
  return self
83
91
  if self.input is not None:
@@ -0,0 +1,3 @@
1
+ # ruff: noqa: F401
2
+
3
+ from .expr_eval_node import ExprEvalNode
@@ -0,0 +1,365 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import datetime
5
+ import itertools
6
+ import logging
7
+ import sys
8
+ from typing import Any, Callable, Iterator, cast
9
+
10
+ from pixeltable import exprs, func
11
+
12
+ from .globals import Dispatcher, Evaluator, ExecCtx, FnCallArgs
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+
17
+ class DefaultExprEvaluator(Evaluator):
18
+ """
19
+ Standard expression evaluation using Expr.eval().
20
+
21
+ Creates one task per set of rows handed to schedule().
22
+
23
+ TODO:
24
+ - parallelize via Ray
25
+ """
26
+
27
+ e: exprs.Expr
28
+
29
+ def __init__(self, e: exprs.Expr, dispatcher: Dispatcher, exec_ctx: ExecCtx):
30
+ super().__init__(dispatcher, exec_ctx)
31
+ self.e = e
32
+
33
+ def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
34
+ assert self.e.slot_idx >= 0
35
+ task = asyncio.create_task(self.eval(rows))
36
+ self.dispatcher.register_task(task)
37
+
38
+ async def eval(self, rows: list[exprs.DataRow]) -> None:
39
+ rows_with_excs: set[int] = set() # records idxs into rows
40
+ for idx, row in enumerate(rows):
41
+ assert not row.has_val[self.e.slot_idx] and not row.has_exc(self.e.slot_idx)
42
+ if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
43
+ return
44
+ try:
45
+ self.e.eval(row, self.dispatcher.row_builder)
46
+ except Exception as exc:
47
+ _, _, exc_tb = sys.exc_info()
48
+ row.set_exc(self.e.slot_idx, exc)
49
+ rows_with_excs.add(idx)
50
+ self.dispatcher.dispatch_exc([row], self.e.slot_idx, exc_tb, self.exec_ctx)
51
+ self.dispatcher.dispatch([rows[i] for i in range(len(rows)) if i not in rows_with_excs], self.exec_ctx)
52
+
53
+
54
+ class FnCallEvaluator(Evaluator):
55
+ """
56
+ Evaluates function calls:
57
+ - batched functions (sync and async): one task per batch
58
+ - async functions: one task per row
59
+ - the rest: one task per set of rows handed to schedule()
60
+
61
+ TODO:
62
+ - adaptive batching: finding the optimal batch size based on observed execution times
63
+ """
64
+
65
+ fn_call: exprs.FunctionCall
66
+ fn: func.CallableFunction
67
+ scalar_py_fn: Callable | None # only set for non-batching CallableFunctions
68
+
69
+ # only set if fn.is_batched
70
+ call_args_queue: asyncio.Queue[FnCallArgs] | None # FnCallArgs waiting for execution
71
+ batch_size: int | None
72
+
73
+ def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
74
+ super().__init__(dispatcher, exec_ctx)
75
+ self.fn_call = fn_call
76
+ self.fn = cast(func.CallableFunction, fn_call.fn)
77
+ if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
78
+ self.call_args_queue = asyncio.Queue[FnCallArgs]()
79
+ # we're not supplying sample arguments there, they're ignored anyway
80
+ self.batch_size = self.fn.get_batch_size()
81
+ self.scalar_py_fn = None
82
+ else:
83
+ self.call_args_queue = None
84
+ self.batch_size = None
85
+ if isinstance(self.fn, func.CallableFunction):
86
+ self.scalar_py_fn = self.fn.py_fn
87
+ else:
88
+ self.scalar_py_fn = None
89
+
90
+ def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
91
+ assert self.fn_call.slot_idx >= 0
92
+
93
+ # create FnCallArgs for incoming rows
94
+ skip_rows: list[exprs.DataRow] = [] # skip rows with Nones in non-nullable parameters
95
+ rows_call_args: list[FnCallArgs] = []
96
+ for row in rows:
97
+ args_kwargs = self.fn_call.make_args(row)
98
+ if args_kwargs is None:
99
+ # nothing to do here
100
+ row[self.fn_call.slot_idx] = None
101
+ skip_rows.append(row)
102
+ else:
103
+ args, kwargs = args_kwargs
104
+ rows_call_args.append(FnCallArgs(self.fn_call, [row], args=args, kwargs=kwargs))
105
+
106
+ if len(skip_rows) > 0:
107
+ self.dispatcher.dispatch(skip_rows, self.exec_ctx)
108
+
109
+ if self.batch_size is not None:
110
+ if not self.is_closed and (len(rows_call_args) + self.call_args_queue.qsize() < self.batch_size):
111
+ # we don't have enough FnCallArgs for a batch, so add them to the queue
112
+ for item in rows_call_args:
113
+ self.call_args_queue.put_nowait(item)
114
+ return
115
+
116
+ # create one task per batch
117
+ combined_call_args = itertools.chain(self._queued_call_args_iter(), rows_call_args)
118
+ while True:
119
+ call_args_batch = list(itertools.islice(combined_call_args, self.batch_size))
120
+ if len(call_args_batch) == 0:
121
+ break
122
+ if len(call_args_batch) < self.batch_size and not self.is_closed:
123
+ # we don't have a full batch left: return the rest to the queue
124
+ assert self.call_args_queue.empty() # we saw all queued items
125
+ for item in call_args_batch:
126
+ self.call_args_queue.put_nowait(item)
127
+ return
128
+
129
+ # turn call_args_batch into a single batched FnCallArgs
130
+ _logger.debug(f'Creating batch of size {len(call_args_batch)} for slot {slot_idx}')
131
+ batched_call_args = self._create_batch_call_args(call_args_batch)
132
+ if self.fn_call.resource_pool is not None:
133
+ # hand the call off to the resource pool's scheduler
134
+ scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
135
+ scheduler.submit(batched_call_args, self.exec_ctx)
136
+ else:
137
+ task = asyncio.create_task(self.eval_batch(batched_call_args))
138
+ self.dispatcher.register_task(task)
139
+
140
+ elif self.fn.is_async:
141
+ if self.fn_call.resource_pool is not None:
142
+ # hand the call off to the resource pool's scheduler
143
+ scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
144
+ for item in rows_call_args:
145
+ scheduler.submit(item, self.exec_ctx)
146
+ else:
147
+ # create one task per call
148
+ for item in rows_call_args:
149
+ task = asyncio.create_task(self.eval_async(item))
150
+ self.dispatcher.register_task(task)
151
+
152
+ else:
153
+ # create a single task for all rows
154
+ task = asyncio.create_task(self.eval(rows_call_args))
155
+ self.dispatcher.register_task(task)
156
+
157
+ def _queued_call_args_iter(self) -> Iterator[FnCallArgs]:
158
+ while not self.call_args_queue.empty():
159
+ yield self.call_args_queue.get_nowait()
160
+
161
+ def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
162
+ """Roll call_args into a single batched FnCallArgs"""
163
+ batch_args: list[list[Any | None]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
164
+ batch_kwargs: dict[str, list[Any | None]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
165
+ assert isinstance(self.fn, func.CallableFunction)
166
+ for i, item in enumerate(call_args):
167
+ for j in range(len(item.args)):
168
+ batch_args[j][i] = item.args[j]
169
+ for k in item.kwargs:
170
+ batch_kwargs[k][i] = item.kwargs[k]
171
+ return FnCallArgs(
172
+ self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
173
+ )
174
+
175
+ async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
176
+ result_batch: list[Any]
177
+ try:
178
+ if self.fn.is_async:
179
+ result_batch = await self.fn.aexec_batch(
180
+ *batched_call_args.batch_args, **batched_call_args.batch_kwargs
181
+ )
182
+ else:
183
+ # check for cancellation before starting something potentially long-running
184
+ if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
185
+ return
186
+ result_batch = self.fn.exec_batch(batched_call_args.batch_args, batched_call_args.batch_kwargs)
187
+ except Exception as exc:
188
+ _, _, exc_tb = sys.exc_info()
189
+ for row in batched_call_args.rows:
190
+ row.set_exc(self.fn_call.slot_idx, exc)
191
+ self.dispatcher.dispatch_exc(batched_call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
192
+ return
193
+
194
+ for i, row in enumerate(batched_call_args.rows):
195
+ row[self.fn_call.slot_idx] = result_batch[i]
196
+ self.dispatcher.dispatch(batched_call_args.rows, self.exec_ctx)
197
+
198
+ async def eval_async(self, call_args: FnCallArgs) -> None:
199
+ assert len(call_args.rows) == 1
200
+ assert not call_args.row.has_val[self.fn_call.slot_idx]
201
+ assert not call_args.row.has_exc(self.fn_call.slot_idx)
202
+
203
+ try:
204
+ start_ts = datetime.datetime.now()
205
+ _logger.debug(f'Start evaluating slot {self.fn_call.slot_idx}')
206
+ call_args.row[self.fn_call.slot_idx] = await self.fn.aexec(*call_args.args, **call_args.kwargs)
207
+ end_ts = datetime.datetime.now()
208
+ _logger.debug(f'Evaluated slot {self.fn_call.slot_idx} in {end_ts - start_ts}')
209
+ self.dispatcher.dispatch([call_args.row], self.exec_ctx)
210
+ except Exception as exc:
211
+ _, _, exc_tb = sys.exc_info()
212
+ call_args.row.set_exc(self.fn_call.slot_idx, exc)
213
+ self.dispatcher.dispatch_exc(call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
214
+
215
+ async def eval(self, call_args_batch: list[FnCallArgs]) -> None:
216
+ rows_with_excs: set[int] = set() # records idxs into 'rows'
217
+ for idx, item in enumerate(call_args_batch):
218
+ assert len(item.rows) == 1
219
+ assert not item.row.has_val[self.fn_call.slot_idx]
220
+ assert not item.row.has_exc(self.fn_call.slot_idx)
221
+ # check for cancellation before starting something potentially long-running
222
+ if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
223
+ return
224
+ try:
225
+ item.row[self.fn_call.slot_idx] = self.scalar_py_fn(*item.args, **item.kwargs)
226
+ except Exception as exc:
227
+ _, _, exc_tb = sys.exc_info()
228
+ item.row.set_exc(self.fn_call.slot_idx, exc)
229
+ rows_with_excs.add(idx)
230
+ self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
231
+ self.dispatcher.dispatch(
232
+ [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs], self.exec_ctx
233
+ )
234
+
235
+ def _close(self) -> None:
236
+ """Create a task for the incomplete batch of queued FnCallArgs, if any"""
237
+ _logger.debug(f'FnCallEvaluator.close(): slot_idx={self.fn_call.slot_idx}')
238
+ if self.call_args_queue is None or self.call_args_queue.empty():
239
+ return
240
+ batched_call_args = self._create_batch_call_args(list(self._queued_call_args_iter()))
241
+ task = asyncio.create_task(self.eval_batch(batched_call_args))
242
+ self.dispatcher.register_task(task)
243
+
244
+
245
+ class NestedRowList:
246
+ """
247
+ A list of nested rows, used by JsonMapperDispatcher to store the rows corresponding to the elements of the
248
+ JsonMapper source list and make completion awaitable.
249
+ """
250
+
251
+ rows: list[exprs.DataRow]
252
+ num_completed: int
253
+ completion: asyncio.Event
254
+
255
+ def __init__(self, rows: list[exprs.DataRow]):
256
+ self.num_completed = 0
257
+ self.rows = rows
258
+ self.completion = asyncio.Event()
259
+
260
+ def complete_row(self) -> None:
261
+ self.num_completed += 1
262
+ if self.num_completed == len(self.rows):
263
+ self.completion.set()
264
+
265
+
266
+ class JsonMapperDispatcher(Evaluator):
267
+ """
268
+ The execution logic for materializing the nested DataRows of a JsonMapper/JsonMapperDispatch.
269
+
270
+ The rows are stored in a NestedRowList, which itself is stored in the JsonMapperDispatch instance's slot.
271
+ """
272
+
273
+ e: exprs.JsonMapperDispatch
274
+ target_expr: exprs.Expr
275
+ scope_anchor: exprs.ObjectRef
276
+ nested_exec_ctx: ExecCtx # ExecCtx needed to evaluate the nested rows
277
+ external_slot_map: dict[int, int] # slot idx in parent row -> slot idx in nested row
278
+ has_async_calls: bool # True if target_expr contains any async FunctionCalls
279
+
280
+ def __init__(self, e: exprs.JsonMapperDispatch, dispatcher: Dispatcher, exec_ctx: ExecCtx):
281
+ super().__init__(dispatcher, exec_ctx)
282
+ self.e = e
283
+ self.target_expr = e.target_expr.copy() # we need new slot idxs
284
+ self.scope_anchor = e.scope_anchor.copy()
285
+ nested_row_builder = exprs.RowBuilder(output_exprs=[self.target_expr], columns=[], input_exprs=[])
286
+ nested_row_builder.set_slot_idxs([self.target_expr, self.scope_anchor])
287
+ target_expr_ctx = nested_row_builder.create_eval_ctx([self.target_expr], limit_scope=True)
288
+ self.has_async_calls = any(isinstance(e, exprs.FunctionCall) and e.is_async for e in target_expr_ctx.exprs)
289
+ target_scope = self.target_expr.scope()
290
+ # we need to pre-populated nested rows with slot values that are produced in an outer scope (literals excluded)
291
+ parent_exprs = [
292
+ e for e in target_expr_ctx.exprs if e.scope() != target_scope and not isinstance(e, exprs.Literal)
293
+ ]
294
+ self.external_slot_map = {exec_ctx.row_builder.unique_exprs[e].slot_idx: e.slot_idx for e in parent_exprs}
295
+ self.nested_exec_ctx = ExecCtx(dispatcher, nested_row_builder, [self.target_expr], parent_exprs)
296
+
297
+ def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
298
+ """Create nested rows for all source list elements and dispatch them"""
299
+ assert self.e.slot_idx >= 0
300
+ all_nested_rows: list[exprs.DataRow] = []
301
+ for row in rows:
302
+ src = row[self.e.src_expr.slot_idx]
303
+ if not isinstance(src, list):
304
+ # invalid/non-list src path
305
+ row[self.e.slot_idx] = None
306
+ continue
307
+
308
+ nested_rows = [
309
+ exprs.DataRow(
310
+ size=self.nested_exec_ctx.row_builder.num_materialized,
311
+ img_slot_idxs=[],
312
+ media_slot_idxs=[],
313
+ array_slot_idxs=[],
314
+ json_slot_idxs=[],
315
+ parent_row=row,
316
+ parent_slot_idx=self.e.slot_idx,
317
+ )
318
+ for _ in src
319
+ ]
320
+ for nested_row, anchor_val in zip(nested_rows, src):
321
+ # It's possible that self.scope_anchor.slot_idx is None; this corresponds to the case where the
322
+ # mapper expression doesn't actually contain references to RELATIVE_PATH_ROOT.
323
+ if self.scope_anchor.slot_idx is not None:
324
+ nested_row[self.scope_anchor.slot_idx] = anchor_val
325
+ for slot_idx_, nested_slot_idx in self.external_slot_map.items():
326
+ nested_row[nested_slot_idx] = row[slot_idx_]
327
+ self.nested_exec_ctx.init_rows(nested_rows)
328
+
329
+ # we modify DataRow.vals here directly, rather than going through __getitem__(), because we don't have
330
+ # an official "value" yet (the nested rows are not yet materialized)
331
+ row.vals[self.e.slot_idx] = NestedRowList(nested_rows)
332
+ all_nested_rows.extend(nested_rows)
333
+
334
+ self.dispatcher.dispatch(all_nested_rows, self.nested_exec_ctx)
335
+ task = asyncio.create_task(self.gather(rows))
336
+ self.dispatcher.register_task(task)
337
+
338
+ async def gather(self, rows: list[exprs.DataRow]) -> None:
339
+ """Wait for nested rows to complete, then signal completion to the parent rows"""
340
+ if self.has_async_calls:
341
+ # if our target expr contains async FunctionCalls, they typically get completed out-of-order, and it's
342
+ # more effective to dispatch them as they complete
343
+ remaining = {
344
+ asyncio.create_task(row.vals[self.e.slot_idx].completion.wait()): row
345
+ for row in rows
346
+ if not row.has_val[self.e.slot_idx]
347
+ }
348
+ while len(remaining) > 0:
349
+ done, _ = await asyncio.wait(remaining.keys(), return_when=asyncio.FIRST_COMPLETED)
350
+ done_rows = [remaining.pop(task) for task in done]
351
+ for row in done_rows:
352
+ row.has_val[self.e.slot_idx] = True
353
+ self.dispatcher.dispatch(done_rows, self.exec_ctx)
354
+
355
+ else:
356
+ # our target expr doesn't contain async FunctionCalls, which means they will get completed in-order
357
+ for row in rows:
358
+ if row.has_val[self.e.slot_idx]:
359
+ # the source_expr's value is not a list
360
+ assert row.vals[self.e.slot_idx] is None
361
+ continue
362
+ assert row.vals[self.e.slot_idx] is not None and isinstance(row.vals[self.e.slot_idx], NestedRowList)
363
+ await row.vals[self.e.slot_idx].completion.wait()
364
+ row.has_val[self.e.slot_idx] = True
365
+ self.dispatcher.dispatch(rows, self.exec_ctx)