pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,413 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import traceback
6
+ from types import TracebackType
7
+ from typing import AsyncIterator, Iterable
8
+
9
+ import numpy as np
10
+
11
+ import pixeltable.exceptions as excs
12
+ from pixeltable import exprs
13
+
14
+ from ..data_row_batch import DataRowBatch
15
+ from ..exec_node import ExecNode
16
+ from .evaluators import FnCallEvaluator, NestedRowList
17
+ from .globals import ExecCtx, Scheduler
18
+ from .row_buffer import RowBuffer
19
+ from .schedulers import SCHEDULERS
20
+
21
+ _logger = logging.getLogger('pixeltable')
22
+
23
+
24
+ class ExprEvalNode(ExecNode):
25
+ """
26
+ Expression evaluation
27
+
28
+ Resource management:
29
+ - the execution system tries to limit total memory consumption by limiting the number of rows that are in
30
+ circulation
31
+ - during execution, slots that aren't part of the output are garbage collected as soon as their direct dependents
32
+ are materialized
33
+
34
+ TODO:
35
+ - Literal handling: currently, Literal values are copied into slots via the normal evaluation mechanism, which is
36
+ needless overhead; instead: pre-populate Literal slots in _init_row()
37
+ - dynamically determine MAX_BUFFERED_ROWS, based on the avg memory consumption of a row and our configured memory
38
+ limit
39
+ - local model inference on gpu: currently, no attempt is made to ensure that models can fit onto the gpu
40
+ simultaneously, which will cause errors; instead, the execution should be divided into sequential phases, each
41
+ of which only contains a subset of the models which is known to fit onto the gpu simultaneously
42
+ """
43
+
44
+ maintain_input_order: bool # True if we're returning rows in the order we received them from our input
45
+ outputs: np.ndarray # bool per slot; True if this slot is part of our output
46
+ schedulers: dict[str, Scheduler] # key: resource pool name
47
+ exec_ctx: ExecCtx # for input/output rows
48
+
49
+ # execution state
50
+ tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
51
+ exc_event: asyncio.Event # set if an exception needs to be propagated
52
+ error: Exception | None # exception that needs to be propagated
53
+ completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
54
+ completed_event: asyncio.Event # set when completed_rows is non-empty
55
+ input_iter: AsyncIterator[DataRowBatch]
56
+ current_input_batch: DataRowBatch | None # batch from which we're currently consuming rows
57
+ input_row_idx: int # next row to consume from current_input_batch
58
+ next_input_batch: DataRowBatch | None # read-ahead input batch
59
+ avail_input_rows: int # total number across both current_/next_input_batch
60
+ input_complete: bool # True if we've received all input batches
61
+ num_in_flight: int # number of dispatched rows that haven't completed
62
+ row_pos_map: dict[int, int] | None # id(row) -> position of row in input; only set if maintain_input_order
63
+ output_buffer: RowBuffer # holds rows that are ready to be returned, in order
64
+
65
+ # debugging
66
+ num_input_rows: int
67
+ num_output_rows: int
68
+
69
+ BATCH_SIZE = 64
70
+ MAX_BUFFERED_ROWS = 2048 # maximum number of rows that have been dispatched but not yet returned
71
+
72
+ def __init__(
73
+ self,
74
+ row_builder: exprs.RowBuilder,
75
+ output_exprs: Iterable[exprs.Expr],
76
+ input_exprs: Iterable[exprs.Expr],
77
+ input: ExecNode,
78
+ maintain_input_order: bool = True,
79
+ ):
80
+ super().__init__(row_builder, output_exprs, input_exprs, input)
81
+ self.maintain_input_order = maintain_input_order
82
+ self.outputs = np.zeros(row_builder.num_materialized, dtype=bool)
83
+ output_slot_idxs = [e.slot_idx for e in output_exprs]
84
+ self.outputs[output_slot_idxs] = True
85
+ self.tasks = set()
86
+ self.error = None
87
+
88
+ self.input_iter = self.input.__aiter__()
89
+ self.current_input_batch = None
90
+ self.next_input_batch = None
91
+ self.input_row_idx = 0
92
+ self.avail_input_rows = 0
93
+ self.input_complete = False
94
+ self.num_in_flight = 0
95
+ self.row_pos_map = None
96
+ self.output_buffer = RowBuffer(self.MAX_BUFFERED_ROWS)
97
+
98
+ self.num_input_rows = 0
99
+ self.num_output_rows = 0
100
+
101
+ # self.slot_evaluators = {}
102
+ self.schedulers = {}
103
+ # self._init_slot_evaluators()
104
+ self.exec_ctx = ExecCtx(self, self.row_builder, output_exprs, input_exprs)
105
+
106
+ def set_input_order(self, maintain_input_order: bool) -> None:
107
+ self.maintain_input_order = maintain_input_order
108
+
109
+ async def _fetch_input_batch(self) -> None:
110
+ """
111
+ Fetches another batch from our input or sets input_complete to True if there are no more batches.
112
+
113
+ - stores the batch in current_input_batch, if not already set, or next_input_batch
114
+ - updates row_pos_map, if needed
115
+ """
116
+ assert not self.input_complete
117
+ try:
118
+ batch = await anext(self.input_iter)
119
+ assert self.next_input_batch is None
120
+ if self.current_input_batch is None:
121
+ self.current_input_batch = batch
122
+ else:
123
+ self.next_input_batch = batch
124
+ if self.maintain_input_order:
125
+ for idx, row in enumerate(batch.rows):
126
+ self.row_pos_map[id(row)] = self.num_input_rows + idx
127
+ self.num_input_rows += len(batch)
128
+ self.avail_input_rows += len(batch)
129
+ _logger.debug(
130
+ f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} '
131
+ f'#avail={self.avail_input_rows}'
132
+ )
133
+ except StopAsyncIteration:
134
+ self.input_complete = True
135
+ _logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
136
+ # make sure to pass DBAPIError through, so the transaction handling logic sees it
137
+ except Exception as exc:
138
+ self.error = exc
139
+ self.exc_event.set()
140
+
141
+ @property
142
+ def total_buffered(self) -> int:
143
+ return self.num_in_flight + self.completed_rows.qsize() + self.output_buffer.num_rows
144
+
145
+ def _dispatch_input_rows(self) -> None:
146
+ """Dispatch the maximum number of input rows, given total_buffered; does not block"""
147
+ if self.avail_input_rows == 0:
148
+ return
149
+ num_rows = min(self.MAX_BUFFERED_ROWS - self.total_buffered, self.avail_input_rows)
150
+ assert num_rows >= 0
151
+ if num_rows == 0:
152
+ return
153
+ assert self.current_input_batch is not None
154
+ avail_current_batch_rows = len(self.current_input_batch) - self.input_row_idx
155
+
156
+ rows: list[exprs.DataRow]
157
+ if avail_current_batch_rows > num_rows:
158
+ # we only need rows from current_input_batch
159
+ rows = self.current_input_batch.rows[self.input_row_idx : self.input_row_idx + num_rows]
160
+ self.input_row_idx += num_rows
161
+ else:
162
+ # we need rows from both current_/next_input_batch
163
+ rows = self.current_input_batch.rows[self.input_row_idx :]
164
+ self.current_input_batch = self.next_input_batch
165
+ self.next_input_batch = None
166
+ self.input_row_idx = 0
167
+ num_remaining = num_rows - len(rows)
168
+ if num_remaining > 0:
169
+ rows.extend(self.current_input_batch.rows[:num_remaining])
170
+ self.input_row_idx = num_remaining
171
+ self.avail_input_rows -= num_rows
172
+ self.num_in_flight += num_rows
173
+ self._log_state(f'dispatch input ({num_rows})')
174
+
175
+ self.exec_ctx.init_rows(rows)
176
+ self.dispatch(rows, self.exec_ctx)
177
+
178
+ def _log_state(self, prefix: str) -> None:
179
+ _logger.debug(
180
+ f'{prefix}: #in-flight={self.num_in_flight} #complete={self.completed_rows.qsize()} '
181
+ f'#output-buffer={self.output_buffer.num_rows} #ready={self.output_buffer.num_ready} '
182
+ f'total-buffered={self.total_buffered} #avail={self.avail_input_rows} '
183
+ f'#input={self.num_input_rows} #output={self.num_output_rows}'
184
+ )
185
+
186
+ def _init_schedulers(self) -> None:
187
+ resource_pools = {
188
+ eval.fn_call.resource_pool
189
+ for eval in self.exec_ctx.slot_evaluators.values()
190
+ if isinstance(eval, FnCallEvaluator)
191
+ }
192
+ resource_pools = {pool for pool in resource_pools if pool is not None}
193
+ for pool_name in resource_pools:
194
+ for scheduler in SCHEDULERS:
195
+ if scheduler.matches(pool_name):
196
+ self.schedulers[pool_name] = scheduler(pool_name, self)
197
+ break
198
+ if pool_name not in self.schedulers:
199
+ raise RuntimeError(f'No scheduler found for resource pool {pool_name}')
200
+
201
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
202
+ """
203
+ Main event loop
204
+
205
+ Goals:
206
+ - return completed DataRowBatches as soon as they become available
207
+ - maximize the number of rows in flight in order to maximize parallelism, up to the given limit
208
+ """
209
+ # initialize completed_rows and events, now that we have the correct event loop
210
+ self.completed_rows = asyncio.Queue[exprs.DataRow]()
211
+ self.exc_event = asyncio.Event()
212
+ self.completed_event = asyncio.Event()
213
+ self._init_schedulers()
214
+ if self.maintain_input_order:
215
+ self.row_pos_map = {}
216
+ self.output_buffer.set_row_pos_map(self.row_pos_map)
217
+
218
+ row: exprs.DataRow
219
+ exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
220
+ input_batch_aw: asyncio.Task | None = None
221
+ completed_aw: asyncio.Task | None = None
222
+ closed_evaluators = False # True after calling Evaluator.close()
223
+ exprs.Expr.prepare_list(self.exec_ctx.all_exprs)
224
+
225
+ try:
226
+ while True:
227
+ # process completed rows before doing anything else
228
+ while not self.completed_rows.empty():
229
+ # move completed rows to output buffer
230
+ while not self.completed_rows.empty():
231
+ row = self.completed_rows.get_nowait()
232
+ self.output_buffer.add_row(row)
233
+ if self.row_pos_map is not None:
234
+ self.row_pos_map.pop(id(row))
235
+
236
+ self._log_state('processed completed')
237
+ # return as many batches as we have available
238
+ while self.output_buffer.num_ready >= self.BATCH_SIZE:
239
+ batch_rows = self.output_buffer.get_rows(self.BATCH_SIZE)
240
+ self.num_output_rows += len(batch_rows)
241
+ # make sure we top up our in-flight rows before yielding
242
+ self._dispatch_input_rows()
243
+ self._log_state(f'yielding {len(batch_rows)} rows')
244
+ yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
245
+ # at this point, we may have more completed rows
246
+
247
+ assert self.completed_rows.empty() # all completed rows should be sitting in output_buffer
248
+ self.completed_event.clear()
249
+ if self.input_complete and self.num_in_flight == 0:
250
+ # there is no more input and nothing left to wait for
251
+ assert self.avail_input_rows == 0
252
+ if self.output_buffer.num_ready > 0:
253
+ assert self.output_buffer.num_rows == self.output_buffer.num_ready
254
+ # yield the leftover rows
255
+ batch_rows = self.output_buffer.get_rows(self.output_buffer.num_ready)
256
+ self.num_output_rows += len(batch_rows)
257
+ self._log_state(f'yielding {len(batch_rows)} rows')
258
+ yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
259
+
260
+ assert self.output_buffer.num_rows == 0
261
+ return
262
+
263
+ if self.input_complete and self.avail_input_rows == 0 and not closed_evaluators:
264
+ # no more input rows to dispatch, but we're still waiting for rows to finish:
265
+ # close all slot evaluators to flush queued rows
266
+ for evaluator in self.exec_ctx.slot_evaluators.values():
267
+ evaluator.close()
268
+ closed_evaluators = True
269
+
270
+ # we don't have a full batch of rows at this point and need to wait
271
+ aws = {exc_event_aw} # always wait for an exception
272
+ if self.next_input_batch is None and not self.input_complete:
273
+ # also wait for another batch if we don't have a read-ahead batch yet
274
+ if input_batch_aw is None:
275
+ input_batch_aw = asyncio.create_task(self._fetch_input_batch(), name='_fetch_input_batch()')
276
+ aws.add(input_batch_aw)
277
+ if self.num_in_flight > 0:
278
+ # also wait for more rows to complete
279
+ if completed_aw is None:
280
+ completed_aw = asyncio.create_task(self.completed_event.wait(), name='completed.wait()')
281
+ aws.add(completed_aw)
282
+ done, _ = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
283
+
284
+ if self.exc_event.is_set():
285
+ # we got an exception that we need to propagate through __iter__()
286
+ if isinstance(self.error, excs.ExprEvalError):
287
+ raise self.error from self.error.exc
288
+ else:
289
+ raise self.error
290
+ if completed_aw in done:
291
+ self._log_state('completed_aw done')
292
+ completed_aw = None
293
+ if input_batch_aw in done:
294
+ self._dispatch_input_rows()
295
+ input_batch_aw = None
296
+
297
+ finally:
298
+ # task cleanup
299
+ active_tasks = {exc_event_aw}
300
+ if input_batch_aw is not None:
301
+ active_tasks.add(input_batch_aw)
302
+ if completed_aw is not None:
303
+ active_tasks.add(completed_aw)
304
+ active_tasks.update(self.tasks)
305
+ for task in active_tasks:
306
+ if not task.done():
307
+ task.cancel()
308
+ _ = await asyncio.gather(*active_tasks, return_exceptions=True)
309
+
310
+ # expr cleanup
311
+ exprs.Expr.release_list(self.exec_ctx.all_exprs)
312
+
313
+ def dispatch_exc(
314
+ self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
315
+ ) -> None:
316
+ """Propagate exception to main event loop or to dependent slots, depending on ignore_errors"""
317
+ if len(rows) == 0 or self.exc_event.is_set():
318
+ return
319
+
320
+ if not self.ctx.ignore_errors:
321
+ dependency_idxs = [e.slot_idx for e in exec_ctx.row_builder.unique_exprs[slot_with_exc].dependencies()]
322
+ first_row = rows[0]
323
+ input_vals = [first_row[idx] for idx in dependency_idxs]
324
+ e = exec_ctx.row_builder.unique_exprs[slot_with_exc]
325
+ self.error = excs.ExprEvalError(e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
326
+ self.exc_event.set()
327
+ return
328
+
329
+ for row in rows:
330
+ assert row.has_exc(slot_with_exc)
331
+ exc = row.get_exc(slot_with_exc)
332
+ # propagate exception
333
+ for slot_idx in np.nonzero(exec_ctx.row_builder.transitive_dependents[slot_with_exc])[0].tolist():
334
+ row.set_exc(slot_idx, exc)
335
+ self.dispatch(rows, exec_ctx)
336
+
337
+ def dispatch(self, rows: list[exprs.DataRow], exec_ctx: ExecCtx) -> None:
338
+ """Dispatch rows to slot evaluators, based on materialized dependencies"""
339
+ if len(rows) == 0 or self.exc_event.is_set():
340
+ return
341
+
342
+ # slots ready for evaluation; rows x slots
343
+ ready_slots = np.zeros((len(rows), exec_ctx.row_builder.num_materialized), dtype=bool)
344
+ completed_rows = np.zeros(len(rows), dtype=bool)
345
+ for i, row in enumerate(rows):
346
+ row.missing_slots &= row.has_val == False
347
+ if row.missing_slots.sum() == 0:
348
+ # all output slots have been materialized
349
+ completed_rows[i] = True
350
+ else:
351
+ # dependencies of missing slots
352
+ missing_dependencies = exec_ctx.row_builder.num_dependencies * row.missing_slots
353
+ # determine ready slots that are not yet materialized and not yet scheduled
354
+ num_mat_dependencies = np.sum(exec_ctx.row_builder.dependencies * row.has_val, axis=1)
355
+ num_missing = missing_dependencies - num_mat_dependencies
356
+ ready_slots[i] = (num_missing == 0) & (row.is_scheduled == False) & row.missing_slots
357
+ row.is_scheduled |= ready_slots[i]
358
+
359
+ # clear intermediate values that are no longer needed (ie, all dependents are materialized)
360
+ missing_dependents = np.sum(exec_ctx.row_builder.dependencies[row.has_val == False], axis=0)
361
+ gc_targets = (missing_dependents == 0) & (row.missing_dependents > 0) & exec_ctx.gc_targets
362
+ row.clear(gc_targets)
363
+ row.missing_dependents = missing_dependents
364
+
365
+ if np.any(completed_rows):
366
+ completed_idxs = list(completed_rows.nonzero()[0])
367
+ if rows[i].parent_row is not None:
368
+ # these are nested rows
369
+ for i in completed_idxs:
370
+ row = rows[i]
371
+ assert row.parent_row is not None and row.parent_slot_idx is not None
372
+ assert isinstance(row.parent_row.vals[row.parent_slot_idx], NestedRowList)
373
+ row.parent_row.vals[row.parent_slot_idx].complete_row()
374
+ else:
375
+ for i in completed_idxs:
376
+ self.completed_rows.put_nowait(rows[i])
377
+ self.completed_event.set()
378
+ self.num_in_flight -= len(completed_idxs)
379
+
380
+ # schedule all ready slots
381
+ for slot_idx in np.sum(ready_slots, axis=0).nonzero()[0]:
382
+ ready_rows_v = ready_slots[:, slot_idx].flatten()
383
+ _ = ready_rows_v.nonzero()
384
+ ready_rows = [rows[i] for i in ready_rows_v.nonzero()[0]]
385
+ _logger.debug(f'Scheduling {len(ready_rows)} rows for slot {slot_idx}')
386
+ exec_ctx.slot_evaluators[slot_idx].schedule(ready_rows, slot_idx)
387
+
388
+ def register_task(self, t: asyncio.Task) -> None:
389
+ self.tasks.add(t)
390
+ t.add_done_callback(self._done_cb)
391
+
392
+ def _done_cb(self, t: asyncio.Task) -> None:
393
+ self.tasks.discard(t)
394
+ # end the main loop if we had an unhandled exception
395
+ try:
396
+ t.result()
397
+ except KeyboardInterrupt:
398
+ # ExprEvalNode instances are long-running and reused across multiple operations.
399
+ # When a user interrupts an operation (Ctrl+C), the main evaluation loop properly
400
+ # handles the KeyboardInterrupt and terminates the current operation. However,
401
+ # background tasks spawned by evaluators may complete asynchronously after the
402
+ # operation has ended, and their done callbacks will fire during subsequent
403
+ # operations. These "phantom" KeyboardInterrupt exceptions from previous
404
+ # operations' background tasks should not interfere with new operations, so we
405
+ # absorb them here rather than propagating them via self.error/self.exc_event.
406
+ _logger.debug('Task completed with KeyboardInterrupt (user cancellation)')
407
+ pass
408
+ except asyncio.CancelledError:
409
+ pass
410
+ except Exception as exc:
411
+ stack_trace = traceback.format_exc()
412
+ self.error = excs.Error(f'Exception in task: {exc}\n{stack_trace}')
413
+ self.exc_event.set()
@@ -0,0 +1,200 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ import asyncio
5
+ from dataclasses import dataclass
6
+ from types import TracebackType
7
+ from typing import Any, Iterable, Protocol
8
+
9
+ import numpy as np
10
+
11
+ from pixeltable import exprs, func
12
+
13
+
14
+ @dataclass
15
+ class FnCallArgs:
16
+ """Container for everything needed to execute a FunctionCall against one or more DataRows"""
17
+
18
+ fn_call: exprs.FunctionCall
19
+ rows: list[exprs.DataRow]
20
+ # single call
21
+ args: list[Any] | None = None
22
+ kwargs: dict[str, Any] | None = None
23
+ # batch call
24
+ batch_args: list[list[Any | None]] | None = None
25
+ batch_kwargs: dict[str, list[Any | None]] | None = None
26
+
27
+ @property
28
+ def pxt_fn(self) -> func.CallableFunction:
29
+ assert isinstance(self.fn_call.fn, func.CallableFunction)
30
+ return self.fn_call.fn
31
+
32
+ @property
33
+ def is_batched(self) -> bool:
34
+ return self.batch_args is not None
35
+
36
+ @property
37
+ def row(self) -> exprs.DataRow:
38
+ assert len(self.rows) == 1
39
+ return self.rows[0]
40
+
41
+
42
+ class Scheduler(abc.ABC):
43
+ """
44
+ Base class for queueing schedulers. A scheduler executes FunctionCalls against a limited resource pool.
45
+
46
+ Expected behavior:
47
+ - all created tasks must be recorded in dispatcher.tasks
48
+ - schedulers are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
49
+ elsewhere (indicated by dispatcher.exc_event)
50
+ """
51
+
52
+ @dataclass(frozen=True)
53
+ class QueueItem:
54
+ """Container of work items for queueing schedulers"""
55
+
56
+ request: FnCallArgs
57
+ num_retries: int
58
+ exec_ctx: ExecCtx
59
+ retry_after: float | None = None # time.monotonic()
60
+
61
+ def __lt__(self, other: Scheduler.QueueItem) -> bool:
62
+ # prioritize by number of retries (more retries = higher priority)
63
+ return self.num_retries > other.num_retries
64
+
65
+ resource_pool: str
66
+ queue: asyncio.PriorityQueue[QueueItem] # prioritizes retries
67
+ dispatcher: Dispatcher
68
+
69
+ def __init__(self, resource_pool: str, dispatcher: Dispatcher):
70
+ self.resource_pool = resource_pool
71
+ self.queue = asyncio.PriorityQueue()
72
+ self.dispatcher = dispatcher
73
+
74
+ def submit(self, item: FnCallArgs, exec_ctx: ExecCtx) -> None:
75
+ self.queue.put_nowait(self.QueueItem(item, 0, exec_ctx))
76
+
77
+ @classmethod
78
+ @abc.abstractmethod
79
+ def matches(cls, resource_pool: str) -> bool:
80
+ """Returns True if the scheduler can handle the given resource pool"""
81
+ pass
82
+
83
+
84
+ class Dispatcher(Protocol):
85
+ """
86
+ Row dispatcher used by Evaluators/Schedulers for post-processing after slot materialization and for task management.
87
+
88
+ Task management: all tasks need to be registered via register_task()
89
+ Exceptions: evaluators/schedulers need to check exc_event prior to starting long-running (non-interruptible)
90
+ computations
91
+ """
92
+
93
+ row_builder: exprs.RowBuilder
94
+ exc_event: asyncio.Event
95
+ schedulers: dict[str, Scheduler] # key: resource pool id
96
+
97
+ def dispatch(self, rows: list[exprs.DataRow], exec_ctx: Any) -> None:
98
+ """Dispatches row slots to the appropriate schedulers; does not block"""
99
+ ...
100
+
101
+ def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: Any) -> None:
102
+ """Propagates exception in slot_with_exc to all dependent slots and dispatches the rest; does not block"""
103
+ ...
104
+
105
+ def register_task(self, f: asyncio.Task) -> None:
106
+ """Register task with dispatcher for subsequent cleanup; does not block"""
107
+ ...
108
+
109
+
110
+ class Evaluator(abc.ABC):
111
+ """
112
+ Base class for expression evaluators. Each DataRow slot is assigned an evaluator, which is responsible for the
113
+ execution of the expression evaluation logic as well as the scheduling/task breakdown of that execution.
114
+
115
+ Expected behavior:
116
+ - all created tasks must be recorded in dispatcher.tasks
117
+ - evaluators are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
118
+ elsewhere (indicated by dispatcher.exc_event)
119
+ """
120
+
121
+ dispatcher: Dispatcher
122
+ is_closed: bool
123
+ exec_ctx: 'ExecCtx'
124
+
125
+ def __init__(self, dispatcher: Dispatcher, exec_ctx: 'ExecCtx') -> None:
126
+ self.dispatcher = dispatcher
127
+ self.is_closed = False
128
+ self.exec_ctx = exec_ctx
129
+
130
+ @abc.abstractmethod
131
+ def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
132
+ """Create tasks to evaluate the expression in the given slot for the given rows; must not block."""
133
+
134
+ def _close(self) -> None:
135
+ """Close the evaluator; must not block"""
136
+ pass
137
+
138
+ def close(self) -> None:
139
+ """Indicates that there may not be any more rows getting scheduled"""
140
+ self.is_closed = True
141
+ self._close()
142
+
143
+
144
+ class ExecCtx:
145
+ """DataRow-specific state needed by ExprEvalNode"""
146
+
147
+ row_builder: exprs.RowBuilder
148
+ slot_evaluators: dict[int, Evaluator] # key: slot idx
149
+ gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
150
+ eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
151
+ literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
152
+ all_exprs: list[exprs.Expr] # all evaluated exprs; needed for cleanup
153
+
154
+ def __init__(
155
+ self,
156
+ dispatcher: Dispatcher,
157
+ row_builder: exprs.RowBuilder,
158
+ output_exprs: Iterable[exprs.Expr],
159
+ input_exprs: Iterable[exprs.Expr],
160
+ ):
161
+ self.row_builder = row_builder
162
+ self.slot_evaluators = {}
163
+ # TODO: only include output_exprs dependencies
164
+ self.gc_targets = np.ones(self.row_builder.num_materialized, dtype=bool)
165
+ # we need to retain all slots that are part of the output
166
+ self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
167
+
168
+ output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
169
+ self.all_exprs = output_ctx.exprs
170
+ self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
171
+ self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
172
+ non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
173
+ self.eval_ctx[non_literal_slot_idxs] = True
174
+ self._init_slot_evaluators(dispatcher, non_literal_slot_idxs)
175
+
176
+ def _init_slot_evaluators(self, dispatcher: Dispatcher, target_slot_idxs: list[int]) -> None:
177
+ from .evaluators import DefaultExprEvaluator, FnCallEvaluator, JsonMapperDispatcher
178
+
179
+ for slot_idx in target_slot_idxs:
180
+ expr = self.row_builder.unique_exprs[slot_idx]
181
+ if (
182
+ isinstance(expr, exprs.FunctionCall)
183
+ # ExprTemplateFunction and AggregateFunction calls are best handled by FunctionCall.eval()
184
+ and not isinstance(expr.fn, func.ExprTemplateFunction)
185
+ and not isinstance(expr.fn, func.AggregateFunction)
186
+ ):
187
+ self.slot_evaluators[slot_idx] = FnCallEvaluator(expr, dispatcher, self)
188
+ elif isinstance(expr, exprs.JsonMapperDispatch):
189
+ self.slot_evaluators[slot_idx] = JsonMapperDispatcher(expr, dispatcher, self)
190
+ else:
191
+ self.slot_evaluators[slot_idx] = DefaultExprEvaluator(expr, dispatcher, self)
192
+
193
+ def init_rows(self, rows: list[exprs.DataRow]) -> None:
194
+ """Pre-populate rows with literals and initialize execution state"""
195
+ for row in rows:
196
+ # set literals before missing_dependents/slots
197
+ for slot_idx, val in self.literals.items():
198
+ row[slot_idx] = val
199
+ row.missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
200
+ row.missing_slots = self.eval_ctx & (row.has_val == False)
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ import numpy as np
6
+
7
+ from pixeltable import exprs
8
+
9
+ _logger = logging.getLogger('pixeltable')
10
+
11
+
12
+ class RowBuffer:
13
+ """Fixed-length circular buffer of DataRows; knows how to maintain input order"""
14
+
15
+ size: int
16
+ row_pos_map: dict[int, int] | None # id(row) -> position of row in output; None if not maintaining order
17
+ num_rows: int # number of rows in the buffer
18
+ num_ready: int # number of consecutive non-None rows at head
19
+ buffer: np.ndarray # of object
20
+ head_idx: int # index of beginning of the buffer
21
+ head_pos: int # row position of the beginning of the buffer
22
+
23
+ def __init__(self, size: int):
24
+ self.size = size
25
+ self.row_pos_map = None
26
+ self.num_rows = 0
27
+ self.num_ready = 0
28
+ self.buffer = np.full(size, None, dtype=object)
29
+ self.head_pos = 0
30
+ self.head_idx = 0
31
+
32
+ def set_row_pos_map(self, row_pos_map: dict[int, int]) -> None:
33
+ self.row_pos_map = row_pos_map
34
+
35
+ def add_row(self, row: exprs.DataRow) -> None:
36
+ offset: int # of new row from head
37
+ if self.row_pos_map is not None:
38
+ pos = self.row_pos_map.get(id(row))
39
+ assert pos is not None and (pos - self.head_pos < self.size), f'{pos} {self.head_pos} {self.size}'
40
+ offset = pos - self.head_pos
41
+ else:
42
+ offset = self.num_rows
43
+ idx = (self.head_idx + offset) % self.size
44
+ assert self.buffer[idx] is None
45
+
46
+ self.buffer[idx] = row
47
+ self.num_rows += 1
48
+ if self.row_pos_map is not None:
49
+ if offset == self.num_ready:
50
+ # we have new ready rows; find out how many
51
+ while offset < self.size and self.buffer[(self.head_idx + offset) % self.size] is not None:
52
+ offset += 1
53
+ self.num_ready = offset
54
+ else:
55
+ self.num_ready += 1
56
+
57
+ def get_rows(self, n: int) -> list[exprs.DataRow]:
58
+ """Get up to n ready rows from head"""
59
+ n = min(n, self.num_ready)
60
+ if n == 0:
61
+ return []
62
+ rows: list[exprs.DataRow]
63
+ if self.head_idx + n <= self.size:
64
+ rows = self.buffer[self.head_idx : self.head_idx + n].tolist()
65
+ self.buffer[self.head_idx : self.head_idx + n] = None
66
+ else:
67
+ rows = np.concatenate([self.buffer[self.head_idx :], self.buffer[: self.head_idx + n - self.size]]).tolist()
68
+ self.buffer[self.head_idx :] = None
69
+ self.buffer[: self.head_idx + n - self.size] = None
70
+ self.head_pos += n
71
+ self.head_idx = (self.head_idx + n) % self.size
72
+ self.num_rows -= n
73
+ self.num_ready -= n
74
+ return rows