pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Any, AsyncIterator
7
+
8
+ import numpy as np
9
+ import PIL.Image
10
+
11
+ import pixeltable.type_system as ts
12
+ from pixeltable import exprs
13
+ from pixeltable.utils import parse_local_file_path
14
+
15
+ from .data_row_batch import DataRowBatch
16
+ from .exec_node import ExecNode
17
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
18
+
19
+ _logger = logging.getLogger('pixeltable')
20
+
21
+
22
+ def json_has_inlined_objs(element: Any) -> bool:
23
+ """Returns True if element contains inlined objects produced by CellMaterializationNode."""
24
+ if isinstance(element, list):
25
+ return any(json_has_inlined_objs(v) for v in element)
26
+ if isinstance(element, dict):
27
+ if INLINED_OBJECT_MD_KEY in element:
28
+ return True
29
+ return any(json_has_inlined_objs(v) for v in element.values())
30
+ return False
31
+
32
+
33
+ def reconstruct_json(element: Any, urls: list[str], file_handles: dict[Path, io.BufferedReader]) -> Any:
34
+ """Recursively reconstructs inlined objects in a json structure."""
35
+ if isinstance(element, list):
36
+ return [reconstruct_json(v, urls, file_handles) for v in element]
37
+ if isinstance(element, dict):
38
+ if INLINED_OBJECT_MD_KEY in element:
39
+ obj_md = InlinedObjectMd.from_dict(element[INLINED_OBJECT_MD_KEY])
40
+ url = urls[obj_md.url_idx]
41
+ local_path = parse_local_file_path(url)
42
+ if local_path not in file_handles:
43
+ file_handles[local_path] = open(local_path, 'rb') # noqa: SIM115
44
+ fp = file_handles[local_path]
45
+
46
+ if obj_md.type == ts.ColumnType.Type.ARRAY.name:
47
+ fp.seek(obj_md.array_md.start)
48
+ ar = load_array(
49
+ fp, obj_md.array_md.start, obj_md.array_md.end, obj_md.array_md.is_bool, obj_md.array_md.shape
50
+ )
51
+ return ar
52
+ else:
53
+ fp.seek(obj_md.img_start)
54
+ bytesio = io.BytesIO(fp.read(obj_md.img_end - obj_md.img_start))
55
+ img = PIL.Image.open(bytesio)
56
+ img.load()
57
+ assert fp.tell() == obj_md.img_end, f'{fp.tell()} != {obj_md.img_end} / {obj_md.img_start}'
58
+ return img
59
+ else:
60
+ return {k: reconstruct_json(v, urls, file_handles) for k, v in element.items()}
61
+ return element
62
+
63
+
64
+ def load_array(
65
+ fh: io.BufferedReader, start: int, end: int, is_bool_array: bool, shape: tuple[int, ...] | None
66
+ ) -> np.ndarray:
67
+ """Loads an array from a section of a file."""
68
+ fh.seek(start)
69
+ ar = np.load(fh, allow_pickle=False)
70
+ assert fh.tell() == end
71
+ if is_bool_array:
72
+ assert shape is not None
73
+ ar = np.unpackbits(ar, count=np.prod(shape)).reshape(shape).astype(bool)
74
+ return ar
75
+
76
+
77
+ class CellReconstructionNode(ExecNode):
78
+ """
79
+ Reconstruction of stored json and array cells that were produced by CellMaterializationNode.
80
+ """
81
+
82
+ json_refs: list[exprs.ColumnRef]
83
+ array_refs: list[exprs.ColumnRef]
84
+ file_handles: dict[Path, io.BufferedReader] # key: file path
85
+
86
+ def __init__(
87
+ self,
88
+ json_refs: list[exprs.ColumnRef],
89
+ array_refs: list[exprs.ColumnRef],
90
+ row_builder: exprs.RowBuilder,
91
+ input: ExecNode | None = None,
92
+ ):
93
+ super().__init__(row_builder, [], [], input)
94
+ self.json_refs = json_refs
95
+ self.array_refs = array_refs
96
+ self.file_handles = {}
97
+
98
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
99
+ async for batch in self.input:
100
+ for row in batch:
101
+ for col_ref in self.json_refs:
102
+ val = row[col_ref.slot_idx]
103
+ if val is None:
104
+ continue
105
+ cell_md = row.slot_md.get(col_ref.slot_idx)
106
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(row[col_ref.slot_idx]):
107
+ continue
108
+ row[col_ref.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
109
+
110
+ for col_ref in self.array_refs:
111
+ cell_md = row.slot_md.get(col_ref.slot_idx)
112
+ if cell_md is not None and cell_md.array_md is not None:
113
+ assert row[col_ref.slot_idx] is None
114
+ assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
115
+ row[col_ref.slot_idx] = self._reconstruct_array(cell_md)
116
+ else:
117
+ assert row[col_ref.slot_idx] is None or isinstance(row[col_ref.slot_idx], np.ndarray)
118
+
119
+ yield batch
120
+
121
+ def close(self) -> None:
122
+ for fp in self.file_handles.values():
123
+ fp.close()
124
+
125
+ def _reconstruct_array(self, cell_md: exprs.CellMd) -> np.ndarray:
126
+ assert cell_md.array_md is not None
127
+ local_path = parse_local_file_path(cell_md.file_urls[0])
128
+ assert local_path is not None
129
+ if local_path not in self.file_handles:
130
+ self.file_handles[local_path] = open(str(local_path), 'rb') # noqa: SIM115
131
+ fp = self.file_handles[local_path]
132
+ ar = load_array(
133
+ fp, cell_md.array_md.start, cell_md.array_md.end, bool(cell_md.array_md.is_bool), cell_md.array_md.shape
134
+ )
135
+ return ar
@@ -40,7 +40,7 @@ class ComponentIterationNode(ExecNode):
40
40
  }
41
41
 
42
42
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
43
- output_batch = DataRowBatch(self.view, self.row_builder)
43
+ output_batch = DataRowBatch(self.row_builder)
44
44
  async for input_batch in self.input:
45
45
  for input_row in input_batch:
46
46
  self.row_builder.eval(input_row, self.iterator_args_ctx)
@@ -52,13 +52,14 @@ class ComponentIterationNode(ExecNode):
52
52
  if self.__non_nullable_args_specified(iterator_args):
53
53
  iterator = self.view.get().iterator_cls(**iterator_args)
54
54
  for pos, component_dict in enumerate(iterator):
55
- output_row = output_batch.add_row()
55
+ output_row = self.row_builder.make_row()
56
56
  input_row.copy(output_row)
57
57
  # we're expanding the input and need to add the iterator position to the pk
58
58
  self.__populate_output_row(output_row, pos, component_dict)
59
+ output_batch.add_row(output_row)
59
60
  if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
60
61
  yield output_batch
61
- output_batch = DataRowBatch(self.view, self.row_builder)
62
+ output_batch = DataRowBatch(self.row_builder)
62
63
 
63
64
  if len(output_batch) > 0:
64
65
  yield output_batch
@@ -1,10 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from typing import Iterator, Optional
4
+ from typing import Iterator
5
5
 
6
- from pixeltable import catalog, exprs
7
- from pixeltable.utils.media_store import MediaStore
6
+ from pixeltable import exprs
8
7
 
9
8
  _logger = logging.getLogger('pixeltable')
10
9
 
@@ -13,53 +12,20 @@ class DataRowBatch:
13
12
  """Set of DataRows, indexed by rowid.
14
13
 
15
14
  Contains the metadata needed to initialize DataRows.
15
+
16
+ Requires either num_rows or rows to be specified, but not both.
16
17
  """
17
18
 
18
- tbl: Optional[catalog.TableVersionHandle]
19
19
  row_builder: exprs.RowBuilder
20
- img_slot_idxs: list[int]
21
- media_slot_idxs: list[int] # non-image media slots
22
- array_slot_idxs: list[int]
23
20
  rows: list[exprs.DataRow]
24
21
 
25
- def __init__(
26
- self,
27
- tbl: Optional[catalog.TableVersionHandle],
28
- row_builder: exprs.RowBuilder,
29
- num_rows: Optional[int] = None,
30
- rows: Optional[list[exprs.DataRow]] = None,
31
- ):
32
- """
33
- Requires either num_rows or rows to be specified, but not both.
34
- """
35
- assert num_rows is None or rows is None
36
- self.tbl = tbl
22
+ def __init__(self, row_builder: exprs.RowBuilder, rows: list[exprs.DataRow] | None = None):
37
23
  self.row_builder = row_builder
38
- self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
39
- # non-image media slots
40
- self.media_slot_idxs = [
41
- e.slot_idx
42
- for e in row_builder.unique_exprs
43
- if e.col_type.is_media_type() and not e.col_type.is_image_type()
44
- ]
45
- self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
46
- if rows is not None:
47
- self.rows = rows
48
- else:
49
- if num_rows is None:
50
- num_rows = 0
51
- self.rows = [
52
- exprs.DataRow(
53
- row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
54
- )
55
- for _ in range(num_rows)
56
- ]
24
+ self.rows = [] if rows is None else rows
57
25
 
58
- def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
26
+ def add_row(self, row: exprs.DataRow | None) -> exprs.DataRow:
59
27
  if row is None:
60
- row = exprs.DataRow(
61
- self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
62
- )
28
+ row = self.row_builder.make_row()
63
29
  self.rows.append(row)
64
30
  return row
65
31
 
@@ -72,28 +38,5 @@ class DataRowBatch:
72
38
  def __getitem__(self, index: int) -> exprs.DataRow:
73
39
  return self.rows[index]
74
40
 
75
- def flush_imgs(
76
- self,
77
- idx_range: Optional[slice] = None,
78
- stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
79
- flushed_slot_idxs: Optional[list[int]] = None,
80
- ) -> None:
81
- """Flushes images in the given range of rows."""
82
- assert self.tbl is not None
83
- if stored_img_info is None:
84
- stored_img_info = []
85
- if flushed_slot_idxs is None:
86
- flushed_slot_idxs = []
87
- if len(stored_img_info) == 0 and len(flushed_slot_idxs) == 0:
88
- return
89
- if idx_range is None:
90
- idx_range = slice(0, len(self.rows))
91
- for row in self.rows[idx_range]:
92
- for info in stored_img_info:
93
- filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.get().version))
94
- row.flush_img(info.slot_idx, filepath)
95
- for slot_idx in flushed_slot_idxs:
96
- row.flush_img(slot_idx)
97
-
98
41
  def __iter__(self) -> Iterator[exprs.DataRow]:
99
42
  return iter(self.rows)
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ import random
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -8,13 +8,24 @@ from pixeltable import exprs
8
8
  class ExecContext:
9
9
  """Class for execution runtime constants"""
10
10
 
11
+ row_builder: exprs.RowBuilder
12
+ profile: exprs.ExecProfile
13
+ show_pbar: bool
14
+ batch_size: int
15
+ num_rows: int | None
16
+ conn: sql.engine.Connection | None
17
+ pk_clause: list[sql.ClauseElement] | None
18
+ num_computed_exprs: int
19
+ ignore_errors: bool
20
+ random_seed: int # general-purpose source of randomness with execution scope
21
+
11
22
  def __init__(
12
23
  self,
13
24
  row_builder: exprs.RowBuilder,
14
25
  *,
15
26
  show_pbar: bool = False,
16
27
  batch_size: int = 0,
17
- pk_clause: Optional[list[sql.ClauseElement]] = None,
28
+ pk_clause: list[sql.ClauseElement] | None = None,
18
29
  num_computed_exprs: int = 0,
19
30
  ignore_errors: bool = False,
20
31
  ):
@@ -23,8 +34,9 @@ class ExecContext:
23
34
  self.row_builder = row_builder
24
35
  self.profile = exprs.ExecProfile(row_builder)
25
36
  # num_rows is used to compute the total number of computed cells used for the progress bar
26
- self.num_rows: Optional[int] = None
27
- self.conn: Optional[sql.engine.Connection] = None # if present, use this to execute SQL queries
37
+ self.num_rows = None
38
+ self.conn = None # if present, use this to execute SQL queries
28
39
  self.pk_clause = pk_clause
29
40
  self.num_computed_exprs = num_computed_exprs
30
41
  self.ignore_errors = ignore_errors
42
+ self.random_seed = random.randint(0, 1 << 63)
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- import asyncio
5
4
  import logging
6
- from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
5
+ from typing import AsyncIterator, Iterable, Iterator, TypeVar
7
6
 
8
7
  from pixeltable import exprs
8
+ from pixeltable.env import Env
9
9
 
10
10
  from .data_row_batch import DataRowBatch
11
11
  from .exec_context import ExecContext
@@ -18,17 +18,16 @@ class ExecNode(abc.ABC):
18
18
 
19
19
  output_exprs: Iterable[exprs.Expr]
20
20
  row_builder: exprs.RowBuilder
21
- input: Optional[ExecNode]
21
+ input: ExecNode | None
22
22
  flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
23
- stored_img_cols: list[exprs.ColumnSlotIdx]
24
- ctx: Optional[ExecContext]
23
+ ctx: ExecContext | None
25
24
 
26
25
  def __init__(
27
26
  self,
28
27
  row_builder: exprs.RowBuilder,
29
28
  output_exprs: Iterable[exprs.Expr],
30
29
  input_exprs: Iterable[exprs.Expr],
31
- input: Optional[ExecNode] = None,
30
+ input: ExecNode | None = None,
32
31
  ):
33
32
  assert all(expr.is_valid for expr in output_exprs)
34
33
  self.output_exprs = output_exprs
@@ -40,43 +39,19 @@ class ExecNode(abc.ABC):
40
39
  self.flushed_img_slots = [
41
40
  e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
42
41
  ]
43
- self.stored_img_cols = []
44
- self.ctx = None # all nodes of a tree share the same context
42
+ self.ctx = input.ctx if input is not None else None
45
43
 
46
44
  def set_ctx(self, ctx: ExecContext) -> None:
47
45
  self.ctx = ctx
48
46
  if self.input is not None:
49
47
  self.input.set_ctx(ctx)
50
48
 
51
- def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
52
- self.stored_img_cols = stored_img_cols
53
- # propagate batch size to the source
54
- if self.input is not None:
55
- self.input.set_stored_img_cols(stored_img_cols)
56
-
57
49
  @abc.abstractmethod
58
50
  def __aiter__(self) -> AsyncIterator[DataRowBatch]:
59
51
  pass
60
52
 
61
53
  def __iter__(self) -> Iterator[DataRowBatch]:
62
- running_loop: Optional[asyncio.AbstractEventLoop] = None
63
- loop: asyncio.AbstractEventLoop
64
- try:
65
- # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
66
- # multiple run_until_complete()
67
- running_loop = asyncio.get_running_loop()
68
- import nest_asyncio # type: ignore[import-untyped]
69
-
70
- nest_asyncio.apply()
71
- loop = running_loop
72
- _logger.debug('Patched running loop')
73
- except RuntimeError:
74
- loop = asyncio.new_event_loop()
75
- asyncio.set_event_loop(loop)
76
-
77
- if _logger.isEnabledFor(logging.DEBUG):
78
- loop.set_debug(True)
79
-
54
+ loop = Env.get().event_loop
80
55
  aiter = self.__aiter__()
81
56
  try:
82
57
  while True:
@@ -84,9 +59,11 @@ class ExecNode(abc.ABC):
84
59
  yield batch
85
60
  except StopAsyncIteration:
86
61
  pass
87
- finally:
88
- if loop != running_loop:
89
- loop.close()
62
+ # TODO:
63
+ # - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
64
+ # we end up here
65
+ # - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
66
+ # creates tasks on its own
90
67
 
91
68
  def open(self) -> None:
92
69
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -108,7 +85,7 @@ class ExecNode(abc.ABC):
108
85
 
109
86
  T = TypeVar('T', bound='ExecNode')
110
87
 
111
- def get_node(self, node_class: type[T]) -> Optional[T]:
88
+ def get_node(self, node_class: type[T]) -> T | None:
112
89
  if isinstance(self, node_class):
113
90
  return self
114
91
  if self.input is not None:
@@ -5,7 +5,7 @@ import datetime
5
5
  import itertools
6
6
  import logging
7
7
  import sys
8
- from typing import Any, Callable, Iterator, Optional, cast
8
+ from typing import Any, Callable, Iterator, cast
9
9
 
10
10
  from pixeltable import exprs, func
11
11
 
@@ -64,11 +64,11 @@ class FnCallEvaluator(Evaluator):
64
64
 
65
65
  fn_call: exprs.FunctionCall
66
66
  fn: func.CallableFunction
67
- scalar_py_fn: Optional[Callable] # only set for non-batching CallableFunctions
67
+ scalar_py_fn: Callable | None # only set for non-batching CallableFunctions
68
68
 
69
69
  # only set if fn.is_batched
70
- call_args_queue: Optional[asyncio.Queue[FnCallArgs]] # FnCallArgs waiting for execution
71
- batch_size: Optional[int]
70
+ call_args_queue: asyncio.Queue[FnCallArgs] | None # FnCallArgs waiting for execution
71
+ batch_size: int | None
72
72
 
73
73
  def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
74
74
  super().__init__(dispatcher, exec_ctx)
@@ -160,8 +160,8 @@ class FnCallEvaluator(Evaluator):
160
160
 
161
161
  def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
162
162
  """Roll call_args into a single batched FnCallArgs"""
163
- batch_args: list[list[Optional[Any]]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
164
- batch_kwargs: dict[str, list[Optional[Any]]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
163
+ batch_args: list[list[Any | None]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
164
+ batch_kwargs: dict[str, list[Any | None]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
165
165
  assert isinstance(self.fn, func.CallableFunction)
166
166
  for i, item in enumerate(call_args):
167
167
  for j in range(len(item.args)):
@@ -311,6 +311,7 @@ class JsonMapperDispatcher(Evaluator):
311
311
  img_slot_idxs=[],
312
312
  media_slot_idxs=[],
313
313
  array_slot_idxs=[],
314
+ json_slot_idxs=[],
314
315
  parent_row=row,
315
316
  parent_slot_idx=self.e.slot_idx,
316
317
  )
@@ -4,7 +4,7 @@ import asyncio
4
4
  import logging
5
5
  import traceback
6
6
  from types import TracebackType
7
- from typing import AsyncIterator, Iterable, Optional, Union
7
+ from typing import AsyncIterator, Iterable
8
8
 
9
9
  import numpy as np
10
10
 
@@ -49,17 +49,17 @@ class ExprEvalNode(ExecNode):
49
49
  # execution state
50
50
  tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
51
51
  exc_event: asyncio.Event # set if an exception needs to be propagated
52
- error: Optional[Union[excs.Error, excs.ExprEvalError]] # exception that needs to be propagated
52
+ error: Exception | None # exception that needs to be propagated
53
53
  completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
54
54
  completed_event: asyncio.Event # set when completed_rows is non-empty
55
55
  input_iter: AsyncIterator[DataRowBatch]
56
- current_input_batch: Optional[DataRowBatch] # batch from which we're currently consuming rows
56
+ current_input_batch: DataRowBatch | None # batch from which we're currently consuming rows
57
57
  input_row_idx: int # next row to consume from current_input_batch
58
- next_input_batch: Optional[DataRowBatch] # read-ahead input batch
58
+ next_input_batch: DataRowBatch | None # read-ahead input batch
59
59
  avail_input_rows: int # total number across both current_/next_input_batch
60
60
  input_complete: bool # True if we've received all input batches
61
61
  num_in_flight: int # number of dispatched rows that haven't completed
62
- row_pos_map: Optional[dict[int, int]] # id(row) -> position of row in input; only set if maintain_input_order
62
+ row_pos_map: dict[int, int] | None # id(row) -> position of row in input; only set if maintain_input_order
63
63
  output_buffer: RowBuffer # holds rows that are ready to be returned, in order
64
64
 
65
65
  # debugging
@@ -133,10 +133,10 @@ class ExprEvalNode(ExecNode):
133
133
  except StopAsyncIteration:
134
134
  self.input_complete = True
135
135
  _logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
136
- except excs.Error as err:
137
- self.error = err
136
+ # make sure to pass DBAPIError through, so the transaction handling logic sees it
137
+ except Exception as exc:
138
+ self.error = exc
138
139
  self.exc_event.set()
139
- # TODO: should we also handle Exception here and create an excs.Error from it?
140
140
 
141
141
  @property
142
142
  def total_buffered(self) -> int:
@@ -217,9 +217,10 @@ class ExprEvalNode(ExecNode):
217
217
 
218
218
  row: exprs.DataRow
219
219
  exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
220
- input_batch_aw: Optional[asyncio.Task] = None
221
- completed_aw: Optional[asyncio.Task] = None
220
+ input_batch_aw: asyncio.Task | None = None
221
+ completed_aw: asyncio.Task | None = None
222
222
  closed_evaluators = False # True after calling Evaluator.close()
223
+ exprs.Expr.prepare_list(self.exec_ctx.all_exprs)
223
224
 
224
225
  try:
225
226
  while True:
@@ -240,7 +241,7 @@ class ExprEvalNode(ExecNode):
240
241
  # make sure we top up our in-flight rows before yielding
241
242
  self._dispatch_input_rows()
242
243
  self._log_state(f'yielding {len(batch_rows)} rows')
243
- yield DataRowBatch(tbl=None, row_builder=self.row_builder, rows=batch_rows)
244
+ yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
244
245
  # at this point, we may have more completed rows
245
246
 
246
247
  assert self.completed_rows.empty() # all completed rows should be sitting in output_buffer
@@ -254,7 +255,7 @@ class ExprEvalNode(ExecNode):
254
255
  batch_rows = self.output_buffer.get_rows(self.output_buffer.num_ready)
255
256
  self.num_output_rows += len(batch_rows)
256
257
  self._log_state(f'yielding {len(batch_rows)} rows')
257
- yield DataRowBatch(tbl=None, row_builder=self.row_builder, rows=batch_rows)
258
+ yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
258
259
 
259
260
  assert self.output_buffer.num_rows == 0
260
261
  return
@@ -306,6 +307,9 @@ class ExprEvalNode(ExecNode):
306
307
  task.cancel()
307
308
  _ = await asyncio.gather(*active_tasks, return_exceptions=True)
308
309
 
310
+ # expr cleanup
311
+ exprs.Expr.release_list(self.exec_ctx.all_exprs)
312
+
309
313
  def dispatch_exc(
310
314
  self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
311
315
  ) -> None:
@@ -390,6 +394,17 @@ class ExprEvalNode(ExecNode):
390
394
  # end the main loop if we had an unhandled exception
391
395
  try:
392
396
  t.result()
397
+ except KeyboardInterrupt:
398
+ # ExprEvalNode instances are long-running and reused across multiple operations.
399
+ # When a user interrupts an operation (Ctrl+C), the main evaluation loop properly
400
+ # handles the KeyboardInterrupt and terminates the current operation. However,
401
+ # background tasks spawned by evaluators may complete asynchronously after the
402
+ # operation has ended, and their done callbacks will fire during subsequent
403
+ # operations. These "phantom" KeyboardInterrupt exceptions from previous
404
+ # operations' background tasks should not interfere with new operations, so we
405
+ # absorb them here rather than propagating them via self.error/self.exc_event.
406
+ _logger.debug('Task completed with KeyboardInterrupt (user cancellation)')
407
+ pass
393
408
  except asyncio.CancelledError:
394
409
  pass
395
410
  except Exception as exc:
@@ -4,7 +4,7 @@ import abc
4
4
  import asyncio
5
5
  from dataclasses import dataclass
6
6
  from types import TracebackType
7
- from typing import Any, Iterable, Optional, Protocol
7
+ from typing import Any, Iterable, Protocol
8
8
 
9
9
  import numpy as np
10
10
 
@@ -18,11 +18,11 @@ class FnCallArgs:
18
18
  fn_call: exprs.FunctionCall
19
19
  rows: list[exprs.DataRow]
20
20
  # single call
21
- args: Optional[list[Any]] = None
22
- kwargs: Optional[dict[str, Any]] = None
21
+ args: list[Any] | None = None
22
+ kwargs: dict[str, Any] | None = None
23
23
  # batch call
24
- batch_args: Optional[list[list[Optional[Any]]]] = None
25
- batch_kwargs: Optional[dict[str, list[Optional[Any]]]] = None
24
+ batch_args: list[list[Any | None]] | None = None
25
+ batch_kwargs: dict[str, list[Any | None]] | None = None
26
26
 
27
27
  @property
28
28
  def pxt_fn(self) -> func.CallableFunction:
@@ -56,6 +56,7 @@ class Scheduler(abc.ABC):
56
56
  request: FnCallArgs
57
57
  num_retries: int
58
58
  exec_ctx: ExecCtx
59
+ retry_after: float | None = None # time.monotonic()
59
60
 
60
61
  def __lt__(self, other: Scheduler.QueueItem) -> bool:
61
62
  # prioritize by number of retries (more retries = higher priority)
@@ -148,6 +149,7 @@ class ExecCtx:
148
149
  gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
149
150
  eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
150
151
  literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
152
+ all_exprs: list[exprs.Expr] # all evaluated exprs; needed for cleanup
151
153
 
152
154
  def __init__(
153
155
  self,
@@ -164,6 +166,7 @@ class ExecCtx:
164
166
  self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
165
167
 
166
168
  output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
169
+ self.all_exprs = output_ctx.exprs
167
170
  self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
168
171
  self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
169
172
  non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from typing import Optional
5
4
 
6
5
  import numpy as np
7
6
 
@@ -14,7 +13,7 @@ class RowBuffer:
14
13
  """Fixed-length circular buffer of DataRows; knows how to maintain input order"""
15
14
 
16
15
  size: int
17
- row_pos_map: Optional[dict[int, int]] # id(row) -> position of row in output; None if not maintaining order
16
+ row_pos_map: dict[int, int] | None # id(row) -> position of row in output; None if not maintaining order
18
17
  num_rows: int # number of rows in the buffer
19
18
  num_ready: int # number of consecutive non-None rows at head
20
19
  buffer: np.ndarray # of object