pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,168 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ from pathlib import Path
6
+ from types import NoneType
7
+ from typing import Any, AsyncIterator
8
+
9
+ import numpy as np
10
+ import PIL.Image
11
+
12
+ import pixeltable.type_system as ts
13
+ from pixeltable import exprs
14
+ from pixeltable.utils import parse_local_file_path
15
+
16
+ from .data_row_batch import DataRowBatch
17
+ from .exec_node import ExecNode
18
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
19
+
20
+ _logger = logging.getLogger('pixeltable')
21
+
22
+
23
+ def json_has_inlined_objs(element: Any) -> bool:
24
+ """Returns True if element contains inlined objects produced by CellMaterializationNode."""
25
+ if isinstance(element, list):
26
+ return any(json_has_inlined_objs(v) for v in element)
27
+ if isinstance(element, dict):
28
+ if INLINED_OBJECT_MD_KEY in element:
29
+ return True
30
+ return any(json_has_inlined_objs(v) for v in element.values())
31
+ return False
32
+
33
+
34
+ def reconstruct_json(element: Any, urls: list[str], file_handles: dict[Path, io.BufferedReader]) -> Any:
35
+ """Recursively reconstructs inlined objects in a json structure."""
36
+ if isinstance(element, list):
37
+ return [reconstruct_json(v, urls, file_handles) for v in element]
38
+ if isinstance(element, dict):
39
+ if INLINED_OBJECT_MD_KEY in element:
40
+ obj_md = InlinedObjectMd.from_dict(element[INLINED_OBJECT_MD_KEY])
41
+ url = urls[obj_md.url_idx]
42
+ local_path = parse_local_file_path(url)
43
+ if local_path not in file_handles:
44
+ file_handles[local_path] = open(local_path, 'rb') # noqa: SIM115
45
+ fp = file_handles[local_path]
46
+
47
+ if obj_md.type == ts.ColumnType.Type.ARRAY.name:
48
+ fp.seek(obj_md.array_md.start)
49
+ ar = load_array(
50
+ fp, obj_md.array_md.start, obj_md.array_md.end, obj_md.array_md.is_bool, obj_md.array_md.shape
51
+ )
52
+ return ar
53
+ elif obj_md.type == ts.ColumnType.Type.IMAGE.name:
54
+ fp.seek(obj_md.img_start)
55
+ bytesio = io.BytesIO(fp.read(obj_md.img_end - obj_md.img_start))
56
+ img = PIL.Image.open(bytesio)
57
+ img.load()
58
+ assert fp.tell() == obj_md.img_end, f'{fp.tell()} != {obj_md.img_end} ({obj_md.img_start})'
59
+ return img
60
+ else:
61
+ assert obj_md.type == ts.ColumnType.Type.BINARY.name
62
+ assert obj_md.binary_md is not None
63
+ fp.seek(obj_md.binary_md.start)
64
+ data = fp.read(obj_md.binary_md.end - obj_md.binary_md.start)
65
+ assert fp.tell() == obj_md.binary_md.end, (
66
+ f'{fp.tell()} != {obj_md.binary_md.end} ({obj_md.binary_md.start})'
67
+ )
68
+ return data
69
+ else:
70
+ return {k: reconstruct_json(v, urls, file_handles) for k, v in element.items()}
71
+ return element
72
+
73
+
74
+ def load_array(
75
+ fh: io.BufferedReader, start: int, end: int, is_bool_array: bool, shape: tuple[int, ...] | None
76
+ ) -> np.ndarray:
77
+ """Loads an array from a section of a file."""
78
+ fh.seek(start)
79
+ ar = np.load(fh, allow_pickle=False)
80
+ assert fh.tell() == end
81
+ if is_bool_array:
82
+ assert shape is not None
83
+ ar = np.unpackbits(ar, count=np.prod(shape)).reshape(shape).astype(bool)
84
+ return ar
85
+
86
+
87
+ class CellReconstructionNode(ExecNode):
88
+ """
89
+ Reconstruction of stored json and array cells that were produced by CellMaterializationNode.
90
+ """
91
+
92
+ json_refs: list[exprs.ColumnRef]
93
+ array_refs: list[exprs.ColumnRef]
94
+ binary_refs: list[exprs.ColumnRef]
95
+ file_handles: dict[Path, io.BufferedReader] # key: file path
96
+
97
+ def __init__(
98
+ self,
99
+ json_refs: list[exprs.ColumnRef],
100
+ array_refs: list[exprs.ColumnRef],
101
+ binary_refs: list[exprs.ColumnRef],
102
+ row_builder: exprs.RowBuilder,
103
+ input: ExecNode | None = None,
104
+ ):
105
+ super().__init__(row_builder, [], [], input)
106
+ self.json_refs = json_refs
107
+ self.array_refs = array_refs
108
+ self.binary_refs = binary_refs
109
+ self.file_handles = {}
110
+
111
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
112
+ async for batch in self.input:
113
+ for row in batch:
114
+ for col_ref in self.json_refs:
115
+ val = row[col_ref.slot_idx]
116
+ if val is None:
117
+ continue
118
+ cell_md = row.slot_md.get(col_ref.slot_idx)
119
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(row[col_ref.slot_idx]):
120
+ continue
121
+ row[col_ref.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
122
+
123
+ for col_ref in self.array_refs:
124
+ cell_md = row.slot_md.get(col_ref.slot_idx)
125
+ if cell_md is not None and cell_md.array_md is not None:
126
+ assert row[col_ref.slot_idx] is None
127
+ row[col_ref.slot_idx] = self._reconstruct_array(cell_md)
128
+ else:
129
+ assert isinstance(row[col_ref.slot_idx], (NoneType, np.ndarray))
130
+
131
+ for col_ref in self.binary_refs:
132
+ cell_md = row.slot_md.get(col_ref.slot_idx)
133
+ if cell_md is not None and cell_md.binary_md is not None:
134
+ assert row[col_ref.slot_idx] is None
135
+ row[col_ref.slot_idx] = self._reconstruct_binary(cell_md)
136
+ else:
137
+ assert isinstance(row[col_ref.slot_idx], (NoneType, bytes))
138
+
139
+ yield batch
140
+
141
+ def close(self) -> None:
142
+ for fp in self.file_handles.values():
143
+ fp.close()
144
+
145
+ def _reconstruct_array(self, cell_md: exprs.CellMd) -> np.ndarray:
146
+ assert cell_md.array_md is not None
147
+ assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
148
+ fp = self.__get_file_pointer(cell_md.file_urls[0])
149
+ ar = load_array(
150
+ fp, cell_md.array_md.start, cell_md.array_md.end, bool(cell_md.array_md.is_bool), cell_md.array_md.shape
151
+ )
152
+ return ar
153
+
154
+ def _reconstruct_binary(self, cell_md: exprs.CellMd) -> bytes:
155
+ assert cell_md.binary_md is not None
156
+ assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
157
+ fp = self.__get_file_pointer(cell_md.file_urls[0])
158
+ fp.seek(cell_md.binary_md.start)
159
+ data = fp.read(cell_md.binary_md.end - cell_md.binary_md.start)
160
+ assert fp.tell() == cell_md.binary_md.end
161
+ return data
162
+
163
+ def __get_file_pointer(self, file_url: str) -> io.BufferedReader:
164
+ local_path = parse_local_file_path(file_url)
165
+ assert local_path is not None
166
+ if local_path not in self.file_handles:
167
+ self.file_handles[local_path] = open(str(local_path), 'rb') # noqa: SIM115
168
+ return self.file_handles[local_path]
@@ -40,7 +40,7 @@ class ComponentIterationNode(ExecNode):
40
40
  }
41
41
 
42
42
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
43
- output_batch = DataRowBatch(self.view, self.row_builder)
43
+ output_batch = DataRowBatch(self.row_builder)
44
44
  async for input_batch in self.input:
45
45
  for input_row in input_batch:
46
46
  self.row_builder.eval(input_row, self.iterator_args_ctx)
@@ -52,13 +52,14 @@ class ComponentIterationNode(ExecNode):
52
52
  if self.__non_nullable_args_specified(iterator_args):
53
53
  iterator = self.view.get().iterator_cls(**iterator_args)
54
54
  for pos, component_dict in enumerate(iterator):
55
- output_row = output_batch.add_row()
55
+ output_row = self.row_builder.make_row()
56
56
  input_row.copy(output_row)
57
57
  # we're expanding the input and need to add the iterator position to the pk
58
58
  self.__populate_output_row(output_row, pos, component_dict)
59
+ output_batch.add_row(output_row)
59
60
  if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
60
61
  yield output_batch
61
- output_batch = DataRowBatch(self.view, self.row_builder)
62
+ output_batch = DataRowBatch(self.row_builder)
62
63
 
63
64
  if len(output_batch) > 0:
64
65
  yield output_batch
@@ -1,10 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from typing import Iterator, Optional
4
+ from typing import Iterator
5
5
 
6
- from pixeltable import catalog, exprs
7
- from pixeltable.utils.media_store import MediaStore
6
+ from pixeltable import exprs
8
7
 
9
8
  _logger = logging.getLogger('pixeltable')
10
9
 
@@ -13,53 +12,20 @@ class DataRowBatch:
13
12
  """Set of DataRows, indexed by rowid.
14
13
 
15
14
  Contains the metadata needed to initialize DataRows.
15
+
16
+ Requires either num_rows or rows to be specified, but not both.
16
17
  """
17
18
 
18
- tbl: Optional[catalog.TableVersionHandle]
19
19
  row_builder: exprs.RowBuilder
20
- img_slot_idxs: list[int]
21
- media_slot_idxs: list[int] # non-image media slots
22
- array_slot_idxs: list[int]
23
20
  rows: list[exprs.DataRow]
24
21
 
25
- def __init__(
26
- self,
27
- tbl: Optional[catalog.TableVersionHandle],
28
- row_builder: exprs.RowBuilder,
29
- num_rows: Optional[int] = None,
30
- rows: Optional[list[exprs.DataRow]] = None,
31
- ):
32
- """
33
- Requires either num_rows or rows to be specified, but not both.
34
- """
35
- assert num_rows is None or rows is None
36
- self.tbl = tbl
22
+ def __init__(self, row_builder: exprs.RowBuilder, rows: list[exprs.DataRow] | None = None):
37
23
  self.row_builder = row_builder
38
- self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
39
- # non-image media slots
40
- self.media_slot_idxs = [
41
- e.slot_idx
42
- for e in row_builder.unique_exprs
43
- if e.col_type.is_media_type() and not e.col_type.is_image_type()
44
- ]
45
- self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
46
- if rows is not None:
47
- self.rows = rows
48
- else:
49
- if num_rows is None:
50
- num_rows = 0
51
- self.rows = [
52
- exprs.DataRow(
53
- row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
54
- )
55
- for _ in range(num_rows)
56
- ]
24
+ self.rows = [] if rows is None else rows
57
25
 
58
- def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
26
+ def add_row(self, row: exprs.DataRow | None) -> exprs.DataRow:
59
27
  if row is None:
60
- row = exprs.DataRow(
61
- self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
62
- )
28
+ row = self.row_builder.make_row()
63
29
  self.rows.append(row)
64
30
  return row
65
31
 
@@ -72,28 +38,5 @@ class DataRowBatch:
72
38
  def __getitem__(self, index: int) -> exprs.DataRow:
73
39
  return self.rows[index]
74
40
 
75
- def flush_imgs(
76
- self,
77
- idx_range: Optional[slice] = None,
78
- stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
79
- flushed_slot_idxs: Optional[list[int]] = None,
80
- ) -> None:
81
- """Flushes images in the given range of rows."""
82
- assert self.tbl is not None
83
- if stored_img_info is None:
84
- stored_img_info = []
85
- if flushed_slot_idxs is None:
86
- flushed_slot_idxs = []
87
- if len(stored_img_info) == 0 and len(flushed_slot_idxs) == 0:
88
- return
89
- if idx_range is None:
90
- idx_range = slice(0, len(self.rows))
91
- for row in self.rows[idx_range]:
92
- for info in stored_img_info:
93
- filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.get().version))
94
- row.flush_img(info.slot_idx, filepath)
95
- for slot_idx in flushed_slot_idxs:
96
- row.flush_img(slot_idx)
97
-
98
41
  def __iter__(self) -> Iterator[exprs.DataRow]:
99
42
  return iter(self.rows)
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ import random
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -8,13 +8,24 @@ from pixeltable import exprs
8
8
  class ExecContext:
9
9
  """Class for execution runtime constants"""
10
10
 
11
+ row_builder: exprs.RowBuilder
12
+ profile: exprs.ExecProfile
13
+ show_pbar: bool
14
+ batch_size: int
15
+ num_rows: int | None
16
+ conn: sql.engine.Connection | None
17
+ pk_clause: list[sql.ClauseElement] | None
18
+ num_computed_exprs: int
19
+ ignore_errors: bool
20
+ random_seed: int # general-purpose source of randomness with execution scope
21
+
11
22
  def __init__(
12
23
  self,
13
24
  row_builder: exprs.RowBuilder,
14
25
  *,
15
26
  show_pbar: bool = False,
16
27
  batch_size: int = 0,
17
- pk_clause: Optional[list[sql.ClauseElement]] = None,
28
+ pk_clause: list[sql.ClauseElement] | None = None,
18
29
  num_computed_exprs: int = 0,
19
30
  ignore_errors: bool = False,
20
31
  ):
@@ -23,8 +34,9 @@ class ExecContext:
23
34
  self.row_builder = row_builder
24
35
  self.profile = exprs.ExecProfile(row_builder)
25
36
  # num_rows is used to compute the total number of computed cells used for the progress bar
26
- self.num_rows: Optional[int] = None
27
- self.conn: Optional[sql.engine.Connection] = None # if present, use this to execute SQL queries
37
+ self.num_rows = None
38
+ self.conn = None # if present, use this to execute SQL queries
28
39
  self.pk_clause = pk_clause
29
40
  self.num_computed_exprs = num_computed_exprs
30
41
  self.ignore_errors = ignore_errors
42
+ self.random_seed = random.randint(0, 1 << 63)
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- import asyncio
5
4
  import logging
6
- from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
5
+ from typing import AsyncIterator, Iterable, Iterator, TypeVar
7
6
 
8
7
  from pixeltable import exprs
8
+ from pixeltable.env import Env
9
9
 
10
10
  from .data_row_batch import DataRowBatch
11
11
  from .exec_context import ExecContext
@@ -18,17 +18,16 @@ class ExecNode(abc.ABC):
18
18
 
19
19
  output_exprs: Iterable[exprs.Expr]
20
20
  row_builder: exprs.RowBuilder
21
- input: Optional[ExecNode]
21
+ input: ExecNode | None
22
22
  flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
23
- stored_img_cols: list[exprs.ColumnSlotIdx]
24
- ctx: Optional[ExecContext]
23
+ ctx: ExecContext | None
25
24
 
26
25
  def __init__(
27
26
  self,
28
27
  row_builder: exprs.RowBuilder,
29
28
  output_exprs: Iterable[exprs.Expr],
30
29
  input_exprs: Iterable[exprs.Expr],
31
- input: Optional[ExecNode] = None,
30
+ input: ExecNode | None = None,
32
31
  ):
33
32
  assert all(expr.is_valid for expr in output_exprs)
34
33
  self.output_exprs = output_exprs
@@ -40,43 +39,19 @@ class ExecNode(abc.ABC):
40
39
  self.flushed_img_slots = [
41
40
  e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
42
41
  ]
43
- self.stored_img_cols = []
44
- self.ctx = None # all nodes of a tree share the same context
42
+ self.ctx = input.ctx if input is not None else None
45
43
 
46
44
  def set_ctx(self, ctx: ExecContext) -> None:
47
45
  self.ctx = ctx
48
46
  if self.input is not None:
49
47
  self.input.set_ctx(ctx)
50
48
 
51
- def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
52
- self.stored_img_cols = stored_img_cols
53
- # propagate batch size to the source
54
- if self.input is not None:
55
- self.input.set_stored_img_cols(stored_img_cols)
56
-
57
49
  @abc.abstractmethod
58
50
  def __aiter__(self) -> AsyncIterator[DataRowBatch]:
59
51
  pass
60
52
 
61
53
  def __iter__(self) -> Iterator[DataRowBatch]:
62
- running_loop: Optional[asyncio.AbstractEventLoop] = None
63
- loop: asyncio.AbstractEventLoop
64
- try:
65
- # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
66
- # multiple run_until_complete()
67
- running_loop = asyncio.get_running_loop()
68
- import nest_asyncio # type: ignore[import-untyped]
69
-
70
- nest_asyncio.apply()
71
- loop = running_loop
72
- _logger.debug('Patched running loop')
73
- except RuntimeError:
74
- loop = asyncio.new_event_loop()
75
- asyncio.set_event_loop(loop)
76
-
77
- if _logger.isEnabledFor(logging.DEBUG):
78
- loop.set_debug(True)
79
-
54
+ loop = Env.get().event_loop
80
55
  aiter = self.__aiter__()
81
56
  try:
82
57
  while True:
@@ -84,9 +59,11 @@ class ExecNode(abc.ABC):
84
59
  yield batch
85
60
  except StopAsyncIteration:
86
61
  pass
87
- finally:
88
- if loop != running_loop:
89
- loop.close()
62
+ # TODO:
63
+ # - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
64
+ # we end up here
65
+ # - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
66
+ # creates tasks on its own
90
67
 
91
68
  def open(self) -> None:
92
69
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -108,7 +85,7 @@ class ExecNode(abc.ABC):
108
85
 
109
86
  T = TypeVar('T', bound='ExecNode')
110
87
 
111
- def get_node(self, node_class: type[T]) -> Optional[T]:
88
+ def get_node(self, node_class: type[T]) -> T | None:
112
89
  if isinstance(self, node_class):
113
90
  return self
114
91
  if self.input is not None:
@@ -5,7 +5,7 @@ import datetime
5
5
  import itertools
6
6
  import logging
7
7
  import sys
8
- from typing import Any, Callable, Iterator, Optional, cast
8
+ from typing import Any, Callable, Iterator, cast
9
9
 
10
10
  from pixeltable import exprs, func
11
11
 
@@ -64,11 +64,11 @@ class FnCallEvaluator(Evaluator):
64
64
 
65
65
  fn_call: exprs.FunctionCall
66
66
  fn: func.CallableFunction
67
- scalar_py_fn: Optional[Callable] # only set for non-batching CallableFunctions
67
+ scalar_py_fn: Callable | None # only set for non-batching CallableFunctions
68
68
 
69
69
  # only set if fn.is_batched
70
- call_args_queue: Optional[asyncio.Queue[FnCallArgs]] # FnCallArgs waiting for execution
71
- batch_size: Optional[int]
70
+ call_args_queue: asyncio.Queue[FnCallArgs] | None # FnCallArgs waiting for execution
71
+ batch_size: int | None
72
72
 
73
73
  def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
74
74
  super().__init__(dispatcher, exec_ctx)
@@ -160,8 +160,8 @@ class FnCallEvaluator(Evaluator):
160
160
 
161
161
  def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
162
162
  """Roll call_args into a single batched FnCallArgs"""
163
- batch_args: list[list[Optional[Any]]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
164
- batch_kwargs: dict[str, list[Optional[Any]]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
163
+ batch_args: list[list[Any | None]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
164
+ batch_kwargs: dict[str, list[Any | None]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
165
165
  assert isinstance(self.fn, func.CallableFunction)
166
166
  for i, item in enumerate(call_args):
167
167
  for j in range(len(item.args)):
@@ -311,13 +311,17 @@ class JsonMapperDispatcher(Evaluator):
311
311
  img_slot_idxs=[],
312
312
  media_slot_idxs=[],
313
313
  array_slot_idxs=[],
314
+ json_slot_idxs=[],
314
315
  parent_row=row,
315
316
  parent_slot_idx=self.e.slot_idx,
316
317
  )
317
318
  for _ in src
318
319
  ]
319
320
  for nested_row, anchor_val in zip(nested_rows, src):
320
- nested_row[self.scope_anchor.slot_idx] = anchor_val
321
+ # It's possible that self.scope_anchor.slot_idx is None; this corresponds to the case where the
322
+ # mapper expression doesn't actually contain references to RELATIVE_PATH_ROOT.
323
+ if self.scope_anchor.slot_idx is not None:
324
+ nested_row[self.scope_anchor.slot_idx] = anchor_val
321
325
  for slot_idx_, nested_slot_idx in self.external_slot_map.items():
322
326
  nested_row[nested_slot_idx] = row[slot_idx_]
323
327
  self.nested_exec_ctx.init_rows(nested_rows)
@@ -4,7 +4,7 @@ import asyncio
4
4
  import logging
5
5
  import traceback
6
6
  from types import TracebackType
7
- from typing import AsyncIterator, Iterable, Optional, Union
7
+ from typing import AsyncIterator, Iterable
8
8
 
9
9
  import numpy as np
10
10
 
@@ -49,17 +49,17 @@ class ExprEvalNode(ExecNode):
49
49
  # execution state
50
50
  tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
51
51
  exc_event: asyncio.Event # set if an exception needs to be propagated
52
- error: Optional[Union[excs.Error, excs.ExprEvalError]] # exception that needs to be propagated
52
+ error: Exception | None # exception that needs to be propagated
53
53
  completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
54
54
  completed_event: asyncio.Event # set when completed_rows is non-empty
55
55
  input_iter: AsyncIterator[DataRowBatch]
56
- current_input_batch: Optional[DataRowBatch] # batch from which we're currently consuming rows
56
+ current_input_batch: DataRowBatch | None # batch from which we're currently consuming rows
57
57
  input_row_idx: int # next row to consume from current_input_batch
58
- next_input_batch: Optional[DataRowBatch] # read-ahead input batch
58
+ next_input_batch: DataRowBatch | None # read-ahead input batch
59
59
  avail_input_rows: int # total number across both current_/next_input_batch
60
60
  input_complete: bool # True if we've received all input batches
61
61
  num_in_flight: int # number of dispatched rows that haven't completed
62
- row_pos_map: Optional[dict[int, int]] # id(row) -> position of row in input; only set if maintain_input_order
62
+ row_pos_map: dict[int, int] | None # id(row) -> position of row in input; only set if maintain_input_order
63
63
  output_buffer: RowBuffer # holds rows that are ready to be returned, in order
64
64
 
65
65
  # debugging
@@ -133,10 +133,10 @@ class ExprEvalNode(ExecNode):
133
133
  except StopAsyncIteration:
134
134
  self.input_complete = True
135
135
  _logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
136
- except excs.Error as err:
137
- self.error = err
136
+ # make sure to pass DBAPIError through, so the transaction handling logic sees it
137
+ except Exception as exc:
138
+ self.error = exc
138
139
  self.exc_event.set()
139
- # TODO: should we also handle Exception here and create an excs.Error from it?
140
140
 
141
141
  @property
142
142
  def total_buffered(self) -> int:
@@ -217,9 +217,10 @@ class ExprEvalNode(ExecNode):
217
217
 
218
218
  row: exprs.DataRow
219
219
  exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
220
- input_batch_aw: Optional[asyncio.Task] = None
221
- completed_aw: Optional[asyncio.Task] = None
220
+ input_batch_aw: asyncio.Task | None = None
221
+ completed_aw: asyncio.Task | None = None
222
222
  closed_evaluators = False # True after calling Evaluator.close()
223
+ exprs.Expr.prepare_list(self.exec_ctx.all_exprs)
223
224
 
224
225
  try:
225
226
  while True:
@@ -240,7 +241,7 @@ class ExprEvalNode(ExecNode):
240
241
  # make sure we top up our in-flight rows before yielding
241
242
  self._dispatch_input_rows()
242
243
  self._log_state(f'yielding {len(batch_rows)} rows')
243
- yield DataRowBatch(tbl=None, row_builder=self.row_builder, rows=batch_rows)
244
+ yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
244
245
  # at this point, we may have more completed rows
245
246
 
246
247
  assert self.completed_rows.empty() # all completed rows should be sitting in output_buffer
@@ -254,7 +255,7 @@ class ExprEvalNode(ExecNode):
254
255
  batch_rows = self.output_buffer.get_rows(self.output_buffer.num_ready)
255
256
  self.num_output_rows += len(batch_rows)
256
257
  self._log_state(f'yielding {len(batch_rows)} rows')
257
- yield DataRowBatch(tbl=None, row_builder=self.row_builder, rows=batch_rows)
258
+ yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
258
259
 
259
260
  assert self.output_buffer.num_rows == 0
260
261
  return
@@ -306,6 +307,9 @@ class ExprEvalNode(ExecNode):
306
307
  task.cancel()
307
308
  _ = await asyncio.gather(*active_tasks, return_exceptions=True)
308
309
 
310
+ # expr cleanup
311
+ exprs.Expr.release_list(self.exec_ctx.all_exprs)
312
+
309
313
  def dispatch_exc(
310
314
  self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
311
315
  ) -> None:
@@ -390,6 +394,17 @@ class ExprEvalNode(ExecNode):
390
394
  # end the main loop if we had an unhandled exception
391
395
  try:
392
396
  t.result()
397
+ except KeyboardInterrupt:
398
+ # ExprEvalNode instances are long-running and reused across multiple operations.
399
+ # When a user interrupts an operation (Ctrl+C), the main evaluation loop properly
400
+ # handles the KeyboardInterrupt and terminates the current operation. However,
401
+ # background tasks spawned by evaluators may complete asynchronously after the
402
+ # operation has ended, and their done callbacks will fire during subsequent
403
+ # operations. These "phantom" KeyboardInterrupt exceptions from previous
404
+ # operations' background tasks should not interfere with new operations, so we
405
+ # absorb them here rather than propagating them via self.error/self.exc_event.
406
+ _logger.debug('Task completed with KeyboardInterrupt (user cancellation)')
407
+ pass
393
408
  except asyncio.CancelledError:
394
409
  pass
395
410
  except Exception as exc:
@@ -4,7 +4,7 @@ import abc
4
4
  import asyncio
5
5
  from dataclasses import dataclass
6
6
  from types import TracebackType
7
- from typing import Any, Iterable, Optional, Protocol
7
+ from typing import Any, Iterable, Protocol
8
8
 
9
9
  import numpy as np
10
10
 
@@ -18,11 +18,11 @@ class FnCallArgs:
18
18
  fn_call: exprs.FunctionCall
19
19
  rows: list[exprs.DataRow]
20
20
  # single call
21
- args: Optional[list[Any]] = None
22
- kwargs: Optional[dict[str, Any]] = None
21
+ args: list[Any] | None = None
22
+ kwargs: dict[str, Any] | None = None
23
23
  # batch call
24
- batch_args: Optional[list[list[Optional[Any]]]] = None
25
- batch_kwargs: Optional[dict[str, list[Optional[Any]]]] = None
24
+ batch_args: list[list[Any | None]] | None = None
25
+ batch_kwargs: dict[str, list[Any | None]] | None = None
26
26
 
27
27
  @property
28
28
  def pxt_fn(self) -> func.CallableFunction:
@@ -56,6 +56,7 @@ class Scheduler(abc.ABC):
56
56
  request: FnCallArgs
57
57
  num_retries: int
58
58
  exec_ctx: ExecCtx
59
+ retry_after: float | None = None # time.monotonic()
59
60
 
60
61
  def __lt__(self, other: Scheduler.QueueItem) -> bool:
61
62
  # prioritize by number of retries (more retries = higher priority)
@@ -148,6 +149,7 @@ class ExecCtx:
148
149
  gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
149
150
  eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
150
151
  literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
152
+ all_exprs: list[exprs.Expr] # all evaluated exprs; needed for cleanup
151
153
 
152
154
  def __init__(
153
155
  self,
@@ -164,6 +166,7 @@ class ExecCtx:
164
166
  self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
165
167
 
166
168
  output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
169
+ self.all_exprs = output_ctx.exprs
167
170
  self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
168
171
  self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
169
172
  non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from typing import Optional
5
4
 
6
5
  import numpy as np
7
6
 
@@ -14,7 +13,7 @@ class RowBuffer:
14
13
  """Fixed-length circular buffer of DataRows; knows how to maintain input order"""
15
14
 
16
15
  size: int
17
- row_pos_map: Optional[dict[int, int]] # id(row) -> position of row in output; None if not maintaining order
16
+ row_pos_map: dict[int, int] | None # id(row) -> position of row in output; None if not maintaining order
18
17
  num_rows: int # number of rows in the buffer
19
18
  num_ready: int # number of consecutive non-None rows at head
20
19
  buffer: np.ndarray # of object