pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, AsyncIterator
8
+
9
+ import numpy as np
10
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
11
+ import PIL.Image
12
+ import sqlalchemy as sql
13
+
14
+ import pixeltable.type_system as ts
15
+ import pixeltable.utils.image as image_utils
16
+ from pixeltable import catalog, exprs
17
+ from pixeltable.env import Env
18
+ from pixeltable.utils.local_store import LocalStore
19
+
20
+ from .data_row_batch import DataRowBatch
21
+ from .exec_node import ExecNode
22
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
23
+
24
+ _logger = logging.getLogger('pixeltable')
25
+
26
+
27
+ class CellMaterializationNode(ExecNode):
28
+ """
29
+ Node to populate DataRow.cell_vals/cell_md.
30
+
31
+ For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
32
+
33
+ Array values:
34
+ - Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
35
+ - Larger arrays are written to inlined_obj_files
36
+ - Bool arrays are stored as packed bits (uint8)
37
+ - cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
38
+ (this allows us to query cell_md to get the total external storage size of an array column)
39
+
40
+ Json values:
41
+ - Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
42
+ location
43
+ - Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
44
+ - cell_md contains the list of urls for the inlined objects.
45
+
46
+ TODO:
47
+ - execute file IO via asyncio Tasks in a thread pool?
48
+ (we already seem to be getting 90% of hardware IO throughput)
49
+ - subsume all cell materialization
50
+ """
51
+
52
+ output_col_info: dict[catalog.Column, int] # value: slot idx
53
+
54
+ # execution state
55
+ inlined_obj_files: list[Path] # only [-1] is open for writing
56
+ buffered_writer: io.BufferedWriter | None # BufferedWriter for inlined_obj_files[-1]
57
+
58
+ MIN_FILE_SIZE = 8 * 2**20 # 8MB
59
+ MAX_DB_BINARY_SIZE = 512 # max size of binary data stored in table column; in bytes
60
+
61
+ def __init__(self, input: ExecNode):
62
+ super().__init__(input.row_builder, [], [], input)
63
+ self.output_col_info = {
64
+ col: slot_idx
65
+ for col, slot_idx in input.row_builder.table_columns.items()
66
+ if slot_idx is not None and col.col_type.supports_file_offloading()
67
+ }
68
+ self.inlined_obj_files = []
69
+ self.buffered_writer = None
70
+
71
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
72
+ async for batch in self.input:
73
+ for row in batch:
74
+ for col, slot_idx in self.output_col_info.items():
75
+ if row.has_exc(slot_idx):
76
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
77
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
78
+ exc = row.get_exc(slot_idx)
79
+ row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
80
+ continue
81
+
82
+ val = row[slot_idx]
83
+ if val is None:
84
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
85
+ row.cell_md[col.id] = None
86
+ continue
87
+
88
+ if col.col_type.is_json_type():
89
+ self._materialize_json_cell(row, col, val)
90
+ elif col.col_type.is_array_type():
91
+ assert isinstance(val, np.ndarray)
92
+ self._materialize_array_cell(row, col, val)
93
+ else:
94
+ assert col.col_type.is_binary_type()
95
+ assert isinstance(val, bytes)
96
+ self._materialize_binary_cell(row, col, val)
97
+
98
+ # continue with only the currently open file
99
+ self.inlined_obj_files = self.inlined_obj_files[-1:]
100
+
101
+ yield batch
102
+
103
+ self._flush_buffer(finalize=True)
104
+
105
+ def init_writer(self) -> None:
106
+ if self.buffered_writer is None:
107
+ self._reset_buffer()
108
+ assert self.buffered_writer is not None
109
+
110
+ def close(self) -> None:
111
+ if self.buffered_writer is not None:
112
+ # there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
113
+ self.buffered_writer.close()
114
+ self.buffered_writer = None
115
+
116
+ def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
117
+ if self._json_has_inlined_objs(val):
118
+ row.cell_vals[col.id] = self._rewrite_json(val)
119
+ row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
120
+ else:
121
+ row.cell_vals[col.id] = val
122
+ row.cell_md[col.id] = None
123
+
124
+ def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
125
+ if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
126
+ # this is a vector column (ie, used for a vector index): store the array itself
127
+ row.cell_vals[col.id] = val
128
+ row.cell_md[col.id] = None
129
+ elif val.nbytes <= self.MAX_DB_BINARY_SIZE:
130
+ # this array is small enough to store in the db column (type: binary) directly
131
+ buffer = io.BytesIO()
132
+ np.save(buffer, val, allow_pickle=False)
133
+ row.cell_vals[col.id] = buffer.getvalue()
134
+ row.cell_md[col.id] = None
135
+ else:
136
+ # append this array to the buffer and store its location in the cell md
137
+ ar: np.ndarray
138
+ if np.issubdtype(val.dtype, np.bool_):
139
+ # for bool arrays, store as packed bits, otherwise it's 1 byte per element
140
+ ar = np.packbits(val)
141
+ else:
142
+ ar = val
143
+ self.init_writer()
144
+ start = self.buffered_writer.tell()
145
+ np.save(self.buffered_writer, ar, allow_pickle=False)
146
+ end = self.buffered_writer.tell()
147
+ row.cell_vals[col.id] = None
148
+ cell_md = exprs.CellMd(
149
+ file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
150
+ )
151
+ if np.issubdtype(val.dtype, np.bool_):
152
+ cell_md.array_md.is_bool = True
153
+ cell_md.array_md.shape = val.shape
154
+ row.cell_md[col.id] = cell_md
155
+ self._flush_buffer()
156
+
157
+ assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
158
+
159
+ def _materialize_binary_cell(self, row: exprs.DataRow, col: catalog.Column, val: bytes) -> None:
160
+ if len(val) <= self.MAX_DB_BINARY_SIZE:
161
+ # this `bytes` object is small enough to store in the db column (type: binary) directly
162
+ row.cell_vals[col.id] = val
163
+ row.cell_md[col.id] = None
164
+ else:
165
+ self.init_writer()
166
+ start = self.buffered_writer.tell()
167
+ self.buffered_writer.write(val)
168
+ end = self.buffered_writer.tell()
169
+ row.cell_vals[col.id] = None
170
+ cell_md = exprs.CellMd(
171
+ file_urls=[self.inlined_obj_files[-1].as_uri()], binary_md=exprs.BinaryMd(start=start, end=end)
172
+ )
173
+ row.cell_md[col.id] = cell_md
174
+ self._flush_buffer()
175
+
176
+ assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
177
+
178
+ def _json_has_inlined_objs(self, element: Any) -> bool:
179
+ if isinstance(element, list):
180
+ return any(self._json_has_inlined_objs(v) for v in element)
181
+ if isinstance(element, dict):
182
+ return any(self._json_has_inlined_objs(v) for v in element.values())
183
+ return isinstance(element, (np.ndarray, PIL.Image.Image, bytes))
184
+
185
+ def _rewrite_json(self, element: Any) -> Any:
186
+ """Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
187
+ if isinstance(element, list):
188
+ return [self._rewrite_json(v) for v in element]
189
+ if isinstance(element, dict):
190
+ return {k: self._rewrite_json(v) for k, v in element.items()}
191
+ if isinstance(element, np.ndarray):
192
+ obj_md = self._write_inlined_array(element)
193
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
194
+ if isinstance(element, PIL.Image.Image):
195
+ obj_md = self._write_inlined_image(element)
196
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
197
+ if isinstance(element, bytes):
198
+ obj_md = self._write_inlined_bytes(element)
199
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
200
+ return element
201
+
202
+ def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
203
+ """Write an ndarray to buffered_writer and return its metadata."""
204
+ self.init_writer()
205
+ url_idx = len(self.inlined_obj_files) - 1
206
+ start = self.buffered_writer.tell()
207
+ shape: tuple[int, ...] | None
208
+ is_bool_array: bool
209
+ if np.issubdtype(ar.dtype, np.bool_):
210
+ shape = ar.shape
211
+ ar = np.packbits(ar)
212
+ is_bool_array = True
213
+ else:
214
+ shape = None
215
+ is_bool_array = False
216
+ np.save(self.buffered_writer, ar, allow_pickle=False)
217
+ end = self.buffered_writer.tell()
218
+ self._flush_buffer()
219
+ return InlinedObjectMd(
220
+ type=ts.ColumnType.Type.ARRAY.name,
221
+ url_idx=url_idx,
222
+ array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
223
+ )
224
+
225
+ def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
226
+ """Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
227
+ self.init_writer()
228
+ url_idx = len(self.inlined_obj_files) - 1
229
+ start = self.buffered_writer.tell()
230
+ img.save(self.buffered_writer, format=image_utils.default_format(img))
231
+ end = self.buffered_writer.tell()
232
+ self._flush_buffer()
233
+ return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
234
+
235
+ def _write_inlined_bytes(self, data: bytes) -> InlinedObjectMd:
236
+ """Write raw bytes to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
237
+ self.init_writer()
238
+ url_idx = len(self.inlined_obj_files) - 1
239
+ start = self.buffered_writer.tell()
240
+ self.buffered_writer.write(data)
241
+ end = self.buffered_writer.tell()
242
+ self._flush_buffer()
243
+ return InlinedObjectMd(
244
+ type=ts.ColumnType.Type.BINARY.name, url_idx=url_idx, binary_md=exprs.BinaryMd(start, end)
245
+ )
246
+
247
+ def _reset_buffer(self) -> None:
248
+ local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
249
+ self.row_builder.tbl.id, 0, self.row_builder.tbl.version
250
+ )
251
+ self.inlined_obj_files.append(local_path)
252
+ fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2) # noqa: SIM115
253
+ assert isinstance(fh, io.BufferedWriter)
254
+ self.buffered_writer = fh
255
+
256
+ def _flush_buffer(self, finalize: bool = False) -> None:
257
+ """Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
258
+ if self.buffered_writer is None:
259
+ return
260
+ if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
261
+ return
262
+ self.buffered_writer.flush()
263
+ os.fsync(self.buffered_writer.fileno()) # needed to force bytes cached by OS to storage
264
+ self.buffered_writer.close()
265
+ if finalize:
266
+ self.buffered_writer = None
267
+ else:
268
+ self._reset_buffer()
@@ -0,0 +1,168 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ from pathlib import Path
6
+ from types import NoneType
7
+ from typing import Any, AsyncIterator
8
+
9
+ import numpy as np
10
+ import PIL.Image
11
+
12
+ import pixeltable.type_system as ts
13
+ from pixeltable import exprs
14
+ from pixeltable.utils import parse_local_file_path
15
+
16
+ from .data_row_batch import DataRowBatch
17
+ from .exec_node import ExecNode
18
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
19
+
20
+ _logger = logging.getLogger('pixeltable')
21
+
22
+
23
+ def json_has_inlined_objs(element: Any) -> bool:
24
+ """Returns True if element contains inlined objects produced by CellMaterializationNode."""
25
+ if isinstance(element, list):
26
+ return any(json_has_inlined_objs(v) for v in element)
27
+ if isinstance(element, dict):
28
+ if INLINED_OBJECT_MD_KEY in element:
29
+ return True
30
+ return any(json_has_inlined_objs(v) for v in element.values())
31
+ return False
32
+
33
+
34
+ def reconstruct_json(element: Any, urls: list[str], file_handles: dict[Path, io.BufferedReader]) -> Any:
35
+ """Recursively reconstructs inlined objects in a json structure."""
36
+ if isinstance(element, list):
37
+ return [reconstruct_json(v, urls, file_handles) for v in element]
38
+ if isinstance(element, dict):
39
+ if INLINED_OBJECT_MD_KEY in element:
40
+ obj_md = InlinedObjectMd.from_dict(element[INLINED_OBJECT_MD_KEY])
41
+ url = urls[obj_md.url_idx]
42
+ local_path = parse_local_file_path(url)
43
+ if local_path not in file_handles:
44
+ file_handles[local_path] = open(local_path, 'rb') # noqa: SIM115
45
+ fp = file_handles[local_path]
46
+
47
+ if obj_md.type == ts.ColumnType.Type.ARRAY.name:
48
+ fp.seek(obj_md.array_md.start)
49
+ ar = load_array(
50
+ fp, obj_md.array_md.start, obj_md.array_md.end, obj_md.array_md.is_bool, obj_md.array_md.shape
51
+ )
52
+ return ar
53
+ elif obj_md.type == ts.ColumnType.Type.IMAGE.name:
54
+ fp.seek(obj_md.img_start)
55
+ bytesio = io.BytesIO(fp.read(obj_md.img_end - obj_md.img_start))
56
+ img = PIL.Image.open(bytesio)
57
+ img.load()
58
+ assert fp.tell() == obj_md.img_end, f'{fp.tell()} != {obj_md.img_end} ({obj_md.img_start})'
59
+ return img
60
+ else:
61
+ assert obj_md.type == ts.ColumnType.Type.BINARY.name
62
+ assert obj_md.binary_md is not None
63
+ fp.seek(obj_md.binary_md.start)
64
+ data = fp.read(obj_md.binary_md.end - obj_md.binary_md.start)
65
+ assert fp.tell() == obj_md.binary_md.end, (
66
+ f'{fp.tell()} != {obj_md.binary_md.end} ({obj_md.binary_md.start})'
67
+ )
68
+ return data
69
+ else:
70
+ return {k: reconstruct_json(v, urls, file_handles) for k, v in element.items()}
71
+ return element
72
+
73
+
74
+ def load_array(
75
+ fh: io.BufferedReader, start: int, end: int, is_bool_array: bool, shape: tuple[int, ...] | None
76
+ ) -> np.ndarray:
77
+ """Loads an array from a section of a file."""
78
+ fh.seek(start)
79
+ ar = np.load(fh, allow_pickle=False)
80
+ assert fh.tell() == end
81
+ if is_bool_array:
82
+ assert shape is not None
83
+ ar = np.unpackbits(ar, count=np.prod(shape)).reshape(shape).astype(bool)
84
+ return ar
85
+
86
+
87
+ class CellReconstructionNode(ExecNode):
88
+ """
89
+ Reconstruction of stored json and array cells that were produced by CellMaterializationNode.
90
+ """
91
+
92
+ json_refs: list[exprs.ColumnRef]
93
+ array_refs: list[exprs.ColumnRef]
94
+ binary_refs: list[exprs.ColumnRef]
95
+ file_handles: dict[Path, io.BufferedReader] # key: file path
96
+
97
+ def __init__(
98
+ self,
99
+ json_refs: list[exprs.ColumnRef],
100
+ array_refs: list[exprs.ColumnRef],
101
+ binary_refs: list[exprs.ColumnRef],
102
+ row_builder: exprs.RowBuilder,
103
+ input: ExecNode | None = None,
104
+ ):
105
+ super().__init__(row_builder, [], [], input)
106
+ self.json_refs = json_refs
107
+ self.array_refs = array_refs
108
+ self.binary_refs = binary_refs
109
+ self.file_handles = {}
110
+
111
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
112
+ async for batch in self.input:
113
+ for row in batch:
114
+ for col_ref in self.json_refs:
115
+ val = row[col_ref.slot_idx]
116
+ if val is None:
117
+ continue
118
+ cell_md = row.slot_md.get(col_ref.slot_idx)
119
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(row[col_ref.slot_idx]):
120
+ continue
121
+ row[col_ref.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
122
+
123
+ for col_ref in self.array_refs:
124
+ cell_md = row.slot_md.get(col_ref.slot_idx)
125
+ if cell_md is not None and cell_md.array_md is not None:
126
+ assert row[col_ref.slot_idx] is None
127
+ row[col_ref.slot_idx] = self._reconstruct_array(cell_md)
128
+ else:
129
+ assert isinstance(row[col_ref.slot_idx], (NoneType, np.ndarray))
130
+
131
+ for col_ref in self.binary_refs:
132
+ cell_md = row.slot_md.get(col_ref.slot_idx)
133
+ if cell_md is not None and cell_md.binary_md is not None:
134
+ assert row[col_ref.slot_idx] is None
135
+ row[col_ref.slot_idx] = self._reconstruct_binary(cell_md)
136
+ else:
137
+ assert isinstance(row[col_ref.slot_idx], (NoneType, bytes))
138
+
139
+ yield batch
140
+
141
+ def close(self) -> None:
142
+ for fp in self.file_handles.values():
143
+ fp.close()
144
+
145
+ def _reconstruct_array(self, cell_md: exprs.CellMd) -> np.ndarray:
146
+ assert cell_md.array_md is not None
147
+ assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
148
+ fp = self.__get_file_pointer(cell_md.file_urls[0])
149
+ ar = load_array(
150
+ fp, cell_md.array_md.start, cell_md.array_md.end, bool(cell_md.array_md.is_bool), cell_md.array_md.shape
151
+ )
152
+ return ar
153
+
154
+ def _reconstruct_binary(self, cell_md: exprs.CellMd) -> bytes:
155
+ assert cell_md.binary_md is not None
156
+ assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
157
+ fp = self.__get_file_pointer(cell_md.file_urls[0])
158
+ fp.seek(cell_md.binary_md.start)
159
+ data = fp.read(cell_md.binary_md.end - cell_md.binary_md.start)
160
+ assert fp.tell() == cell_md.binary_md.end
161
+ return data
162
+
163
+ def __get_file_pointer(self, file_url: str) -> io.BufferedReader:
164
+ local_path = parse_local_file_path(file_url)
165
+ assert local_path is not None
166
+ if local_path not in self.file_handles:
167
+ self.file_handles[local_path] = open(str(local_path), 'rb') # noqa: SIM115
168
+ return self.file_handles[local_path]
@@ -1,9 +1,6 @@
1
- import inspect
2
- from typing import Iterator, Optional
1
+ from typing import AsyncIterator
3
2
 
4
- import pixeltable.catalog as catalog
5
- import pixeltable.exceptions as excs
6
- import pixeltable.exprs as exprs
3
+ from pixeltable import catalog, exceptions as excs, exprs
7
4
 
8
5
  from .data_row_batch import DataRowBatch
9
6
  from .exec_node import ExecNode
@@ -14,34 +11,37 @@ class ComponentIterationNode(ExecNode):
14
11
 
15
12
  Returns row batches of OUTPUT_BATCH_SIZE size.
16
13
  """
14
+
15
+ view: catalog.TableVersionHandle
16
+
17
17
  __OUTPUT_BATCH_SIZE = 1024
18
18
 
19
- def __init__(self, view: catalog.TableVersion, input: ExecNode):
20
- assert view.is_component_view()
19
+ def __init__(self, view: catalog.TableVersionHandle, input: ExecNode):
20
+ assert view.get().is_component_view
21
21
  super().__init__(input.row_builder, [], [], input)
22
22
  self.view = view
23
- iterator_args = [view.iterator_args.copy()]
23
+ iterator_args = [view.get().iterator_args.copy()]
24
24
  self.row_builder.set_slot_idxs(iterator_args)
25
25
  self.iterator_args = iterator_args[0]
26
26
  assert isinstance(self.iterator_args, exprs.InlineDict)
27
27
  self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
28
- self.iterator_output_schema, self.unstored_column_names = (
29
- self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
28
+ self.iterator_output_schema, self.unstored_column_names = self.view.get().iterator_cls.output_schema(
29
+ **self.iterator_args.to_kwargs()
30
30
  )
31
31
  self.iterator_output_fields = list(self.iterator_output_schema.keys())
32
32
  self.iterator_output_cols = {
33
- field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields
33
+ field_name: self.view.get().cols_by_name[field_name] for field_name in self.iterator_output_fields
34
34
  }
35
35
  # referenced iterator output fields
36
36
  self.refd_output_slot_idxs = {
37
- e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
37
+ e.col.name: e.slot_idx
38
+ for e in self.row_builder.unique_exprs
38
39
  if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
39
40
  }
40
- self.__output: Optional[Iterator[DataRowBatch]] = None
41
41
 
42
- def __output_batches(self) -> Iterator[DataRowBatch]:
43
- output_batch = DataRowBatch(self.view, self.row_builder)
44
- for input_batch in self.input:
42
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
43
+ output_batch = DataRowBatch(self.row_builder)
44
+ async for input_batch in self.input:
45
45
  for input_row in input_batch:
46
46
  self.row_builder.eval(input_row, self.iterator_args_ctx)
47
47
  iterator_args = input_row[self.iterator_args.slot_idx]
@@ -50,15 +50,16 @@ class ComponentIterationNode(ExecNode):
50
50
  # specified and are not null. If any of them are null, then we skip this row (i.e., we emit 0
51
51
  # output rows for this input row).
52
52
  if self.__non_nullable_args_specified(iterator_args):
53
- iterator = self.view.iterator_cls(**iterator_args)
53
+ iterator = self.view.get().iterator_cls(**iterator_args)
54
54
  for pos, component_dict in enumerate(iterator):
55
- output_row = output_batch.add_row()
55
+ output_row = self.row_builder.make_row()
56
56
  input_row.copy(output_row)
57
57
  # we're expanding the input and need to add the iterator position to the pk
58
58
  self.__populate_output_row(output_row, pos, component_dict)
59
+ output_batch.add_row(output_row)
59
60
  if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
60
61
  yield output_batch
61
- output_batch = DataRowBatch(self.view, self.row_builder)
62
+ output_batch = DataRowBatch(self.row_builder)
62
63
 
63
64
  if len(output_batch) > 0:
64
65
  yield output_batch
@@ -67,7 +68,7 @@ class ComponentIterationNode(ExecNode):
67
68
  """
68
69
  Returns true if all non-nullable iterator arguments are not `None`.
69
70
  """
70
- input_schema = self.view.iterator_cls.input_schema()
71
+ input_schema = self.view.get().iterator_cls.input_schema()
71
72
  for arg_name, arg_value in iterator_args.items():
72
73
  col_type = input_schema[arg_name]
73
74
  if arg_value is None and not col_type.nullable:
@@ -81,7 +82,8 @@ class ComponentIterationNode(ExecNode):
81
82
  for field_name, field_val in component_dict.items():
82
83
  if field_name not in self.iterator_output_fields:
83
84
  raise excs.Error(
84
- f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
85
+ f'Invalid field name {field_name} in output of {self.view.get().iterator_cls.__name__}'
86
+ )
85
87
  if field_name not in self.refd_output_slot_idxs:
86
88
  # we can ignore this
87
89
  continue
@@ -91,10 +93,5 @@ class ComponentIterationNode(ExecNode):
91
93
  if len(component_dict) != len(self.iterator_output_fields):
92
94
  missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
93
95
  raise excs.Error(
94
- f'Invalid output of {self.view.iterator_cls.__name__}: '
95
- f'missing fields {", ".join(missing_fields)}')
96
-
97
- def __next__(self) -> DataRowBatch:
98
- if self.__output is None:
99
- self.__output = self.__output_batches()
100
- return next(self.__output)
96
+ f'Invalid output of {self.view.get().iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
97
+ )
@@ -1,45 +1,31 @@
1
1
  from __future__ import annotations
2
- from typing import Iterator, Optional
3
- import logging
4
2
 
5
- import pixeltable.exprs as exprs
6
- import pixeltable.catalog as catalog
7
- from pixeltable.utils.media_store import MediaStore
3
+ import logging
4
+ from typing import Iterator
8
5
 
6
+ from pixeltable import exprs
9
7
 
10
8
  _logger = logging.getLogger('pixeltable')
11
9
 
10
+
12
11
  class DataRowBatch:
13
12
  """Set of DataRows, indexed by rowid.
14
13
 
15
14
  Contains the metadata needed to initialize DataRows.
15
+
16
+ Requires either num_rows or rows to be specified, but not both.
16
17
  """
17
- tbl: Optional[catalog.TableVersion]
18
+
18
19
  row_builder: exprs.RowBuilder
19
- img_slot_idxs: list[int]
20
- media_slot_idxs: list[int] # non-image media slots
21
- array_slot_idxs: list[int]
22
20
  rows: list[exprs.DataRow]
23
21
 
24
- def __init__(self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, len: int = 0):
25
- self.tbl = tbl
22
+ def __init__(self, row_builder: exprs.RowBuilder, rows: list[exprs.DataRow] | None = None):
26
23
  self.row_builder = row_builder
27
- self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
28
- # non-image media slots
29
- self.media_slot_idxs = [
30
- e.slot_idx for e in row_builder.unique_exprs
31
- if e.col_type.is_media_type() and not e.col_type.is_image_type()
32
- ]
33
- self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
34
- self.rows = [
35
- exprs.DataRow(row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
36
- for _ in range(len)
37
- ]
24
+ self.rows = [] if rows is None else rows
38
25
 
39
- def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
26
+ def add_row(self, row: exprs.DataRow | None) -> exprs.DataRow:
40
27
  if row is None:
41
- row = exprs.DataRow(
42
- self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
28
+ row = self.row_builder.make_row()
43
29
  self.rows.append(row)
44
30
  return row
45
31
 
@@ -52,26 +38,5 @@ class DataRowBatch:
52
38
  def __getitem__(self, index: int) -> exprs.DataRow:
53
39
  return self.rows[index]
54
40
 
55
- def flush_imgs(
56
- self, idx_range: Optional[slice] = None, stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
57
- flushed_slot_idxs: Optional[list[int]] = None
58
- ) -> None:
59
- """Flushes images in the given range of rows."""
60
- assert self.tbl is not None
61
- if stored_img_info is None:
62
- stored_img_info = []
63
- if flushed_slot_idxs is None:
64
- flushed_slot_idxs = []
65
- if len(stored_img_info) == 0 and len(flushed_slot_idxs) == 0:
66
- return
67
- if idx_range is None:
68
- idx_range = slice(0, len(self.rows))
69
- for row in self.rows[idx_range]:
70
- for info in stored_img_info:
71
- filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.version))
72
- row.flush_img(info.slot_idx, filepath)
73
- for slot_idx in flushed_slot_idxs:
74
- row.flush_img(slot_idx)
75
-
76
41
  def __iter__(self) -> Iterator[exprs.DataRow]:
77
42
  return iter(self.rows)