pixeltable 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (57) hide show
  1. pixeltable/__init__.py +4 -0
  2. pixeltable/catalog/catalog.py +105 -51
  3. pixeltable/catalog/column.py +7 -2
  4. pixeltable/catalog/table.py +1 -0
  5. pixeltable/catalog/table_metadata.py +4 -0
  6. pixeltable/catalog/table_version.py +99 -78
  7. pixeltable/catalog/table_version_handle.py +4 -1
  8. pixeltable/config.py +6 -0
  9. pixeltable/dataframe.py +10 -5
  10. pixeltable/env.py +48 -19
  11. pixeltable/exec/__init__.py +2 -0
  12. pixeltable/exec/cell_materialization_node.py +231 -0
  13. pixeltable/exec/cell_reconstruction_node.py +135 -0
  14. pixeltable/exec/exec_node.py +1 -1
  15. pixeltable/exec/expr_eval/evaluators.py +1 -0
  16. pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
  17. pixeltable/exec/expr_eval/globals.py +2 -0
  18. pixeltable/exec/globals.py +32 -0
  19. pixeltable/exec/object_store_save_node.py +1 -4
  20. pixeltable/exec/row_update_node.py +16 -9
  21. pixeltable/exec/sql_node.py +107 -14
  22. pixeltable/exprs/__init__.py +1 -1
  23. pixeltable/exprs/arithmetic_expr.py +10 -11
  24. pixeltable/exprs/column_property_ref.py +10 -10
  25. pixeltable/exprs/column_ref.py +2 -2
  26. pixeltable/exprs/data_row.py +106 -37
  27. pixeltable/exprs/expr.py +9 -0
  28. pixeltable/exprs/expr_set.py +14 -7
  29. pixeltable/exprs/inline_expr.py +2 -19
  30. pixeltable/exprs/json_path.py +45 -12
  31. pixeltable/exprs/row_builder.py +54 -22
  32. pixeltable/functions/__init__.py +1 -0
  33. pixeltable/functions/bedrock.py +7 -0
  34. pixeltable/functions/deepseek.py +11 -4
  35. pixeltable/functions/llama_cpp.py +7 -0
  36. pixeltable/functions/math.py +1 -1
  37. pixeltable/functions/ollama.py +7 -0
  38. pixeltable/functions/openai.py +4 -4
  39. pixeltable/functions/openrouter.py +143 -0
  40. pixeltable/globals.py +10 -4
  41. pixeltable/io/globals.py +16 -15
  42. pixeltable/io/table_data_conduit.py +46 -21
  43. pixeltable/metadata/__init__.py +1 -1
  44. pixeltable/metadata/converters/convert_40.py +73 -0
  45. pixeltable/metadata/notes.py +1 -0
  46. pixeltable/plan.py +175 -46
  47. pixeltable/store.py +1 -1
  48. pixeltable/type_system.py +5 -3
  49. pixeltable/utils/console_output.py +4 -1
  50. pixeltable/utils/exception_handler.py +5 -28
  51. pixeltable/utils/image.py +7 -0
  52. pixeltable/utils/misc.py +5 -0
  53. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/METADATA +2 -1
  54. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/RECORD +57 -50
  55. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/WHEEL +0 -0
  56. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/entry_points.txt +0 -0
  57. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,231 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, AsyncIterator
8
+
9
+ import numpy as np
10
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
11
+ import PIL.Image
12
+ import sqlalchemy as sql
13
+
14
+ import pixeltable.type_system as ts
15
+ import pixeltable.utils.image as image_utils
16
+ from pixeltable import catalog, exprs
17
+ from pixeltable.env import Env
18
+ from pixeltable.utils.local_store import LocalStore
19
+
20
+ from .data_row_batch import DataRowBatch
21
+ from .exec_node import ExecNode
22
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
23
+
24
+ _logger = logging.getLogger('pixeltable')
25
+
26
+
27
+ class CellMaterializationNode(ExecNode):
28
+ """
29
+ Node to populate DataRow.cell_vals/cell_md.
30
+
31
+ For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
32
+
33
+ Array values:
34
+ - Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
35
+ - Larger arrays are written to inlined_obj_files
36
+ - Bool arrays are stored as packed bits (uint8)
37
+ - cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
38
+ (this allows us to query cell_md to get the total external storage size of an array column)
39
+
40
+ Json values:
41
+ - Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
42
+ location
43
+ - Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
44
+ - cell_md contains the list of urls for the inlined objects.
45
+
46
+ TODO:
47
+ - execute file IO via asyncio Tasks in a thread pool?
48
+ (we already seem to be getting 90% of hardware IO throughput)
49
+ - subsume all cell materialization
50
+ """
51
+
52
+ output_col_info: dict[catalog.Column, int] # value: slot idx
53
+
54
+ # execution state
55
+ inlined_obj_files: list[Path] # only [-1] is open for writing
56
+ buffered_writer: io.BufferedWriter | None # BufferedWriter for inlined_obj_files[-1]
57
+
58
+ MIN_FILE_SIZE = 8 * 2**20 # 8MB
59
+ MAX_DB_ARRAY_SIZE = 512 # max size of array stored in table column; in bytes
60
+
61
+ def __init__(self, input: ExecNode):
62
+ super().__init__(input.row_builder, [], [], input)
63
+ self.output_col_info = {
64
+ col: slot_idx
65
+ for col, slot_idx in input.row_builder.table_columns.items()
66
+ if slot_idx is not None and (col.col_type.is_json_type() or col.col_type.is_array_type())
67
+ }
68
+ self.inlined_obj_files = []
69
+ self.buffered_writer = None
70
+
71
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
72
+ async for batch in self.input:
73
+ for row in batch:
74
+ for col, slot_idx in self.output_col_info.items():
75
+ if row.has_exc(slot_idx):
76
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
77
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
78
+ exc = row.get_exc(slot_idx)
79
+ row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
80
+ continue
81
+
82
+ val = row[slot_idx]
83
+ if val is None:
84
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
85
+ row.cell_md[col.id] = None
86
+ continue
87
+
88
+ if col.col_type.is_json_type():
89
+ self._materialize_json_cell(row, col, val)
90
+ else:
91
+ assert col.col_type.is_array_type()
92
+ assert isinstance(val, np.ndarray)
93
+ self._materialize_array_cell(row, col, val)
94
+
95
+ # continue with only the currently open file
96
+ self.inlined_obj_files = self.inlined_obj_files[-1:]
97
+
98
+ yield batch
99
+
100
+ self._flush_buffer(finalize=True)
101
+
102
+ def init_writer(self) -> None:
103
+ if self.buffered_writer is None:
104
+ self._reset_buffer()
105
+ assert self.buffered_writer is not None
106
+
107
+ def close(self) -> None:
108
+ if self.buffered_writer is not None:
109
+ # there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
110
+ self.buffered_writer.close()
111
+ self.buffered_writer = None
112
+
113
+ def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
114
+ if self._json_has_inlined_objs(val):
115
+ row.cell_vals[col.id] = self._rewrite_json(val)
116
+ row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
117
+ else:
118
+ row.cell_vals[col.id] = val
119
+ row.cell_md[col.id] = None
120
+
121
+ def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
122
+ if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
123
+ # this is a vector column (ie, used for a vector index): store the array itself
124
+ row.cell_vals[col.id] = val
125
+ row.cell_md[col.id] = None
126
+ elif val.nbytes <= self.MAX_DB_ARRAY_SIZE:
127
+ # this array is small enough to store in the db column (type: binary) directly
128
+ buffer = io.BytesIO()
129
+ np.save(buffer, val, allow_pickle=False)
130
+ row.cell_vals[col.id] = buffer.getvalue()
131
+ row.cell_md[col.id] = None
132
+ else:
133
+ # append this array to the buffer and store its location in the cell md
134
+ ar: np.ndarray
135
+ if np.issubdtype(val.dtype, np.bool_):
136
+ # for bool arrays, store as packed bits, otherwise it's 1 byte per element
137
+ ar = np.packbits(val)
138
+ else:
139
+ ar = val
140
+ self.init_writer()
141
+ start = self.buffered_writer.tell()
142
+ np.save(self.buffered_writer, ar, allow_pickle=False)
143
+ end = self.buffered_writer.tell()
144
+ row.cell_vals[col.id] = None
145
+ cell_md = exprs.CellMd(
146
+ file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
147
+ )
148
+ if np.issubdtype(val.dtype, np.bool_):
149
+ cell_md.array_md.is_bool = True
150
+ cell_md.array_md.shape = val.shape
151
+ row.cell_md[col.id] = cell_md
152
+ self._flush_buffer()
153
+
154
+ assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
155
+
156
+ def _json_has_inlined_objs(self, element: Any) -> bool:
157
+ if isinstance(element, list):
158
+ return any(self._json_has_inlined_objs(v) for v in element)
159
+ if isinstance(element, dict):
160
+ return any(self._json_has_inlined_objs(v) for v in element.values())
161
+ return isinstance(element, (np.ndarray, PIL.Image.Image))
162
+
163
+ def _rewrite_json(self, element: Any) -> Any:
164
+ """Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
165
+ if isinstance(element, list):
166
+ return [self._rewrite_json(v) for v in element]
167
+ if isinstance(element, dict):
168
+ return {k: self._rewrite_json(v) for k, v in element.items()}
169
+ if isinstance(element, np.ndarray):
170
+ obj_md = self._write_inlined_array(element)
171
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
172
+ if isinstance(element, PIL.Image.Image):
173
+ obj_md = self._write_inlined_image(element)
174
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
175
+ return element
176
+
177
+ def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
178
+ """Write an ndarray to buffered_writer and return its metadata."""
179
+ self.init_writer()
180
+ url_idx = len(self.inlined_obj_files) - 1
181
+ start = self.buffered_writer.tell()
182
+ shape: tuple[int, ...] | None
183
+ is_bool_array: bool
184
+ if np.issubdtype(ar.dtype, np.bool_):
185
+ shape = ar.shape
186
+ ar = np.packbits(ar)
187
+ is_bool_array = True
188
+ else:
189
+ shape = None
190
+ is_bool_array = False
191
+ np.save(self.buffered_writer, ar, allow_pickle=False)
192
+ end = self.buffered_writer.tell()
193
+ self._flush_buffer()
194
+ return InlinedObjectMd(
195
+ type=ts.ColumnType.Type.ARRAY.name,
196
+ url_idx=url_idx,
197
+ array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
198
+ )
199
+
200
+ def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
201
+ """Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
202
+ self.init_writer()
203
+ url_idx = len(self.inlined_obj_files) - 1
204
+ start = self.buffered_writer.tell()
205
+ img.save(self.buffered_writer, format=image_utils.default_format(img))
206
+ end = self.buffered_writer.tell()
207
+ self._flush_buffer()
208
+ return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
209
+
210
+ def _reset_buffer(self) -> None:
211
+ local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
212
+ self.row_builder.tbl.id, 0, self.row_builder.tbl.version
213
+ )
214
+ self.inlined_obj_files.append(local_path)
215
+ fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2) # noqa: SIM115
216
+ assert isinstance(fh, io.BufferedWriter)
217
+ self.buffered_writer = fh
218
+
219
+ def _flush_buffer(self, finalize: bool = False) -> None:
220
+ """Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
221
+ if self.buffered_writer is None:
222
+ return
223
+ if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
224
+ return
225
+ self.buffered_writer.flush()
226
+ os.fsync(self.buffered_writer.fileno()) # needed to force bytes cached by OS to storage
227
+ self.buffered_writer.close()
228
+ if finalize:
229
+ self.buffered_writer = None
230
+ else:
231
+ self._reset_buffer()
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Any, AsyncIterator
7
+
8
+ import numpy as np
9
+ import PIL.Image
10
+
11
+ import pixeltable.type_system as ts
12
+ from pixeltable import exprs
13
+ from pixeltable.utils import parse_local_file_path
14
+
15
+ from .data_row_batch import DataRowBatch
16
+ from .exec_node import ExecNode
17
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
18
+
19
+ _logger = logging.getLogger('pixeltable')
20
+
21
+
22
+ def json_has_inlined_objs(element: Any) -> bool:
23
+ """Returns True if element contains inlined objects produced by CellMaterializationNode."""
24
+ if isinstance(element, list):
25
+ return any(json_has_inlined_objs(v) for v in element)
26
+ if isinstance(element, dict):
27
+ if INLINED_OBJECT_MD_KEY in element:
28
+ return True
29
+ return any(json_has_inlined_objs(v) for v in element.values())
30
+ return False
31
+
32
+
33
+ def reconstruct_json(element: Any, urls: list[str], file_handles: dict[Path, io.BufferedReader]) -> Any:
34
+ """Recursively reconstructs inlined objects in a json structure."""
35
+ if isinstance(element, list):
36
+ return [reconstruct_json(v, urls, file_handles) for v in element]
37
+ if isinstance(element, dict):
38
+ if INLINED_OBJECT_MD_KEY in element:
39
+ obj_md = InlinedObjectMd.from_dict(element[INLINED_OBJECT_MD_KEY])
40
+ url = urls[obj_md.url_idx]
41
+ local_path = parse_local_file_path(url)
42
+ if local_path not in file_handles:
43
+ file_handles[local_path] = open(local_path, 'rb') # noqa: SIM115
44
+ fp = file_handles[local_path]
45
+
46
+ if obj_md.type == ts.ColumnType.Type.ARRAY.name:
47
+ fp.seek(obj_md.array_md.start)
48
+ ar = load_array(
49
+ fp, obj_md.array_md.start, obj_md.array_md.end, obj_md.array_md.is_bool, obj_md.array_md.shape
50
+ )
51
+ return ar
52
+ else:
53
+ fp.seek(obj_md.img_start)
54
+ bytesio = io.BytesIO(fp.read(obj_md.img_end - obj_md.img_start))
55
+ img = PIL.Image.open(bytesio)
56
+ img.load()
57
+ assert fp.tell() == obj_md.img_end, f'{fp.tell()} != {obj_md.img_end} / {obj_md.img_start}'
58
+ return img
59
+ else:
60
+ return {k: reconstruct_json(v, urls, file_handles) for k, v in element.items()}
61
+ return element
62
+
63
+
64
+ def load_array(
65
+ fh: io.BufferedReader, start: int, end: int, is_bool_array: bool, shape: tuple[int, ...] | None
66
+ ) -> np.ndarray:
67
+ """Loads an array from a section of a file."""
68
+ fh.seek(start)
69
+ ar = np.load(fh, allow_pickle=False)
70
+ assert fh.tell() == end
71
+ if is_bool_array:
72
+ assert shape is not None
73
+ ar = np.unpackbits(ar, count=np.prod(shape)).reshape(shape).astype(bool)
74
+ return ar
75
+
76
+
77
+ class CellReconstructionNode(ExecNode):
78
+ """
79
+ Reconstruction of stored json and array cells that were produced by CellMaterializationNode.
80
+ """
81
+
82
+ json_refs: list[exprs.ColumnRef]
83
+ array_refs: list[exprs.ColumnRef]
84
+ file_handles: dict[Path, io.BufferedReader] # key: file path
85
+
86
+ def __init__(
87
+ self,
88
+ json_refs: list[exprs.ColumnRef],
89
+ array_refs: list[exprs.ColumnRef],
90
+ row_builder: exprs.RowBuilder,
91
+ input: ExecNode | None = None,
92
+ ):
93
+ super().__init__(row_builder, [], [], input)
94
+ self.json_refs = json_refs
95
+ self.array_refs = array_refs
96
+ self.file_handles = {}
97
+
98
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
99
+ async for batch in self.input:
100
+ for row in batch:
101
+ for col_ref in self.json_refs:
102
+ val = row[col_ref.slot_idx]
103
+ if val is None:
104
+ continue
105
+ cell_md = row.slot_md.get(col_ref.slot_idx)
106
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(row[col_ref.slot_idx]):
107
+ continue
108
+ row[col_ref.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
109
+
110
+ for col_ref in self.array_refs:
111
+ cell_md = row.slot_md.get(col_ref.slot_idx)
112
+ if cell_md is not None and cell_md.array_md is not None:
113
+ assert row[col_ref.slot_idx] is None
114
+ assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
115
+ row[col_ref.slot_idx] = self._reconstruct_array(cell_md)
116
+ else:
117
+ assert row[col_ref.slot_idx] is None or isinstance(row[col_ref.slot_idx], np.ndarray)
118
+
119
+ yield batch
120
+
121
+ def close(self) -> None:
122
+ for fp in self.file_handles.values():
123
+ fp.close()
124
+
125
+ def _reconstruct_array(self, cell_md: exprs.CellMd) -> np.ndarray:
126
+ assert cell_md.array_md is not None
127
+ local_path = parse_local_file_path(cell_md.file_urls[0])
128
+ assert local_path is not None
129
+ if local_path not in self.file_handles:
130
+ self.file_handles[local_path] = open(str(local_path), 'rb') # noqa: SIM115
131
+ fp = self.file_handles[local_path]
132
+ ar = load_array(
133
+ fp, cell_md.array_md.start, cell_md.array_md.end, bool(cell_md.array_md.is_bool), cell_md.array_md.shape
134
+ )
135
+ return ar
@@ -39,7 +39,7 @@ class ExecNode(abc.ABC):
39
39
  self.flushed_img_slots = [
40
40
  e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
41
41
  ]
42
- self.ctx = None # all nodes of a tree share the same context
42
+ self.ctx = input.ctx if input is not None else None
43
43
 
44
44
  def set_ctx(self, ctx: ExecContext) -> None:
45
45
  self.ctx = ctx
@@ -311,6 +311,7 @@ class JsonMapperDispatcher(Evaluator):
311
311
  img_slot_idxs=[],
312
312
  media_slot_idxs=[],
313
313
  array_slot_idxs=[],
314
+ json_slot_idxs=[],
314
315
  parent_row=row,
315
316
  parent_slot_idx=self.e.slot_idx,
316
317
  )
@@ -306,6 +306,9 @@ class ExprEvalNode(ExecNode):
306
306
  task.cancel()
307
307
  _ = await asyncio.gather(*active_tasks, return_exceptions=True)
308
308
 
309
+ # expr cleanup
310
+ exprs.Expr.release_list(self.exec_ctx.all_exprs)
311
+
309
312
  def dispatch_exc(
310
313
  self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
311
314
  ) -> None:
@@ -149,6 +149,7 @@ class ExecCtx:
149
149
  gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
150
150
  eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
151
151
  literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
152
+ all_exprs: list[exprs.Expr] # all evaluated exprs; needed for cleanup
152
153
 
153
154
  def __init__(
154
155
  self,
@@ -165,6 +166,7 @@ class ExecCtx:
165
166
  self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
166
167
 
167
168
  output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
169
+ self.all_exprs = output_ctx.exprs
168
170
  self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
169
171
  self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
170
172
  non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+
5
+ from pixeltable.exprs import ArrayMd
6
+ from pixeltable.utils.misc import non_none_dict_factory
7
+
8
+ INLINED_OBJECT_MD_KEY = '__pxtinlinedobjmd__'
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class InlinedObjectMd:
13
+ type: str # corresponds to ts.ColumnType.Type
14
+ url_idx: int
15
+ img_start: int | None = None
16
+ img_end: int | None = None
17
+ array_md: ArrayMd | None = None
18
+
19
+ @classmethod
20
+ def from_dict(cls, d: dict) -> InlinedObjectMd:
21
+ if 'array_md' in d:
22
+ array_md = ArrayMd(**d['array_md'])
23
+ del d['array_md']
24
+ return cls(**d, array_md=array_md)
25
+ else:
26
+ return cls(**d)
27
+
28
+ def as_dict(self) -> dict:
29
+ result = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
30
+ if self.array_md is not None:
31
+ result['array_md'] = self.array_md.as_dict()
32
+ return result
@@ -7,7 +7,6 @@ from collections import defaultdict, deque
7
7
  from concurrent import futures
8
8
  from pathlib import Path
9
9
  from typing import AsyncIterator, Iterator, NamedTuple, Optional
10
- from uuid import UUID
11
10
 
12
11
  from pixeltable import exprs
13
12
  from pixeltable.utils.object_stores import ObjectOps, ObjectPath, StorageTarget
@@ -81,9 +80,7 @@ class ObjectStoreSaveNode(ExecNode):
81
80
  num_missing: int # number of references to media files in this row
82
81
  delete_destinations: list[Path] # paths to delete after all copies are complete
83
82
 
84
- def __init__(
85
- self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
86
- ):
83
+ def __init__(self, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True):
87
84
  # input_/output_exprs=[]: we don't have anything to evaluate
88
85
  super().__init__(input.row_builder, [], [], input)
89
86
  self.retain_input_order = retain_input_order
@@ -14,10 +14,18 @@ class RowUpdateNode(ExecNode):
14
14
  Update individual rows in the input batches, identified by key columns.
15
15
 
16
16
  The updates for a row are provided as a dict of column names to new values.
17
- The node assumes that all update dicts contain the same keys, and it populates the slots of the columns present in
18
- the update list.
17
+ Populates the slots of the columns present in the update list.
18
+ Assumptions:
19
+ - all update dicts contain the same keys
20
+ - the input node populates DataRow.cell_vals for all primary key columns
19
21
  """
20
22
 
23
+ updates: dict[tuple, dict[catalog.Column, Any]]
24
+ is_rowid_key: bool # if True, key_vals_batch contains rowids rather than primary key values
25
+ col_slot_idxs: dict[catalog.Column, int]
26
+ pk_columns: list[catalog.Column]
27
+ matched_key_vals: set[tuple]
28
+
21
29
  def __init__(
22
30
  self,
23
31
  tbl: catalog.TableVersionPath,
@@ -37,16 +45,16 @@ class RowUpdateNode(ExecNode):
37
45
  for col_ref in row_builder.unique_exprs
38
46
  if isinstance(col_ref, exprs.ColumnRef)
39
47
  }
48
+ # all update target columns should have assigned slot idxs
49
+ assert all(col in all_col_slot_idxs for col in col_vals_batch[0])
40
50
  self.col_slot_idxs = {col: all_col_slot_idxs[col] for col in col_vals_batch[0]}
41
- self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.get().primary_key_columns()}
42
- self.matched_key_vals: set[tuple] = set()
51
+ self.pk_columns = tbl.tbl_version.get().primary_key_columns()
52
+ self.matched_key_vals = set()
43
53
 
44
54
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
45
55
  async for batch in self.input:
46
56
  for row in batch:
47
- key_vals = (
48
- row.rowid if self.is_rowid_key else tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
49
- )
57
+ key_vals = row.rowid if self.is_rowid_key else tuple(row.cell_vals[col.id] for col in self.pk_columns)
50
58
  if key_vals not in self.updates:
51
59
  continue
52
60
  self.matched_key_vals.add(key_vals)
@@ -59,11 +67,10 @@ class RowUpdateNode(ExecNode):
59
67
  def unmatched_rows(self) -> list[dict[str, Any]]:
60
68
  """Return rows that didn't get used in the updates as a list of dicts compatible with TableVersion.insert()."""
61
69
  result: list[dict[str, Any]] = []
62
- key_cols = self.key_slot_idxs.keys()
63
70
  for key_vals, col_vals in self.updates.items():
64
71
  if key_vals in self.matched_key_vals:
65
72
  continue
66
- row = {col.name: val for col, val in zip(key_cols, key_vals)}
73
+ row = {col.name: val for col, val in zip(self.pk_columns, key_vals)}
67
74
  row.update({col.name: val for col, val in col_vals.items()})
68
75
  result.append(row)
69
76
  return result