pixeltable 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (64) hide show
  1. pixeltable/__init__.py +6 -1
  2. pixeltable/catalog/catalog.py +107 -45
  3. pixeltable/catalog/column.py +7 -2
  4. pixeltable/catalog/table.py +1 -0
  5. pixeltable/catalog/table_metadata.py +5 -0
  6. pixeltable/catalog/table_version.py +100 -106
  7. pixeltable/catalog/table_version_handle.py +4 -1
  8. pixeltable/catalog/update_status.py +12 -0
  9. pixeltable/config.py +6 -0
  10. pixeltable/dataframe.py +11 -5
  11. pixeltable/env.py +52 -19
  12. pixeltable/exec/__init__.py +2 -0
  13. pixeltable/exec/cell_materialization_node.py +231 -0
  14. pixeltable/exec/cell_reconstruction_node.py +135 -0
  15. pixeltable/exec/exec_node.py +1 -1
  16. pixeltable/exec/expr_eval/evaluators.py +1 -0
  17. pixeltable/exec/expr_eval/expr_eval_node.py +14 -0
  18. pixeltable/exec/expr_eval/globals.py +2 -0
  19. pixeltable/exec/globals.py +32 -0
  20. pixeltable/exec/object_store_save_node.py +1 -4
  21. pixeltable/exec/row_update_node.py +16 -9
  22. pixeltable/exec/sql_node.py +107 -14
  23. pixeltable/exprs/__init__.py +1 -1
  24. pixeltable/exprs/arithmetic_expr.py +10 -11
  25. pixeltable/exprs/column_property_ref.py +10 -10
  26. pixeltable/exprs/column_ref.py +2 -2
  27. pixeltable/exprs/data_row.py +106 -37
  28. pixeltable/exprs/expr.py +9 -0
  29. pixeltable/exprs/expr_set.py +14 -7
  30. pixeltable/exprs/inline_expr.py +2 -19
  31. pixeltable/exprs/json_path.py +45 -12
  32. pixeltable/exprs/row_builder.py +54 -22
  33. pixeltable/functions/__init__.py +1 -0
  34. pixeltable/functions/bedrock.py +7 -0
  35. pixeltable/functions/deepseek.py +11 -4
  36. pixeltable/functions/llama_cpp.py +7 -0
  37. pixeltable/functions/math.py +1 -1
  38. pixeltable/functions/ollama.py +7 -0
  39. pixeltable/functions/openai.py +4 -4
  40. pixeltable/functions/openrouter.py +143 -0
  41. pixeltable/functions/video.py +123 -9
  42. pixeltable/functions/whisperx.py +2 -0
  43. pixeltable/functions/yolox.py +2 -0
  44. pixeltable/globals.py +56 -31
  45. pixeltable/io/__init__.py +1 -0
  46. pixeltable/io/globals.py +16 -15
  47. pixeltable/io/table_data_conduit.py +46 -21
  48. pixeltable/iterators/__init__.py +1 -0
  49. pixeltable/metadata/__init__.py +1 -1
  50. pixeltable/metadata/converters/convert_40.py +73 -0
  51. pixeltable/metadata/notes.py +1 -0
  52. pixeltable/plan.py +175 -46
  53. pixeltable/share/publish.py +0 -1
  54. pixeltable/store.py +2 -2
  55. pixeltable/type_system.py +5 -3
  56. pixeltable/utils/console_output.py +4 -1
  57. pixeltable/utils/exception_handler.py +5 -28
  58. pixeltable/utils/image.py +7 -0
  59. pixeltable/utils/misc.py +5 -0
  60. {pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/METADATA +2 -1
  61. {pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/RECORD +64 -57
  62. {pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/WHEEL +0 -0
  63. {pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/entry_points.txt +0 -0
  64. {pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/licenses/LICENSE +0 -0
pixeltable/env.py CHANGED
@@ -27,6 +27,7 @@ from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
27
27
  import nest_asyncio # type: ignore[import-untyped]
28
28
  import pixeltable_pgserver
29
29
  import sqlalchemy as sql
30
+ import tzlocal
30
31
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
31
32
  from sqlalchemy import orm
32
33
  from tenacity import retry, stop_after_attempt, wait_exponential_jitter
@@ -71,6 +72,7 @@ class Env:
71
72
  _db_server: Optional[pixeltable_pgserver.PostgresServer] # set only when running in local environment
72
73
  _db_url: Optional[str]
73
74
  _default_time_zone: Optional[ZoneInfo]
75
+ _verbosity: int
74
76
 
75
77
  # info about optional packages that are utilized by some parts of the code
76
78
  __optional_packages: dict[str, PackageInfo]
@@ -218,10 +220,18 @@ class Env:
218
220
  """
219
221
  This is not a publicly visible setter; it is only for testing purposes.
220
222
  """
221
- tz_name = None if tz is None else tz.key
223
+ if tz is None:
224
+ tz_name = self._get_tz_name()
225
+ else:
226
+ assert isinstance(tz, ZoneInfo)
227
+ tz_name = tz.key
222
228
  self.engine.dispose()
223
229
  self._create_engine(time_zone_name=tz_name)
224
230
 
231
+ @property
232
+ def verbosity(self) -> int:
233
+ return self._verbosity
234
+
225
235
  @property
226
236
  def conn(self) -> Optional[sql.Connection]:
227
237
  assert self._current_conn is not None
@@ -237,6 +247,11 @@ class Env:
237
247
  assert self._dbms is not None
238
248
  return self._dbms
239
249
 
250
+ @property
251
+ def is_using_cockroachdb(self) -> bool:
252
+ assert self._dbms is not None
253
+ return isinstance(self._dbms, CockroachDbms)
254
+
240
255
  @property
241
256
  def in_xact(self) -> bool:
242
257
  return self._current_conn is not None
@@ -247,7 +262,7 @@ class Env:
247
262
  return self._db_server is not None
248
263
 
249
264
  @contextmanager
250
- def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
265
+ def begin_xact(self, *, for_write: bool = False) -> Iterator[sql.Connection]:
251
266
  """
252
267
  Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
253
268
 
@@ -350,6 +365,26 @@ class Env:
350
365
  def console_logger(self) -> ConsoleLogger:
351
366
  return self._console_logger
352
367
 
368
+ def _get_tz_name(self) -> str:
369
+ """Get the time zone name from the configuration, or the system local time zone if not specified.
370
+
371
+ Returns:
372
+ str: The time zone name.
373
+ """
374
+ tz_name = Config.get().get_string_value('time_zone')
375
+ if tz_name is not None:
376
+ # Validate tzname
377
+ if not isinstance(tz_name, str):
378
+ self._logger.error('Invalid time zone specified in configuration.')
379
+ else:
380
+ try:
381
+ _ = ZoneInfo(tz_name)
382
+ except ZoneInfoNotFoundError:
383
+ self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
384
+ else:
385
+ tz_name = tzlocal.get_localzone_name()
386
+ return tz_name
387
+
353
388
  def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
354
389
  if self._initialized:
355
390
  return
@@ -393,10 +428,12 @@ class Env:
393
428
  warnings.simplefilter('ignore', category=UserWarning)
394
429
  warnings.simplefilter('ignore', category=FutureWarning)
395
430
 
396
- # Set verbose level for user visible console messages
397
- verbosity = map_level(config.get_int_value('verbosity'))
431
+ # Set verbosity level for user visible console messages
432
+ self._verbosity = config.get_int_value('verbosity')
433
+ if self._verbosity is None:
434
+ self._verbosity = 1
398
435
  stdout_handler = ConsoleOutputHandler(stream=stdout)
399
- stdout_handler.setLevel(verbosity)
436
+ stdout_handler.setLevel(map_level(self._verbosity))
400
437
  stdout_handler.addFilter(ConsoleMessageFilter())
401
438
  self._logger.addHandler(stdout_handler)
402
439
  self._console_logger = ConsoleLogger(self._logger)
@@ -430,6 +467,7 @@ class Env:
430
467
  http_logger.propagate = False
431
468
 
432
469
  self.clear_tmp_dir()
470
+ tz_name = self._get_tz_name()
433
471
 
434
472
  # configure pixeltable database
435
473
  self._init_db(config)
@@ -439,22 +477,10 @@ class Env:
439
477
  'Reinitializing pixeltable database is not supported when running in non-local environment'
440
478
  )
441
479
 
442
- tz_name = config.get_string_value('time_zone')
443
- if tz_name is not None:
444
- # Validate tzname
445
- if not isinstance(tz_name, str):
446
- self._logger.error('Invalid time zone specified in configuration.')
447
- else:
448
- try:
449
- _ = ZoneInfo(tz_name)
450
- except ZoneInfoNotFoundError:
451
- self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
452
-
453
480
  if reinit_db and self._store_db_exists():
454
481
  self._drop_store_db()
455
482
 
456
483
  create_db = not self._store_db_exists()
457
-
458
484
  if create_db:
459
485
  self._logger.info(f'creating database at: {self.db_url}')
460
486
  self._create_store_db()
@@ -534,13 +560,16 @@ class Env:
534
560
  metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
535
561
  metadata.create_system_info(self._sa_engine)
536
562
 
537
- def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
538
- connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
563
+ def _create_engine(self, time_zone_name: str, echo: bool = False) -> None:
564
+ connect_args = {'options': f'-c timezone={time_zone_name}'}
565
+ self._logger.info(f'Creating SQLAlchemy engine with connection arguments: {connect_args}')
539
566
  self._sa_engine = sql.create_engine(
540
567
  self.db_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level, connect_args=connect_args
541
568
  )
542
569
 
543
570
  self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
571
+ self._logger.info(f'Engine dialect: {self._sa_engine.dialect.name}')
572
+ self._logger.info(f'Engine driver : {self._sa_engine.dialect.driver}')
544
573
 
545
574
  with self.engine.begin() as conn:
546
575
  tz_name = conn.execute(sql.text('SHOW TIME ZONE')).scalar()
@@ -770,6 +799,10 @@ class Env:
770
799
  library_name=library_name or package_name, # defaults to package_name unless specified otherwise
771
800
  )
772
801
 
802
+ def require_binary(self, binary_name: str) -> None:
803
+ if not shutil.which(binary_name):
804
+ raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
805
+
773
806
  def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
774
807
  """
775
808
  Checks whether the specified optional package is available. If not, raises an exception
@@ -2,6 +2,8 @@
2
2
 
3
3
  from .aggregation_node import AggregationNode
4
4
  from .cache_prefetch_node import CachePrefetchNode
5
+ from .cell_materialization_node import CellMaterializationNode
6
+ from .cell_reconstruction_node import CellReconstructionNode
5
7
  from .component_iteration_node import ComponentIterationNode
6
8
  from .data_row_batch import DataRowBatch
7
9
  from .exec_context import ExecContext
@@ -0,0 +1,231 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, AsyncIterator
8
+
9
+ import numpy as np
10
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
11
+ import PIL.Image
12
+ import sqlalchemy as sql
13
+
14
+ import pixeltable.type_system as ts
15
+ import pixeltable.utils.image as image_utils
16
+ from pixeltable import catalog, exprs
17
+ from pixeltable.env import Env
18
+ from pixeltable.utils.local_store import LocalStore
19
+
20
+ from .data_row_batch import DataRowBatch
21
+ from .exec_node import ExecNode
22
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
23
+
24
+ _logger = logging.getLogger('pixeltable')
25
+
26
+
27
+ class CellMaterializationNode(ExecNode):
28
+ """
29
+ Node to populate DataRow.cell_vals/cell_md.
30
+
31
+ For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
32
+
33
+ Array values:
34
+ - Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
35
+ - Larger arrays are written to inlined_obj_files
36
+ - Bool arrays are stored as packed bits (uint8)
37
+ - cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
38
+ (this allows us to query cell_md to get the total external storage size of an array column)
39
+
40
+ Json values:
41
+ - Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
42
+ location
43
+ - Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
44
+ - cell_md contains the list of urls for the inlined objects.
45
+
46
+ TODO:
47
+ - execute file IO via asyncio Tasks in a thread pool?
48
+ (we already seem to be getting 90% of hardware IO throughput)
49
+ - subsume all cell materialization
50
+ """
51
+
52
+ output_col_info: dict[catalog.Column, int] # value: slot idx
53
+
54
+ # execution state
55
+ inlined_obj_files: list[Path] # only [-1] is open for writing
56
+ buffered_writer: io.BufferedWriter | None # BufferedWriter for inlined_obj_files[-1]
57
+
58
+ MIN_FILE_SIZE = 8 * 2**20 # 8MB
59
+ MAX_DB_ARRAY_SIZE = 512 # max size of array stored in table column; in bytes
60
+
61
+ def __init__(self, input: ExecNode):
62
+ super().__init__(input.row_builder, [], [], input)
63
+ self.output_col_info = {
64
+ col: slot_idx
65
+ for col, slot_idx in input.row_builder.table_columns.items()
66
+ if slot_idx is not None and (col.col_type.is_json_type() or col.col_type.is_array_type())
67
+ }
68
+ self.inlined_obj_files = []
69
+ self.buffered_writer = None
70
+
71
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
72
+ async for batch in self.input:
73
+ for row in batch:
74
+ for col, slot_idx in self.output_col_info.items():
75
+ if row.has_exc(slot_idx):
76
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
77
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
78
+ exc = row.get_exc(slot_idx)
79
+ row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
80
+ continue
81
+
82
+ val = row[slot_idx]
83
+ if val is None:
84
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
85
+ row.cell_md[col.id] = None
86
+ continue
87
+
88
+ if col.col_type.is_json_type():
89
+ self._materialize_json_cell(row, col, val)
90
+ else:
91
+ assert col.col_type.is_array_type()
92
+ assert isinstance(val, np.ndarray)
93
+ self._materialize_array_cell(row, col, val)
94
+
95
+ # continue with only the currently open file
96
+ self.inlined_obj_files = self.inlined_obj_files[-1:]
97
+
98
+ yield batch
99
+
100
+ self._flush_buffer(finalize=True)
101
+
102
+ def init_writer(self) -> None:
103
+ if self.buffered_writer is None:
104
+ self._reset_buffer()
105
+ assert self.buffered_writer is not None
106
+
107
+ def close(self) -> None:
108
+ if self.buffered_writer is not None:
109
+ # there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
110
+ self.buffered_writer.close()
111
+ self.buffered_writer = None
112
+
113
+ def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
114
+ if self._json_has_inlined_objs(val):
115
+ row.cell_vals[col.id] = self._rewrite_json(val)
116
+ row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
117
+ else:
118
+ row.cell_vals[col.id] = val
119
+ row.cell_md[col.id] = None
120
+
121
+ def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
122
+ if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
123
+ # this is a vector column (ie, used for a vector index): store the array itself
124
+ row.cell_vals[col.id] = val
125
+ row.cell_md[col.id] = None
126
+ elif val.nbytes <= self.MAX_DB_ARRAY_SIZE:
127
+ # this array is small enough to store in the db column (type: binary) directly
128
+ buffer = io.BytesIO()
129
+ np.save(buffer, val, allow_pickle=False)
130
+ row.cell_vals[col.id] = buffer.getvalue()
131
+ row.cell_md[col.id] = None
132
+ else:
133
+ # append this array to the buffer and store its location in the cell md
134
+ ar: np.ndarray
135
+ if np.issubdtype(val.dtype, np.bool_):
136
+ # for bool arrays, store as packed bits, otherwise it's 1 byte per element
137
+ ar = np.packbits(val)
138
+ else:
139
+ ar = val
140
+ self.init_writer()
141
+ start = self.buffered_writer.tell()
142
+ np.save(self.buffered_writer, ar, allow_pickle=False)
143
+ end = self.buffered_writer.tell()
144
+ row.cell_vals[col.id] = None
145
+ cell_md = exprs.CellMd(
146
+ file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
147
+ )
148
+ if np.issubdtype(val.dtype, np.bool_):
149
+ cell_md.array_md.is_bool = True
150
+ cell_md.array_md.shape = val.shape
151
+ row.cell_md[col.id] = cell_md
152
+ self._flush_buffer()
153
+
154
+ assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
155
+
156
+ def _json_has_inlined_objs(self, element: Any) -> bool:
157
+ if isinstance(element, list):
158
+ return any(self._json_has_inlined_objs(v) for v in element)
159
+ if isinstance(element, dict):
160
+ return any(self._json_has_inlined_objs(v) for v in element.values())
161
+ return isinstance(element, (np.ndarray, PIL.Image.Image))
162
+
163
+ def _rewrite_json(self, element: Any) -> Any:
164
+ """Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
165
+ if isinstance(element, list):
166
+ return [self._rewrite_json(v) for v in element]
167
+ if isinstance(element, dict):
168
+ return {k: self._rewrite_json(v) for k, v in element.items()}
169
+ if isinstance(element, np.ndarray):
170
+ obj_md = self._write_inlined_array(element)
171
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
172
+ if isinstance(element, PIL.Image.Image):
173
+ obj_md = self._write_inlined_image(element)
174
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
175
+ return element
176
+
177
+ def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
178
+ """Write an ndarray to buffered_writer and return its metadata."""
179
+ self.init_writer()
180
+ url_idx = len(self.inlined_obj_files) - 1
181
+ start = self.buffered_writer.tell()
182
+ shape: tuple[int, ...] | None
183
+ is_bool_array: bool
184
+ if np.issubdtype(ar.dtype, np.bool_):
185
+ shape = ar.shape
186
+ ar = np.packbits(ar)
187
+ is_bool_array = True
188
+ else:
189
+ shape = None
190
+ is_bool_array = False
191
+ np.save(self.buffered_writer, ar, allow_pickle=False)
192
+ end = self.buffered_writer.tell()
193
+ self._flush_buffer()
194
+ return InlinedObjectMd(
195
+ type=ts.ColumnType.Type.ARRAY.name,
196
+ url_idx=url_idx,
197
+ array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
198
+ )
199
+
200
+ def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
201
+ """Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
202
+ self.init_writer()
203
+ url_idx = len(self.inlined_obj_files) - 1
204
+ start = self.buffered_writer.tell()
205
+ img.save(self.buffered_writer, format=image_utils.default_format(img))
206
+ end = self.buffered_writer.tell()
207
+ self._flush_buffer()
208
+ return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
209
+
210
+ def _reset_buffer(self) -> None:
211
+ local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
212
+ self.row_builder.tbl.id, 0, self.row_builder.tbl.version
213
+ )
214
+ self.inlined_obj_files.append(local_path)
215
+ fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2) # noqa: SIM115
216
+ assert isinstance(fh, io.BufferedWriter)
217
+ self.buffered_writer = fh
218
+
219
+ def _flush_buffer(self, finalize: bool = False) -> None:
220
+ """Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
221
+ if self.buffered_writer is None:
222
+ return
223
+ if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
224
+ return
225
+ self.buffered_writer.flush()
226
+ os.fsync(self.buffered_writer.fileno()) # needed to force bytes cached by OS to storage
227
+ self.buffered_writer.close()
228
+ if finalize:
229
+ self.buffered_writer = None
230
+ else:
231
+ self._reset_buffer()
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Any, AsyncIterator
7
+
8
+ import numpy as np
9
+ import PIL.Image
10
+
11
+ import pixeltable.type_system as ts
12
+ from pixeltable import exprs
13
+ from pixeltable.utils import parse_local_file_path
14
+
15
+ from .data_row_batch import DataRowBatch
16
+ from .exec_node import ExecNode
17
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
18
+
19
+ _logger = logging.getLogger('pixeltable')
20
+
21
+
22
+ def json_has_inlined_objs(element: Any) -> bool:
23
+ """Returns True if element contains inlined objects produced by CellMaterializationNode."""
24
+ if isinstance(element, list):
25
+ return any(json_has_inlined_objs(v) for v in element)
26
+ if isinstance(element, dict):
27
+ if INLINED_OBJECT_MD_KEY in element:
28
+ return True
29
+ return any(json_has_inlined_objs(v) for v in element.values())
30
+ return False
31
+
32
+
33
+ def reconstruct_json(element: Any, urls: list[str], file_handles: dict[Path, io.BufferedReader]) -> Any:
34
+ """Recursively reconstructs inlined objects in a json structure."""
35
+ if isinstance(element, list):
36
+ return [reconstruct_json(v, urls, file_handles) for v in element]
37
+ if isinstance(element, dict):
38
+ if INLINED_OBJECT_MD_KEY in element:
39
+ obj_md = InlinedObjectMd.from_dict(element[INLINED_OBJECT_MD_KEY])
40
+ url = urls[obj_md.url_idx]
41
+ local_path = parse_local_file_path(url)
42
+ if local_path not in file_handles:
43
+ file_handles[local_path] = open(local_path, 'rb') # noqa: SIM115
44
+ fp = file_handles[local_path]
45
+
46
+ if obj_md.type == ts.ColumnType.Type.ARRAY.name:
47
+ fp.seek(obj_md.array_md.start)
48
+ ar = load_array(
49
+ fp, obj_md.array_md.start, obj_md.array_md.end, obj_md.array_md.is_bool, obj_md.array_md.shape
50
+ )
51
+ return ar
52
+ else:
53
+ fp.seek(obj_md.img_start)
54
+ bytesio = io.BytesIO(fp.read(obj_md.img_end - obj_md.img_start))
55
+ img = PIL.Image.open(bytesio)
56
+ img.load()
57
+ assert fp.tell() == obj_md.img_end, f'{fp.tell()} != {obj_md.img_end} / {obj_md.img_start}'
58
+ return img
59
+ else:
60
+ return {k: reconstruct_json(v, urls, file_handles) for k, v in element.items()}
61
+ return element
62
+
63
+
64
+ def load_array(
65
+ fh: io.BufferedReader, start: int, end: int, is_bool_array: bool, shape: tuple[int, ...] | None
66
+ ) -> np.ndarray:
67
+ """Loads an array from a section of a file."""
68
+ fh.seek(start)
69
+ ar = np.load(fh, allow_pickle=False)
70
+ assert fh.tell() == end
71
+ if is_bool_array:
72
+ assert shape is not None
73
+ ar = np.unpackbits(ar, count=np.prod(shape)).reshape(shape).astype(bool)
74
+ return ar
75
+
76
+
77
+ class CellReconstructionNode(ExecNode):
78
+ """
79
+ Reconstruction of stored json and array cells that were produced by CellMaterializationNode.
80
+ """
81
+
82
+ json_refs: list[exprs.ColumnRef]
83
+ array_refs: list[exprs.ColumnRef]
84
+ file_handles: dict[Path, io.BufferedReader] # key: file path
85
+
86
+ def __init__(
87
+ self,
88
+ json_refs: list[exprs.ColumnRef],
89
+ array_refs: list[exprs.ColumnRef],
90
+ row_builder: exprs.RowBuilder,
91
+ input: ExecNode | None = None,
92
+ ):
93
+ super().__init__(row_builder, [], [], input)
94
+ self.json_refs = json_refs
95
+ self.array_refs = array_refs
96
+ self.file_handles = {}
97
+
98
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
99
+ async for batch in self.input:
100
+ for row in batch:
101
+ for col_ref in self.json_refs:
102
+ val = row[col_ref.slot_idx]
103
+ if val is None:
104
+ continue
105
+ cell_md = row.slot_md.get(col_ref.slot_idx)
106
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(row[col_ref.slot_idx]):
107
+ continue
108
+ row[col_ref.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
109
+
110
+ for col_ref in self.array_refs:
111
+ cell_md = row.slot_md.get(col_ref.slot_idx)
112
+ if cell_md is not None and cell_md.array_md is not None:
113
+ assert row[col_ref.slot_idx] is None
114
+ assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
115
+ row[col_ref.slot_idx] = self._reconstruct_array(cell_md)
116
+ else:
117
+ assert row[col_ref.slot_idx] is None or isinstance(row[col_ref.slot_idx], np.ndarray)
118
+
119
+ yield batch
120
+
121
+ def close(self) -> None:
122
+ for fp in self.file_handles.values():
123
+ fp.close()
124
+
125
+ def _reconstruct_array(self, cell_md: exprs.CellMd) -> np.ndarray:
126
+ assert cell_md.array_md is not None
127
+ local_path = parse_local_file_path(cell_md.file_urls[0])
128
+ assert local_path is not None
129
+ if local_path not in self.file_handles:
130
+ self.file_handles[local_path] = open(str(local_path), 'rb') # noqa: SIM115
131
+ fp = self.file_handles[local_path]
132
+ ar = load_array(
133
+ fp, cell_md.array_md.start, cell_md.array_md.end, bool(cell_md.array_md.is_bool), cell_md.array_md.shape
134
+ )
135
+ return ar
@@ -39,7 +39,7 @@ class ExecNode(abc.ABC):
39
39
  self.flushed_img_slots = [
40
40
  e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
41
41
  ]
42
- self.ctx = None # all nodes of a tree share the same context
42
+ self.ctx = input.ctx if input is not None else None
43
43
 
44
44
  def set_ctx(self, ctx: ExecContext) -> None:
45
45
  self.ctx = ctx
@@ -311,6 +311,7 @@ class JsonMapperDispatcher(Evaluator):
311
311
  img_slot_idxs=[],
312
312
  media_slot_idxs=[],
313
313
  array_slot_idxs=[],
314
+ json_slot_idxs=[],
314
315
  parent_row=row,
315
316
  parent_slot_idx=self.e.slot_idx,
316
317
  )
@@ -306,6 +306,9 @@ class ExprEvalNode(ExecNode):
306
306
  task.cancel()
307
307
  _ = await asyncio.gather(*active_tasks, return_exceptions=True)
308
308
 
309
+ # expr cleanup
310
+ exprs.Expr.release_list(self.exec_ctx.all_exprs)
311
+
309
312
  def dispatch_exc(
310
313
  self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
311
314
  ) -> None:
@@ -390,6 +393,17 @@ class ExprEvalNode(ExecNode):
390
393
  # end the main loop if we had an unhandled exception
391
394
  try:
392
395
  t.result()
396
+ except KeyboardInterrupt:
397
+ # ExprEvalNode instances are long-running and reused across multiple operations.
398
+ # When a user interrupts an operation (Ctrl+C), the main evaluation loop properly
399
+ # handles the KeyboardInterrupt and terminates the current operation. However,
400
+ # background tasks spawned by evaluators may complete asynchronously after the
401
+ # operation has ended, and their done callbacks will fire during subsequent
402
+ # operations. These "phantom" KeyboardInterrupt exceptions from previous
403
+ # operations' background tasks should not interfere with new operations, so we
404
+ # absorb them here rather than propagating them via self.error/self.exc_event.
405
+ _logger.debug('Task completed with KeyboardInterrupt (user cancellation)')
406
+ pass
393
407
  except asyncio.CancelledError:
394
408
  pass
395
409
  except Exception as exc:
@@ -149,6 +149,7 @@ class ExecCtx:
149
149
  gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
150
150
  eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
151
151
  literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
152
+ all_exprs: list[exprs.Expr] # all evaluated exprs; needed for cleanup
152
153
 
153
154
  def __init__(
154
155
  self,
@@ -165,6 +166,7 @@ class ExecCtx:
165
166
  self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
166
167
 
167
168
  output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
169
+ self.all_exprs = output_ctx.exprs
168
170
  self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
169
171
  self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
170
172
  non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+
5
+ from pixeltable.exprs import ArrayMd
6
+ from pixeltable.utils.misc import non_none_dict_factory
7
+
8
+ INLINED_OBJECT_MD_KEY = '__pxtinlinedobjmd__'
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class InlinedObjectMd:
13
+ type: str # corresponds to ts.ColumnType.Type
14
+ url_idx: int
15
+ img_start: int | None = None
16
+ img_end: int | None = None
17
+ array_md: ArrayMd | None = None
18
+
19
+ @classmethod
20
+ def from_dict(cls, d: dict) -> InlinedObjectMd:
21
+ if 'array_md' in d:
22
+ array_md = ArrayMd(**d['array_md'])
23
+ del d['array_md']
24
+ return cls(**d, array_md=array_md)
25
+ else:
26
+ return cls(**d)
27
+
28
+ def as_dict(self) -> dict:
29
+ result = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
30
+ if self.array_md is not None:
31
+ result['array_md'] = self.array_md.as_dict()
32
+ return result
@@ -7,7 +7,6 @@ from collections import defaultdict, deque
7
7
  from concurrent import futures
8
8
  from pathlib import Path
9
9
  from typing import AsyncIterator, Iterator, NamedTuple, Optional
10
- from uuid import UUID
11
10
 
12
11
  from pixeltable import exprs
13
12
  from pixeltable.utils.object_stores import ObjectOps, ObjectPath, StorageTarget
@@ -81,9 +80,7 @@ class ObjectStoreSaveNode(ExecNode):
81
80
  num_missing: int # number of references to media files in this row
82
81
  delete_destinations: list[Path] # paths to delete after all copies are complete
83
82
 
84
- def __init__(
85
- self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
86
- ):
83
+ def __init__(self, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True):
87
84
  # input_/output_exprs=[]: we don't have anything to evaluate
88
85
  super().__init__(input.row_builder, [], [], input)
89
86
  self.retain_input_order = retain_input_order