pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (68) hide show
  1. pixeltable/__init__.py +4 -0
  2. pixeltable/catalog/catalog.py +125 -63
  3. pixeltable/catalog/column.py +7 -2
  4. pixeltable/catalog/table.py +1 -0
  5. pixeltable/catalog/table_metadata.py +4 -0
  6. pixeltable/catalog/table_version.py +174 -117
  7. pixeltable/catalog/table_version_handle.py +4 -1
  8. pixeltable/catalog/table_version_path.py +0 -11
  9. pixeltable/catalog/view.py +6 -0
  10. pixeltable/config.py +7 -0
  11. pixeltable/dataframe.py +10 -5
  12. pixeltable/env.py +56 -19
  13. pixeltable/exec/__init__.py +2 -0
  14. pixeltable/exec/cell_materialization_node.py +231 -0
  15. pixeltable/exec/cell_reconstruction_node.py +135 -0
  16. pixeltable/exec/exec_node.py +1 -1
  17. pixeltable/exec/expr_eval/evaluators.py +1 -0
  18. pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
  19. pixeltable/exec/expr_eval/globals.py +2 -0
  20. pixeltable/exec/globals.py +32 -0
  21. pixeltable/exec/object_store_save_node.py +1 -4
  22. pixeltable/exec/row_update_node.py +16 -9
  23. pixeltable/exec/sql_node.py +107 -14
  24. pixeltable/exprs/__init__.py +1 -1
  25. pixeltable/exprs/arithmetic_expr.py +23 -18
  26. pixeltable/exprs/column_property_ref.py +10 -10
  27. pixeltable/exprs/column_ref.py +2 -2
  28. pixeltable/exprs/data_row.py +106 -37
  29. pixeltable/exprs/expr.py +9 -0
  30. pixeltable/exprs/expr_set.py +14 -7
  31. pixeltable/exprs/inline_expr.py +2 -19
  32. pixeltable/exprs/json_path.py +45 -12
  33. pixeltable/exprs/row_builder.py +54 -22
  34. pixeltable/functions/__init__.py +1 -0
  35. pixeltable/functions/bedrock.py +7 -0
  36. pixeltable/functions/deepseek.py +11 -4
  37. pixeltable/functions/llama_cpp.py +7 -0
  38. pixeltable/functions/math.py +1 -1
  39. pixeltable/functions/ollama.py +7 -0
  40. pixeltable/functions/openai.py +4 -4
  41. pixeltable/functions/openrouter.py +143 -0
  42. pixeltable/functions/video.py +110 -28
  43. pixeltable/globals.py +10 -4
  44. pixeltable/io/globals.py +18 -17
  45. pixeltable/io/parquet.py +1 -1
  46. pixeltable/io/table_data_conduit.py +47 -22
  47. pixeltable/iterators/document.py +61 -23
  48. pixeltable/iterators/video.py +126 -53
  49. pixeltable/metadata/__init__.py +1 -1
  50. pixeltable/metadata/converters/convert_40.py +73 -0
  51. pixeltable/metadata/notes.py +1 -0
  52. pixeltable/plan.py +175 -46
  53. pixeltable/share/packager.py +155 -26
  54. pixeltable/store.py +2 -3
  55. pixeltable/type_system.py +5 -3
  56. pixeltable/utils/arrow.py +6 -6
  57. pixeltable/utils/av.py +65 -0
  58. pixeltable/utils/console_output.py +4 -1
  59. pixeltable/utils/exception_handler.py +5 -28
  60. pixeltable/utils/image.py +7 -0
  61. pixeltable/utils/misc.py +5 -0
  62. pixeltable/utils/object_stores.py +16 -1
  63. pixeltable/utils/s3_store.py +44 -11
  64. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
  65. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
  66. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
  67. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
  68. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0
pixeltable/dataframe.py CHANGED
@@ -23,7 +23,7 @@ from typing import (
23
23
 
24
24
  import pandas as pd
25
25
  import pydantic
26
- import sqlalchemy as sql
26
+ import sqlalchemy.exc as sql_exc
27
27
 
28
28
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
29
29
  from pixeltable.catalog import Catalog, is_valid_identifier
@@ -186,6 +186,8 @@ class DataFrameResultSet:
186
186
 
187
187
 
188
188
  class DataFrame:
189
+ """Represents a query for retrieving and transforming data from Pixeltable tables."""
190
+
189
191
  _from_clause: plan.FromClause
190
192
  _select_list_exprs: list[exprs.Expr]
191
193
  _schema: dict[str, ts.ColumnType]
@@ -539,20 +541,23 @@ class DataFrame:
539
541
  yield [data_row[e.slot_idx] for e in self._select_list_exprs]
540
542
  except excs.ExprEvalError as e:
541
543
  self._raise_expr_eval_err(e)
542
- except sql.exc.DBAPIError as e:
543
- raise excs.Error(f'Error during SQL execution:\n{e}') from e
544
+ except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
545
+ Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
546
+ raise # just re-raise if not converted to a Pixeltable error
544
547
 
545
548
  def collect(self) -> DataFrameResultSet:
546
549
  return DataFrameResultSet(list(self._output_row_iterator()), self.schema)
547
550
 
548
551
  async def _acollect(self) -> DataFrameResultSet:
552
+ single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
549
553
  try:
550
554
  result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec()]
551
555
  return DataFrameResultSet(result, self.schema)
552
556
  except excs.ExprEvalError as e:
553
557
  self._raise_expr_eval_err(e)
554
- except sql.exc.DBAPIError as e:
555
- raise excs.Error(f'Error during SQL execution:\n{e}') from e
558
+ except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
559
+ Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
560
+ raise # just re-raise if not converted to a Pixeltable error
556
561
 
557
562
  def count(self) -> int:
558
563
  """Return the number of rows in the DataFrame.
pixeltable/env.py CHANGED
@@ -27,6 +27,7 @@ from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
27
27
  import nest_asyncio # type: ignore[import-untyped]
28
28
  import pixeltable_pgserver
29
29
  import sqlalchemy as sql
30
+ import tzlocal
30
31
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
31
32
  from sqlalchemy import orm
32
33
  from tenacity import retry, stop_after_attempt, wait_exponential_jitter
@@ -71,6 +72,7 @@ class Env:
71
72
  _db_server: Optional[pixeltable_pgserver.PostgresServer] # set only when running in local environment
72
73
  _db_url: Optional[str]
73
74
  _default_time_zone: Optional[ZoneInfo]
75
+ _verbosity: int
74
76
 
75
77
  # info about optional packages that are utilized by some parts of the code
76
78
  __optional_packages: dict[str, PackageInfo]
@@ -218,10 +220,18 @@ class Env:
218
220
  """
219
221
  This is not a publicly visible setter; it is only for testing purposes.
220
222
  """
221
- tz_name = None if tz is None else tz.key
223
+ if tz is None:
224
+ tz_name = self._get_tz_name()
225
+ else:
226
+ assert isinstance(tz, ZoneInfo)
227
+ tz_name = tz.key
222
228
  self.engine.dispose()
223
229
  self._create_engine(time_zone_name=tz_name)
224
230
 
231
+ @property
232
+ def verbosity(self) -> int:
233
+ return self._verbosity
234
+
225
235
  @property
226
236
  def conn(self) -> Optional[sql.Connection]:
227
237
  assert self._current_conn is not None
@@ -237,6 +247,11 @@ class Env:
237
247
  assert self._dbms is not None
238
248
  return self._dbms
239
249
 
250
+ @property
251
+ def is_using_cockroachdb(self) -> bool:
252
+ assert self._dbms is not None
253
+ return isinstance(self._dbms, CockroachDbms)
254
+
240
255
  @property
241
256
  def in_xact(self) -> bool:
242
257
  return self._current_conn is not None
@@ -247,7 +262,7 @@ class Env:
247
262
  return self._db_server is not None
248
263
 
249
264
  @contextmanager
250
- def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
265
+ def begin_xact(self, *, for_write: bool = False) -> Iterator[sql.Connection]:
251
266
  """
252
267
  Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
253
268
 
@@ -340,6 +355,8 @@ class Env:
340
355
  # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
341
356
  path_parts = list(Path(record.pathname).parts)
342
357
  path_parts.reverse()
358
+ if 'pixeltable' not in path_parts:
359
+ return False
343
360
  max_idx = path_parts.index('pixeltable')
344
361
  for module_name in path_parts[:max_idx]:
345
362
  if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
@@ -350,6 +367,26 @@ class Env:
350
367
  def console_logger(self) -> ConsoleLogger:
351
368
  return self._console_logger
352
369
 
370
+ def _get_tz_name(self) -> str:
371
+ """Get the time zone name from the configuration, or the system local time zone if not specified.
372
+
373
+ Returns:
374
+ str: The time zone name.
375
+ """
376
+ tz_name = Config.get().get_string_value('time_zone')
377
+ if tz_name is not None:
378
+ # Validate tzname
379
+ if not isinstance(tz_name, str):
380
+ self._logger.error('Invalid time zone specified in configuration.')
381
+ else:
382
+ try:
383
+ _ = ZoneInfo(tz_name)
384
+ except ZoneInfoNotFoundError:
385
+ self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
386
+ else:
387
+ tz_name = tzlocal.get_localzone_name()
388
+ return tz_name
389
+
353
390
  def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
354
391
  if self._initialized:
355
392
  return
@@ -393,10 +430,12 @@ class Env:
393
430
  warnings.simplefilter('ignore', category=UserWarning)
394
431
  warnings.simplefilter('ignore', category=FutureWarning)
395
432
 
396
- # Set verbose level for user visible console messages
397
- verbosity = map_level(config.get_int_value('verbosity'))
433
+ # Set verbosity level for user visible console messages
434
+ self._verbosity = config.get_int_value('verbosity')
435
+ if self._verbosity is None:
436
+ self._verbosity = 1
398
437
  stdout_handler = ConsoleOutputHandler(stream=stdout)
399
- stdout_handler.setLevel(verbosity)
438
+ stdout_handler.setLevel(map_level(self._verbosity))
400
439
  stdout_handler.addFilter(ConsoleMessageFilter())
401
440
  self._logger.addHandler(stdout_handler)
402
441
  self._console_logger = ConsoleLogger(self._logger)
@@ -430,6 +469,7 @@ class Env:
430
469
  http_logger.propagate = False
431
470
 
432
471
  self.clear_tmp_dir()
472
+ tz_name = self._get_tz_name()
433
473
 
434
474
  # configure pixeltable database
435
475
  self._init_db(config)
@@ -439,22 +479,10 @@ class Env:
439
479
  'Reinitializing pixeltable database is not supported when running in non-local environment'
440
480
  )
441
481
 
442
- tz_name = config.get_string_value('time_zone')
443
- if tz_name is not None:
444
- # Validate tzname
445
- if not isinstance(tz_name, str):
446
- self._logger.error('Invalid time zone specified in configuration.')
447
- else:
448
- try:
449
- _ = ZoneInfo(tz_name)
450
- except ZoneInfoNotFoundError:
451
- self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
452
-
453
482
  if reinit_db and self._store_db_exists():
454
483
  self._drop_store_db()
455
484
 
456
485
  create_db = not self._store_db_exists()
457
-
458
486
  if create_db:
459
487
  self._logger.info(f'creating database at: {self.db_url}')
460
488
  self._create_store_db()
@@ -534,19 +562,28 @@ class Env:
534
562
  metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
535
563
  metadata.create_system_info(self._sa_engine)
536
564
 
537
- def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
538
- connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
565
+ def _create_engine(self, time_zone_name: str, echo: bool = False) -> None:
566
+ connect_args = {'options': f'-c timezone={time_zone_name}'}
567
+ self._logger.info(f'Creating SQLAlchemy engine with connection arguments: {connect_args}')
539
568
  self._sa_engine = sql.create_engine(
540
569
  self.db_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level, connect_args=connect_args
541
570
  )
542
571
 
543
572
  self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
573
+ self._logger.info(f'Engine dialect: {self._sa_engine.dialect.name}')
574
+ self._logger.info(f'Engine driver : {self._sa_engine.dialect.driver}')
544
575
 
545
576
  with self.engine.begin() as conn:
546
577
  tz_name = conn.execute(sql.text('SHOW TIME ZONE')).scalar()
547
578
  assert isinstance(tz_name, str)
548
579
  self._logger.info(f'Database time zone is now: {tz_name}')
549
580
  self._default_time_zone = ZoneInfo(tz_name)
581
+ if self.is_using_cockroachdb:
582
+ # This could be set when the database is created, but we set it now
583
+ conn.execute(sql.text('SET null_ordered_last = true;'))
584
+ null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
585
+ assert isinstance(null_ordered_last, str)
586
+ self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
550
587
 
551
588
  def _store_db_exists(self) -> bool:
552
589
  assert self._db_name is not None
@@ -2,6 +2,8 @@
2
2
 
3
3
  from .aggregation_node import AggregationNode
4
4
  from .cache_prefetch_node import CachePrefetchNode
5
+ from .cell_materialization_node import CellMaterializationNode
6
+ from .cell_reconstruction_node import CellReconstructionNode
5
7
  from .component_iteration_node import ComponentIterationNode
6
8
  from .data_row_batch import DataRowBatch
7
9
  from .exec_context import ExecContext
@@ -0,0 +1,231 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, AsyncIterator
8
+
9
+ import numpy as np
10
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
11
+ import PIL.Image
12
+ import sqlalchemy as sql
13
+
14
+ import pixeltable.type_system as ts
15
+ import pixeltable.utils.image as image_utils
16
+ from pixeltable import catalog, exprs
17
+ from pixeltable.env import Env
18
+ from pixeltable.utils.local_store import LocalStore
19
+
20
+ from .data_row_batch import DataRowBatch
21
+ from .exec_node import ExecNode
22
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
23
+
24
+ _logger = logging.getLogger('pixeltable')
25
+
26
+
27
+ class CellMaterializationNode(ExecNode):
28
+ """
29
+ Node to populate DataRow.cell_vals/cell_md.
30
+
31
+ For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
32
+
33
+ Array values:
34
+ - Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
35
+ - Larger arrays are written to inlined_obj_files
36
+ - Bool arrays are stored as packed bits (uint8)
37
+ - cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
38
+ (this allows us to query cell_md to get the total external storage size of an array column)
39
+
40
+ Json values:
41
+ - Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
42
+ location
43
+ - Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
44
+ - cell_md contains the list of urls for the inlined objects.
45
+
46
+ TODO:
47
+ - execute file IO via asyncio Tasks in a thread pool?
48
+ (we already seem to be getting 90% of hardware IO throughput)
49
+ - subsume all cell materialization
50
+ """
51
+
52
+ output_col_info: dict[catalog.Column, int] # value: slot idx
53
+
54
+ # execution state
55
+ inlined_obj_files: list[Path] # only [-1] is open for writing
56
+ buffered_writer: io.BufferedWriter | None # BufferedWriter for inlined_obj_files[-1]
57
+
58
+ MIN_FILE_SIZE = 8 * 2**20 # 8MB
59
+ MAX_DB_ARRAY_SIZE = 512 # max size of array stored in table column; in bytes
60
+
61
+ def __init__(self, input: ExecNode):
62
+ super().__init__(input.row_builder, [], [], input)
63
+ self.output_col_info = {
64
+ col: slot_idx
65
+ for col, slot_idx in input.row_builder.table_columns.items()
66
+ if slot_idx is not None and (col.col_type.is_json_type() or col.col_type.is_array_type())
67
+ }
68
+ self.inlined_obj_files = []
69
+ self.buffered_writer = None
70
+
71
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
72
+ async for batch in self.input:
73
+ for row in batch:
74
+ for col, slot_idx in self.output_col_info.items():
75
+ if row.has_exc(slot_idx):
76
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
77
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
78
+ exc = row.get_exc(slot_idx)
79
+ row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
80
+ continue
81
+
82
+ val = row[slot_idx]
83
+ if val is None:
84
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
85
+ row.cell_md[col.id] = None
86
+ continue
87
+
88
+ if col.col_type.is_json_type():
89
+ self._materialize_json_cell(row, col, val)
90
+ else:
91
+ assert col.col_type.is_array_type()
92
+ assert isinstance(val, np.ndarray)
93
+ self._materialize_array_cell(row, col, val)
94
+
95
+ # continue with only the currently open file
96
+ self.inlined_obj_files = self.inlined_obj_files[-1:]
97
+
98
+ yield batch
99
+
100
+ self._flush_buffer(finalize=True)
101
+
102
+ def init_writer(self) -> None:
103
+ if self.buffered_writer is None:
104
+ self._reset_buffer()
105
+ assert self.buffered_writer is not None
106
+
107
+ def close(self) -> None:
108
+ if self.buffered_writer is not None:
109
+ # there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
110
+ self.buffered_writer.close()
111
+ self.buffered_writer = None
112
+
113
+ def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
114
+ if self._json_has_inlined_objs(val):
115
+ row.cell_vals[col.id] = self._rewrite_json(val)
116
+ row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
117
+ else:
118
+ row.cell_vals[col.id] = val
119
+ row.cell_md[col.id] = None
120
+
121
+ def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
122
+ if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
123
+ # this is a vector column (ie, used for a vector index): store the array itself
124
+ row.cell_vals[col.id] = val
125
+ row.cell_md[col.id] = None
126
+ elif val.nbytes <= self.MAX_DB_ARRAY_SIZE:
127
+ # this array is small enough to store in the db column (type: binary) directly
128
+ buffer = io.BytesIO()
129
+ np.save(buffer, val, allow_pickle=False)
130
+ row.cell_vals[col.id] = buffer.getvalue()
131
+ row.cell_md[col.id] = None
132
+ else:
133
+ # append this array to the buffer and store its location in the cell md
134
+ ar: np.ndarray
135
+ if np.issubdtype(val.dtype, np.bool_):
136
+ # for bool arrays, store as packed bits, otherwise it's 1 byte per element
137
+ ar = np.packbits(val)
138
+ else:
139
+ ar = val
140
+ self.init_writer()
141
+ start = self.buffered_writer.tell()
142
+ np.save(self.buffered_writer, ar, allow_pickle=False)
143
+ end = self.buffered_writer.tell()
144
+ row.cell_vals[col.id] = None
145
+ cell_md = exprs.CellMd(
146
+ file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
147
+ )
148
+ if np.issubdtype(val.dtype, np.bool_):
149
+ cell_md.array_md.is_bool = True
150
+ cell_md.array_md.shape = val.shape
151
+ row.cell_md[col.id] = cell_md
152
+ self._flush_buffer()
153
+
154
+ assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
155
+
156
+ def _json_has_inlined_objs(self, element: Any) -> bool:
157
+ if isinstance(element, list):
158
+ return any(self._json_has_inlined_objs(v) for v in element)
159
+ if isinstance(element, dict):
160
+ return any(self._json_has_inlined_objs(v) for v in element.values())
161
+ return isinstance(element, (np.ndarray, PIL.Image.Image))
162
+
163
+ def _rewrite_json(self, element: Any) -> Any:
164
+ """Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
165
+ if isinstance(element, list):
166
+ return [self._rewrite_json(v) for v in element]
167
+ if isinstance(element, dict):
168
+ return {k: self._rewrite_json(v) for k, v in element.items()}
169
+ if isinstance(element, np.ndarray):
170
+ obj_md = self._write_inlined_array(element)
171
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
172
+ if isinstance(element, PIL.Image.Image):
173
+ obj_md = self._write_inlined_image(element)
174
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
175
+ return element
176
+
177
+ def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
178
+ """Write an ndarray to buffered_writer and return its metadata."""
179
+ self.init_writer()
180
+ url_idx = len(self.inlined_obj_files) - 1
181
+ start = self.buffered_writer.tell()
182
+ shape: tuple[int, ...] | None
183
+ is_bool_array: bool
184
+ if np.issubdtype(ar.dtype, np.bool_):
185
+ shape = ar.shape
186
+ ar = np.packbits(ar)
187
+ is_bool_array = True
188
+ else:
189
+ shape = None
190
+ is_bool_array = False
191
+ np.save(self.buffered_writer, ar, allow_pickle=False)
192
+ end = self.buffered_writer.tell()
193
+ self._flush_buffer()
194
+ return InlinedObjectMd(
195
+ type=ts.ColumnType.Type.ARRAY.name,
196
+ url_idx=url_idx,
197
+ array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
198
+ )
199
+
200
+ def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
201
+ """Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
202
+ self.init_writer()
203
+ url_idx = len(self.inlined_obj_files) - 1
204
+ start = self.buffered_writer.tell()
205
+ img.save(self.buffered_writer, format=image_utils.default_format(img))
206
+ end = self.buffered_writer.tell()
207
+ self._flush_buffer()
208
+ return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
209
+
210
+ def _reset_buffer(self) -> None:
211
+ local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
212
+ self.row_builder.tbl.id, 0, self.row_builder.tbl.version
213
+ )
214
+ self.inlined_obj_files.append(local_path)
215
+ fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2) # noqa: SIM115
216
+ assert isinstance(fh, io.BufferedWriter)
217
+ self.buffered_writer = fh
218
+
219
+ def _flush_buffer(self, finalize: bool = False) -> None:
220
+ """Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
221
+ if self.buffered_writer is None:
222
+ return
223
+ if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
224
+ return
225
+ self.buffered_writer.flush()
226
+ os.fsync(self.buffered_writer.fileno()) # needed to force bytes cached by OS to storage
227
+ self.buffered_writer.close()
228
+ if finalize:
229
+ self.buffered_writer = None
230
+ else:
231
+ self._reset_buffer()
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Any, AsyncIterator
7
+
8
+ import numpy as np
9
+ import PIL.Image
10
+
11
+ import pixeltable.type_system as ts
12
+ from pixeltable import exprs
13
+ from pixeltable.utils import parse_local_file_path
14
+
15
+ from .data_row_batch import DataRowBatch
16
+ from .exec_node import ExecNode
17
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
18
+
19
+ _logger = logging.getLogger('pixeltable')
20
+
21
+
22
+ def json_has_inlined_objs(element: Any) -> bool:
23
+ """Returns True if element contains inlined objects produced by CellMaterializationNode."""
24
+ if isinstance(element, list):
25
+ return any(json_has_inlined_objs(v) for v in element)
26
+ if isinstance(element, dict):
27
+ if INLINED_OBJECT_MD_KEY in element:
28
+ return True
29
+ return any(json_has_inlined_objs(v) for v in element.values())
30
+ return False
31
+
32
+
33
+ def reconstruct_json(element: Any, urls: list[str], file_handles: dict[Path, io.BufferedReader]) -> Any:
34
+ """Recursively reconstructs inlined objects in a json structure."""
35
+ if isinstance(element, list):
36
+ return [reconstruct_json(v, urls, file_handles) for v in element]
37
+ if isinstance(element, dict):
38
+ if INLINED_OBJECT_MD_KEY in element:
39
+ obj_md = InlinedObjectMd.from_dict(element[INLINED_OBJECT_MD_KEY])
40
+ url = urls[obj_md.url_idx]
41
+ local_path = parse_local_file_path(url)
42
+ if local_path not in file_handles:
43
+ file_handles[local_path] = open(local_path, 'rb') # noqa: SIM115
44
+ fp = file_handles[local_path]
45
+
46
+ if obj_md.type == ts.ColumnType.Type.ARRAY.name:
47
+ fp.seek(obj_md.array_md.start)
48
+ ar = load_array(
49
+ fp, obj_md.array_md.start, obj_md.array_md.end, obj_md.array_md.is_bool, obj_md.array_md.shape
50
+ )
51
+ return ar
52
+ else:
53
+ fp.seek(obj_md.img_start)
54
+ bytesio = io.BytesIO(fp.read(obj_md.img_end - obj_md.img_start))
55
+ img = PIL.Image.open(bytesio)
56
+ img.load()
57
+ assert fp.tell() == obj_md.img_end, f'{fp.tell()} != {obj_md.img_end} / {obj_md.img_start}'
58
+ return img
59
+ else:
60
+ return {k: reconstruct_json(v, urls, file_handles) for k, v in element.items()}
61
+ return element
62
+
63
+
64
+ def load_array(
65
+ fh: io.BufferedReader, start: int, end: int, is_bool_array: bool, shape: tuple[int, ...] | None
66
+ ) -> np.ndarray:
67
+ """Loads an array from a section of a file."""
68
+ fh.seek(start)
69
+ ar = np.load(fh, allow_pickle=False)
70
+ assert fh.tell() == end
71
+ if is_bool_array:
72
+ assert shape is not None
73
+ ar = np.unpackbits(ar, count=np.prod(shape)).reshape(shape).astype(bool)
74
+ return ar
75
+
76
+
77
+ class CellReconstructionNode(ExecNode):
78
+ """
79
+ Reconstruction of stored json and array cells that were produced by CellMaterializationNode.
80
+ """
81
+
82
+ json_refs: list[exprs.ColumnRef]
83
+ array_refs: list[exprs.ColumnRef]
84
+ file_handles: dict[Path, io.BufferedReader] # key: file path
85
+
86
+ def __init__(
87
+ self,
88
+ json_refs: list[exprs.ColumnRef],
89
+ array_refs: list[exprs.ColumnRef],
90
+ row_builder: exprs.RowBuilder,
91
+ input: ExecNode | None = None,
92
+ ):
93
+ super().__init__(row_builder, [], [], input)
94
+ self.json_refs = json_refs
95
+ self.array_refs = array_refs
96
+ self.file_handles = {}
97
+
98
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
99
+ async for batch in self.input:
100
+ for row in batch:
101
+ for col_ref in self.json_refs:
102
+ val = row[col_ref.slot_idx]
103
+ if val is None:
104
+ continue
105
+ cell_md = row.slot_md.get(col_ref.slot_idx)
106
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(row[col_ref.slot_idx]):
107
+ continue
108
+ row[col_ref.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
109
+
110
+ for col_ref in self.array_refs:
111
+ cell_md = row.slot_md.get(col_ref.slot_idx)
112
+ if cell_md is not None and cell_md.array_md is not None:
113
+ assert row[col_ref.slot_idx] is None
114
+ assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
115
+ row[col_ref.slot_idx] = self._reconstruct_array(cell_md)
116
+ else:
117
+ assert row[col_ref.slot_idx] is None or isinstance(row[col_ref.slot_idx], np.ndarray)
118
+
119
+ yield batch
120
+
121
+ def close(self) -> None:
122
+ for fp in self.file_handles.values():
123
+ fp.close()
124
+
125
+ def _reconstruct_array(self, cell_md: exprs.CellMd) -> np.ndarray:
126
+ assert cell_md.array_md is not None
127
+ local_path = parse_local_file_path(cell_md.file_urls[0])
128
+ assert local_path is not None
129
+ if local_path not in self.file_handles:
130
+ self.file_handles[local_path] = open(str(local_path), 'rb') # noqa: SIM115
131
+ fp = self.file_handles[local_path]
132
+ ar = load_array(
133
+ fp, cell_md.array_md.start, cell_md.array_md.end, bool(cell_md.array_md.is_bool), cell_md.array_md.shape
134
+ )
135
+ return ar
@@ -39,7 +39,7 @@ class ExecNode(abc.ABC):
39
39
  self.flushed_img_slots = [
40
40
  e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
41
41
  ]
42
- self.ctx = None # all nodes of a tree share the same context
42
+ self.ctx = input.ctx if input is not None else None
43
43
 
44
44
  def set_ctx(self, ctx: ExecContext) -> None:
45
45
  self.ctx = ctx
@@ -311,6 +311,7 @@ class JsonMapperDispatcher(Evaluator):
311
311
  img_slot_idxs=[],
312
312
  media_slot_idxs=[],
313
313
  array_slot_idxs=[],
314
+ json_slot_idxs=[],
314
315
  parent_row=row,
315
316
  parent_slot_idx=self.e.slot_idx,
316
317
  )
@@ -306,6 +306,9 @@ class ExprEvalNode(ExecNode):
306
306
  task.cancel()
307
307
  _ = await asyncio.gather(*active_tasks, return_exceptions=True)
308
308
 
309
+ # expr cleanup
310
+ exprs.Expr.release_list(self.exec_ctx.all_exprs)
311
+
309
312
  def dispatch_exc(
310
313
  self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
311
314
  ) -> None:
@@ -149,6 +149,7 @@ class ExecCtx:
149
149
  gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
150
150
  eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
151
151
  literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
152
+ all_exprs: list[exprs.Expr] # all evaluated exprs; needed for cleanup
152
153
 
153
154
  def __init__(
154
155
  self,
@@ -165,6 +166,7 @@ class ExecCtx:
165
166
  self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
166
167
 
167
168
  output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
169
+ self.all_exprs = output_ctx.exprs
168
170
  self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
169
171
  self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
170
172
  non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+
5
+ from pixeltable.exprs import ArrayMd
6
+ from pixeltable.utils.misc import non_none_dict_factory
7
+
8
+ INLINED_OBJECT_MD_KEY = '__pxtinlinedobjmd__'
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class InlinedObjectMd:
13
+ type: str # corresponds to ts.ColumnType.Type
14
+ url_idx: int
15
+ img_start: int | None = None
16
+ img_end: int | None = None
17
+ array_md: ArrayMd | None = None
18
+
19
+ @classmethod
20
+ def from_dict(cls, d: dict) -> InlinedObjectMd:
21
+ if 'array_md' in d:
22
+ array_md = ArrayMd(**d['array_md'])
23
+ del d['array_md']
24
+ return cls(**d, array_md=array_md)
25
+ else:
26
+ return cls(**d)
27
+
28
+ def as_dict(self) -> dict:
29
+ result = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
30
+ if self.array_md is not None:
31
+ result['array_md'] = self.array_md.as_dict()
32
+ return result