pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -0
- pixeltable/catalog/catalog.py +125 -63
- pixeltable/catalog/column.py +7 -2
- pixeltable/catalog/table.py +1 -0
- pixeltable/catalog/table_metadata.py +4 -0
- pixeltable/catalog/table_version.py +174 -117
- pixeltable/catalog/table_version_handle.py +4 -1
- pixeltable/catalog/table_version_path.py +0 -11
- pixeltable/catalog/view.py +6 -0
- pixeltable/config.py +7 -0
- pixeltable/dataframe.py +10 -5
- pixeltable/env.py +56 -19
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/exec_node.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +1 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
- pixeltable/exec/expr_eval/globals.py +2 -0
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/object_store_save_node.py +1 -4
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +107 -14
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +23 -18
- pixeltable/exprs/column_property_ref.py +10 -10
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/data_row.py +106 -37
- pixeltable/exprs/expr.py +9 -0
- pixeltable/exprs/expr_set.py +14 -7
- pixeltable/exprs/inline_expr.py +2 -19
- pixeltable/exprs/json_path.py +45 -12
- pixeltable/exprs/row_builder.py +54 -22
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/bedrock.py +7 -0
- pixeltable/functions/deepseek.py +11 -4
- pixeltable/functions/llama_cpp.py +7 -0
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/ollama.py +7 -0
- pixeltable/functions/openai.py +4 -4
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/video.py +110 -28
- pixeltable/globals.py +10 -4
- pixeltable/io/globals.py +18 -17
- pixeltable/io/parquet.py +1 -1
- pixeltable/io/table_data_conduit.py +47 -22
- pixeltable/iterators/document.py +61 -23
- pixeltable/iterators/video.py +126 -53
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +175 -46
- pixeltable/share/packager.py +155 -26
- pixeltable/store.py +2 -3
- pixeltable/type_system.py +5 -3
- pixeltable/utils/arrow.py +6 -6
- pixeltable/utils/av.py +65 -0
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/exception_handler.py +5 -28
- pixeltable/utils/image.py +7 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +16 -1
- pixeltable/utils/s3_store.py +44 -11
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0
pixeltable/dataframe.py
CHANGED
|
@@ -23,7 +23,7 @@ from typing import (
|
|
|
23
23
|
|
|
24
24
|
import pandas as pd
|
|
25
25
|
import pydantic
|
|
26
|
-
import sqlalchemy as
|
|
26
|
+
import sqlalchemy.exc as sql_exc
|
|
27
27
|
|
|
28
28
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
29
29
|
from pixeltable.catalog import Catalog, is_valid_identifier
|
|
@@ -186,6 +186,8 @@ class DataFrameResultSet:
|
|
|
186
186
|
|
|
187
187
|
|
|
188
188
|
class DataFrame:
|
|
189
|
+
"""Represents a query for retrieving and transforming data from Pixeltable tables."""
|
|
190
|
+
|
|
189
191
|
_from_clause: plan.FromClause
|
|
190
192
|
_select_list_exprs: list[exprs.Expr]
|
|
191
193
|
_schema: dict[str, ts.ColumnType]
|
|
@@ -539,20 +541,23 @@ class DataFrame:
|
|
|
539
541
|
yield [data_row[e.slot_idx] for e in self._select_list_exprs]
|
|
540
542
|
except excs.ExprEvalError as e:
|
|
541
543
|
self._raise_expr_eval_err(e)
|
|
542
|
-
except
|
|
543
|
-
|
|
544
|
+
except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
|
|
545
|
+
Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
|
|
546
|
+
raise # just re-raise if not converted to a Pixeltable error
|
|
544
547
|
|
|
545
548
|
def collect(self) -> DataFrameResultSet:
|
|
546
549
|
return DataFrameResultSet(list(self._output_row_iterator()), self.schema)
|
|
547
550
|
|
|
548
551
|
async def _acollect(self) -> DataFrameResultSet:
|
|
552
|
+
single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
|
|
549
553
|
try:
|
|
550
554
|
result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec()]
|
|
551
555
|
return DataFrameResultSet(result, self.schema)
|
|
552
556
|
except excs.ExprEvalError as e:
|
|
553
557
|
self._raise_expr_eval_err(e)
|
|
554
|
-
except
|
|
555
|
-
|
|
558
|
+
except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
|
|
559
|
+
Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
|
|
560
|
+
raise # just re-raise if not converted to a Pixeltable error
|
|
556
561
|
|
|
557
562
|
def count(self) -> int:
|
|
558
563
|
"""Return the number of rows in the DataFrame.
|
pixeltable/env.py
CHANGED
|
@@ -27,6 +27,7 @@ from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
|
27
27
|
import nest_asyncio # type: ignore[import-untyped]
|
|
28
28
|
import pixeltable_pgserver
|
|
29
29
|
import sqlalchemy as sql
|
|
30
|
+
import tzlocal
|
|
30
31
|
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
31
32
|
from sqlalchemy import orm
|
|
32
33
|
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
|
|
@@ -71,6 +72,7 @@ class Env:
|
|
|
71
72
|
_db_server: Optional[pixeltable_pgserver.PostgresServer] # set only when running in local environment
|
|
72
73
|
_db_url: Optional[str]
|
|
73
74
|
_default_time_zone: Optional[ZoneInfo]
|
|
75
|
+
_verbosity: int
|
|
74
76
|
|
|
75
77
|
# info about optional packages that are utilized by some parts of the code
|
|
76
78
|
__optional_packages: dict[str, PackageInfo]
|
|
@@ -218,10 +220,18 @@ class Env:
|
|
|
218
220
|
"""
|
|
219
221
|
This is not a publicly visible setter; it is only for testing purposes.
|
|
220
222
|
"""
|
|
221
|
-
|
|
223
|
+
if tz is None:
|
|
224
|
+
tz_name = self._get_tz_name()
|
|
225
|
+
else:
|
|
226
|
+
assert isinstance(tz, ZoneInfo)
|
|
227
|
+
tz_name = tz.key
|
|
222
228
|
self.engine.dispose()
|
|
223
229
|
self._create_engine(time_zone_name=tz_name)
|
|
224
230
|
|
|
231
|
+
@property
|
|
232
|
+
def verbosity(self) -> int:
|
|
233
|
+
return self._verbosity
|
|
234
|
+
|
|
225
235
|
@property
|
|
226
236
|
def conn(self) -> Optional[sql.Connection]:
|
|
227
237
|
assert self._current_conn is not None
|
|
@@ -237,6 +247,11 @@ class Env:
|
|
|
237
247
|
assert self._dbms is not None
|
|
238
248
|
return self._dbms
|
|
239
249
|
|
|
250
|
+
@property
|
|
251
|
+
def is_using_cockroachdb(self) -> bool:
|
|
252
|
+
assert self._dbms is not None
|
|
253
|
+
return isinstance(self._dbms, CockroachDbms)
|
|
254
|
+
|
|
240
255
|
@property
|
|
241
256
|
def in_xact(self) -> bool:
|
|
242
257
|
return self._current_conn is not None
|
|
@@ -247,7 +262,7 @@ class Env:
|
|
|
247
262
|
return self._db_server is not None
|
|
248
263
|
|
|
249
264
|
@contextmanager
|
|
250
|
-
def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
265
|
+
def begin_xact(self, *, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
251
266
|
"""
|
|
252
267
|
Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
|
|
253
268
|
|
|
@@ -340,6 +355,8 @@ class Env:
|
|
|
340
355
|
# accept log messages from a configured pixeltable module (at any level of the module hierarchy)
|
|
341
356
|
path_parts = list(Path(record.pathname).parts)
|
|
342
357
|
path_parts.reverse()
|
|
358
|
+
if 'pixeltable' not in path_parts:
|
|
359
|
+
return False
|
|
343
360
|
max_idx = path_parts.index('pixeltable')
|
|
344
361
|
for module_name in path_parts[:max_idx]:
|
|
345
362
|
if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
|
|
@@ -350,6 +367,26 @@ class Env:
|
|
|
350
367
|
def console_logger(self) -> ConsoleLogger:
|
|
351
368
|
return self._console_logger
|
|
352
369
|
|
|
370
|
+
def _get_tz_name(self) -> str:
|
|
371
|
+
"""Get the time zone name from the configuration, or the system local time zone if not specified.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
str: The time zone name.
|
|
375
|
+
"""
|
|
376
|
+
tz_name = Config.get().get_string_value('time_zone')
|
|
377
|
+
if tz_name is not None:
|
|
378
|
+
# Validate tzname
|
|
379
|
+
if not isinstance(tz_name, str):
|
|
380
|
+
self._logger.error('Invalid time zone specified in configuration.')
|
|
381
|
+
else:
|
|
382
|
+
try:
|
|
383
|
+
_ = ZoneInfo(tz_name)
|
|
384
|
+
except ZoneInfoNotFoundError:
|
|
385
|
+
self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
|
|
386
|
+
else:
|
|
387
|
+
tz_name = tzlocal.get_localzone_name()
|
|
388
|
+
return tz_name
|
|
389
|
+
|
|
353
390
|
def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
|
|
354
391
|
if self._initialized:
|
|
355
392
|
return
|
|
@@ -393,10 +430,12 @@ class Env:
|
|
|
393
430
|
warnings.simplefilter('ignore', category=UserWarning)
|
|
394
431
|
warnings.simplefilter('ignore', category=FutureWarning)
|
|
395
432
|
|
|
396
|
-
# Set
|
|
397
|
-
|
|
433
|
+
# Set verbosity level for user visible console messages
|
|
434
|
+
self._verbosity = config.get_int_value('verbosity')
|
|
435
|
+
if self._verbosity is None:
|
|
436
|
+
self._verbosity = 1
|
|
398
437
|
stdout_handler = ConsoleOutputHandler(stream=stdout)
|
|
399
|
-
stdout_handler.setLevel(
|
|
438
|
+
stdout_handler.setLevel(map_level(self._verbosity))
|
|
400
439
|
stdout_handler.addFilter(ConsoleMessageFilter())
|
|
401
440
|
self._logger.addHandler(stdout_handler)
|
|
402
441
|
self._console_logger = ConsoleLogger(self._logger)
|
|
@@ -430,6 +469,7 @@ class Env:
|
|
|
430
469
|
http_logger.propagate = False
|
|
431
470
|
|
|
432
471
|
self.clear_tmp_dir()
|
|
472
|
+
tz_name = self._get_tz_name()
|
|
433
473
|
|
|
434
474
|
# configure pixeltable database
|
|
435
475
|
self._init_db(config)
|
|
@@ -439,22 +479,10 @@ class Env:
|
|
|
439
479
|
'Reinitializing pixeltable database is not supported when running in non-local environment'
|
|
440
480
|
)
|
|
441
481
|
|
|
442
|
-
tz_name = config.get_string_value('time_zone')
|
|
443
|
-
if tz_name is not None:
|
|
444
|
-
# Validate tzname
|
|
445
|
-
if not isinstance(tz_name, str):
|
|
446
|
-
self._logger.error('Invalid time zone specified in configuration.')
|
|
447
|
-
else:
|
|
448
|
-
try:
|
|
449
|
-
_ = ZoneInfo(tz_name)
|
|
450
|
-
except ZoneInfoNotFoundError:
|
|
451
|
-
self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
|
|
452
|
-
|
|
453
482
|
if reinit_db and self._store_db_exists():
|
|
454
483
|
self._drop_store_db()
|
|
455
484
|
|
|
456
485
|
create_db = not self._store_db_exists()
|
|
457
|
-
|
|
458
486
|
if create_db:
|
|
459
487
|
self._logger.info(f'creating database at: {self.db_url}')
|
|
460
488
|
self._create_store_db()
|
|
@@ -534,19 +562,28 @@ class Env:
|
|
|
534
562
|
metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
|
|
535
563
|
metadata.create_system_info(self._sa_engine)
|
|
536
564
|
|
|
537
|
-
def _create_engine(self, time_zone_name:
|
|
538
|
-
connect_args = {
|
|
565
|
+
def _create_engine(self, time_zone_name: str, echo: bool = False) -> None:
|
|
566
|
+
connect_args = {'options': f'-c timezone={time_zone_name}'}
|
|
567
|
+
self._logger.info(f'Creating SQLAlchemy engine with connection arguments: {connect_args}')
|
|
539
568
|
self._sa_engine = sql.create_engine(
|
|
540
569
|
self.db_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level, connect_args=connect_args
|
|
541
570
|
)
|
|
542
571
|
|
|
543
572
|
self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
|
|
573
|
+
self._logger.info(f'Engine dialect: {self._sa_engine.dialect.name}')
|
|
574
|
+
self._logger.info(f'Engine driver : {self._sa_engine.dialect.driver}')
|
|
544
575
|
|
|
545
576
|
with self.engine.begin() as conn:
|
|
546
577
|
tz_name = conn.execute(sql.text('SHOW TIME ZONE')).scalar()
|
|
547
578
|
assert isinstance(tz_name, str)
|
|
548
579
|
self._logger.info(f'Database time zone is now: {tz_name}')
|
|
549
580
|
self._default_time_zone = ZoneInfo(tz_name)
|
|
581
|
+
if self.is_using_cockroachdb:
|
|
582
|
+
# This could be set when the database is created, but we set it now
|
|
583
|
+
conn.execute(sql.text('SET null_ordered_last = true;'))
|
|
584
|
+
null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
|
|
585
|
+
assert isinstance(null_ordered_last, str)
|
|
586
|
+
self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
|
|
550
587
|
|
|
551
588
|
def _store_db_exists(self) -> bool:
|
|
552
589
|
assert self._db_name is not None
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
from .aggregation_node import AggregationNode
|
|
4
4
|
from .cache_prefetch_node import CachePrefetchNode
|
|
5
|
+
from .cell_materialization_node import CellMaterializationNode
|
|
6
|
+
from .cell_reconstruction_node import CellReconstructionNode
|
|
5
7
|
from .component_iteration_node import ComponentIterationNode
|
|
6
8
|
from .data_row_batch import DataRowBatch
|
|
7
9
|
from .exec_context import ExecContext
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, AsyncIterator
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
11
|
+
import PIL.Image
|
|
12
|
+
import sqlalchemy as sql
|
|
13
|
+
|
|
14
|
+
import pixeltable.type_system as ts
|
|
15
|
+
import pixeltable.utils.image as image_utils
|
|
16
|
+
from pixeltable import catalog, exprs
|
|
17
|
+
from pixeltable.env import Env
|
|
18
|
+
from pixeltable.utils.local_store import LocalStore
|
|
19
|
+
|
|
20
|
+
from .data_row_batch import DataRowBatch
|
|
21
|
+
from .exec_node import ExecNode
|
|
22
|
+
from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
|
|
23
|
+
|
|
24
|
+
_logger = logging.getLogger('pixeltable')
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CellMaterializationNode(ExecNode):
|
|
28
|
+
"""
|
|
29
|
+
Node to populate DataRow.cell_vals/cell_md.
|
|
30
|
+
|
|
31
|
+
For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
|
|
32
|
+
|
|
33
|
+
Array values:
|
|
34
|
+
- Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
|
|
35
|
+
- Larger arrays are written to inlined_obj_files
|
|
36
|
+
- Bool arrays are stored as packed bits (uint8)
|
|
37
|
+
- cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
|
|
38
|
+
(this allows us to query cell_md to get the total external storage size of an array column)
|
|
39
|
+
|
|
40
|
+
Json values:
|
|
41
|
+
- Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
|
|
42
|
+
location
|
|
43
|
+
- Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
|
|
44
|
+
- cell_md contains the list of urls for the inlined objects.
|
|
45
|
+
|
|
46
|
+
TODO:
|
|
47
|
+
- execute file IO via asyncio Tasks in a thread pool?
|
|
48
|
+
(we already seem to be getting 90% of hardware IO throughput)
|
|
49
|
+
- subsume all cell materialization
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
output_col_info: dict[catalog.Column, int] # value: slot idx
|
|
53
|
+
|
|
54
|
+
# execution state
|
|
55
|
+
inlined_obj_files: list[Path] # only [-1] is open for writing
|
|
56
|
+
buffered_writer: io.BufferedWriter | None # BufferedWriter for inlined_obj_files[-1]
|
|
57
|
+
|
|
58
|
+
MIN_FILE_SIZE = 8 * 2**20 # 8MB
|
|
59
|
+
MAX_DB_ARRAY_SIZE = 512 # max size of array stored in table column; in bytes
|
|
60
|
+
|
|
61
|
+
def __init__(self, input: ExecNode):
|
|
62
|
+
super().__init__(input.row_builder, [], [], input)
|
|
63
|
+
self.output_col_info = {
|
|
64
|
+
col: slot_idx
|
|
65
|
+
for col, slot_idx in input.row_builder.table_columns.items()
|
|
66
|
+
if slot_idx is not None and (col.col_type.is_json_type() or col.col_type.is_array_type())
|
|
67
|
+
}
|
|
68
|
+
self.inlined_obj_files = []
|
|
69
|
+
self.buffered_writer = None
|
|
70
|
+
|
|
71
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
72
|
+
async for batch in self.input:
|
|
73
|
+
for row in batch:
|
|
74
|
+
for col, slot_idx in self.output_col_info.items():
|
|
75
|
+
if row.has_exc(slot_idx):
|
|
76
|
+
# Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
|
|
77
|
+
row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
|
|
78
|
+
exc = row.get_exc(slot_idx)
|
|
79
|
+
row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
val = row[slot_idx]
|
|
83
|
+
if val is None:
|
|
84
|
+
row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
|
|
85
|
+
row.cell_md[col.id] = None
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
if col.col_type.is_json_type():
|
|
89
|
+
self._materialize_json_cell(row, col, val)
|
|
90
|
+
else:
|
|
91
|
+
assert col.col_type.is_array_type()
|
|
92
|
+
assert isinstance(val, np.ndarray)
|
|
93
|
+
self._materialize_array_cell(row, col, val)
|
|
94
|
+
|
|
95
|
+
# continue with only the currently open file
|
|
96
|
+
self.inlined_obj_files = self.inlined_obj_files[-1:]
|
|
97
|
+
|
|
98
|
+
yield batch
|
|
99
|
+
|
|
100
|
+
self._flush_buffer(finalize=True)
|
|
101
|
+
|
|
102
|
+
def init_writer(self) -> None:
|
|
103
|
+
if self.buffered_writer is None:
|
|
104
|
+
self._reset_buffer()
|
|
105
|
+
assert self.buffered_writer is not None
|
|
106
|
+
|
|
107
|
+
def close(self) -> None:
|
|
108
|
+
if self.buffered_writer is not None:
|
|
109
|
+
# there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
|
|
110
|
+
self.buffered_writer.close()
|
|
111
|
+
self.buffered_writer = None
|
|
112
|
+
|
|
113
|
+
def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
|
|
114
|
+
if self._json_has_inlined_objs(val):
|
|
115
|
+
row.cell_vals[col.id] = self._rewrite_json(val)
|
|
116
|
+
row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
|
|
117
|
+
else:
|
|
118
|
+
row.cell_vals[col.id] = val
|
|
119
|
+
row.cell_md[col.id] = None
|
|
120
|
+
|
|
121
|
+
def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
|
|
122
|
+
if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
|
|
123
|
+
# this is a vector column (ie, used for a vector index): store the array itself
|
|
124
|
+
row.cell_vals[col.id] = val
|
|
125
|
+
row.cell_md[col.id] = None
|
|
126
|
+
elif val.nbytes <= self.MAX_DB_ARRAY_SIZE:
|
|
127
|
+
# this array is small enough to store in the db column (type: binary) directly
|
|
128
|
+
buffer = io.BytesIO()
|
|
129
|
+
np.save(buffer, val, allow_pickle=False)
|
|
130
|
+
row.cell_vals[col.id] = buffer.getvalue()
|
|
131
|
+
row.cell_md[col.id] = None
|
|
132
|
+
else:
|
|
133
|
+
# append this array to the buffer and store its location in the cell md
|
|
134
|
+
ar: np.ndarray
|
|
135
|
+
if np.issubdtype(val.dtype, np.bool_):
|
|
136
|
+
# for bool arrays, store as packed bits, otherwise it's 1 byte per element
|
|
137
|
+
ar = np.packbits(val)
|
|
138
|
+
else:
|
|
139
|
+
ar = val
|
|
140
|
+
self.init_writer()
|
|
141
|
+
start = self.buffered_writer.tell()
|
|
142
|
+
np.save(self.buffered_writer, ar, allow_pickle=False)
|
|
143
|
+
end = self.buffered_writer.tell()
|
|
144
|
+
row.cell_vals[col.id] = None
|
|
145
|
+
cell_md = exprs.CellMd(
|
|
146
|
+
file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
|
|
147
|
+
)
|
|
148
|
+
if np.issubdtype(val.dtype, np.bool_):
|
|
149
|
+
cell_md.array_md.is_bool = True
|
|
150
|
+
cell_md.array_md.shape = val.shape
|
|
151
|
+
row.cell_md[col.id] = cell_md
|
|
152
|
+
self._flush_buffer()
|
|
153
|
+
|
|
154
|
+
assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
|
|
155
|
+
|
|
156
|
+
def _json_has_inlined_objs(self, element: Any) -> bool:
|
|
157
|
+
if isinstance(element, list):
|
|
158
|
+
return any(self._json_has_inlined_objs(v) for v in element)
|
|
159
|
+
if isinstance(element, dict):
|
|
160
|
+
return any(self._json_has_inlined_objs(v) for v in element.values())
|
|
161
|
+
return isinstance(element, (np.ndarray, PIL.Image.Image))
|
|
162
|
+
|
|
163
|
+
def _rewrite_json(self, element: Any) -> Any:
|
|
164
|
+
"""Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
|
|
165
|
+
if isinstance(element, list):
|
|
166
|
+
return [self._rewrite_json(v) for v in element]
|
|
167
|
+
if isinstance(element, dict):
|
|
168
|
+
return {k: self._rewrite_json(v) for k, v in element.items()}
|
|
169
|
+
if isinstance(element, np.ndarray):
|
|
170
|
+
obj_md = self._write_inlined_array(element)
|
|
171
|
+
return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
|
|
172
|
+
if isinstance(element, PIL.Image.Image):
|
|
173
|
+
obj_md = self._write_inlined_image(element)
|
|
174
|
+
return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
|
|
175
|
+
return element
|
|
176
|
+
|
|
177
|
+
def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
|
|
178
|
+
"""Write an ndarray to buffered_writer and return its metadata."""
|
|
179
|
+
self.init_writer()
|
|
180
|
+
url_idx = len(self.inlined_obj_files) - 1
|
|
181
|
+
start = self.buffered_writer.tell()
|
|
182
|
+
shape: tuple[int, ...] | None
|
|
183
|
+
is_bool_array: bool
|
|
184
|
+
if np.issubdtype(ar.dtype, np.bool_):
|
|
185
|
+
shape = ar.shape
|
|
186
|
+
ar = np.packbits(ar)
|
|
187
|
+
is_bool_array = True
|
|
188
|
+
else:
|
|
189
|
+
shape = None
|
|
190
|
+
is_bool_array = False
|
|
191
|
+
np.save(self.buffered_writer, ar, allow_pickle=False)
|
|
192
|
+
end = self.buffered_writer.tell()
|
|
193
|
+
self._flush_buffer()
|
|
194
|
+
return InlinedObjectMd(
|
|
195
|
+
type=ts.ColumnType.Type.ARRAY.name,
|
|
196
|
+
url_idx=url_idx,
|
|
197
|
+
array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
|
|
201
|
+
"""Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
|
|
202
|
+
self.init_writer()
|
|
203
|
+
url_idx = len(self.inlined_obj_files) - 1
|
|
204
|
+
start = self.buffered_writer.tell()
|
|
205
|
+
img.save(self.buffered_writer, format=image_utils.default_format(img))
|
|
206
|
+
end = self.buffered_writer.tell()
|
|
207
|
+
self._flush_buffer()
|
|
208
|
+
return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
|
|
209
|
+
|
|
210
|
+
def _reset_buffer(self) -> None:
|
|
211
|
+
local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
|
|
212
|
+
self.row_builder.tbl.id, 0, self.row_builder.tbl.version
|
|
213
|
+
)
|
|
214
|
+
self.inlined_obj_files.append(local_path)
|
|
215
|
+
fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2) # noqa: SIM115
|
|
216
|
+
assert isinstance(fh, io.BufferedWriter)
|
|
217
|
+
self.buffered_writer = fh
|
|
218
|
+
|
|
219
|
+
def _flush_buffer(self, finalize: bool = False) -> None:
|
|
220
|
+
"""Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
|
|
221
|
+
if self.buffered_writer is None:
|
|
222
|
+
return
|
|
223
|
+
if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
|
|
224
|
+
return
|
|
225
|
+
self.buffered_writer.flush()
|
|
226
|
+
os.fsync(self.buffered_writer.fileno()) # needed to force bytes cached by OS to storage
|
|
227
|
+
self.buffered_writer.close()
|
|
228
|
+
if finalize:
|
|
229
|
+
self.buffered_writer = None
|
|
230
|
+
else:
|
|
231
|
+
self._reset_buffer()
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, AsyncIterator
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import PIL.Image
|
|
10
|
+
|
|
11
|
+
import pixeltable.type_system as ts
|
|
12
|
+
from pixeltable import exprs
|
|
13
|
+
from pixeltable.utils import parse_local_file_path
|
|
14
|
+
|
|
15
|
+
from .data_row_batch import DataRowBatch
|
|
16
|
+
from .exec_node import ExecNode
|
|
17
|
+
from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
|
|
18
|
+
|
|
19
|
+
_logger = logging.getLogger('pixeltable')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def json_has_inlined_objs(element: Any) -> bool:
|
|
23
|
+
"""Returns True if element contains inlined objects produced by CellMaterializationNode."""
|
|
24
|
+
if isinstance(element, list):
|
|
25
|
+
return any(json_has_inlined_objs(v) for v in element)
|
|
26
|
+
if isinstance(element, dict):
|
|
27
|
+
if INLINED_OBJECT_MD_KEY in element:
|
|
28
|
+
return True
|
|
29
|
+
return any(json_has_inlined_objs(v) for v in element.values())
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def reconstruct_json(element: Any, urls: list[str], file_handles: dict[Path, io.BufferedReader]) -> Any:
|
|
34
|
+
"""Recursively reconstructs inlined objects in a json structure."""
|
|
35
|
+
if isinstance(element, list):
|
|
36
|
+
return [reconstruct_json(v, urls, file_handles) for v in element]
|
|
37
|
+
if isinstance(element, dict):
|
|
38
|
+
if INLINED_OBJECT_MD_KEY in element:
|
|
39
|
+
obj_md = InlinedObjectMd.from_dict(element[INLINED_OBJECT_MD_KEY])
|
|
40
|
+
url = urls[obj_md.url_idx]
|
|
41
|
+
local_path = parse_local_file_path(url)
|
|
42
|
+
if local_path not in file_handles:
|
|
43
|
+
file_handles[local_path] = open(local_path, 'rb') # noqa: SIM115
|
|
44
|
+
fp = file_handles[local_path]
|
|
45
|
+
|
|
46
|
+
if obj_md.type == ts.ColumnType.Type.ARRAY.name:
|
|
47
|
+
fp.seek(obj_md.array_md.start)
|
|
48
|
+
ar = load_array(
|
|
49
|
+
fp, obj_md.array_md.start, obj_md.array_md.end, obj_md.array_md.is_bool, obj_md.array_md.shape
|
|
50
|
+
)
|
|
51
|
+
return ar
|
|
52
|
+
else:
|
|
53
|
+
fp.seek(obj_md.img_start)
|
|
54
|
+
bytesio = io.BytesIO(fp.read(obj_md.img_end - obj_md.img_start))
|
|
55
|
+
img = PIL.Image.open(bytesio)
|
|
56
|
+
img.load()
|
|
57
|
+
assert fp.tell() == obj_md.img_end, f'{fp.tell()} != {obj_md.img_end} / {obj_md.img_start}'
|
|
58
|
+
return img
|
|
59
|
+
else:
|
|
60
|
+
return {k: reconstruct_json(v, urls, file_handles) for k, v in element.items()}
|
|
61
|
+
return element
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def load_array(
|
|
65
|
+
fh: io.BufferedReader, start: int, end: int, is_bool_array: bool, shape: tuple[int, ...] | None
|
|
66
|
+
) -> np.ndarray:
|
|
67
|
+
"""Loads an array from a section of a file."""
|
|
68
|
+
fh.seek(start)
|
|
69
|
+
ar = np.load(fh, allow_pickle=False)
|
|
70
|
+
assert fh.tell() == end
|
|
71
|
+
if is_bool_array:
|
|
72
|
+
assert shape is not None
|
|
73
|
+
ar = np.unpackbits(ar, count=np.prod(shape)).reshape(shape).astype(bool)
|
|
74
|
+
return ar
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class CellReconstructionNode(ExecNode):
|
|
78
|
+
"""
|
|
79
|
+
Reconstruction of stored json and array cells that were produced by CellMaterializationNode.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
json_refs: list[exprs.ColumnRef]
|
|
83
|
+
array_refs: list[exprs.ColumnRef]
|
|
84
|
+
file_handles: dict[Path, io.BufferedReader] # key: file path
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
json_refs: list[exprs.ColumnRef],
|
|
89
|
+
array_refs: list[exprs.ColumnRef],
|
|
90
|
+
row_builder: exprs.RowBuilder,
|
|
91
|
+
input: ExecNode | None = None,
|
|
92
|
+
):
|
|
93
|
+
super().__init__(row_builder, [], [], input)
|
|
94
|
+
self.json_refs = json_refs
|
|
95
|
+
self.array_refs = array_refs
|
|
96
|
+
self.file_handles = {}
|
|
97
|
+
|
|
98
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
99
|
+
async for batch in self.input:
|
|
100
|
+
for row in batch:
|
|
101
|
+
for col_ref in self.json_refs:
|
|
102
|
+
val = row[col_ref.slot_idx]
|
|
103
|
+
if val is None:
|
|
104
|
+
continue
|
|
105
|
+
cell_md = row.slot_md.get(col_ref.slot_idx)
|
|
106
|
+
if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(row[col_ref.slot_idx]):
|
|
107
|
+
continue
|
|
108
|
+
row[col_ref.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
|
|
109
|
+
|
|
110
|
+
for col_ref in self.array_refs:
|
|
111
|
+
cell_md = row.slot_md.get(col_ref.slot_idx)
|
|
112
|
+
if cell_md is not None and cell_md.array_md is not None:
|
|
113
|
+
assert row[col_ref.slot_idx] is None
|
|
114
|
+
assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
|
|
115
|
+
row[col_ref.slot_idx] = self._reconstruct_array(cell_md)
|
|
116
|
+
else:
|
|
117
|
+
assert row[col_ref.slot_idx] is None or isinstance(row[col_ref.slot_idx], np.ndarray)
|
|
118
|
+
|
|
119
|
+
yield batch
|
|
120
|
+
|
|
121
|
+
def close(self) -> None:
|
|
122
|
+
for fp in self.file_handles.values():
|
|
123
|
+
fp.close()
|
|
124
|
+
|
|
125
|
+
def _reconstruct_array(self, cell_md: exprs.CellMd) -> np.ndarray:
|
|
126
|
+
assert cell_md.array_md is not None
|
|
127
|
+
local_path = parse_local_file_path(cell_md.file_urls[0])
|
|
128
|
+
assert local_path is not None
|
|
129
|
+
if local_path not in self.file_handles:
|
|
130
|
+
self.file_handles[local_path] = open(str(local_path), 'rb') # noqa: SIM115
|
|
131
|
+
fp = self.file_handles[local_path]
|
|
132
|
+
ar = load_array(
|
|
133
|
+
fp, cell_md.array_md.start, cell_md.array_md.end, bool(cell_md.array_md.is_bool), cell_md.array_md.shape
|
|
134
|
+
)
|
|
135
|
+
return ar
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -39,7 +39,7 @@ class ExecNode(abc.ABC):
|
|
|
39
39
|
self.flushed_img_slots = [
|
|
40
40
|
e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
41
41
|
]
|
|
42
|
-
self.ctx =
|
|
42
|
+
self.ctx = input.ctx if input is not None else None
|
|
43
43
|
|
|
44
44
|
def set_ctx(self, ctx: ExecContext) -> None:
|
|
45
45
|
self.ctx = ctx
|
|
@@ -306,6 +306,9 @@ class ExprEvalNode(ExecNode):
|
|
|
306
306
|
task.cancel()
|
|
307
307
|
_ = await asyncio.gather(*active_tasks, return_exceptions=True)
|
|
308
308
|
|
|
309
|
+
# expr cleanup
|
|
310
|
+
exprs.Expr.release_list(self.exec_ctx.all_exprs)
|
|
311
|
+
|
|
309
312
|
def dispatch_exc(
|
|
310
313
|
self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
|
|
311
314
|
) -> None:
|
|
@@ -149,6 +149,7 @@ class ExecCtx:
|
|
|
149
149
|
gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
|
|
150
150
|
eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
|
|
151
151
|
literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
|
|
152
|
+
all_exprs: list[exprs.Expr] # all evaluated exprs; needed for cleanup
|
|
152
153
|
|
|
153
154
|
def __init__(
|
|
154
155
|
self,
|
|
@@ -165,6 +166,7 @@ class ExecCtx:
|
|
|
165
166
|
self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
|
|
166
167
|
|
|
167
168
|
output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
|
|
169
|
+
self.all_exprs = output_ctx.exprs
|
|
168
170
|
self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
|
|
169
171
|
self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
|
|
170
172
|
non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import dataclasses
|
|
4
|
+
|
|
5
|
+
from pixeltable.exprs import ArrayMd
|
|
6
|
+
from pixeltable.utils.misc import non_none_dict_factory
|
|
7
|
+
|
|
8
|
+
INLINED_OBJECT_MD_KEY = '__pxtinlinedobjmd__'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class InlinedObjectMd:
|
|
13
|
+
type: str # corresponds to ts.ColumnType.Type
|
|
14
|
+
url_idx: int
|
|
15
|
+
img_start: int | None = None
|
|
16
|
+
img_end: int | None = None
|
|
17
|
+
array_md: ArrayMd | None = None
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def from_dict(cls, d: dict) -> InlinedObjectMd:
|
|
21
|
+
if 'array_md' in d:
|
|
22
|
+
array_md = ArrayMd(**d['array_md'])
|
|
23
|
+
del d['array_md']
|
|
24
|
+
return cls(**d, array_md=array_md)
|
|
25
|
+
else:
|
|
26
|
+
return cls(**d)
|
|
27
|
+
|
|
28
|
+
def as_dict(self) -> dict:
|
|
29
|
+
result = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
|
|
30
|
+
if self.array_md is not None:
|
|
31
|
+
result['array_md'] = self.array_md.as_dict()
|
|
32
|
+
return result
|