pixeltable 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -0
- pixeltable/catalog/catalog.py +105 -51
- pixeltable/catalog/column.py +7 -2
- pixeltable/catalog/table.py +1 -0
- pixeltable/catalog/table_metadata.py +4 -0
- pixeltable/catalog/table_version.py +99 -78
- pixeltable/catalog/table_version_handle.py +4 -1
- pixeltable/config.py +6 -0
- pixeltable/dataframe.py +10 -5
- pixeltable/env.py +48 -19
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/exec_node.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +1 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
- pixeltable/exec/expr_eval/globals.py +2 -0
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/object_store_save_node.py +1 -4
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +107 -14
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +10 -11
- pixeltable/exprs/column_property_ref.py +10 -10
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/data_row.py +106 -37
- pixeltable/exprs/expr.py +9 -0
- pixeltable/exprs/expr_set.py +14 -7
- pixeltable/exprs/inline_expr.py +2 -19
- pixeltable/exprs/json_path.py +45 -12
- pixeltable/exprs/row_builder.py +54 -22
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/bedrock.py +7 -0
- pixeltable/functions/deepseek.py +11 -4
- pixeltable/functions/llama_cpp.py +7 -0
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/ollama.py +7 -0
- pixeltable/functions/openai.py +4 -4
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/globals.py +10 -4
- pixeltable/io/globals.py +16 -15
- pixeltable/io/table_data_conduit.py +46 -21
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +175 -46
- pixeltable/store.py +1 -1
- pixeltable/type_system.py +5 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/exception_handler.py +5 -28
- pixeltable/utils/image.py +7 -0
- pixeltable/utils/misc.py +5 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/METADATA +2 -1
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/RECORD +57 -50
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/licenses/LICENSE +0 -0
|
@@ -10,7 +10,9 @@ from dataclasses import dataclass, field, fields
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
|
|
12
12
|
|
|
13
|
+
import numpy as np
|
|
13
14
|
import pandas as pd
|
|
15
|
+
import PIL
|
|
14
16
|
from pyarrow.parquet import ParquetDataset
|
|
15
17
|
|
|
16
18
|
import pixeltable as pxt
|
|
@@ -325,7 +327,11 @@ class JsonTableDataConduit(TableDataConduit):
|
|
|
325
327
|
|
|
326
328
|
|
|
327
329
|
class HFTableDataConduit(TableDataConduit):
|
|
328
|
-
|
|
330
|
+
"""
|
|
331
|
+
TODO:
|
|
332
|
+
- use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
|
|
333
|
+
"""
|
|
334
|
+
|
|
329
335
|
column_name_for_split: Optional[str] = None
|
|
330
336
|
categorical_features: dict[str, dict[int, str]]
|
|
331
337
|
dataset_dict: dict[str, datasets.Dataset] = None
|
|
@@ -339,9 +345,19 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
339
345
|
import datasets
|
|
340
346
|
|
|
341
347
|
assert isinstance(tds.source, (datasets.Dataset, datasets.DatasetDict))
|
|
342
|
-
t.hf_ds = tds.source
|
|
343
348
|
if 'column_name_for_split' in t.extra_fields:
|
|
344
349
|
t.column_name_for_split = t.extra_fields['column_name_for_split']
|
|
350
|
+
|
|
351
|
+
# make sure we get numpy arrays for arrays, not Python lists
|
|
352
|
+
source = tds.source.with_format(type='numpy')
|
|
353
|
+
if isinstance(source, datasets.Dataset):
|
|
354
|
+
# when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
|
|
355
|
+
raw_name = source.split._name
|
|
356
|
+
split_name = raw_name.split('[')[0] if raw_name is not None else None
|
|
357
|
+
t.dataset_dict = {split_name: source}
|
|
358
|
+
else:
|
|
359
|
+
assert isinstance(source, datasets.DatasetDict)
|
|
360
|
+
t.dataset_dict = source
|
|
345
361
|
return t
|
|
346
362
|
|
|
347
363
|
@classmethod
|
|
@@ -361,7 +377,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
361
377
|
if self.source_column_map is None:
|
|
362
378
|
if self.src_schema_overrides is None:
|
|
363
379
|
self.src_schema_overrides = {}
|
|
364
|
-
self.hf_schema_source = _get_hf_schema(self.
|
|
380
|
+
self.hf_schema_source = _get_hf_schema(self.source)
|
|
365
381
|
self.src_schema = huggingface_schema_to_pxt_schema(
|
|
366
382
|
self.hf_schema_source, self.src_schema_overrides, self.src_pk
|
|
367
383
|
)
|
|
@@ -396,15 +412,6 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
396
412
|
def prepare_insert(self) -> None:
|
|
397
413
|
import datasets
|
|
398
414
|
|
|
399
|
-
if isinstance(self.source, datasets.Dataset):
|
|
400
|
-
# when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
|
|
401
|
-
raw_name = self.source.split._name
|
|
402
|
-
split_name = raw_name.split('[')[0] if raw_name is not None else None
|
|
403
|
-
self.dataset_dict = {split_name: self.source}
|
|
404
|
-
else:
|
|
405
|
-
assert isinstance(self.source, datasets.DatasetDict)
|
|
406
|
-
self.dataset_dict = self.source
|
|
407
|
-
|
|
408
415
|
# extract all class labels from the dataset to translate category ints to strings
|
|
409
416
|
self.categorical_features = {
|
|
410
417
|
feature_name: feature_type.names
|
|
@@ -415,26 +422,44 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
415
422
|
self.source_column_map = {}
|
|
416
423
|
self.check_source_columns_are_insertable(self.hf_schema_source.keys())
|
|
417
424
|
|
|
418
|
-
def _translate_row(self, row: dict[str, Any], split_name: str) -> dict[str, Any]:
|
|
425
|
+
def _translate_row(self, row: dict[str, Any], split_name: str, features: datasets.Features) -> dict[str, Any]:
|
|
419
426
|
output_row: dict[str, Any] = {}
|
|
420
427
|
for col_name, val in row.items():
|
|
421
428
|
# translate category ints to strings
|
|
422
429
|
new_val = self.categorical_features[col_name][val] if col_name in self.categorical_features else val
|
|
423
430
|
mapped_col_name = self.source_column_map.get(col_name, col_name)
|
|
424
431
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
checked_val = self.pxt_schema[mapped_col_name].create_literal(new_val)
|
|
428
|
-
except TypeError as e:
|
|
429
|
-
msg = str(e)
|
|
430
|
-
raise excs.Error(f'Error in column {col_name}: {msg[0].lower() + msg[1:]}\nRow: {row}') from e
|
|
431
|
-
output_row[mapped_col_name] = checked_val
|
|
432
|
+
new_val = self._translate_val(new_val, features[col_name])
|
|
433
|
+
output_row[mapped_col_name] = new_val
|
|
432
434
|
|
|
433
435
|
# add split name to output row
|
|
434
436
|
if self.column_name_for_split is not None:
|
|
435
437
|
output_row[self.column_name_for_split] = split_name
|
|
436
438
|
return output_row
|
|
437
439
|
|
|
440
|
+
def _translate_val(self, val: Any, feature: datasets.Feature) -> Any:
|
|
441
|
+
"""Convert numpy scalars to Python types and images to PIL.Image.Image"""
|
|
442
|
+
import datasets
|
|
443
|
+
|
|
444
|
+
if isinstance(feature, datasets.Value):
|
|
445
|
+
if isinstance(val, (np.generic, np.ndarray)):
|
|
446
|
+
# a scalar, which we want as a standard Python type
|
|
447
|
+
assert np.ndim(val) == 0
|
|
448
|
+
return val.item()
|
|
449
|
+
else:
|
|
450
|
+
# a standard Python object
|
|
451
|
+
return val
|
|
452
|
+
elif isinstance(feature, datasets.Sequence):
|
|
453
|
+
assert np.ndim(val) > 0
|
|
454
|
+
return val
|
|
455
|
+
elif isinstance(feature, datasets.Image):
|
|
456
|
+
return PIL.Image.fromarray(val)
|
|
457
|
+
elif isinstance(feature, dict):
|
|
458
|
+
assert isinstance(val, dict)
|
|
459
|
+
return {k: self._translate_val(v, feature[k]) for k, v in val.items()}
|
|
460
|
+
else:
|
|
461
|
+
return val
|
|
462
|
+
|
|
438
463
|
def valid_row_batch(self) -> Iterator[RowData]:
|
|
439
464
|
for split_name, split_dataset in self.dataset_dict.items():
|
|
440
465
|
num_batches = split_dataset.size_in_bytes / self._K_BATCH_SIZE_BYTES
|
|
@@ -443,7 +468,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
443
468
|
|
|
444
469
|
batch = []
|
|
445
470
|
for row in split_dataset:
|
|
446
|
-
batch.append(self._translate_row(row, split_name))
|
|
471
|
+
batch.append(self._translate_row(row, split_name, split_dataset.features))
|
|
447
472
|
if len(batch) >= tuples_per_batch:
|
|
448
473
|
yield batch
|
|
449
474
|
batch = []
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 41
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from uuid import UUID
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
|
|
9
|
+
_logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_converter(version=40)
|
|
13
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
14
|
+
convert_table_md(engine, table_modifier=__table_modifier)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
|
|
18
|
+
store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
|
|
19
|
+
store_name = f'{store_prefix}_{tbl_id.hex}'
|
|
20
|
+
|
|
21
|
+
# Get the list of column names that need _cellmd columns
|
|
22
|
+
_logger.info(f'Checking table {orig_table_md["name"]} ({store_name})')
|
|
23
|
+
col_ids = find_target_columns(orig_table_md)
|
|
24
|
+
if len(col_ids) == 0:
|
|
25
|
+
_logger.info(f'No Array or Json columns found in table {orig_table_md["name"]}. Skipping migration.')
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
# Check which columns already exist in the table
|
|
29
|
+
check_columns_sql = sql.text(f"""
|
|
30
|
+
SELECT column_name
|
|
31
|
+
FROM information_schema.columns
|
|
32
|
+
WHERE table_name = '{store_name}'
|
|
33
|
+
""")
|
|
34
|
+
existing_columns = {row[0] for row in conn.execute(check_columns_sql)}
|
|
35
|
+
|
|
36
|
+
# Filter out columns that already have _cellmd
|
|
37
|
+
col_ids_to_add: list[int] = []
|
|
38
|
+
for col_id in col_ids:
|
|
39
|
+
cellmd_col = f'col_{col_id}_cellmd'
|
|
40
|
+
if cellmd_col not in existing_columns:
|
|
41
|
+
col_ids_to_add.append(col_id)
|
|
42
|
+
else:
|
|
43
|
+
_logger.info(f'Column {cellmd_col} already exists in table {orig_table_md["name"]}. Skipping.')
|
|
44
|
+
|
|
45
|
+
if len(col_ids_to_add) == 0:
|
|
46
|
+
_logger.info(f'All _cellmd columns already exist in table {orig_table_md["name"]}. Skipping migration.')
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
return add_cellmd_columns(conn, store_name, col_ids_to_add)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def find_target_columns(table_md: dict) -> list[int]:
|
|
53
|
+
"""Returns ids of stored array and json columns"""
|
|
54
|
+
result: list[int] = []
|
|
55
|
+
for col_id, col_md in table_md['column_md'].items():
|
|
56
|
+
col_type = col_md['col_type']
|
|
57
|
+
classname = col_type.get('_classname')
|
|
58
|
+
if classname in ['ArrayType', 'JsonType'] and col_md.get('stored', False):
|
|
59
|
+
result.append(col_id)
|
|
60
|
+
_logger.info(f'Found {classname} column: {col_id}')
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def add_cellmd_columns(conn: sql.Connection, store_name: str, col_ids: list[int]) -> None:
|
|
65
|
+
try:
|
|
66
|
+
# Add new columns
|
|
67
|
+
add_column_str = ', '.join(f'ADD COLUMN col_{col_id}_cellmd JSONB DEFAULT NULL' for col_id in col_ids)
|
|
68
|
+
add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
|
|
69
|
+
conn.execute(add_column_sql)
|
|
70
|
+
_logger.info(f'Added columns to {store_name}: {", ".join(f"col_{col_id}_cellmd" for col_id in col_ids)}')
|
|
71
|
+
except sql.exc.SQLAlchemyError as e:
|
|
72
|
+
_logger.error(f'Migration for table {store_name} failed: {e}')
|
|
73
|
+
raise
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
41: 'Cellmd columns for array and json columns',
|
|
5
6
|
40: 'Convert error property columns to cellmd columns',
|
|
6
7
|
39: 'ColumnHandles in external stores',
|
|
7
8
|
38: 'Added TableMd.view_sn',
|
pixeltable/plan.py
CHANGED
|
@@ -3,9 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
5
|
from textwrap import dedent
|
|
6
|
-
from typing import Any, Iterable, Literal, Optional, Sequence
|
|
6
|
+
from typing import Any, Iterable, Literal, Optional, Sequence, cast
|
|
7
7
|
from uuid import UUID
|
|
8
8
|
|
|
9
|
+
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
9
10
|
import sqlalchemy as sql
|
|
10
11
|
|
|
11
12
|
import pixeltable as pxt
|
|
@@ -385,7 +386,7 @@ class Planner:
|
|
|
385
386
|
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
|
|
386
387
|
)
|
|
387
388
|
|
|
388
|
-
plan = cls.
|
|
389
|
+
plan = cls._add_prefetch_node(tbl.id, row_builder.input_exprs, input_node=plan)
|
|
389
390
|
|
|
390
391
|
computed_exprs = row_builder.output_exprs - row_builder.input_exprs
|
|
391
392
|
if len(computed_exprs) > 0:
|
|
@@ -393,6 +394,8 @@ class Planner:
|
|
|
393
394
|
plan = exec.ExprEvalNode(
|
|
394
395
|
row_builder, computed_exprs, plan.output_exprs, input=plan, maintain_input_order=False
|
|
395
396
|
)
|
|
397
|
+
if any(c.col_type.is_json_type() or c.col_type.is_array_type() for c in stored_cols):
|
|
398
|
+
plan = exec.CellMaterializationNode(plan)
|
|
396
399
|
|
|
397
400
|
plan.set_ctx(
|
|
398
401
|
exec.ExecContext(
|
|
@@ -403,7 +406,7 @@ class Planner:
|
|
|
403
406
|
ignore_errors=ignore_errors,
|
|
404
407
|
)
|
|
405
408
|
)
|
|
406
|
-
plan = cls.
|
|
409
|
+
plan = cls._add_save_node(plan)
|
|
407
410
|
|
|
408
411
|
return plan
|
|
409
412
|
|
|
@@ -422,10 +425,17 @@ class Planner:
|
|
|
422
425
|
plan = df._create_query_plan() # ExecNode constructed by the DataFrame
|
|
423
426
|
|
|
424
427
|
# Modify the plan RowBuilder to register the output columns
|
|
428
|
+
needs_cell_materialization = False
|
|
425
429
|
for col_name, expr in zip(df.schema.keys(), df._select_list_exprs):
|
|
426
430
|
assert col_name in tbl.cols_by_name
|
|
427
431
|
col = tbl.cols_by_name[col_name]
|
|
428
432
|
plan.row_builder.add_table_column(col, expr.slot_idx)
|
|
433
|
+
needs_cell_materialization = (
|
|
434
|
+
needs_cell_materialization or col.col_type.is_json_type() or col.col_type.is_array_type()
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
if needs_cell_materialization:
|
|
438
|
+
plan = exec.CellMaterializationNode(plan)
|
|
429
439
|
|
|
430
440
|
plan.set_ctx(
|
|
431
441
|
exec.ExecContext(
|
|
@@ -446,12 +456,14 @@ class Planner:
|
|
|
446
456
|
cascade: bool,
|
|
447
457
|
) -> tuple[exec.ExecNode, list[str], list[catalog.Column]]:
|
|
448
458
|
"""Creates a plan to materialize updated rows.
|
|
459
|
+
|
|
449
460
|
The plan:
|
|
450
461
|
- retrieves rows that are visible at the current version of the table
|
|
451
462
|
- materializes all stored columns and the update targets
|
|
452
463
|
- if cascade is True, recomputes all computed columns that transitively depend on the updated columns
|
|
453
464
|
and copies the values of all other stored columns
|
|
454
465
|
- if cascade is False, copies all columns that aren't update targets from the original rows
|
|
466
|
+
|
|
455
467
|
Returns:
|
|
456
468
|
- root node of the plan
|
|
457
469
|
- list of qualified column names that are getting updated
|
|
@@ -477,14 +489,16 @@ class Planner:
|
|
|
477
489
|
|
|
478
490
|
cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
|
|
479
491
|
|
|
492
|
+
# our query plan
|
|
493
|
+
# - evaluates the update targets and recomputed columns
|
|
494
|
+
# - copies all other stored columns
|
|
480
495
|
recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
|
|
481
496
|
copied_cols = [
|
|
482
497
|
col
|
|
483
498
|
for col in target.cols_by_id.values()
|
|
484
499
|
if col.is_stored and col not in updated_cols and col not in recomputed_base_cols
|
|
485
500
|
]
|
|
486
|
-
select_list: list[exprs.Expr] =
|
|
487
|
-
select_list.extend(update_targets.values())
|
|
501
|
+
select_list: list[exprs.Expr] = list(update_targets.values())
|
|
488
502
|
|
|
489
503
|
recomputed_exprs = [
|
|
490
504
|
c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols
|
|
@@ -495,14 +509,22 @@ class Planner:
|
|
|
495
509
|
select_list.extend(recomputed_exprs)
|
|
496
510
|
|
|
497
511
|
# we need to retrieve the PK columns of the existing rows
|
|
498
|
-
plan = cls.create_query_plan(
|
|
499
|
-
|
|
512
|
+
plan = cls.create_query_plan(
|
|
513
|
+
FromClause(tbls=[tbl]),
|
|
514
|
+
select_list=select_list,
|
|
515
|
+
columns=copied_cols,
|
|
516
|
+
where_clause=where_clause,
|
|
517
|
+
ignore_errors=True,
|
|
518
|
+
)
|
|
519
|
+
evaluated_cols = updated_cols + list(recomputed_base_cols) # same order as select_list
|
|
500
520
|
# update row builder with column information
|
|
501
|
-
|
|
521
|
+
plan.row_builder.add_table_columns(copied_cols)
|
|
522
|
+
for i, col in enumerate(evaluated_cols):
|
|
502
523
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
503
524
|
plan.ctx.num_computed_exprs = len(recomputed_exprs)
|
|
504
525
|
|
|
505
|
-
plan = cls.
|
|
526
|
+
plan = cls._add_cell_materialization_node(plan)
|
|
527
|
+
plan = cls._add_save_node(plan)
|
|
506
528
|
|
|
507
529
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
508
530
|
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
@@ -525,6 +547,79 @@ class Planner:
|
|
|
525
547
|
.format(validation_error=col.value_expr.validation_error)
|
|
526
548
|
)
|
|
527
549
|
|
|
550
|
+
@classmethod
|
|
551
|
+
def _cell_md_col_refs(cls, expr_list: Iterable[exprs.Expr]) -> list[exprs.ColumnRef]:
|
|
552
|
+
"""Return list of ColumnRefs that need their cellmd values for reconstruction"""
|
|
553
|
+
json_col_refs = list(
|
|
554
|
+
exprs.Expr.list_subexprs(
|
|
555
|
+
expr_list,
|
|
556
|
+
expr_class=exprs.ColumnRef,
|
|
557
|
+
filter=lambda e: cast(exprs.ColumnRef, e).col.col_type.is_json_type(),
|
|
558
|
+
traverse_matches=False,
|
|
559
|
+
)
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
def needs_reconstruction(e: exprs.Expr) -> bool:
|
|
563
|
+
assert isinstance(e, exprs.ColumnRef)
|
|
564
|
+
# Vector-typed array columns are used for vector indexes, and are stored in the db
|
|
565
|
+
return e.col.col_type.is_array_type() and not isinstance(e.col.sa_col_type, pgvector.sqlalchemy.Vector)
|
|
566
|
+
|
|
567
|
+
array_col_refs = list(
|
|
568
|
+
exprs.Expr.list_subexprs(
|
|
569
|
+
expr_list, expr_class=exprs.ColumnRef, filter=needs_reconstruction, traverse_matches=False
|
|
570
|
+
)
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
return json_col_refs + array_col_refs
|
|
574
|
+
|
|
575
|
+
@classmethod
|
|
576
|
+
def _add_cell_materialization_node(cls, input: exec.ExecNode) -> exec.ExecNode:
|
|
577
|
+
# we need a CellMaterializationNode if any of the evaluated output columns are json or array-typed
|
|
578
|
+
has_target_cols = any(
|
|
579
|
+
col.col_type.is_json_type() or col.col_type.is_array_type()
|
|
580
|
+
for col, slot_idx in input.row_builder.table_columns.items()
|
|
581
|
+
if slot_idx is not None
|
|
582
|
+
)
|
|
583
|
+
if has_target_cols:
|
|
584
|
+
return exec.CellMaterializationNode(input)
|
|
585
|
+
else:
|
|
586
|
+
return input
|
|
587
|
+
|
|
588
|
+
@classmethod
|
|
589
|
+
def _add_cell_reconstruction_node(cls, expr_list: list[exprs.Expr], input: exec.ExecNode) -> exec.ExecNode:
|
|
590
|
+
"""
|
|
591
|
+
Add a CellReconstructionNode, if required by any of the exprs in expr_list.
|
|
592
|
+
|
|
593
|
+
Cell reconstruction is required for
|
|
594
|
+
1) all json-typed ColumnRefs that are not used as part of a JsonPath (the latter does its own reconstruction)
|
|
595
|
+
or as part of a ColumnPropertyRef
|
|
596
|
+
2) all array-typed ColumnRefs that are not used as part of a ColumnPropertyRef
|
|
597
|
+
"""
|
|
598
|
+
|
|
599
|
+
def json_filter(e: exprs.Expr) -> bool:
|
|
600
|
+
if isinstance(e, exprs.JsonPath):
|
|
601
|
+
return not e.is_relative_path() and isinstance(e.anchor, exprs.ColumnRef)
|
|
602
|
+
if isinstance(e, exprs.ColumnPropertyRef):
|
|
603
|
+
return e.col_ref.col.col_type.is_json_type()
|
|
604
|
+
return isinstance(e, exprs.ColumnRef) and e.col.col_type.is_json_type()
|
|
605
|
+
|
|
606
|
+
def array_filter(e: exprs.Expr) -> bool:
|
|
607
|
+
if isinstance(e, exprs.ColumnPropertyRef):
|
|
608
|
+
return e.col_ref.col.col_type.is_array_type()
|
|
609
|
+
if not isinstance(e, exprs.ColumnRef):
|
|
610
|
+
return False
|
|
611
|
+
# Vector-typed array columns are used for vector indexes, and are stored in the db
|
|
612
|
+
return e.col.col_type.is_array_type() and not isinstance(e.col.sa_col_type, pgvector.sqlalchemy.Vector)
|
|
613
|
+
|
|
614
|
+
json_candidates = list(exprs.Expr.list_subexprs(expr_list, filter=json_filter, traverse_matches=False))
|
|
615
|
+
json_refs = [e for e in json_candidates if isinstance(e, exprs.ColumnRef)]
|
|
616
|
+
array_candidates = list(exprs.Expr.list_subexprs(expr_list, filter=array_filter, traverse_matches=False))
|
|
617
|
+
array_refs = [e for e in array_candidates if isinstance(e, exprs.ColumnRef)]
|
|
618
|
+
if len(json_refs) > 0 or len(array_refs) > 0:
|
|
619
|
+
return exec.CellReconstructionNode(json_refs, array_refs, input.row_builder, input=input)
|
|
620
|
+
else:
|
|
621
|
+
return input
|
|
622
|
+
|
|
528
623
|
@classmethod
|
|
529
624
|
def create_batch_update_plan(
|
|
530
625
|
cls,
|
|
@@ -543,8 +638,8 @@ class Planner:
|
|
|
543
638
|
"""
|
|
544
639
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
545
640
|
target = tbl.tbl_version.get() # the one we need to update
|
|
546
|
-
sa_key_cols: list[sql.Column]
|
|
547
|
-
key_vals: list[tuple]
|
|
641
|
+
sa_key_cols: list[sql.Column]
|
|
642
|
+
key_vals: list[tuple]
|
|
548
643
|
if len(rowids) > 0:
|
|
549
644
|
sa_key_cols = target.store_tbl.rowid_columns()
|
|
550
645
|
key_vals = rowids
|
|
@@ -567,8 +662,7 @@ class Planner:
|
|
|
567
662
|
for col in target.cols_by_id.values()
|
|
568
663
|
if col.is_stored and col not in updated_cols and col not in recomputed_base_cols
|
|
569
664
|
]
|
|
570
|
-
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in
|
|
571
|
-
select_list.extend(exprs.ColumnRef(col) for col in updated_cols)
|
|
665
|
+
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in updated_cols]
|
|
572
666
|
|
|
573
667
|
recomputed_exprs = [
|
|
574
668
|
c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols
|
|
@@ -586,23 +680,37 @@ class Planner:
|
|
|
586
680
|
)
|
|
587
681
|
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs, target)
|
|
588
682
|
analyzer.finalize(row_builder)
|
|
589
|
-
|
|
683
|
+
|
|
684
|
+
cell_md_col_refs = cls._cell_md_col_refs(sql_exprs)
|
|
685
|
+
sql_lookup_node = exec.SqlLookupNode(
|
|
686
|
+
tbl,
|
|
687
|
+
row_builder,
|
|
688
|
+
sql_exprs,
|
|
689
|
+
columns=copied_cols,
|
|
690
|
+
sa_key_cols=sa_key_cols,
|
|
691
|
+
key_vals=key_vals,
|
|
692
|
+
cell_md_col_refs=cell_md_col_refs,
|
|
693
|
+
)
|
|
590
694
|
col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
|
|
591
695
|
row_update_node = exec.RowUpdateNode(tbl, key_vals, len(rowids) > 0, col_vals, row_builder, sql_lookup_node)
|
|
592
696
|
plan: exec.ExecNode = row_update_node
|
|
593
697
|
if not cls._is_contained_in(analyzer.select_list, sql_exprs):
|
|
594
698
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
595
699
|
plan = exec.ExprEvalNode(row_builder, analyzer.select_list, sql_exprs, input=plan)
|
|
700
|
+
|
|
596
701
|
# update row builder with column information
|
|
597
|
-
|
|
702
|
+
evaluated_cols = list(updated_cols) + list(recomputed_base_cols) # same order as select_list
|
|
598
703
|
row_builder.set_slot_idxs(select_list, remove_duplicates=False)
|
|
599
|
-
|
|
704
|
+
plan.row_builder.add_table_columns(copied_cols)
|
|
705
|
+
for i, col in enumerate(evaluated_cols):
|
|
600
706
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
601
707
|
ctx = exec.ExecContext(row_builder, num_computed_exprs=len(recomputed_exprs))
|
|
602
|
-
#
|
|
708
|
+
# TODO: correct batch size?
|
|
603
709
|
ctx.batch_size = 0
|
|
604
710
|
plan.set_ctx(ctx)
|
|
605
|
-
|
|
711
|
+
|
|
712
|
+
plan = cls._add_cell_materialization_node(plan)
|
|
713
|
+
plan = cls._add_save_node(plan)
|
|
606
714
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
607
715
|
return (
|
|
608
716
|
plan,
|
|
@@ -653,10 +761,11 @@ class Planner:
|
|
|
653
761
|
exact_version_only=view.get_bases(),
|
|
654
762
|
)
|
|
655
763
|
plan.ctx.num_computed_exprs = len(recomputed_exprs)
|
|
656
|
-
|
|
764
|
+
materialized_cols = copied_cols + list(recomputed_cols) # same order as select_list
|
|
765
|
+
for i, col in enumerate(materialized_cols):
|
|
657
766
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
658
|
-
|
|
659
|
-
plan = cls.
|
|
767
|
+
plan = cls._add_cell_materialization_node(plan)
|
|
768
|
+
plan = cls._add_save_node(plan)
|
|
660
769
|
|
|
661
770
|
return plan
|
|
662
771
|
|
|
@@ -726,7 +835,9 @@ class Planner:
|
|
|
726
835
|
|
|
727
836
|
exec_ctx.ignore_errors = True
|
|
728
837
|
plan.set_ctx(exec_ctx)
|
|
729
|
-
|
|
838
|
+
if any(c.col_type.is_json_type() or c.col_type.is_array_type() for c in stored_cols):
|
|
839
|
+
plan = exec.CellMaterializationNode(plan)
|
|
840
|
+
plan = cls._add_save_node(plan)
|
|
730
841
|
|
|
731
842
|
return plan, len(row_builder.default_eval_ctx.target_exprs)
|
|
732
843
|
|
|
@@ -773,15 +884,13 @@ class Planner:
|
|
|
773
884
|
return combined_ordering
|
|
774
885
|
|
|
775
886
|
@classmethod
|
|
776
|
-
def
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
if len(stored_media_cols) == 0:
|
|
887
|
+
def _add_save_node(cls, input_node: exec.ExecNode) -> exec.ExecNode:
|
|
888
|
+
"""Add an ObjectStoreSaveNode, if needed."""
|
|
889
|
+
media_col_info = input_node.row_builder.media_output_col_info
|
|
890
|
+
if len(media_col_info) == 0:
|
|
781
891
|
return input_node
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
return save_node
|
|
892
|
+
else:
|
|
893
|
+
return exec.ObjectStoreSaveNode(media_col_info, input_node)
|
|
785
894
|
|
|
786
895
|
@classmethod
|
|
787
896
|
def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
|
|
@@ -789,10 +898,10 @@ class Planner:
|
|
|
789
898
|
return {e.id for e in l1} <= {e.id for e in l2}
|
|
790
899
|
|
|
791
900
|
@classmethod
|
|
792
|
-
def
|
|
901
|
+
def _add_prefetch_node(
|
|
793
902
|
cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
|
|
794
903
|
) -> exec.ExecNode:
|
|
795
|
-
"""
|
|
904
|
+
"""Add a CachePrefetch node, if needed."""
|
|
796
905
|
# we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
|
|
797
906
|
# of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
|
|
798
907
|
# aren't explicitly captured as dependencies
|
|
@@ -808,21 +917,30 @@ class Planner:
|
|
|
808
917
|
def create_query_plan(
|
|
809
918
|
cls,
|
|
810
919
|
from_clause: FromClause,
|
|
811
|
-
select_list:
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
920
|
+
select_list: list[exprs.Expr] | None = None,
|
|
921
|
+
columns: list[catalog.Column] | None = None,
|
|
922
|
+
where_clause: exprs.Expr | None = None,
|
|
923
|
+
group_by_clause: list[exprs.Expr] | None = None,
|
|
924
|
+
order_by_clause: list[tuple[exprs.Expr, bool]] | None = None,
|
|
925
|
+
limit: exprs.Expr | None = None,
|
|
926
|
+
sample_clause: SampleClause | None = None,
|
|
817
927
|
ignore_errors: bool = False,
|
|
818
|
-
exact_version_only:
|
|
928
|
+
exact_version_only: list[catalog.TableVersionHandle] | None = None,
|
|
819
929
|
) -> exec.ExecNode:
|
|
820
|
-
"""
|
|
930
|
+
"""
|
|
931
|
+
Return plan for executing a query.
|
|
932
|
+
|
|
933
|
+
The plan:
|
|
934
|
+
- materializes the values of select_list exprs into their respective slots
|
|
935
|
+
- materializes cell values of 'columns' (and their cellmd, if applicable) into DataRow.cell_vals/cell_md
|
|
936
|
+
|
|
821
937
|
Updates 'select_list' in place to make it executable.
|
|
822
938
|
TODO: make exact_version_only a flag and use the versions from tbl
|
|
823
939
|
"""
|
|
824
940
|
if select_list is None:
|
|
825
941
|
select_list = []
|
|
942
|
+
if columns is None:
|
|
943
|
+
columns = []
|
|
826
944
|
if order_by_clause is None:
|
|
827
945
|
order_by_clause = []
|
|
828
946
|
if exact_version_only is None:
|
|
@@ -850,6 +968,7 @@ class Planner:
|
|
|
850
968
|
row_builder=row_builder,
|
|
851
969
|
analyzer=analyzer,
|
|
852
970
|
eval_ctx=eval_ctx,
|
|
971
|
+
columns=columns,
|
|
853
972
|
limit=limit,
|
|
854
973
|
with_pk=True,
|
|
855
974
|
exact_version_only=exact_version_only,
|
|
@@ -865,9 +984,10 @@ class Planner:
|
|
|
865
984
|
row_builder: exprs.RowBuilder,
|
|
866
985
|
analyzer: Analyzer,
|
|
867
986
|
eval_ctx: exprs.RowBuilder.EvalCtx,
|
|
987
|
+
columns: list[catalog.Column] | None = None,
|
|
868
988
|
limit: Optional[exprs.Expr] = None,
|
|
869
989
|
with_pk: bool = False,
|
|
870
|
-
exact_version_only:
|
|
990
|
+
exact_version_only: list[catalog.TableVersionHandle] | None = None,
|
|
871
991
|
) -> exec.ExecNode:
|
|
872
992
|
"""
|
|
873
993
|
Create plan to materialize eval_ctx.
|
|
@@ -877,6 +997,8 @@ class Planner:
|
|
|
877
997
|
in the context of that table version (eg, if 'tbl' is a view, 'plan_target' might be the base)
|
|
878
998
|
TODO: make exact_version_only a flag and use the versions from tbl
|
|
879
999
|
"""
|
|
1000
|
+
if columns is None:
|
|
1001
|
+
columns = []
|
|
880
1002
|
if exact_version_only is None:
|
|
881
1003
|
exact_version_only = []
|
|
882
1004
|
sql_elements = analyzer.sql_elements
|
|
@@ -934,8 +1056,15 @@ class Planner:
|
|
|
934
1056
|
traverse_matches=False,
|
|
935
1057
|
)
|
|
936
1058
|
)
|
|
1059
|
+
|
|
937
1060
|
plan = exec.SqlScanNode(
|
|
938
|
-
tbl,
|
|
1061
|
+
tbl,
|
|
1062
|
+
row_builder,
|
|
1063
|
+
select_list=tbl_scan_exprs,
|
|
1064
|
+
columns=[c for c in columns if c.tbl.id == tbl.tbl_id],
|
|
1065
|
+
set_pk=with_pk,
|
|
1066
|
+
cell_md_col_refs=cls._cell_md_col_refs(tbl_scan_exprs),
|
|
1067
|
+
exact_version_only=exact_version_only,
|
|
939
1068
|
)
|
|
940
1069
|
tbl_scan_plans.append(plan)
|
|
941
1070
|
|
|
@@ -966,7 +1095,8 @@ class Planner:
|
|
|
966
1095
|
stratify_exprs=analyzer.stratify_exprs,
|
|
967
1096
|
)
|
|
968
1097
|
|
|
969
|
-
plan = cls.
|
|
1098
|
+
plan = cls._add_prefetch_node(tbl.tbl_version.id, row_builder.unique_exprs, plan)
|
|
1099
|
+
plan = cls._add_cell_reconstruction_node(analyzer.all_exprs, plan)
|
|
970
1100
|
|
|
971
1101
|
if analyzer.group_by_clause is not None:
|
|
972
1102
|
# we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the
|
|
@@ -1010,7 +1140,7 @@ class Planner:
|
|
|
1010
1140
|
if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
|
|
1011
1141
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
1012
1142
|
plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
|
|
1013
|
-
plan = cls.
|
|
1143
|
+
plan = cls._add_save_node(plan)
|
|
1014
1144
|
else:
|
|
1015
1145
|
if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
|
|
1016
1146
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
@@ -1062,7 +1192,6 @@ class Planner:
|
|
|
1062
1192
|
plan.ctx.ignore_errors = True
|
|
1063
1193
|
computed_exprs = row_builder.output_exprs - row_builder.input_exprs
|
|
1064
1194
|
plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
|
|
1065
|
-
|
|
1066
|
-
plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
|
|
1195
|
+
plan = cls._add_save_node(plan)
|
|
1067
1196
|
|
|
1068
1197
|
return plan
|
pixeltable/store.py
CHANGED
|
@@ -321,7 +321,7 @@ class StoreBase:
|
|
|
321
321
|
table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
|
|
322
322
|
num_excs += num_row_exc
|
|
323
323
|
|
|
324
|
-
if show_progress:
|
|
324
|
+
if show_progress and Env.get().verbosity >= 1:
|
|
325
325
|
if progress_bar is None:
|
|
326
326
|
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
327
327
|
progress_bar = tqdm(
|