pixeltable 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +8 -3
- pixeltable/catalog/globals.py +8 -0
- pixeltable/catalog/table.py +25 -9
- pixeltable/catalog/table_version.py +30 -55
- pixeltable/catalog/view.py +1 -1
- pixeltable/env.py +4 -4
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/row_update_node.py +61 -0
- pixeltable/exec/{sql_scan_node.py → sql_node.py} +120 -56
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +41 -16
- pixeltable/exprs/expr.py +72 -22
- pixeltable/exprs/function_call.py +64 -29
- pixeltable/exprs/globals.py +5 -1
- pixeltable/exprs/inline_array.py +18 -11
- pixeltable/exprs/method_ref.py +63 -0
- pixeltable/ext/__init__.py +9 -0
- pixeltable/ext/functions/__init__.py +8 -0
- pixeltable/ext/functions/whisperx.py +45 -5
- pixeltable/ext/functions/yolox.py +60 -14
- pixeltable/func/callable_function.py +12 -4
- pixeltable/func/expr_template_function.py +1 -1
- pixeltable/func/function.py +12 -2
- pixeltable/func/function_registry.py +24 -9
- pixeltable/func/udf.py +32 -4
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/fireworks.py +33 -0
- pixeltable/functions/huggingface.py +96 -6
- pixeltable/functions/image.py +226 -41
- pixeltable/functions/json.py +46 -0
- pixeltable/functions/openai.py +214 -0
- pixeltable/functions/string.py +195 -218
- pixeltable/functions/timestamp.py +210 -0
- pixeltable/functions/together.py +106 -0
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/{eval.py → vision.py} +170 -27
- pixeltable/functions/whisper.py +32 -0
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +2 -2
- pixeltable/io/globals.py +133 -1
- pixeltable/io/pandas.py +82 -31
- pixeltable/iterators/video.py +55 -23
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_18.py +39 -0
- pixeltable/metadata/notes.py +10 -0
- pixeltable/plan.py +76 -1
- pixeltable/store.py +65 -28
- pixeltable/tool/create_test_db_dump.py +8 -9
- pixeltable/tool/doc_plugins/griffe.py +4 -0
- pixeltable/type_system.py +84 -63
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/METADATA +2 -2
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/RECORD +57 -51
- pixeltable/exprs/image_member_access.py +0 -96
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/entry_points.txt +0 -0
pixeltable/__init__.py
CHANGED
|
@@ -21,7 +21,7 @@ from .type_system import (
|
|
|
21
21
|
)
|
|
22
22
|
from .utils.help import help
|
|
23
23
|
|
|
24
|
-
from . import functions, io, iterators
|
|
24
|
+
from . import ext, functions, io, iterators
|
|
25
25
|
from .__version__ import __version__, __version_tuple__
|
|
26
26
|
|
|
27
27
|
# This is the safest / most maintainable way to do this: start with the default and "blacklist" stuff that
|
pixeltable/__version__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
# These version placeholders will be replaced during build.
|
|
2
|
-
__version__ = "0.2.
|
|
3
|
-
__version_tuple__ = (0, 2,
|
|
2
|
+
__version__ = "0.2.15"
|
|
3
|
+
__version_tuple__ = (0, 2, 15)
|
pixeltable/catalog/column.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import
|
|
5
|
-
from uuid import UUID
|
|
4
|
+
from typing import Any, Callable, Optional, Union
|
|
6
5
|
|
|
7
6
|
import sqlalchemy as sql
|
|
8
7
|
|
|
9
8
|
import pixeltable.exceptions as excs
|
|
10
9
|
import pixeltable.type_system as ts
|
|
10
|
+
|
|
11
11
|
from .globals import is_valid_identifier
|
|
12
12
|
|
|
13
13
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -21,7 +21,7 @@ class Column:
|
|
|
21
21
|
def __init__(
|
|
22
22
|
self, name: Optional[str], col_type: Optional[ts.ColumnType] = None,
|
|
23
23
|
computed_with: Optional[Union['Expr', Callable]] = None,
|
|
24
|
-
is_pk: bool = False, stored:
|
|
24
|
+
is_pk: bool = False, stored: bool = True,
|
|
25
25
|
col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
|
|
26
26
|
schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None,
|
|
27
27
|
records_errors: Optional[bool] = None, value_expr_dict: Optional[dict[str, Any]] = None,
|
|
@@ -152,6 +152,11 @@ class Column:
|
|
|
152
152
|
return self._records_errors
|
|
153
153
|
return self.is_stored and (self.is_computed or self.col_type.is_media_type())
|
|
154
154
|
|
|
155
|
+
@property
|
|
156
|
+
def qualified_name(self) -> str:
|
|
157
|
+
assert self.tbl is not None
|
|
158
|
+
return f'{self.tbl.name}.{self.name}'
|
|
159
|
+
|
|
155
160
|
def source(self) -> None:
|
|
156
161
|
"""
|
|
157
162
|
If this is a computed col and the top-level expr is a function call, print the source, if possible.
|
pixeltable/catalog/globals.py
CHANGED
|
@@ -19,6 +19,14 @@ class UpdateStatus:
|
|
|
19
19
|
updated_cols: List[str] = dataclasses.field(default_factory=list)
|
|
20
20
|
cols_with_excs: List[str] = dataclasses.field(default_factory=list)
|
|
21
21
|
|
|
22
|
+
def __iadd__(self, other: 'UpdateStatus') -> 'UpdateStatus':
|
|
23
|
+
self.num_rows += other.num_rows
|
|
24
|
+
self.num_computed_values += other.num_computed_values
|
|
25
|
+
self.num_excs += other.num_excs
|
|
26
|
+
self.updated_cols = list(dict.fromkeys(self.updated_cols + other.updated_cols))
|
|
27
|
+
self.cols_with_excs = list(dict.fromkeys(self.cols_with_excs + other.cols_with_excs))
|
|
28
|
+
return self
|
|
29
|
+
|
|
22
30
|
def is_valid_identifier(name: str) -> bool:
|
|
23
31
|
return name.isidentifier() and not name.startswith('_')
|
|
24
32
|
|
pixeltable/catalog/table.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Union, Any, Optional, Callable, Set, Tuple, Iterable, overload, Type
|
|
6
|
+
from typing import Union, Any, Optional, Callable, Set, Tuple, Iterable, overload, Type, Literal
|
|
7
7
|
from uuid import UUID
|
|
8
8
|
import abc
|
|
9
9
|
|
|
@@ -434,8 +434,8 @@ class Table(SchemaObject):
|
|
|
434
434
|
for name, spec in schema.items():
|
|
435
435
|
col_type: Optional[ts.ColumnType] = None
|
|
436
436
|
value_expr: Optional[exprs.Expr] = None
|
|
437
|
-
stored: Optional[bool] = None
|
|
438
437
|
primary_key: Optional[bool] = None
|
|
438
|
+
stored = True
|
|
439
439
|
|
|
440
440
|
if isinstance(spec, ts.ColumnType):
|
|
441
441
|
# TODO: create copy
|
|
@@ -455,7 +455,7 @@ class Table(SchemaObject):
|
|
|
455
455
|
if value_expr is not None and isinstance(value_expr, exprs.Expr):
|
|
456
456
|
# create copy so we can modify it
|
|
457
457
|
value_expr = value_expr.copy()
|
|
458
|
-
stored = spec.get('stored')
|
|
458
|
+
stored = spec.get('stored', True)
|
|
459
459
|
primary_key = spec.get('primary_key')
|
|
460
460
|
|
|
461
461
|
column = Column(
|
|
@@ -478,12 +478,10 @@ class Table(SchemaObject):
|
|
|
478
478
|
raise excs.Error(f'Column name conflicts with a registered query: {col.name!r}')
|
|
479
479
|
if col.stored is False and not (col.is_computed and col.col_type.is_image_type()):
|
|
480
480
|
raise excs.Error(f'Column {col.name!r}: stored={col.stored} only applies to computed image columns')
|
|
481
|
-
if col.stored is False and
|
|
481
|
+
if col.stored is False and col.has_window_fn_call():
|
|
482
482
|
raise excs.Error((
|
|
483
483
|
f'Column {col.name!r}: stored={col.stored} is not valid for image columns computed with a streaming '
|
|
484
484
|
f'function'))
|
|
485
|
-
if col.stored is None:
|
|
486
|
-
col.stored = not (col.is_computed and col.col_type.is_image_type() and not col.has_window_fn_call())
|
|
487
485
|
|
|
488
486
|
@classmethod
|
|
489
487
|
def _verify_schema(cls, schema: list[Column]) -> None:
|
|
@@ -745,18 +743,34 @@ class Table(SchemaObject):
|
|
|
745
743
|
self._check_is_dropped()
|
|
746
744
|
return self._tbl_version.update(value_spec, where, cascade)
|
|
747
745
|
|
|
748
|
-
def batch_update(
|
|
746
|
+
def batch_update(
|
|
747
|
+
self, rows: Iterable[dict[str, Any]], cascade: bool = True,
|
|
748
|
+
if_not_exists: Literal['error', 'ignore', 'insert'] = 'error'
|
|
749
|
+
) -> UpdateStatus:
|
|
749
750
|
"""Update rows in this table.
|
|
750
751
|
|
|
751
752
|
Args:
|
|
752
753
|
rows: an Iterable of dictionaries containing values for the updated columns plus values for the primary key
|
|
753
754
|
columns.
|
|
754
755
|
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
756
|
+
if_not_exists: Specifies the behavior if a row to update does not exist:
|
|
757
|
+
|
|
758
|
+
- `'error'`: Raise an error.
|
|
759
|
+
- `'ignore'`: Skip the row silently.
|
|
760
|
+
- `'insert'`: Insert the row.
|
|
755
761
|
|
|
756
762
|
Examples:
|
|
757
|
-
Update the
|
|
763
|
+
Update the `name` and `age` columns for the rows with ids 1 and 2 (assuming `id` is the primary key).
|
|
764
|
+
If either row does not exist, this raises an error:
|
|
758
765
|
|
|
759
766
|
>>> tbl.update([{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 2, 'name': 'Bob', 'age': 40}])
|
|
767
|
+
|
|
768
|
+
Update the `name` and `age` columns for the row with `id` 1 (assuming `id` is the primary key) and insert
|
|
769
|
+
the row with new `id` 3 (assuming this key does not exist):
|
|
770
|
+
|
|
771
|
+
>>> tbl.update(
|
|
772
|
+
[{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
|
|
773
|
+
if_not_exists='insert')
|
|
760
774
|
"""
|
|
761
775
|
if self._tbl_version_path.is_snapshot():
|
|
762
776
|
raise excs.Error('Cannot update a snapshot')
|
|
@@ -784,7 +798,9 @@ class Table(SchemaObject):
|
|
|
784
798
|
missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
|
|
785
799
|
raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
|
|
786
800
|
row_updates.append(col_vals)
|
|
787
|
-
return self._tbl_version.batch_update(
|
|
801
|
+
return self._tbl_version.batch_update(
|
|
802
|
+
row_updates, rowids, error_if_not_exists=if_not_exists == 'error',
|
|
803
|
+
insert_if_not_exists=if_not_exists == 'insert', cascade=cascade)
|
|
788
804
|
|
|
789
805
|
def delete(self, where: Optional['pixeltable.exprs.Expr'] = None) -> UpdateStatus:
|
|
790
806
|
"""Delete rows in this table.
|
|
@@ -702,10 +702,18 @@ class TableVersion:
|
|
|
702
702
|
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
|
703
703
|
|
|
704
704
|
with Env.get().engine.begin() as conn:
|
|
705
|
-
|
|
705
|
+
plan, updated_cols, recomputed_cols = (
|
|
706
|
+
Planner.create_update_plan(self.path, update_spec, [], where, cascade)
|
|
707
|
+
)
|
|
708
|
+
result = self.propagate_update(
|
|
709
|
+
plan, where.sql_expr() if where is not None else None, recomputed_cols,
|
|
710
|
+
base_versions=[], conn=conn, timestamp=time.time(), cascade=cascade, show_progress=True)
|
|
711
|
+
result.updated_cols = updated_cols
|
|
712
|
+
return result
|
|
706
713
|
|
|
707
714
|
def batch_update(
|
|
708
|
-
self, batch: list[dict[Column, 'exprs.Expr']], rowids: list[tuple[int, ...]],
|
|
715
|
+
self, batch: list[dict[Column, 'exprs.Expr']], rowids: list[tuple[int, ...]], insert_if_not_exists: bool,
|
|
716
|
+
error_if_not_exists: bool, cascade: bool = True,
|
|
709
717
|
) -> UpdateStatus:
|
|
710
718
|
"""Update rows in batch.
|
|
711
719
|
Args:
|
|
@@ -714,62 +722,26 @@ class TableVersion:
|
|
|
714
722
|
"""
|
|
715
723
|
# if we do lookups of rowids, we must have one for each row in the batch
|
|
716
724
|
assert len(rowids) == 0 or len(rowids) == len(batch)
|
|
717
|
-
result_status = UpdateStatus()
|
|
718
725
|
cols_with_excs: set[str] = set()
|
|
719
|
-
updated_cols: set[str] = set()
|
|
720
|
-
pk_cols = self.primary_key_columns()
|
|
721
|
-
use_rowids = len(rowids) > 0
|
|
722
726
|
|
|
723
727
|
with Env.get().engine.begin() as conn:
|
|
724
|
-
|
|
725
|
-
where_clause: Optional[exprs.Expr] = None
|
|
726
|
-
if use_rowids:
|
|
727
|
-
# construct Where clause to match rowid
|
|
728
|
-
num_rowid_cols = len(self.store_tbl.rowid_columns())
|
|
729
|
-
for col_idx in range(num_rowid_cols):
|
|
730
|
-
assert len(rowids[i]) == num_rowid_cols, f'len({rowids[i]}) != {num_rowid_cols}'
|
|
731
|
-
clause = exprs.RowidRef(self, col_idx) == rowids[i][col_idx]
|
|
732
|
-
if where_clause is None:
|
|
733
|
-
where_clause = clause
|
|
734
|
-
else:
|
|
735
|
-
where_clause = where_clause & clause
|
|
736
|
-
else:
|
|
737
|
-
# construct Where clause for primary key columns
|
|
738
|
-
for col in pk_cols:
|
|
739
|
-
assert col in row
|
|
740
|
-
clause = exprs.ColumnRef(col) == row[col]
|
|
741
|
-
if where_clause is None:
|
|
742
|
-
where_clause = clause
|
|
743
|
-
else:
|
|
744
|
-
where_clause = where_clause & clause
|
|
745
|
-
|
|
746
|
-
update_targets = {col: row[col] for col in row if col not in pk_cols}
|
|
747
|
-
status = self._update(conn, update_targets, where_clause, cascade, show_progress=False)
|
|
748
|
-
result_status.num_rows += status.num_rows
|
|
749
|
-
result_status.num_excs += status.num_excs
|
|
750
|
-
result_status.num_computed_values += status.num_computed_values
|
|
751
|
-
cols_with_excs.update(status.cols_with_excs)
|
|
752
|
-
updated_cols.update(status.updated_cols)
|
|
753
|
-
|
|
754
|
-
result_status.cols_with_excs = list(cols_with_excs)
|
|
755
|
-
result_status.updated_cols = list(updated_cols)
|
|
756
|
-
return result_status
|
|
757
|
-
|
|
758
|
-
def _update(
|
|
759
|
-
self, conn: sql.engine.Connection, update_targets: dict[Column, 'pixeltable.exprs.Expr'],
|
|
760
|
-
where_clause: Optional['pixeltable.exprs.Expr'] = None, cascade: bool = True,
|
|
761
|
-
show_progress: bool = True
|
|
762
|
-
) -> UpdateStatus:
|
|
763
|
-
from pixeltable.plan import Planner
|
|
728
|
+
from pixeltable.plan import Planner
|
|
764
729
|
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
730
|
+
plan, row_update_node, delete_where_clause, updated_cols, recomputed_cols = \
|
|
731
|
+
Planner.create_batch_update_plan(self.path, batch, rowids, cascade=cascade)
|
|
732
|
+
result = self.propagate_update(
|
|
733
|
+
plan, delete_where_clause, recomputed_cols, base_versions=[], conn=conn, timestamp=time.time(),
|
|
734
|
+
cascade=cascade)
|
|
735
|
+
result.updated_cols = [c.qualified_name for c in updated_cols]
|
|
736
|
+
|
|
737
|
+
unmatched_rows = row_update_node.unmatched_rows()
|
|
738
|
+
if len(unmatched_rows) > 0:
|
|
739
|
+
if error_if_not_exists:
|
|
740
|
+
raise excs.Error(f'batch_update(): {len(unmatched_rows)} row(s) not found')
|
|
741
|
+
if insert_if_not_exists:
|
|
742
|
+
insert_status = self.insert(unmatched_rows, print_stats=False, fail_on_exception=False)
|
|
743
|
+
result += insert_status
|
|
744
|
+
return result
|
|
773
745
|
|
|
774
746
|
def _validate_update_spec(
|
|
775
747
|
self, value_spec: dict[str, Any], allow_pk: bool, allow_exprs: bool
|
|
@@ -779,7 +751,10 @@ class TableVersion:
|
|
|
779
751
|
if not isinstance(col_name, str):
|
|
780
752
|
raise excs.Error(f'Update specification: dict key must be column name, got {col_name!r}')
|
|
781
753
|
if col_name == _ROWID_COLUMN_NAME:
|
|
782
|
-
#
|
|
754
|
+
# a valid rowid is a list of ints, one per rowid column
|
|
755
|
+
assert len(val) == len(self.store_tbl.rowid_columns())
|
|
756
|
+
for el in val:
|
|
757
|
+
assert isinstance(el, int)
|
|
783
758
|
continue
|
|
784
759
|
col = self.path.get_column(col_name, include_bases=False)
|
|
785
760
|
if col is None:
|
pixeltable/catalog/view.py
CHANGED
|
@@ -92,7 +92,7 @@ class View(Table):
|
|
|
92
92
|
]
|
|
93
93
|
sig = func.Signature(InvalidType(), params)
|
|
94
94
|
from pixeltable.exprs import FunctionCall
|
|
95
|
-
FunctionCall.
|
|
95
|
+
FunctionCall.normalize_args(sig, bound_args)
|
|
96
96
|
except TypeError as e:
|
|
97
97
|
raise Error(f'Cannot instantiate iterator with given arguments: {e}')
|
|
98
98
|
|
pixeltable/env.py
CHANGED
|
@@ -16,7 +16,7 @@ from dataclasses import dataclass
|
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
from typing import Callable, Optional, Dict, Any, List, TYPE_CHECKING
|
|
18
18
|
|
|
19
|
-
import
|
|
19
|
+
import pixeltable_pgserver
|
|
20
20
|
import sqlalchemy as sql
|
|
21
21
|
import yaml
|
|
22
22
|
from tqdm import TqdmWarning
|
|
@@ -60,7 +60,7 @@ class Env:
|
|
|
60
60
|
self._sa_engine: Optional[sql.engine.base.Engine] = None
|
|
61
61
|
self._pgdata_dir: Optional[Path] = None
|
|
62
62
|
self._db_name: Optional[str] = None
|
|
63
|
-
self._db_server: Optional[
|
|
63
|
+
self._db_server: Optional[pixeltable_pgserver.PostgresServer] = None
|
|
64
64
|
self._db_url: Optional[str] = None
|
|
65
65
|
|
|
66
66
|
# info about installed packages that are utilized by some parts of the code;
|
|
@@ -266,8 +266,8 @@ class Env:
|
|
|
266
266
|
self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
|
|
267
267
|
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
|
|
268
268
|
|
|
269
|
-
# in
|
|
270
|
-
self._db_server =
|
|
269
|
+
# in pixeltable_pgserver.get_server(): cleanup_mode=None will leave db on for debugging purposes
|
|
270
|
+
self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
|
|
271
271
|
self._db_url = self._db_server.get_uri(database=self._db_name)
|
|
272
272
|
|
|
273
273
|
if reinit_db:
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ from .exec_context import ExecContext
|
|
|
5
5
|
from .exec_node import ExecNode
|
|
6
6
|
from .expr_eval_node import ExprEvalNode
|
|
7
7
|
from .in_memory_data_node import InMemoryDataNode
|
|
8
|
-
from .
|
|
8
|
+
from .sql_node import SqlScanNode, SqlLookupNode
|
|
9
|
+
from .row_update_node import RowUpdateNode
|
|
9
10
|
from .media_validation_node import MediaValidationNode
|
|
10
11
|
from .data_row_batch import DataRowBatch
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import pixeltable.catalog as catalog
|
|
5
|
+
import pixeltable.exprs as exprs
|
|
6
|
+
from pixeltable.utils.media_store import MediaStore
|
|
7
|
+
from .data_row_batch import DataRowBatch
|
|
8
|
+
from .exec_node import ExecNode
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger('pixeltable')
|
|
11
|
+
|
|
12
|
+
class RowUpdateNode(ExecNode):
|
|
13
|
+
"""
|
|
14
|
+
Update individual rows in the input batches, identified by key columns.
|
|
15
|
+
|
|
16
|
+
The updates for a row are provided as a dict of column names to new values.
|
|
17
|
+
The node assumes that all update dicts contain the same keys, and it populates the slots of the columns present in
|
|
18
|
+
the update list.
|
|
19
|
+
"""
|
|
20
|
+
def __init__(
|
|
21
|
+
self, tbl: catalog.TableVersionPath, key_vals_batch: list[tuple], is_rowid_key: bool,
|
|
22
|
+
col_vals_batch: list[dict[catalog.Column, Any]], row_builder: exprs.RowBuilder, input: ExecNode,
|
|
23
|
+
):
|
|
24
|
+
super().__init__(row_builder, [], [], input)
|
|
25
|
+
self.updates = {key_vals: col_vals for key_vals, col_vals in zip(key_vals_batch, col_vals_batch)}
|
|
26
|
+
self.is_rowid_key = is_rowid_key
|
|
27
|
+
# determine slot idxs of all columns we need to read or write
|
|
28
|
+
# retrieve ColumnRefs from the RowBuilder (has slot_idx set)
|
|
29
|
+
all_col_slot_idxs = {
|
|
30
|
+
col_ref.col: col_ref.slot_idx
|
|
31
|
+
for col_ref in row_builder.unique_exprs if isinstance(col_ref, exprs.ColumnRef)
|
|
32
|
+
}
|
|
33
|
+
self.col_slot_idxs = {col: all_col_slot_idxs[col] for col in col_vals_batch[0].keys()}
|
|
34
|
+
self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.primary_key_columns()}
|
|
35
|
+
self.matched_key_vals: set[tuple] = set()
|
|
36
|
+
|
|
37
|
+
def __next__(self) -> DataRowBatch:
|
|
38
|
+
batch = next(self.input)
|
|
39
|
+
for row in batch:
|
|
40
|
+
key_vals = row.rowid if self.is_rowid_key else \
|
|
41
|
+
tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
|
|
42
|
+
if key_vals not in self.updates:
|
|
43
|
+
continue
|
|
44
|
+
self.matched_key_vals.add(key_vals)
|
|
45
|
+
col_vals = self.updates[key_vals]
|
|
46
|
+
for col, val in col_vals.items():
|
|
47
|
+
slot_idx = self.col_slot_idxs[col]
|
|
48
|
+
row[slot_idx] = val
|
|
49
|
+
return batch
|
|
50
|
+
|
|
51
|
+
def unmatched_rows(self) -> list[dict[str, Any]]:
|
|
52
|
+
"""Return rows that didn't get used in the updates as a list of dicts compatible with TableVersion.insert()."""
|
|
53
|
+
result: list[dict[str, Any]] = []
|
|
54
|
+
key_cols = self.key_slot_idxs.keys()
|
|
55
|
+
for key_vals, col_vals in self.updates.items():
|
|
56
|
+
if key_vals in self.matched_key_vals:
|
|
57
|
+
continue
|
|
58
|
+
row = {col.name: val for col, val in zip(key_cols, key_vals)}
|
|
59
|
+
row.update({col.name: val for col, val in col_vals.items()})
|
|
60
|
+
result.append(row)
|
|
61
|
+
return result
|
|
@@ -13,30 +13,23 @@ import pixeltable.catalog as catalog
|
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
15
15
|
|
|
16
|
-
class
|
|
17
|
-
"""Materializes data from the store via
|
|
18
|
-
|
|
16
|
+
class SqlNode(ExecNode):
|
|
17
|
+
"""Materializes data from the store via a Select stmt."""
|
|
18
|
+
|
|
19
19
|
def __init__(
|
|
20
20
|
self, tbl: catalog.TableVersionPath, row_builder: exprs.RowBuilder,
|
|
21
|
-
select_list: Iterable[exprs.Expr],
|
|
22
|
-
where_clause: Optional[exprs.Expr] = None, filter: Optional[exprs.Expr] = None,
|
|
23
|
-
order_by_items: Optional[List[Tuple[exprs.Expr, bool]]] = None,
|
|
24
|
-
limit: int = 0, set_pk: bool = False, exact_version_only: Optional[List[catalog.TableVersion]] = None
|
|
21
|
+
select_list: Iterable[exprs.Expr], set_pk: bool = False
|
|
25
22
|
):
|
|
26
23
|
"""
|
|
24
|
+
Initialize self.stmt with expressions derived from select_list.
|
|
25
|
+
|
|
26
|
+
This only provides the select list. The subclass is responsible for the From clause and any additional clauses.
|
|
27
|
+
|
|
27
28
|
Args:
|
|
28
29
|
select_list: output of the query
|
|
29
|
-
sql_where_clause: SQL Where clause
|
|
30
|
-
filter: additional Where-clause predicate that can't be evaluated via SQL
|
|
31
|
-
limit: max number of rows to return: 0 = no limit
|
|
32
30
|
set_pk: if True, sets the primary for each DataRow
|
|
33
|
-
exact_version_only: tables for which we only want to see rows created at the current version
|
|
34
31
|
"""
|
|
35
32
|
# create Select stmt
|
|
36
|
-
if order_by_items is None:
|
|
37
|
-
order_by_items = []
|
|
38
|
-
if exact_version_only is None:
|
|
39
|
-
exact_version_only = []
|
|
40
33
|
self.tbl = tbl
|
|
41
34
|
target = tbl.tbl_version # the stored table we're scanning
|
|
42
35
|
self.sql_exprs = exprs.ExprSet(select_list)
|
|
@@ -45,21 +38,15 @@ class SqlScanNode(ExecNode):
|
|
|
45
38
|
sql_subexprs = iter_arg.subexprs(filter=lambda e: e.sql_expr() is not None, traverse_matches=False)
|
|
46
39
|
[self.sql_exprs.append(e) for e in sql_subexprs]
|
|
47
40
|
super().__init__(row_builder, self.sql_exprs, [], None) # we materialize self.sql_exprs
|
|
48
|
-
self.filter = filter
|
|
49
|
-
self.filter_eval_ctx = \
|
|
50
|
-
row_builder.create_eval_ctx([filter], exclude=select_list) if filter is not None else None
|
|
51
|
-
self.limit = limit
|
|
52
41
|
|
|
53
42
|
# change rowid refs against a base table to rowid refs against the target table, so that we minimize
|
|
54
43
|
# the number of tables that need to be joined to the target table
|
|
55
44
|
for rowid_ref in [e for e in self.sql_exprs if isinstance(e, exprs.RowidRef)]:
|
|
56
45
|
rowid_ref.set_tbl(tbl)
|
|
57
46
|
|
|
58
|
-
where_clause_tbl_ids = where_clause.tbl_ids() if where_clause is not None else set()
|
|
59
|
-
refd_tbl_ids = exprs.Expr.list_tbl_ids(self.sql_exprs) | where_clause_tbl_ids
|
|
60
47
|
sql_select_list = [e.sql_expr() for e in self.sql_exprs]
|
|
61
48
|
assert len(sql_select_list) == len(self.sql_exprs)
|
|
62
|
-
assert all(
|
|
49
|
+
assert all(e is not None for e in sql_select_list)
|
|
63
50
|
self.set_pk = set_pk
|
|
64
51
|
self.num_pk_cols = 0
|
|
65
52
|
if set_pk:
|
|
@@ -69,42 +56,12 @@ class SqlScanNode(ExecNode):
|
|
|
69
56
|
sql_select_list += pk_columns
|
|
70
57
|
|
|
71
58
|
self.stmt = sql.select(*sql_select_list)
|
|
72
|
-
self.stmt = self.create_from_clause(
|
|
73
|
-
tbl, self.stmt, refd_tbl_ids, exact_version_only={t.id for t in exact_version_only})
|
|
74
|
-
|
|
75
|
-
# change rowid refs against a base table to rowid refs against the target table, so that we minimize
|
|
76
|
-
# the number of tables that need to be joined to the target table
|
|
77
|
-
for rowid_ref in [e for e, _ in order_by_items if isinstance(e, exprs.RowidRef)]:
|
|
78
|
-
rowid_ref.set_tbl(tbl)
|
|
79
|
-
order_by_clause: List[sql.ClauseElement] = []
|
|
80
|
-
for e, asc in order_by_items:
|
|
81
|
-
if isinstance(e, exprs.SimilarityExpr):
|
|
82
|
-
order_by_clause.append(e.as_order_by_clause(asc))
|
|
83
|
-
else:
|
|
84
|
-
order_by_clause.append(e.sql_expr().desc() if not asc else e.sql_expr())
|
|
85
|
-
|
|
86
|
-
if where_clause is not None:
|
|
87
|
-
sql_where_clause = where_clause.sql_expr()
|
|
88
|
-
assert sql_where_clause is not None
|
|
89
|
-
self.stmt = self.stmt.where(sql_where_clause)
|
|
90
|
-
if len(order_by_clause) > 0:
|
|
91
|
-
self.stmt = self.stmt.order_by(*order_by_clause)
|
|
92
|
-
elif target.id in row_builder.unstored_iter_args:
|
|
93
|
-
# we are referencing unstored iter columns from this view and try to order by our primary key,
|
|
94
|
-
# which ensures that iterators will see monotonically increasing pos values
|
|
95
|
-
self.stmt = self.stmt.order_by(*self.tbl.store_tbl.rowid_columns())
|
|
96
|
-
if limit != 0 and self.filter is None:
|
|
97
|
-
# if we need to do post-SQL filtering, we can't use LIMIT
|
|
98
|
-
self.stmt = self.stmt.limit(limit)
|
|
99
59
|
|
|
60
|
+
# additional state
|
|
100
61
|
self.result_cursor: Optional[sql.engine.CursorResult] = None
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
stmt_str = str(self.stmt.compile(compile_kwargs={'literal_binds': True}))
|
|
105
|
-
_logger.debug(f'SqlScanNode stmt:\n{stmt_str}')
|
|
106
|
-
except Exception as e:
|
|
107
|
-
pass
|
|
62
|
+
# the filter is provided by the subclass
|
|
63
|
+
self.filter: Optional[exprs.Expr] = None
|
|
64
|
+
self.filter_eval_ctx: Optional[exprs.EvalContext] = None
|
|
108
65
|
|
|
109
66
|
@classmethod
|
|
110
67
|
def create_from_clause(
|
|
@@ -224,3 +181,110 @@ class SqlScanNode(ExecNode):
|
|
|
224
181
|
if self.result_cursor is not None:
|
|
225
182
|
self.result_cursor.close()
|
|
226
183
|
|
|
184
|
+
|
|
185
|
+
class SqlScanNode(SqlNode):
|
|
186
|
+
"""
|
|
187
|
+
Materializes data from the store via a Select stmt.
|
|
188
|
+
|
|
189
|
+
Supports filtering and ordering.
|
|
190
|
+
"""
|
|
191
|
+
def __init__(
|
|
192
|
+
self, tbl: catalog.TableVersionPath, row_builder: exprs.RowBuilder,
|
|
193
|
+
select_list: Iterable[exprs.Expr],
|
|
194
|
+
where_clause: Optional[exprs.Expr] = None, filter: Optional[exprs.Expr] = None,
|
|
195
|
+
order_by_items: Optional[List[Tuple[exprs.Expr, bool]]] = None,
|
|
196
|
+
limit: int = 0, set_pk: bool = False, exact_version_only: Optional[List[catalog.TableVersion]] = None
|
|
197
|
+
):
|
|
198
|
+
"""
|
|
199
|
+
Args:
|
|
200
|
+
select_list: output of the query
|
|
201
|
+
sql_where_clause: SQL Where clause
|
|
202
|
+
filter: additional Where-clause predicate that can't be evaluated via SQL
|
|
203
|
+
limit: max number of rows to return: 0 = no limit
|
|
204
|
+
set_pk: if True, sets the primary for each DataRow
|
|
205
|
+
exact_version_only: tables for which we only want to see rows created at the current version
|
|
206
|
+
"""
|
|
207
|
+
super().__init__(tbl, row_builder, select_list, set_pk=set_pk)
|
|
208
|
+
# create Select stmt
|
|
209
|
+
if order_by_items is None:
|
|
210
|
+
order_by_items = []
|
|
211
|
+
if exact_version_only is None:
|
|
212
|
+
exact_version_only = []
|
|
213
|
+
target = tbl.tbl_version # the stored table we're scanning
|
|
214
|
+
self.filter = filter
|
|
215
|
+
self.filter_eval_ctx = \
|
|
216
|
+
row_builder.create_eval_ctx([filter], exclude=select_list) if filter is not None else None
|
|
217
|
+
self.limit = limit
|
|
218
|
+
|
|
219
|
+
where_clause_tbl_ids = where_clause.tbl_ids() if where_clause is not None else set()
|
|
220
|
+
refd_tbl_ids = exprs.Expr.list_tbl_ids(self.sql_exprs) | where_clause_tbl_ids
|
|
221
|
+
self.stmt = self.create_from_clause(
|
|
222
|
+
tbl, self.stmt, refd_tbl_ids, exact_version_only={t.id for t in exact_version_only})
|
|
223
|
+
|
|
224
|
+
# change rowid refs against a base table to rowid refs against the target table, so that we minimize
|
|
225
|
+
# the number of tables that need to be joined to the target table
|
|
226
|
+
for rowid_ref in [e for e, _ in order_by_items if isinstance(e, exprs.RowidRef)]:
|
|
227
|
+
rowid_ref.set_tbl(tbl)
|
|
228
|
+
order_by_clause: List[sql.ClauseElement] = []
|
|
229
|
+
for e, asc in order_by_items:
|
|
230
|
+
if isinstance(e, exprs.SimilarityExpr):
|
|
231
|
+
order_by_clause.append(e.as_order_by_clause(asc))
|
|
232
|
+
else:
|
|
233
|
+
order_by_clause.append(e.sql_expr().desc() if not asc else e.sql_expr())
|
|
234
|
+
|
|
235
|
+
if where_clause is not None:
|
|
236
|
+
sql_where_clause = where_clause.sql_expr()
|
|
237
|
+
assert sql_where_clause is not None
|
|
238
|
+
self.stmt = self.stmt.where(sql_where_clause)
|
|
239
|
+
if len(order_by_clause) > 0:
|
|
240
|
+
self.stmt = self.stmt.order_by(*order_by_clause)
|
|
241
|
+
elif target.id in row_builder.unstored_iter_args:
|
|
242
|
+
# we are referencing unstored iter columns from this view and try to order by our primary key,
|
|
243
|
+
# which ensures that iterators will see monotonically increasing pos values
|
|
244
|
+
self.stmt = self.stmt.order_by(*self.tbl.store_tbl.rowid_columns())
|
|
245
|
+
if limit != 0 and self.filter is None:
|
|
246
|
+
# if we need to do post-SQL filtering, we can't use LIMIT
|
|
247
|
+
self.stmt = self.stmt.limit(limit)
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
# log stmt, if possible
|
|
251
|
+
stmt_str = str(self.stmt.compile(compile_kwargs={'literal_binds': True}))
|
|
252
|
+
_logger.debug(f'SqlScanNode stmt:\n{stmt_str}')
|
|
253
|
+
except Exception as e:
|
|
254
|
+
pass
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class SqlLookupNode(SqlNode):
|
|
258
|
+
"""
|
|
259
|
+
Materializes data from the store via a Select stmt with a WHERE clause that matches a list of key values
|
|
260
|
+
"""
|
|
261
|
+
def __init__(
|
|
262
|
+
self, tbl: catalog.TableVersionPath, row_builder: exprs.RowBuilder,
|
|
263
|
+
select_list: Iterable[exprs.Expr], sa_key_cols: list[sql.Column], key_vals: list[tuple],
|
|
264
|
+
):
|
|
265
|
+
"""
|
|
266
|
+
Args:
|
|
267
|
+
select_list: output of the query
|
|
268
|
+
sa_key_cols: list of key columns in the store table
|
|
269
|
+
key_vals: list of key values to look up
|
|
270
|
+
"""
|
|
271
|
+
super().__init__(tbl, row_builder, select_list, set_pk=True)
|
|
272
|
+
target = tbl.tbl_version # the stored table we're scanning
|
|
273
|
+
refd_tbl_ids = exprs.Expr.list_tbl_ids(self.sql_exprs)
|
|
274
|
+
self.stmt = self.create_from_clause(tbl, self.stmt, refd_tbl_ids)
|
|
275
|
+
# Where clause: (key-col-1, key-col-2, ...) IN ((val-1, val-2, ...), ...)
|
|
276
|
+
self.where_clause = sql.tuple_(*sa_key_cols).in_(key_vals)
|
|
277
|
+
self.stmt = self.stmt.where(self.where_clause)
|
|
278
|
+
|
|
279
|
+
if target.id in row_builder.unstored_iter_args:
|
|
280
|
+
# we are referencing unstored iter columns from this view and try to order by our primary key,
|
|
281
|
+
# which ensures that iterators will see monotonically increasing pos values
|
|
282
|
+
self.stmt = self.stmt.order_by(*self.tbl.store_tbl.rowid_columns())
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
# log stmt, if possible
|
|
286
|
+
stmt_str = str(self.stmt.compile(compile_kwargs={'literal_binds': True}))
|
|
287
|
+
_logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
|
|
288
|
+
except Exception as e:
|
|
289
|
+
pass
|
|
290
|
+
|
pixeltable/exprs/__init__.py
CHANGED
|
@@ -8,7 +8,6 @@ from .data_row import DataRow
|
|
|
8
8
|
from .expr import Expr
|
|
9
9
|
from .expr_set import ExprSet
|
|
10
10
|
from .function_call import FunctionCall
|
|
11
|
-
from .image_member_access import ImageMemberAccess
|
|
12
11
|
from .in_predicate import InPredicate
|
|
13
12
|
from .inline_array import InlineArray
|
|
14
13
|
from .inline_dict import InlineDict
|
|
@@ -16,6 +15,7 @@ from .is_null import IsNull
|
|
|
16
15
|
from .json_mapper import JsonMapper
|
|
17
16
|
from .json_path import RELATIVE_PATH_ROOT, JsonPath
|
|
18
17
|
from .literal import Literal
|
|
18
|
+
from .method_ref import MethodRef
|
|
19
19
|
from .object_ref import ObjectRef
|
|
20
20
|
from .row_builder import RowBuilder, ColumnSlotIdx, ExecProfile
|
|
21
21
|
from .rowid_ref import RowidRef
|