pixeltable 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +5 -3
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -0
- pixeltable/catalog/catalog.py +335 -128
- pixeltable/catalog/column.py +21 -5
- pixeltable/catalog/dir.py +19 -6
- pixeltable/catalog/insertable_table.py +34 -37
- pixeltable/catalog/named_function.py +0 -4
- pixeltable/catalog/schema_object.py +28 -42
- pixeltable/catalog/table.py +195 -158
- pixeltable/catalog/table_version.py +187 -232
- pixeltable/catalog/table_version_handle.py +50 -0
- pixeltable/catalog/table_version_path.py +49 -33
- pixeltable/catalog/view.py +56 -96
- pixeltable/config.py +103 -0
- pixeltable/dataframe.py +90 -90
- pixeltable/env.py +98 -168
- pixeltable/exec/aggregation_node.py +5 -4
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/component_iteration_node.py +13 -9
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +0 -4
- pixeltable/exec/exec_node.py +3 -2
- pixeltable/exec/expr_eval/schedulers.py +2 -1
- pixeltable/exec/in_memory_data_node.py +9 -4
- pixeltable/exec/row_update_node.py +1 -2
- pixeltable/exec/sql_node.py +20 -16
- pixeltable/exprs/column_ref.py +9 -9
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +4 -4
- pixeltable/exprs/expr.py +20 -5
- pixeltable/exprs/function_call.py +98 -58
- pixeltable/exprs/json_mapper.py +25 -8
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/object_ref.py +16 -5
- pixeltable/exprs/row_builder.py +15 -15
- pixeltable/exprs/rowid_ref.py +21 -7
- pixeltable/func/__init__.py +1 -1
- pixeltable/func/function.py +38 -6
- pixeltable/func/query_template_function.py +3 -6
- pixeltable/func/tools.py +26 -26
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +2 -0
- pixeltable/functions/anthropic.py +9 -3
- pixeltable/functions/fireworks.py +7 -4
- pixeltable/functions/globals.py +4 -5
- pixeltable/functions/huggingface.py +1 -5
- pixeltable/functions/image.py +17 -7
- pixeltable/functions/llama_cpp.py +1 -1
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +4 -4
- pixeltable/functions/openai.py +26 -23
- pixeltable/functions/string.py +23 -30
- pixeltable/functions/timestamp.py +11 -6
- pixeltable/functions/together.py +14 -12
- pixeltable/functions/util.py +1 -1
- pixeltable/functions/video.py +5 -4
- pixeltable/functions/vision.py +6 -9
- pixeltable/functions/whisper.py +3 -3
- pixeltable/globals.py +246 -260
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +1 -1
- pixeltable/index/btree.py +3 -1
- pixeltable/index/embedding_index.py +11 -5
- pixeltable/io/external_store.py +11 -12
- pixeltable/io/label_studio.py +4 -3
- pixeltable/io/parquet.py +57 -56
- pixeltable/iterators/__init__.py +4 -2
- pixeltable/iterators/audio.py +11 -11
- pixeltable/iterators/document.py +10 -10
- pixeltable/iterators/string.py +1 -2
- pixeltable/iterators/video.py +14 -15
- pixeltable/metadata/__init__.py +9 -5
- pixeltable/metadata/converters/convert_10.py +0 -1
- pixeltable/metadata/converters/convert_15.py +0 -2
- pixeltable/metadata/converters/convert_23.py +0 -2
- pixeltable/metadata/converters/convert_24.py +3 -3
- pixeltable/metadata/converters/convert_25.py +1 -1
- pixeltable/metadata/converters/convert_27.py +0 -2
- pixeltable/metadata/converters/convert_28.py +0 -2
- pixeltable/metadata/converters/convert_29.py +7 -8
- pixeltable/metadata/converters/util.py +7 -7
- pixeltable/metadata/schema.py +27 -19
- pixeltable/plan.py +68 -40
- pixeltable/share/packager.py +12 -9
- pixeltable/store.py +37 -38
- pixeltable/type_system.py +41 -28
- pixeltable/utils/filecache.py +2 -1
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/METADATA +1 -1
- pixeltable-0.3.7.dist-info/RECORD +174 -0
- pixeltable-0.3.5.dist-info/RECORD +0 -172
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/entry_points.txt +0 -0
|
@@ -13,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
13
13
|
|
|
14
14
|
def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
|
|
15
15
|
if k == 'path' and (
|
|
16
|
-
v in
|
|
16
|
+
v in {'pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image'}
|
|
17
17
|
):
|
|
18
18
|
return 'path', 'pixeltable.functions.huggingface.clip'
|
|
19
19
|
return None
|
|
@@ -1,12 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Optional
|
|
3
2
|
from uuid import UUID
|
|
4
3
|
|
|
5
4
|
import sqlalchemy as sql
|
|
6
5
|
|
|
7
6
|
from pixeltable.metadata import register_converter
|
|
8
7
|
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
|
-
from pixeltable.metadata.schema import Table
|
|
10
8
|
|
|
11
9
|
_logger = logging.getLogger('pixeltable')
|
|
12
10
|
|
|
@@ -63,13 +63,12 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
|
|
|
63
63
|
# is an edge case that won't migrate properly.
|
|
64
64
|
parameters: list[dict] = v['fn']['signature']['parameters']
|
|
65
65
|
for i, param in enumerate(parameters):
|
|
66
|
-
if param['kind'] == 'VAR_POSITIONAL':
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
new_args = new_args[:i] + new_args[i + 1 :]
|
|
66
|
+
if param['kind'] == 'VAR_POSITIONAL' and new_args_len > i:
|
|
67
|
+
# For peculiar historical reasons, variable kwargs might show up in args. Thus variable
|
|
68
|
+
# positional args is not necessarily the last element of args; it might be the second-to-last.
|
|
69
|
+
assert new_args_len <= i + 2, new_args
|
|
70
|
+
rolled_args = new_args[i]
|
|
71
|
+
new_args = new_args[:i] + new_args[i + 1 :]
|
|
73
72
|
if param['kind'] == 'VAR_KEYWORD':
|
|
74
73
|
# As noted above, variable kwargs might show up either in args or in kwargs. If it's in args, it
|
|
75
74
|
# is necessarily the last element.
|
|
@@ -81,7 +80,7 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
|
|
|
81
80
|
rolled_kwargs = kwargs.pop(param['name'])
|
|
82
81
|
|
|
83
82
|
if rolled_args is not None:
|
|
84
|
-
assert rolled_args['_classname'] in
|
|
83
|
+
assert rolled_args['_classname'] in {'InlineArray', 'InlineList'}
|
|
85
84
|
new_args.extend(rolled_args['components'])
|
|
86
85
|
if rolled_kwargs is not None:
|
|
87
86
|
assert rolled_kwargs['_classname'] == 'InlineDict'
|
|
@@ -34,12 +34,12 @@ def convert_table_md(
|
|
|
34
34
|
"""
|
|
35
35
|
with engine.begin() as conn:
|
|
36
36
|
for row in conn.execute(sql.select(Table)):
|
|
37
|
-
|
|
37
|
+
tbl_id = row[0]
|
|
38
38
|
table_md = row[2]
|
|
39
39
|
assert isinstance(table_md, dict)
|
|
40
40
|
updated_table_md = copy.deepcopy(table_md)
|
|
41
41
|
if table_md_updater is not None:
|
|
42
|
-
table_md_updater(updated_table_md,
|
|
42
|
+
table_md_updater(updated_table_md, tbl_id)
|
|
43
43
|
if column_md_updater is not None:
|
|
44
44
|
__update_column_md(updated_table_md, column_md_updater)
|
|
45
45
|
if external_store_md_updater is not None:
|
|
@@ -47,19 +47,19 @@ def convert_table_md(
|
|
|
47
47
|
if substitution_fn is not None:
|
|
48
48
|
updated_table_md = __substitute_md_rec(updated_table_md, substitution_fn)
|
|
49
49
|
if updated_table_md != table_md:
|
|
50
|
-
__logger.info(f'Updating schema for table: {
|
|
51
|
-
conn.execute(sql.update(Table).where(Table.id ==
|
|
50
|
+
__logger.info(f'Updating schema for table: {tbl_id}')
|
|
51
|
+
conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
|
|
52
52
|
|
|
53
53
|
for row in conn.execute(sql.select(Function)):
|
|
54
|
-
|
|
54
|
+
fn_id = row[0]
|
|
55
55
|
function_md = row[2]
|
|
56
56
|
assert isinstance(function_md, dict)
|
|
57
57
|
updated_function_md = copy.deepcopy(function_md)
|
|
58
58
|
if substitution_fn is not None:
|
|
59
59
|
updated_function_md = __substitute_md_rec(updated_function_md, substitution_fn)
|
|
60
60
|
if updated_function_md != function_md:
|
|
61
|
-
__logger.info(f'Updating function: {
|
|
62
|
-
conn.execute(sql.update(Function).where(Function.id ==
|
|
61
|
+
__logger.info(f'Updating function: {fn_id}')
|
|
62
|
+
conn.execute(sql.update(Function).where(Function.id == fn_id).values(md=updated_function_md))
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -4,16 +4,14 @@ import uuid
|
|
|
4
4
|
from typing import Any, Optional, TypeVar, Union, get_type_hints
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
|
-
|
|
8
|
-
from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary
|
|
7
|
+
from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
|
|
9
8
|
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
|
10
|
-
from sqlalchemy.orm import declarative_base
|
|
11
9
|
from sqlalchemy.orm.decl_api import DeclarativeMeta
|
|
12
10
|
|
|
13
11
|
# Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
|
|
14
12
|
# a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
|
|
15
13
|
# outside of the module in a typesafe way.
|
|
16
|
-
Base: type = declarative_base()
|
|
14
|
+
Base: type = orm.declarative_base()
|
|
17
15
|
assert isinstance(Base, DeclarativeMeta)
|
|
18
16
|
base_metadata = Base.metadata
|
|
19
17
|
|
|
@@ -23,7 +21,7 @@ T = TypeVar('T')
|
|
|
23
21
|
def md_from_dict(data_class_type: type[T], data: Any) -> T:
|
|
24
22
|
"""Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
|
|
25
23
|
if dataclasses.is_dataclass(data_class_type):
|
|
26
|
-
fieldtypes =
|
|
24
|
+
fieldtypes = get_type_hints(data_class_type)
|
|
27
25
|
return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data}) # type: ignore[return-value]
|
|
28
26
|
|
|
29
27
|
origin = typing.get_origin(data_class_type)
|
|
@@ -43,7 +41,7 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
|
|
|
43
41
|
elif origin is tuple:
|
|
44
42
|
return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(type_args, data)) # type: ignore[return-value]
|
|
45
43
|
else:
|
|
46
|
-
|
|
44
|
+
raise AssertionError(origin)
|
|
47
45
|
else:
|
|
48
46
|
return data
|
|
49
47
|
|
|
@@ -85,7 +83,7 @@ class Dir(Base):
|
|
|
85
83
|
UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False
|
|
86
84
|
)
|
|
87
85
|
parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
88
|
-
md =
|
|
86
|
+
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # DirMd
|
|
89
87
|
|
|
90
88
|
|
|
91
89
|
@dataclasses.dataclass
|
|
@@ -131,13 +129,17 @@ class IndexMd:
|
|
|
131
129
|
init_args: dict[str, Any]
|
|
132
130
|
|
|
133
131
|
|
|
132
|
+
# a stored table version path is a list of (table id as str, effective table version)
|
|
133
|
+
TableVersionPath = list[tuple[str, Optional[int]]]
|
|
134
|
+
|
|
135
|
+
|
|
134
136
|
@dataclasses.dataclass
|
|
135
137
|
class ViewMd:
|
|
136
138
|
is_snapshot: bool
|
|
137
139
|
include_base_columns: bool
|
|
138
140
|
|
|
139
141
|
# (table id, version); for mutable views, all versions are None
|
|
140
|
-
base_versions:
|
|
142
|
+
base_versions: TableVersionPath
|
|
141
143
|
|
|
142
144
|
# filter predicate applied to the base table; view-only
|
|
143
145
|
predicate: Optional[dict[str, Any]]
|
|
@@ -192,7 +194,7 @@ class Table(Base):
|
|
|
192
194
|
|
|
193
195
|
id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, nullable=False)
|
|
194
196
|
dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
|
|
195
|
-
md =
|
|
197
|
+
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableMd
|
|
196
198
|
|
|
197
199
|
|
|
198
200
|
@dataclasses.dataclass
|
|
@@ -205,9 +207,11 @@ class TableVersionMd:
|
|
|
205
207
|
|
|
206
208
|
class TableVersion(Base):
|
|
207
209
|
__tablename__ = 'tableversions'
|
|
208
|
-
tbl_id
|
|
209
|
-
|
|
210
|
-
|
|
210
|
+
tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
|
|
211
|
+
UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
|
|
212
|
+
)
|
|
213
|
+
version: orm.Mapped[int] = orm.mapped_column(BigInteger, primary_key=True, nullable=False)
|
|
214
|
+
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False)
|
|
211
215
|
|
|
212
216
|
|
|
213
217
|
@dataclasses.dataclass
|
|
@@ -246,9 +250,11 @@ class TableSchemaVersionMd:
|
|
|
246
250
|
class TableSchemaVersion(Base):
|
|
247
251
|
__tablename__ = 'tableschemaversions'
|
|
248
252
|
|
|
249
|
-
tbl_id
|
|
250
|
-
|
|
251
|
-
|
|
253
|
+
tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
|
|
254
|
+
UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
|
|
255
|
+
)
|
|
256
|
+
schema_version: orm.Mapped[int] = orm.mapped_column(BigInteger, primary_key=True, nullable=False)
|
|
257
|
+
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableSchemaVersionMd
|
|
252
258
|
|
|
253
259
|
|
|
254
260
|
@dataclasses.dataclass
|
|
@@ -271,7 +277,9 @@ class Function(Base):
|
|
|
271
277
|
|
|
272
278
|
__tablename__ = 'functions'
|
|
273
279
|
|
|
274
|
-
id
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
280
|
+
id: orm.Mapped[uuid.UUID] = orm.mapped_column(
|
|
281
|
+
UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False
|
|
282
|
+
)
|
|
283
|
+
dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
284
|
+
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # FunctionMd
|
|
285
|
+
binary_obj: orm.Mapped[Optional[bytes]] = orm.mapped_column(LargeBinary, nullable=True)
|
pixeltable/plan.py
CHANGED
|
@@ -2,14 +2,15 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
|
+
from textwrap import dedent
|
|
5
6
|
from typing import Any, Iterable, Literal, Optional, Sequence
|
|
6
7
|
from uuid import UUID
|
|
7
8
|
|
|
8
9
|
import sqlalchemy as sql
|
|
9
10
|
|
|
10
11
|
import pixeltable as pxt
|
|
11
|
-
|
|
12
|
-
from pixeltable import
|
|
12
|
+
from pixeltable import catalog, exceptions as excs, exec, exprs
|
|
13
|
+
from pixeltable.catalog import Column, TableVersionHandle
|
|
13
14
|
from pixeltable.exec.sql_node import OrderByClause, OrderByItem, combine_order_by_clauses, print_order_by_clause
|
|
14
15
|
|
|
15
16
|
|
|
@@ -54,9 +55,9 @@ class JoinType(enum.Enum):
|
|
|
54
55
|
def validated(cls, name: str, error_prefix: str) -> JoinType:
|
|
55
56
|
try:
|
|
56
57
|
return cls[name.upper()]
|
|
57
|
-
except KeyError:
|
|
58
|
-
val_strs = ', '.join(f'{s.lower()!r}' for s in cls.__members__
|
|
59
|
-
raise excs.Error(f'{error_prefix} must be one of: [{val_strs}]')
|
|
58
|
+
except KeyError as exc:
|
|
59
|
+
val_strs = ', '.join(f'{s.lower()!r}' for s in cls.__members__)
|
|
60
|
+
raise excs.Error(f'{error_prefix} must be one of: [{val_strs}]') from exc
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
@dataclasses.dataclass
|
|
@@ -177,19 +178,21 @@ class Analyzer:
|
|
|
177
178
|
)
|
|
178
179
|
|
|
179
180
|
# check that Where clause and filter doesn't contain aggregates
|
|
180
|
-
if self.sql_where_clause is not None
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
181
|
+
if self.sql_where_clause is not None and any(
|
|
182
|
+
_is_agg_fn_call(e) for e in self.sql_where_clause.subexprs(expr_class=exprs.FunctionCall)
|
|
183
|
+
):
|
|
184
|
+
raise excs.Error(f'where() cannot contain aggregate functions: {self.sql_where_clause}')
|
|
185
|
+
if self.filter is not None and any(
|
|
186
|
+
_is_agg_fn_call(e) for e in self.filter.subexprs(expr_class=exprs.FunctionCall)
|
|
187
|
+
):
|
|
188
|
+
raise excs.Error(f'where() cannot contain aggregate functions: {self.filter}')
|
|
186
189
|
|
|
187
190
|
# check that grouping exprs don't contain aggregates and can be expressed as SQL (we perform sort-based
|
|
188
191
|
# aggregation and rely on the SqlScanNode returning data in the correct order)
|
|
189
192
|
for e in self.group_by_clause:
|
|
190
193
|
if not self.sql_elements.contains(e):
|
|
191
194
|
raise excs.Error(f'Invalid grouping expression, needs to be expressible in SQL: {e}')
|
|
192
|
-
if e._contains(filter=
|
|
195
|
+
if e._contains(filter=_is_agg_fn_call):
|
|
193
196
|
raise excs.Error(f'Grouping expression contains aggregate function: {e}')
|
|
194
197
|
|
|
195
198
|
def _determine_agg_status(self, e: exprs.Expr, grouping_expr_ids: set[int]) -> tuple[bool, bool]:
|
|
@@ -207,7 +210,7 @@ class Analyzer:
|
|
|
207
210
|
return True, False
|
|
208
211
|
elif isinstance(e, exprs.Literal):
|
|
209
212
|
return True, True
|
|
210
|
-
elif isinstance(e, exprs.ColumnRef
|
|
213
|
+
elif isinstance(e, (exprs.ColumnRef, exprs.RowidRef)):
|
|
211
214
|
# we already know that this isn't a grouping expr
|
|
212
215
|
return False, True
|
|
213
216
|
else:
|
|
@@ -275,14 +278,19 @@ class Planner:
|
|
|
275
278
|
cls, tbl: catalog.TableVersion, rows: list[dict[str, Any]], ignore_errors: bool
|
|
276
279
|
) -> exec.ExecNode:
|
|
277
280
|
"""Creates a plan for TableVersion.insert()"""
|
|
278
|
-
assert not tbl.is_view
|
|
281
|
+
assert not tbl.is_view
|
|
279
282
|
# stored_cols: all cols we need to store, incl computed cols (and indices)
|
|
280
283
|
stored_cols = [c for c in tbl.cols_by_id.values() if c.is_stored]
|
|
281
284
|
assert len(stored_cols) > 0 # there needs to be something to store
|
|
285
|
+
|
|
286
|
+
cls.__check_valid_columns(tbl, stored_cols, 'inserted into')
|
|
287
|
+
|
|
282
288
|
row_builder = exprs.RowBuilder([], stored_cols, [])
|
|
283
289
|
|
|
284
290
|
# create InMemoryDataNode for 'rows'
|
|
285
|
-
plan: exec.ExecNode = exec.InMemoryDataNode(
|
|
291
|
+
plan: exec.ExecNode = exec.InMemoryDataNode(
|
|
292
|
+
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_rowid
|
|
293
|
+
)
|
|
286
294
|
|
|
287
295
|
media_input_col_info = [
|
|
288
296
|
exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
|
|
@@ -318,7 +326,7 @@ class Planner:
|
|
|
318
326
|
def create_df_insert_plan(
|
|
319
327
|
cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
|
|
320
328
|
) -> exec.ExecNode:
|
|
321
|
-
assert not tbl.is_view
|
|
329
|
+
assert not tbl.is_view
|
|
322
330
|
plan = df._create_query_plan() # ExecNode constructed by the DataFrame
|
|
323
331
|
|
|
324
332
|
# Modify the plan RowBuilder to register the output columns
|
|
@@ -363,7 +371,7 @@ class Planner:
|
|
|
363
371
|
"""
|
|
364
372
|
# retrieve all stored cols and all target exprs
|
|
365
373
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
366
|
-
target = tbl.tbl_version # the one we need to update
|
|
374
|
+
target = tbl.tbl_version.get() # the one we need to update
|
|
367
375
|
updated_cols = list(update_targets.keys())
|
|
368
376
|
if len(recompute_targets) > 0:
|
|
369
377
|
recomputed_cols = set(recompute_targets)
|
|
@@ -374,11 +382,14 @@ class Planner:
|
|
|
374
382
|
recomputed_cols.update(idx_val_cols)
|
|
375
383
|
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
376
384
|
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
377
|
-
|
|
385
|
+
|
|
386
|
+
cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
|
|
387
|
+
|
|
388
|
+
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
|
|
378
389
|
copied_cols = [
|
|
379
390
|
col
|
|
380
391
|
for col in target.cols_by_id.values()
|
|
381
|
-
if col.is_stored and not
|
|
392
|
+
if col.is_stored and col not in updated_cols and col not in recomputed_base_cols
|
|
382
393
|
]
|
|
383
394
|
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
|
|
384
395
|
select_list.extend(update_targets.values())
|
|
@@ -398,7 +409,25 @@ class Planner:
|
|
|
398
409
|
for i, col in enumerate(all_base_cols):
|
|
399
410
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
400
411
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
401
|
-
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
412
|
+
return plan, [f'{c.tbl.get().name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
413
|
+
|
|
414
|
+
@classmethod
|
|
415
|
+
def __check_valid_columns(
|
|
416
|
+
cls, tbl: catalog.TableVersion, cols: Iterable[Column], op_name: Literal['inserted into', 'updated in']
|
|
417
|
+
) -> None:
|
|
418
|
+
for col in cols:
|
|
419
|
+
if col.value_expr is not None and not col.value_expr.is_valid:
|
|
420
|
+
raise excs.Error(
|
|
421
|
+
dedent(
|
|
422
|
+
f"""
|
|
423
|
+
Data cannot be {op_name} the table {tbl.name!r},
|
|
424
|
+
because the column {col.name!r} is currently invalid:
|
|
425
|
+
{{validation_error}}
|
|
426
|
+
"""
|
|
427
|
+
)
|
|
428
|
+
.strip()
|
|
429
|
+
.format(validation_error=col.value_expr.validation_error)
|
|
430
|
+
)
|
|
402
431
|
|
|
403
432
|
@classmethod
|
|
404
433
|
def create_batch_update_plan(
|
|
@@ -417,7 +446,7 @@ class Planner:
|
|
|
417
446
|
- list of user-visible columns that are being recomputed
|
|
418
447
|
"""
|
|
419
448
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
420
|
-
target = tbl.tbl_version # the one we need to update
|
|
449
|
+
target = tbl.tbl_version.get() # the one we need to update
|
|
421
450
|
sa_key_cols: list[sql.Column] = []
|
|
422
451
|
key_vals: list[tuple] = []
|
|
423
452
|
if len(rowids) > 0:
|
|
@@ -440,7 +469,7 @@ class Planner:
|
|
|
440
469
|
copied_cols = [
|
|
441
470
|
col
|
|
442
471
|
for col in target.cols_by_id.values()
|
|
443
|
-
if col.is_stored and not
|
|
472
|
+
if col.is_stored and col not in updated_cols and col not in recomputed_base_cols
|
|
444
473
|
]
|
|
445
474
|
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
|
|
446
475
|
select_list.extend(exprs.ColumnRef(col) for col in updated_cols)
|
|
@@ -507,11 +536,11 @@ class Planner:
|
|
|
507
536
|
- list of columns that are being recomputed
|
|
508
537
|
"""
|
|
509
538
|
assert isinstance(view, catalog.TableVersionPath)
|
|
510
|
-
assert view.is_view
|
|
511
|
-
target = view.tbl_version # the one we need to update
|
|
539
|
+
assert view.is_view
|
|
540
|
+
target = view.tbl_version.get() # the one we need to update
|
|
512
541
|
# retrieve all stored cols and all target exprs
|
|
513
542
|
recomputed_cols = set(recompute_targets.copy())
|
|
514
|
-
copied_cols = [col for col in target.cols_by_id.values() if col.is_stored and not
|
|
543
|
+
copied_cols = [col for col in target.cols_by_id.values() if col.is_stored and col not in recomputed_cols]
|
|
515
544
|
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
|
|
516
545
|
# resolve recomputed exprs to stored columns in the base
|
|
517
546
|
recomputed_exprs = [
|
|
@@ -551,13 +580,13 @@ class Planner:
|
|
|
551
580
|
- number of materialized values per row
|
|
552
581
|
"""
|
|
553
582
|
assert isinstance(view, catalog.TableVersionPath)
|
|
554
|
-
assert view.is_view
|
|
583
|
+
assert view.is_view
|
|
555
584
|
# things we need to materialize as DataRows:
|
|
556
585
|
# 1. stored computed cols
|
|
557
586
|
# - iterator columns are effectively computed, just not with a value_expr
|
|
558
587
|
# - we can ignore stored non-computed columns because they have a default value that is supplied directly by
|
|
559
588
|
# the store
|
|
560
|
-
target = view.tbl_version # the one we need to populate
|
|
589
|
+
target = view.tbl_version.get() # the one we need to populate
|
|
561
590
|
stored_cols = [c for c in target.cols_by_id.values() if c.is_stored]
|
|
562
591
|
# 2. for component views: iterator args
|
|
563
592
|
iterator_args = [target.iterator_args] if target.iterator_args is not None else []
|
|
@@ -585,8 +614,8 @@ class Planner:
|
|
|
585
614
|
exact_version_only=view.get_bases() if propagates_insert else [],
|
|
586
615
|
)
|
|
587
616
|
exec_ctx = plan.ctx
|
|
588
|
-
if target.is_component_view
|
|
589
|
-
plan = exec.ComponentIterationNode(
|
|
617
|
+
if target.is_component_view:
|
|
618
|
+
plan = exec.ComponentIterationNode(view.tbl_version, plan)
|
|
590
619
|
if len(view_output_exprs) > 0:
|
|
591
620
|
plan = exec.ExprEvalNode(
|
|
592
621
|
row_builder, output_exprs=view_output_exprs, input_exprs=base_output_exprs, input=plan
|
|
@@ -639,11 +668,12 @@ class Planner:
|
|
|
639
668
|
@classmethod
|
|
640
669
|
def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
|
|
641
670
|
"""Returns True if l1 is contained in l2"""
|
|
642
|
-
|
|
643
|
-
return s1 <= s2
|
|
671
|
+
return {e.id for e in l1} <= {e.id for e in l2}
|
|
644
672
|
|
|
645
673
|
@classmethod
|
|
646
|
-
def _insert_prefetch_node(
|
|
674
|
+
def _insert_prefetch_node(
|
|
675
|
+
cls, tbl_id: UUID, row_builder: exprs.RowBuilder, input_node: exec.ExecNode
|
|
676
|
+
) -> exec.ExecNode:
|
|
647
677
|
"""Returns a CachePrefetchNode into the plan if needed, otherwise returns input"""
|
|
648
678
|
# we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
|
|
649
679
|
# of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
|
|
@@ -652,10 +682,10 @@ class Planner:
|
|
|
652
682
|
e for e in list(row_builder.unique_exprs) if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()
|
|
653
683
|
]
|
|
654
684
|
if len(media_col_refs) == 0:
|
|
655
|
-
return
|
|
685
|
+
return input_node
|
|
656
686
|
# we need to prefetch external files for media column types
|
|
657
687
|
file_col_info = [exprs.ColumnSlotIdx(e.col, e.slot_idx) for e in media_col_refs]
|
|
658
|
-
prefetch_node = exec.CachePrefetchNode(tbl_id, file_col_info,
|
|
688
|
+
prefetch_node = exec.CachePrefetchNode(tbl_id, file_col_info, input_node)
|
|
659
689
|
return prefetch_node
|
|
660
690
|
|
|
661
691
|
@classmethod
|
|
@@ -668,7 +698,7 @@ class Planner:
|
|
|
668
698
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
|
|
669
699
|
limit: Optional[exprs.Expr] = None,
|
|
670
700
|
ignore_errors: bool = False,
|
|
671
|
-
exact_version_only: Optional[list[catalog.
|
|
701
|
+
exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
|
|
672
702
|
) -> exec.ExecNode:
|
|
673
703
|
"""Return plan for executing a query.
|
|
674
704
|
Updates 'select_list' in place to make it executable.
|
|
@@ -714,7 +744,7 @@ class Planner:
|
|
|
714
744
|
eval_ctx: exprs.RowBuilder.EvalCtx,
|
|
715
745
|
limit: Optional[exprs.Expr] = None,
|
|
716
746
|
with_pk: bool = False,
|
|
717
|
-
exact_version_only: Optional[list[catalog.
|
|
747
|
+
exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
|
|
718
748
|
) -> exec.ExecNode:
|
|
719
749
|
"""
|
|
720
750
|
Create plan to materialize eval_ctx.
|
|
@@ -752,13 +782,11 @@ class Planner:
|
|
|
752
782
|
)
|
|
753
783
|
if analyzer.filter is not None:
|
|
754
784
|
candidates.extend(
|
|
755
|
-
exprs.Expr.subexprs(analyzer.filter, filter=
|
|
785
|
+
exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
|
|
756
786
|
)
|
|
757
787
|
if is_python_agg and analyzer.group_by_clause is not None:
|
|
758
788
|
candidates.extend(
|
|
759
|
-
exprs.Expr.list_subexprs(
|
|
760
|
-
analyzer.group_by_clause, filter=lambda e: sql_elements.contains(e), traverse_matches=False
|
|
761
|
-
)
|
|
789
|
+
exprs.Expr.list_subexprs(analyzer.group_by_clause, filter=sql_elements.contains, traverse_matches=False)
|
|
762
790
|
)
|
|
763
791
|
# not isinstance(...): we don't want to materialize Literals via a Select
|
|
764
792
|
sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
|
pixeltable/share/packager.py
CHANGED
|
@@ -68,11 +68,13 @@ class TablePackager:
|
|
|
68
68
|
'table_id': str(t._tbl_version.id),
|
|
69
69
|
# These are temporary; will replace with a better solution once the concurrency changes to catalog have
|
|
70
70
|
# been merged
|
|
71
|
-
'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
|
|
71
|
+
'table_md': dataclasses.asdict(t._tbl_version.get()._create_tbl_md()),
|
|
72
72
|
'table_version_md': dataclasses.asdict(
|
|
73
|
-
t._tbl_version._create_version_md(datetime.now().timestamp())
|
|
73
|
+
t._tbl_version.get()._create_version_md(datetime.now().timestamp())
|
|
74
|
+
),
|
|
75
|
+
'table_schema_version_md': dataclasses.asdict(
|
|
76
|
+
t._tbl_version.get()._create_schema_version_md(0)
|
|
74
77
|
),
|
|
75
|
-
'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
|
|
76
78
|
}
|
|
77
79
|
for t in (table, *table._bases)
|
|
78
80
|
]
|
|
@@ -91,10 +93,11 @@ class TablePackager:
|
|
|
91
93
|
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
92
94
|
json.dump(self.md, fp)
|
|
93
95
|
self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
with Env.get().begin_xact():
|
|
97
|
+
ancestors = (self.table, *self.table._bases)
|
|
98
|
+
for t in ancestors:
|
|
99
|
+
_logger.info(f"Exporting table '{t._path}'.")
|
|
100
|
+
self.__export_table(t)
|
|
98
101
|
_logger.info(f'Building archive.')
|
|
99
102
|
bundle_path = self.__build_tarball()
|
|
100
103
|
_logger.info(f'Packaging complete: {bundle_path}')
|
|
@@ -117,7 +120,7 @@ class TablePackager:
|
|
|
117
120
|
# to get the column types, since we'll be substituting `fileurl`s for media columns.
|
|
118
121
|
actual_col_types: list[ts.ColumnType] = []
|
|
119
122
|
|
|
120
|
-
for col_name, col in t._tbl_version.cols_by_name.items():
|
|
123
|
+
for col_name, col in t._tbl_version.get().cols_by_name.items():
|
|
121
124
|
if not col.is_stored:
|
|
122
125
|
continue
|
|
123
126
|
if col.col_type.is_media_type():
|
|
@@ -150,7 +153,7 @@ class TablePackager:
|
|
|
150
153
|
"""
|
|
151
154
|
Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
|
|
152
155
|
"""
|
|
153
|
-
parent_path = table._parent._path
|
|
156
|
+
parent_path = table._parent()._path()
|
|
154
157
|
if len(parent_path) == 0:
|
|
155
158
|
return 'pxt'
|
|
156
159
|
else:
|