pixeltable 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +9 -2
- pixeltable/catalog/column.py +1 -1
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/table.py +1 -1
- pixeltable/catalog/table_version.py +12 -2
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +64 -20
- pixeltable/dataframe.py +10 -5
- pixeltable/env.py +12 -0
- pixeltable/exec/expr_eval/evaluators.py +4 -2
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
- pixeltable/exprs/comparison.py +8 -4
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +2 -2
- pixeltable/exprs/function_call.py +155 -313
- pixeltable/func/aggregate_function.py +29 -15
- pixeltable/func/callable_function.py +11 -8
- pixeltable/func/expr_template_function.py +3 -9
- pixeltable/func/function.py +148 -74
- pixeltable/func/signature.py +65 -30
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/deepseek.py +121 -0
- pixeltable/functions/image.py +7 -7
- pixeltable/functions/openai.py +23 -9
- pixeltable/functions/video.py +14 -7
- pixeltable/globals.py +14 -3
- pixeltable/index/embedding_index.py +4 -13
- pixeltable/io/globals.py +88 -77
- pixeltable/io/hf_datasets.py +34 -34
- pixeltable/io/pandas.py +75 -76
- pixeltable/io/parquet.py +19 -27
- pixeltable/io/utils.py +115 -0
- pixeltable/iterators/audio.py +2 -1
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/__init__.py +2 -1
- pixeltable/metadata/converters/convert_15.py +18 -8
- pixeltable/metadata/converters/convert_27.py +31 -0
- pixeltable/metadata/converters/convert_28.py +15 -0
- pixeltable/metadata/converters/convert_29.py +111 -0
- pixeltable/metadata/converters/util.py +12 -1
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/share/__init__.py +1 -0
- pixeltable/share/packager.py +41 -13
- pixeltable/share/publish.py +97 -0
- pixeltable/type_system.py +40 -14
- pixeltable/utils/__init__.py +41 -0
- pixeltable/utils/arrow.py +40 -7
- pixeltable/utils/formatter.py +1 -1
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/METADATA +34 -49
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/RECORD +57 -51
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/entry_points.txt +0 -0
|
@@ -17,7 +17,7 @@ _logger = logging.getLogger('pixeltable')
|
|
|
17
17
|
def _(engine: sql.engine.Engine) -> None:
|
|
18
18
|
with engine.begin() as conn:
|
|
19
19
|
for row in conn.execute(sql.select(Function)):
|
|
20
|
-
id,
|
|
20
|
+
id, _, md, binary_obj = row
|
|
21
21
|
md['md'] = __update_md(md['md'], binary_obj)
|
|
22
22
|
_logger.info(f'Updating function: {id}')
|
|
23
23
|
conn.execute(sql.update(Function).where(Function.id == id).values(md=md))
|
|
@@ -27,14 +27,24 @@ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
|
|
|
27
27
|
# construct dict produced by CallableFunction.to_store()
|
|
28
28
|
py_fn = cloudpickle.loads(binary_obj)
|
|
29
29
|
py_params = inspect.signature(py_fn).parameters
|
|
30
|
-
return_type =
|
|
31
|
-
params: list[
|
|
30
|
+
return_type = orig_d['return_type']
|
|
31
|
+
params: list[dict] = []
|
|
32
32
|
for name, col_type_dict, kind_int, is_batched in orig_d['parameters']:
|
|
33
|
-
col_type = ts.ColumnType.from_dict(col_type_dict) if col_type_dict is not None else None
|
|
34
33
|
default = py_params[name].default
|
|
35
|
-
kind = inspect._ParameterKind(kind_int)
|
|
36
|
-
params.append(
|
|
34
|
+
kind = inspect._ParameterKind(kind_int)
|
|
35
|
+
params.append(
|
|
36
|
+
{
|
|
37
|
+
'name': name,
|
|
38
|
+
'col_type': col_type_dict,
|
|
39
|
+
'kind': str(kind),
|
|
40
|
+
'is_batched': is_batched,
|
|
41
|
+
'has_default': default is not inspect.Parameter.empty,
|
|
42
|
+
'default': None if default is inspect.Parameter.empty else default,
|
|
43
|
+
}
|
|
44
|
+
)
|
|
37
45
|
is_batched = 'batch_size' in orig_d
|
|
38
|
-
|
|
39
|
-
|
|
46
|
+
d = {
|
|
47
|
+
'signature': {'return_type': return_type, 'parameters': params, 'is_batched': is_batched},
|
|
48
|
+
'batch_size': orig_d['batch_size'] if is_batched else None,
|
|
49
|
+
}
|
|
40
50
|
return d
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
|
|
5
|
+
import sqlalchemy as sql
|
|
6
|
+
|
|
7
|
+
from pixeltable.metadata import register_converter
|
|
8
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
|
+
from pixeltable.metadata.schema import Table
|
|
10
|
+
|
|
11
|
+
_logger = logging.getLogger('pixeltable')
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_converter(version=27)
|
|
15
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
16
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
20
|
+
"""Update the view metadata to add the include_base_columns boolean if it is missing
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
table_md (dict): copy of the original table metadata. this gets updated in place.
|
|
24
|
+
table_id (UUID): the table id
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
if table_md['view_md'] is None:
|
|
28
|
+
return
|
|
29
|
+
if 'include_base_columns' not in table_md['view_md']:
|
|
30
|
+
table_md['view_md']['include_base_columns'] = True
|
|
31
|
+
_logger.info(f'Updating view metadata for table: {table_id}')
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.schema import Dir, Table, TableSchemaVersion, TableVersion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=28)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
with engine.begin() as conn:
|
|
12
|
+
conn.execute(sql.update(Dir).values(md=Dir.md.concat({'user': None, 'additional_md': {}})))
|
|
13
|
+
conn.execute(sql.update(Table).values(md=Table.md.concat({'user': None, 'additional_md': {}})))
|
|
14
|
+
conn.execute(sql.update(TableVersion).values(md=TableVersion.md.concat({'additional_md': {}})))
|
|
15
|
+
conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat({'additional_md': {}})))
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable import exprs
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_converter(version=29)
|
|
11
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
12
|
+
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
|
|
16
|
+
# Defaults are now stored as literals in signatures
|
|
17
|
+
if k == 'parameters':
|
|
18
|
+
for param in v:
|
|
19
|
+
assert isinstance(param, dict)
|
|
20
|
+
has_default = param.get('has_default') or (param.get('default') is not None)
|
|
21
|
+
if 'has_default' in param:
|
|
22
|
+
del param['has_default']
|
|
23
|
+
literal = exprs.Expr.from_object(param['default']) if has_default else None
|
|
24
|
+
assert literal is None or isinstance(literal, exprs.Literal)
|
|
25
|
+
param['default'] = None if literal is None else literal.as_dict()
|
|
26
|
+
return k, v
|
|
27
|
+
|
|
28
|
+
# Method of organizing argument expressions has changed
|
|
29
|
+
if isinstance(v, dict) and v.get('_classname') == 'FunctionCall':
|
|
30
|
+
args = v['args']
|
|
31
|
+
kwargs = v['kwargs']
|
|
32
|
+
components = v['components']
|
|
33
|
+
group_by_start_idx = v['group_by_start_idx']
|
|
34
|
+
group_by_stop_idx = v['group_by_stop_idx']
|
|
35
|
+
order_by_start_idx = v['order_by_start_idx']
|
|
36
|
+
|
|
37
|
+
new_args = []
|
|
38
|
+
for arg in args:
|
|
39
|
+
if arg[0] is not None:
|
|
40
|
+
assert isinstance(arg[0], int)
|
|
41
|
+
new_args.append(components[arg[0]])
|
|
42
|
+
else:
|
|
43
|
+
literal = exprs.Expr.from_object(arg[1])
|
|
44
|
+
new_args.append(literal.as_dict())
|
|
45
|
+
|
|
46
|
+
new_kwargs = {}
|
|
47
|
+
for name, kwarg in kwargs.items():
|
|
48
|
+
if kwarg[0] is not None:
|
|
49
|
+
assert isinstance(kwarg[0], int)
|
|
50
|
+
new_kwargs[name] = components[kwarg[0]]
|
|
51
|
+
else:
|
|
52
|
+
literal = exprs.Expr.from_object(kwarg[1])
|
|
53
|
+
new_kwargs[name] = literal.as_dict()
|
|
54
|
+
|
|
55
|
+
# We need to expand ("unroll") any var-args or var-kwargs.
|
|
56
|
+
|
|
57
|
+
new_args_len = len(new_args)
|
|
58
|
+
rolled_args: Optional[dict] = None
|
|
59
|
+
rolled_kwargs: Optional[dict] = None
|
|
60
|
+
|
|
61
|
+
if 'signature' in v['fn']:
|
|
62
|
+
# If it's a pickled function, there's no signature, so we're out of luck; varargs in a pickled function
|
|
63
|
+
# is an edge case that won't migrate properly.
|
|
64
|
+
parameters: list[dict] = v['fn']['signature']['parameters']
|
|
65
|
+
for i, param in enumerate(parameters):
|
|
66
|
+
if param['kind'] == 'VAR_POSITIONAL':
|
|
67
|
+
if new_args_len > i:
|
|
68
|
+
# For peculiar historical reasons, variable kwargs might show up in args. Thus variable
|
|
69
|
+
# positional args is not necessarily the last element of args; it might be the second-to-last.
|
|
70
|
+
assert new_args_len <= i + 2, new_args
|
|
71
|
+
rolled_args = new_args[i]
|
|
72
|
+
new_args = new_args[:i] + new_args[i + 1 :]
|
|
73
|
+
if param['kind'] == 'VAR_KEYWORD':
|
|
74
|
+
# As noted above, variable kwargs might show up either in args or in kwargs. If it's in args, it
|
|
75
|
+
# is necessarily the last element.
|
|
76
|
+
if new_args_len > i:
|
|
77
|
+
assert new_args_len <= i + 1, new_args
|
|
78
|
+
rolled_kwargs = new_args.pop()
|
|
79
|
+
if param['name'] in kwargs:
|
|
80
|
+
assert rolled_kwargs is None
|
|
81
|
+
rolled_kwargs = kwargs.pop(param['name'])
|
|
82
|
+
|
|
83
|
+
if rolled_args is not None:
|
|
84
|
+
assert rolled_args['_classname'] in ('InlineArray', 'InlineList')
|
|
85
|
+
new_args.extend(rolled_args['components'])
|
|
86
|
+
if rolled_kwargs is not None:
|
|
87
|
+
assert rolled_kwargs['_classname'] == 'InlineDict'
|
|
88
|
+
new_kwargs.update(zip(rolled_kwargs['keys'], rolled_kwargs['components']))
|
|
89
|
+
|
|
90
|
+
group_by_exprs = [components[i] for i in range(group_by_start_idx, group_by_stop_idx)]
|
|
91
|
+
order_by_exprs = [components[i] for i in range(order_by_start_idx, len(components))]
|
|
92
|
+
|
|
93
|
+
new_components = [*new_args, *new_kwargs.values(), *group_by_exprs, *order_by_exprs]
|
|
94
|
+
|
|
95
|
+
newv = {
|
|
96
|
+
'fn': v['fn'],
|
|
97
|
+
'arg_idxs': list(range(len(new_args))),
|
|
98
|
+
'kwarg_idxs': {name: i + len(new_args) for i, name in enumerate(new_kwargs.keys())},
|
|
99
|
+
'group_by_start_idx': len(new_args) + len(new_kwargs),
|
|
100
|
+
'group_by_stop_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
|
|
101
|
+
'order_by_start_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
|
|
102
|
+
'is_method_call': False,
|
|
103
|
+
'_classname': 'FunctionCall',
|
|
104
|
+
'components': new_components,
|
|
105
|
+
}
|
|
106
|
+
if 'return_type' in v:
|
|
107
|
+
newv['return_type'] = v['return_type']
|
|
108
|
+
|
|
109
|
+
return k, newv
|
|
110
|
+
|
|
111
|
+
return None
|
|
@@ -5,7 +5,7 @@ from uuid import UUID
|
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
|
-
from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
8
|
+
from pixeltable.metadata.schema import Function, Table, TableSchemaVersion
|
|
9
9
|
|
|
10
10
|
__logger = logging.getLogger('pixeltable')
|
|
11
11
|
|
|
@@ -50,6 +50,17 @@ def convert_table_md(
|
|
|
50
50
|
__logger.info(f'Updating schema for table: {id}')
|
|
51
51
|
conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
|
|
52
52
|
|
|
53
|
+
for row in conn.execute(sql.select(Function)):
|
|
54
|
+
id = row[0]
|
|
55
|
+
function_md = row[2]
|
|
56
|
+
assert isinstance(function_md, dict)
|
|
57
|
+
updated_function_md = copy.deepcopy(function_md)
|
|
58
|
+
if substitution_fn is not None:
|
|
59
|
+
updated_function_md = __substitute_md_rec(updated_function_md, substitution_fn)
|
|
60
|
+
if updated_function_md != function_md:
|
|
61
|
+
__logger.info(f'Updating function: {id}')
|
|
62
|
+
conn.execute(sql.update(Function).where(Function.id == id).values(md=updated_function_md))
|
|
63
|
+
|
|
53
64
|
|
|
54
65
|
def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
|
|
55
66
|
columns_md = table_md['column_md']
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
30: 'Store default values and constant arguments as literals',
|
|
6
|
+
29: 'Add user and additional_md fields to metadata structs',
|
|
7
|
+
28: 'Enable view creation from DataFrame with select clause',
|
|
5
8
|
27: 'Enable pxt.query parameterization of limit clauses',
|
|
6
9
|
26: 'Rename clip_text and clip_image to clip',
|
|
7
10
|
25: 'Functions with multiple signatures',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -74,6 +74,8 @@ class SystemInfo(Base):
|
|
|
74
74
|
@dataclasses.dataclass
|
|
75
75
|
class DirMd:
|
|
76
76
|
name: str
|
|
77
|
+
user: Optional[str]
|
|
78
|
+
additional_md: dict[str, Any]
|
|
77
79
|
|
|
78
80
|
|
|
79
81
|
class Dir(Base):
|
|
@@ -132,6 +134,7 @@ class IndexMd:
|
|
|
132
134
|
@dataclasses.dataclass
|
|
133
135
|
class ViewMd:
|
|
134
136
|
is_snapshot: bool
|
|
137
|
+
include_base_columns: bool
|
|
135
138
|
|
|
136
139
|
# (table id, version); for mutable views, all versions are None
|
|
137
140
|
base_versions: list[tuple[str, Optional[int]]]
|
|
@@ -150,6 +153,8 @@ class ViewMd:
|
|
|
150
153
|
class TableMd:
|
|
151
154
|
name: str
|
|
152
155
|
|
|
156
|
+
user: Optional[str]
|
|
157
|
+
|
|
153
158
|
# monotonically increasing w/in Table for both data and schema changes, starting at 0
|
|
154
159
|
current_version: int
|
|
155
160
|
# each version has a corresponding schema version (current_version >= current_schema_version)
|
|
@@ -169,6 +174,7 @@ class TableMd:
|
|
|
169
174
|
column_md: dict[int, ColumnMd] # col_id -> ColumnMd
|
|
170
175
|
index_md: dict[int, IndexMd] # index_id -> IndexMd
|
|
171
176
|
view_md: Optional[ViewMd]
|
|
177
|
+
additional_md: dict[str, Any]
|
|
172
178
|
|
|
173
179
|
|
|
174
180
|
class Table(Base):
|
|
@@ -194,6 +200,7 @@ class TableVersionMd:
|
|
|
194
200
|
created_at: float # time.time()
|
|
195
201
|
version: int
|
|
196
202
|
schema_version: int
|
|
203
|
+
additional_md: dict[str, Any]
|
|
197
204
|
|
|
198
205
|
|
|
199
206
|
class TableVersion(Base):
|
|
@@ -232,6 +239,7 @@ class TableSchemaVersionMd:
|
|
|
232
239
|
# default validation strategy for any media column of this table
|
|
233
240
|
# stores column.MediaValiation.name.lower()
|
|
234
241
|
media_validation: str
|
|
242
|
+
additional_md: dict[str, Any]
|
|
235
243
|
|
|
236
244
|
|
|
237
245
|
# versioning: each table schema change results in a new record
|
pixeltable/share/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .publish import publish_snapshot
|
pixeltable/share/packager.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import io
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
@@ -5,8 +6,9 @@ import tarfile
|
|
|
5
6
|
import urllib.parse
|
|
6
7
|
import urllib.request
|
|
7
8
|
import uuid
|
|
9
|
+
from datetime import datetime
|
|
8
10
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Iterator
|
|
11
|
+
from typing import Any, Iterator, Optional
|
|
10
12
|
|
|
11
13
|
import more_itertools
|
|
12
14
|
import numpy as np
|
|
@@ -15,7 +17,8 @@ import pyiceberg.catalog
|
|
|
15
17
|
|
|
16
18
|
import pixeltable as pxt
|
|
17
19
|
import pixeltable.type_system as ts
|
|
18
|
-
from pixeltable import exprs
|
|
20
|
+
from pixeltable import catalog, exprs, metadata
|
|
21
|
+
from pixeltable.dataframe import DataFrame
|
|
19
22
|
from pixeltable.env import Env
|
|
20
23
|
from pixeltable.utils.arrow import PXT_TO_PA_TYPES
|
|
21
24
|
from pixeltable.utils.iceberg import sqlite_catalog
|
|
@@ -28,6 +31,7 @@ class TablePackager:
|
|
|
28
31
|
Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
|
|
29
32
|
is as follows:
|
|
30
33
|
|
|
34
|
+
metadata.json # Pixeltable metadata for the packaged table
|
|
31
35
|
warehouse/catalog.db # sqlite Iceberg catalog
|
|
32
36
|
warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
|
|
33
37
|
media/** # Local media files
|
|
@@ -43,16 +47,40 @@ class TablePackager:
|
|
|
43
47
|
'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
|
|
44
48
|
"""
|
|
45
49
|
|
|
46
|
-
table:
|
|
50
|
+
table: catalog.Table # The table to be packaged
|
|
47
51
|
tmp_dir: Path # Temporary directory where the package will reside
|
|
48
52
|
iceberg_catalog: pyiceberg.catalog.Catalog
|
|
49
53
|
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
54
|
+
md: dict[str, Any]
|
|
50
55
|
|
|
51
|
-
def __init__(self, table:
|
|
56
|
+
def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
|
|
52
57
|
self.table = table
|
|
53
58
|
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
54
59
|
self.media_files = {}
|
|
55
60
|
|
|
61
|
+
# Generate metadata
|
|
62
|
+
self.md = {
|
|
63
|
+
'pxt_version': pxt.__version__,
|
|
64
|
+
'pxt_md_version': metadata.VERSION,
|
|
65
|
+
'md': {
|
|
66
|
+
'tables': [
|
|
67
|
+
{
|
|
68
|
+
'table_id': str(t._tbl_version.id),
|
|
69
|
+
# These are temporary; will replace with a better solution once the concurrency changes to catalog have
|
|
70
|
+
# been merged
|
|
71
|
+
'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
|
|
72
|
+
'table_version_md': dataclasses.asdict(
|
|
73
|
+
t._tbl_version._create_version_md(datetime.now().timestamp())
|
|
74
|
+
),
|
|
75
|
+
'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
|
|
76
|
+
}
|
|
77
|
+
for t in (table, *table._bases)
|
|
78
|
+
]
|
|
79
|
+
},
|
|
80
|
+
}
|
|
81
|
+
if additional_md is not None:
|
|
82
|
+
self.md.update(additional_md)
|
|
83
|
+
|
|
56
84
|
def package(self) -> Path:
|
|
57
85
|
"""
|
|
58
86
|
Export the table to a tarball containing Iceberg tables and media files.
|
|
@@ -60,8 +88,10 @@ class TablePackager:
|
|
|
60
88
|
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
61
89
|
_logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
|
|
62
90
|
self.tmp_dir.mkdir()
|
|
91
|
+
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
92
|
+
json.dump(self.md, fp)
|
|
63
93
|
self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
|
|
64
|
-
ancestors =
|
|
94
|
+
ancestors = (self.table, *self.table._bases)
|
|
65
95
|
for t in ancestors:
|
|
66
96
|
_logger.info(f"Exporting table '{t._path}'.")
|
|
67
97
|
self.__export_table(t)
|
|
@@ -70,7 +100,7 @@ class TablePackager:
|
|
|
70
100
|
_logger.info(f'Packaging complete: {bundle_path}')
|
|
71
101
|
return bundle_path
|
|
72
102
|
|
|
73
|
-
def __export_table(self, t:
|
|
103
|
+
def __export_table(self, t: catalog.Table) -> None:
|
|
74
104
|
"""
|
|
75
105
|
Exports the data from `t` into an Iceberg table.
|
|
76
106
|
"""
|
|
@@ -116,7 +146,7 @@ class TablePackager:
|
|
|
116
146
|
iceberg_tbl.append(pa_table)
|
|
117
147
|
|
|
118
148
|
@classmethod
|
|
119
|
-
def __iceberg_namespace(cls, table:
|
|
149
|
+
def __iceberg_namespace(cls, table: catalog.Table) -> str:
|
|
120
150
|
"""
|
|
121
151
|
Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
|
|
122
152
|
"""
|
|
@@ -149,11 +179,7 @@ class TablePackager:
|
|
|
149
179
|
return PXT_TO_PA_TYPES.get(col_type.__class__)
|
|
150
180
|
|
|
151
181
|
def __to_pa_tables(
|
|
152
|
-
self,
|
|
153
|
-
df: pxt.DataFrame,
|
|
154
|
-
actual_col_types: list[pxt.ColumnType],
|
|
155
|
-
arrow_schema: pa.Schema,
|
|
156
|
-
batch_size: int = 1_000,
|
|
182
|
+
self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
|
|
157
183
|
) -> Iterator[pa.Table]:
|
|
158
184
|
"""
|
|
159
185
|
Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
|
|
@@ -165,7 +191,7 @@ class TablePackager:
|
|
|
165
191
|
cols['_v_min'] = [row[-1] for row in rows]
|
|
166
192
|
yield pa.Table.from_pydict(cols, schema=arrow_schema)
|
|
167
193
|
|
|
168
|
-
def __to_pa_rows(self, df:
|
|
194
|
+
def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
|
|
169
195
|
for row in df._exec():
|
|
170
196
|
vals = [row[e.slot_idx] for e in df._select_list_exprs]
|
|
171
197
|
result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
|
|
@@ -210,6 +236,8 @@ class TablePackager:
|
|
|
210
236
|
def __build_tarball(self) -> Path:
|
|
211
237
|
bundle_path = self.tmp_dir / 'bundle.tar.bz2'
|
|
212
238
|
with tarfile.open(bundle_path, 'w:bz2') as tf:
|
|
239
|
+
# Add metadata json
|
|
240
|
+
tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
|
|
213
241
|
# Add the Iceberg warehouse dir (including the catalog)
|
|
214
242
|
tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
|
|
215
243
|
# Add the media files
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import urllib.parse
|
|
5
|
+
import urllib.request
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
import pixeltable as pxt
|
|
13
|
+
from pixeltable import exceptions as excs, metadata
|
|
14
|
+
from pixeltable.env import Env
|
|
15
|
+
from pixeltable.utils import sha256sum
|
|
16
|
+
|
|
17
|
+
from .packager import TablePackager
|
|
18
|
+
|
|
19
|
+
# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
|
|
20
|
+
# pixeltable.com URLs are available.
|
|
21
|
+
_PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
|
|
22
|
+
_FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
26
|
+
packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
|
|
27
|
+
request_json = packager.md
|
|
28
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key}
|
|
29
|
+
|
|
30
|
+
response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
|
|
31
|
+
if response.status_code != 200:
|
|
32
|
+
raise excs.Error(f'Error publishing snapshot: {response.text}')
|
|
33
|
+
response_json = response.json()
|
|
34
|
+
if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
|
|
35
|
+
raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
|
|
36
|
+
upload_id = response_json['upload_id']
|
|
37
|
+
destination_uri = response_json['destination_uri']
|
|
38
|
+
|
|
39
|
+
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
|
|
40
|
+
|
|
41
|
+
bundle = packager.package()
|
|
42
|
+
|
|
43
|
+
parsed_location = urllib.parse.urlparse(destination_uri)
|
|
44
|
+
if parsed_location.scheme == 's3':
|
|
45
|
+
_upload_bundle_to_s3(bundle, parsed_location)
|
|
46
|
+
else:
|
|
47
|
+
raise excs.Error(f'Unsupported destination: {destination_uri}')
|
|
48
|
+
|
|
49
|
+
Env.get().console_logger.info(f'Finalizing snapshot ...')
|
|
50
|
+
|
|
51
|
+
finalize_request_json = {
|
|
52
|
+
'upload_id': upload_id,
|
|
53
|
+
'datafile': bundle.name,
|
|
54
|
+
'size': bundle.stat().st_size,
|
|
55
|
+
'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# TODO: Use Pydantic for validation
|
|
59
|
+
finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
|
|
60
|
+
if finalize_response.status_code != 200:
|
|
61
|
+
raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
|
|
62
|
+
finalize_response_json = finalize_response.json()
|
|
63
|
+
if not isinstance(finalize_response_json, dict) or 'confirmed_table_uri' not in finalize_response_json:
|
|
64
|
+
raise excs.Error(f'Error finalizing snapshot: unexpected response from server.\n{finalize_response_json}')
|
|
65
|
+
|
|
66
|
+
confirmed_tbl_uri = finalize_response_json['confirmed_table_uri']
|
|
67
|
+
Env.get().console_logger.info(f'The published snapshot is now available at: {confirmed_tbl_uri}')
|
|
68
|
+
return confirmed_tbl_uri
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
|
|
72
|
+
from pixeltable.utils.s3 import get_client
|
|
73
|
+
|
|
74
|
+
bucket = parsed_location.netloc
|
|
75
|
+
remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
|
|
76
|
+
remote_path = str(remote_dir / bundle.name)[1:] # Remove initial /
|
|
77
|
+
|
|
78
|
+
Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
|
|
79
|
+
|
|
80
|
+
boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
|
|
81
|
+
s3_client = get_client(**boto_config)
|
|
82
|
+
|
|
83
|
+
upload_args = {'ChecksumAlgorithm': 'SHA256'}
|
|
84
|
+
|
|
85
|
+
progress_bar = tqdm(
|
|
86
|
+
desc=f'Uploading',
|
|
87
|
+
total=bundle.stat().st_size,
|
|
88
|
+
unit='B',
|
|
89
|
+
unit_scale=True,
|
|
90
|
+
unit_divisor=1024,
|
|
91
|
+
miniters=1, # Update every iteration (should be fine for an upload)
|
|
92
|
+
ncols=100,
|
|
93
|
+
file=sys.stdout,
|
|
94
|
+
)
|
|
95
|
+
s3_client.upload_file(
|
|
96
|
+
Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
|
|
97
|
+
)
|
pixeltable/type_system.py
CHANGED
|
@@ -8,10 +8,9 @@ import json
|
|
|
8
8
|
import typing
|
|
9
9
|
import urllib.parse
|
|
10
10
|
import urllib.request
|
|
11
|
-
from pathlib import Path
|
|
12
11
|
from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
|
|
13
12
|
|
|
14
|
-
import av
|
|
13
|
+
import av
|
|
15
14
|
import jsonschema
|
|
16
15
|
import jsonschema.protocols
|
|
17
16
|
import jsonschema.validators
|
|
@@ -22,6 +21,7 @@ import sqlalchemy as sql
|
|
|
22
21
|
from typing_extensions import _AnnotatedAlias
|
|
23
22
|
|
|
24
23
|
import pixeltable.exceptions as excs
|
|
24
|
+
from pixeltable.utils import parse_local_file_path
|
|
25
25
|
|
|
26
26
|
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
27
27
|
|
|
@@ -47,8 +47,8 @@ class ColumnType:
|
|
|
47
47
|
@classmethod
|
|
48
48
|
def supertype(
|
|
49
49
|
cls,
|
|
50
|
-
type1: 'ColumnType.Type',
|
|
51
|
-
type2: 'ColumnType.Type',
|
|
50
|
+
type1: Optional['ColumnType.Type'],
|
|
51
|
+
type2: Optional['ColumnType.Type'],
|
|
52
52
|
# we need to pass this in because we can't easily append it as a class member
|
|
53
53
|
common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
|
|
54
54
|
) -> Optional['ColumnType.Type']:
|
|
@@ -93,6 +93,9 @@ class ColumnType:
|
|
|
93
93
|
self._type = t
|
|
94
94
|
self._nullable = nullable
|
|
95
95
|
|
|
96
|
+
def has_supertype(self) -> bool:
|
|
97
|
+
return True
|
|
98
|
+
|
|
96
99
|
@property
|
|
97
100
|
def nullable(self) -> bool:
|
|
98
101
|
return self._nullable
|
|
@@ -271,8 +274,10 @@ class ColumnType:
|
|
|
271
274
|
inferred_type = val_type
|
|
272
275
|
else:
|
|
273
276
|
inferred_type = inferred_type.supertype(val_type)
|
|
274
|
-
|
|
275
|
-
|
|
277
|
+
if inferred_type is None:
|
|
278
|
+
return None
|
|
279
|
+
if not inferred_type.has_supertype():
|
|
280
|
+
return inferred_type
|
|
276
281
|
return inferred_type
|
|
277
282
|
|
|
278
283
|
@classmethod
|
|
@@ -397,12 +402,9 @@ class ColumnType:
|
|
|
397
402
|
def _validate_file_path(self, val: Any) -> None:
|
|
398
403
|
"""Raises TypeError if not a valid local file path or not a path/byte sequence"""
|
|
399
404
|
if isinstance(val, str):
|
|
400
|
-
|
|
401
|
-
if
|
|
402
|
-
|
|
403
|
-
path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
404
|
-
if not path.is_file():
|
|
405
|
-
raise TypeError(f'File not found: {str(path)}')
|
|
405
|
+
path = parse_local_file_path(val)
|
|
406
|
+
if path is not None and not path.is_file():
|
|
407
|
+
raise TypeError(f'File not found: {path}')
|
|
406
408
|
else:
|
|
407
409
|
if not isinstance(val, bytes):
|
|
408
410
|
raise TypeError(f'expected file path or bytes, got {type(val)}')
|
|
@@ -495,7 +497,7 @@ class InvalidType(ColumnType):
|
|
|
495
497
|
super().__init__(self.Type.INVALID, nullable=nullable)
|
|
496
498
|
|
|
497
499
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
498
|
-
|
|
500
|
+
return sql.types.NullType()
|
|
499
501
|
|
|
500
502
|
def print_value(self, val: Any) -> str:
|
|
501
503
|
return str(val)
|
|
@@ -508,6 +510,9 @@ class StringType(ColumnType):
|
|
|
508
510
|
def __init__(self, nullable: bool = False):
|
|
509
511
|
super().__init__(self.Type.STRING, nullable=nullable)
|
|
510
512
|
|
|
513
|
+
def has_supertype(self):
|
|
514
|
+
return not self.nullable
|
|
515
|
+
|
|
511
516
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
512
517
|
return sql.String()
|
|
513
518
|
|
|
@@ -591,6 +596,9 @@ class TimestampType(ColumnType):
|
|
|
591
596
|
def __init__(self, nullable: bool = False):
|
|
592
597
|
super().__init__(self.Type.TIMESTAMP, nullable=nullable)
|
|
593
598
|
|
|
599
|
+
def has_supertype(self):
|
|
600
|
+
return not self.nullable
|
|
601
|
+
|
|
594
602
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
595
603
|
return sql.TIMESTAMP(timezone=True)
|
|
596
604
|
|
|
@@ -601,6 +609,8 @@ class TimestampType(ColumnType):
|
|
|
601
609
|
def _create_literal(self, val: Any) -> Any:
|
|
602
610
|
if isinstance(val, str):
|
|
603
611
|
return datetime.datetime.fromisoformat(val)
|
|
612
|
+
if isinstance(val, datetime.datetime):
|
|
613
|
+
return val
|
|
604
614
|
return val
|
|
605
615
|
|
|
606
616
|
|
|
@@ -651,6 +661,10 @@ class JsonType(ColumnType):
|
|
|
651
661
|
return val_type.print_value(val)
|
|
652
662
|
|
|
653
663
|
def _validate_literal(self, val: Any) -> None:
|
|
664
|
+
if isinstance(val, tuple):
|
|
665
|
+
val = list(val)
|
|
666
|
+
if isinstance(val, pydantic.BaseModel):
|
|
667
|
+
val = val.model_dump()
|
|
654
668
|
if not self.__is_valid_json(val):
|
|
655
669
|
raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
|
|
656
670
|
if self.__validator is not None:
|
|
@@ -818,14 +832,20 @@ class ArrayType(ColumnType):
|
|
|
818
832
|
return hash((self._type, self.nullable, self.shape, self.dtype))
|
|
819
833
|
|
|
820
834
|
def supertype(self, other: ColumnType) -> Optional[ArrayType]:
|
|
835
|
+
basic_supertype = super().supertype(other)
|
|
836
|
+
if basic_supertype is not None:
|
|
837
|
+
assert isinstance(basic_supertype, ArrayType)
|
|
838
|
+
return basic_supertype
|
|
839
|
+
|
|
821
840
|
if not isinstance(other, ArrayType):
|
|
822
841
|
return None
|
|
842
|
+
|
|
823
843
|
super_dtype = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
|
|
824
844
|
if super_dtype is None:
|
|
825
845
|
# if the dtypes are incompatible, then the supertype is a fully general array
|
|
826
846
|
return ArrayType(nullable=(self.nullable or other.nullable))
|
|
827
847
|
super_shape: Optional[tuple[Optional[int], ...]]
|
|
828
|
-
if len(self.shape) != len(other.shape):
|
|
848
|
+
if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
|
|
829
849
|
super_shape = None
|
|
830
850
|
else:
|
|
831
851
|
super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
|
|
@@ -1009,8 +1029,14 @@ class ImageType(ColumnType):
|
|
|
1009
1029
|
return hash((self._type, self.nullable, self.size, self.mode))
|
|
1010
1030
|
|
|
1011
1031
|
def supertype(self, other: ColumnType) -> Optional[ImageType]:
|
|
1032
|
+
basic_supertype = super().supertype(other)
|
|
1033
|
+
if basic_supertype is not None:
|
|
1034
|
+
assert isinstance(basic_supertype, ImageType)
|
|
1035
|
+
return basic_supertype
|
|
1036
|
+
|
|
1012
1037
|
if not isinstance(other, ImageType):
|
|
1013
1038
|
return None
|
|
1039
|
+
|
|
1014
1040
|
width = self.width if self.width == other.width else None
|
|
1015
1041
|
height = self.height if self.height == other.height else None
|
|
1016
1042
|
mode = self.mode if self.mode == other.mode else None
|