pixeltable 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +9 -2
- pixeltable/catalog/column.py +1 -1
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/table.py +1 -1
- pixeltable/catalog/table_version.py +12 -2
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +64 -20
- pixeltable/dataframe.py +14 -14
- pixeltable/env.py +20 -3
- pixeltable/exec/component_iteration_node.py +1 -2
- pixeltable/exec/expr_eval/evaluators.py +4 -2
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
- pixeltable/exprs/comparison.py +8 -4
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +9 -2
- pixeltable/exprs/function_call.py +155 -313
- pixeltable/func/aggregate_function.py +29 -15
- pixeltable/func/callable_function.py +11 -8
- pixeltable/func/expr_template_function.py +3 -9
- pixeltable/func/function.py +148 -74
- pixeltable/func/signature.py +65 -30
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/deepseek.py +121 -0
- pixeltable/functions/image.py +7 -7
- pixeltable/functions/openai.py +49 -10
- pixeltable/functions/video.py +14 -7
- pixeltable/globals.py +14 -3
- pixeltable/index/embedding_index.py +4 -13
- pixeltable/io/globals.py +88 -77
- pixeltable/io/hf_datasets.py +34 -34
- pixeltable/io/pandas.py +75 -87
- pixeltable/io/parquet.py +19 -27
- pixeltable/io/utils.py +115 -0
- pixeltable/iterators/audio.py +2 -1
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/__init__.py +2 -1
- pixeltable/metadata/converters/convert_15.py +18 -8
- pixeltable/metadata/converters/convert_27.py +31 -0
- pixeltable/metadata/converters/convert_28.py +15 -0
- pixeltable/metadata/converters/convert_29.py +111 -0
- pixeltable/metadata/converters/util.py +12 -1
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/share/__init__.py +1 -0
- pixeltable/share/packager.py +246 -0
- pixeltable/share/publish.py +97 -0
- pixeltable/type_system.py +87 -42
- pixeltable/utils/__init__.py +41 -0
- pixeltable/utils/arrow.py +45 -12
- pixeltable/utils/formatter.py +1 -1
- pixeltable/utils/iceberg.py +14 -0
- pixeltable/utils/media_store.py +1 -1
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/METADATA +37 -50
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/RECORD +60 -51
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/entry_points.txt +0 -0
|
@@ -17,7 +17,7 @@ _logger = logging.getLogger('pixeltable')
|
|
|
17
17
|
def _(engine: sql.engine.Engine) -> None:
|
|
18
18
|
with engine.begin() as conn:
|
|
19
19
|
for row in conn.execute(sql.select(Function)):
|
|
20
|
-
id,
|
|
20
|
+
id, _, md, binary_obj = row
|
|
21
21
|
md['md'] = __update_md(md['md'], binary_obj)
|
|
22
22
|
_logger.info(f'Updating function: {id}')
|
|
23
23
|
conn.execute(sql.update(Function).where(Function.id == id).values(md=md))
|
|
@@ -27,14 +27,24 @@ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
|
|
|
27
27
|
# construct dict produced by CallableFunction.to_store()
|
|
28
28
|
py_fn = cloudpickle.loads(binary_obj)
|
|
29
29
|
py_params = inspect.signature(py_fn).parameters
|
|
30
|
-
return_type =
|
|
31
|
-
params: list[
|
|
30
|
+
return_type = orig_d['return_type']
|
|
31
|
+
params: list[dict] = []
|
|
32
32
|
for name, col_type_dict, kind_int, is_batched in orig_d['parameters']:
|
|
33
|
-
col_type = ts.ColumnType.from_dict(col_type_dict) if col_type_dict is not None else None
|
|
34
33
|
default = py_params[name].default
|
|
35
|
-
kind = inspect._ParameterKind(kind_int)
|
|
36
|
-
params.append(
|
|
34
|
+
kind = inspect._ParameterKind(kind_int)
|
|
35
|
+
params.append(
|
|
36
|
+
{
|
|
37
|
+
'name': name,
|
|
38
|
+
'col_type': col_type_dict,
|
|
39
|
+
'kind': str(kind),
|
|
40
|
+
'is_batched': is_batched,
|
|
41
|
+
'has_default': default is not inspect.Parameter.empty,
|
|
42
|
+
'default': None if default is inspect.Parameter.empty else default,
|
|
43
|
+
}
|
|
44
|
+
)
|
|
37
45
|
is_batched = 'batch_size' in orig_d
|
|
38
|
-
|
|
39
|
-
|
|
46
|
+
d = {
|
|
47
|
+
'signature': {'return_type': return_type, 'parameters': params, 'is_batched': is_batched},
|
|
48
|
+
'batch_size': orig_d['batch_size'] if is_batched else None,
|
|
49
|
+
}
|
|
40
50
|
return d
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
|
|
5
|
+
import sqlalchemy as sql
|
|
6
|
+
|
|
7
|
+
from pixeltable.metadata import register_converter
|
|
8
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
|
+
from pixeltable.metadata.schema import Table
|
|
10
|
+
|
|
11
|
+
_logger = logging.getLogger('pixeltable')
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_converter(version=27)
|
|
15
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
16
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
20
|
+
"""Update the view metadata to add the include_base_columns boolean if it is missing
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
table_md (dict): copy of the original table metadata. this gets updated in place.
|
|
24
|
+
table_id (UUID): the table id
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
if table_md['view_md'] is None:
|
|
28
|
+
return
|
|
29
|
+
if 'include_base_columns' not in table_md['view_md']:
|
|
30
|
+
table_md['view_md']['include_base_columns'] = True
|
|
31
|
+
_logger.info(f'Updating view metadata for table: {table_id}')
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.schema import Dir, Table, TableSchemaVersion, TableVersion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=28)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
with engine.begin() as conn:
|
|
12
|
+
conn.execute(sql.update(Dir).values(md=Dir.md.concat({'user': None, 'additional_md': {}})))
|
|
13
|
+
conn.execute(sql.update(Table).values(md=Table.md.concat({'user': None, 'additional_md': {}})))
|
|
14
|
+
conn.execute(sql.update(TableVersion).values(md=TableVersion.md.concat({'additional_md': {}})))
|
|
15
|
+
conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat({'additional_md': {}})))
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable import exprs
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_converter(version=29)
|
|
11
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
12
|
+
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
|
|
16
|
+
# Defaults are now stored as literals in signatures
|
|
17
|
+
if k == 'parameters':
|
|
18
|
+
for param in v:
|
|
19
|
+
assert isinstance(param, dict)
|
|
20
|
+
has_default = param.get('has_default') or (param.get('default') is not None)
|
|
21
|
+
if 'has_default' in param:
|
|
22
|
+
del param['has_default']
|
|
23
|
+
literal = exprs.Expr.from_object(param['default']) if has_default else None
|
|
24
|
+
assert literal is None or isinstance(literal, exprs.Literal)
|
|
25
|
+
param['default'] = None if literal is None else literal.as_dict()
|
|
26
|
+
return k, v
|
|
27
|
+
|
|
28
|
+
# Method of organizing argument expressions has changed
|
|
29
|
+
if isinstance(v, dict) and v.get('_classname') == 'FunctionCall':
|
|
30
|
+
args = v['args']
|
|
31
|
+
kwargs = v['kwargs']
|
|
32
|
+
components = v['components']
|
|
33
|
+
group_by_start_idx = v['group_by_start_idx']
|
|
34
|
+
group_by_stop_idx = v['group_by_stop_idx']
|
|
35
|
+
order_by_start_idx = v['order_by_start_idx']
|
|
36
|
+
|
|
37
|
+
new_args = []
|
|
38
|
+
for arg in args:
|
|
39
|
+
if arg[0] is not None:
|
|
40
|
+
assert isinstance(arg[0], int)
|
|
41
|
+
new_args.append(components[arg[0]])
|
|
42
|
+
else:
|
|
43
|
+
literal = exprs.Expr.from_object(arg[1])
|
|
44
|
+
new_args.append(literal.as_dict())
|
|
45
|
+
|
|
46
|
+
new_kwargs = {}
|
|
47
|
+
for name, kwarg in kwargs.items():
|
|
48
|
+
if kwarg[0] is not None:
|
|
49
|
+
assert isinstance(kwarg[0], int)
|
|
50
|
+
new_kwargs[name] = components[kwarg[0]]
|
|
51
|
+
else:
|
|
52
|
+
literal = exprs.Expr.from_object(kwarg[1])
|
|
53
|
+
new_kwargs[name] = literal.as_dict()
|
|
54
|
+
|
|
55
|
+
# We need to expand ("unroll") any var-args or var-kwargs.
|
|
56
|
+
|
|
57
|
+
new_args_len = len(new_args)
|
|
58
|
+
rolled_args: Optional[dict] = None
|
|
59
|
+
rolled_kwargs: Optional[dict] = None
|
|
60
|
+
|
|
61
|
+
if 'signature' in v['fn']:
|
|
62
|
+
# If it's a pickled function, there's no signature, so we're out of luck; varargs in a pickled function
|
|
63
|
+
# is an edge case that won't migrate properly.
|
|
64
|
+
parameters: list[dict] = v['fn']['signature']['parameters']
|
|
65
|
+
for i, param in enumerate(parameters):
|
|
66
|
+
if param['kind'] == 'VAR_POSITIONAL':
|
|
67
|
+
if new_args_len > i:
|
|
68
|
+
# For peculiar historical reasons, variable kwargs might show up in args. Thus variable
|
|
69
|
+
# positional args is not necessarily the last element of args; it might be the second-to-last.
|
|
70
|
+
assert new_args_len <= i + 2, new_args
|
|
71
|
+
rolled_args = new_args[i]
|
|
72
|
+
new_args = new_args[:i] + new_args[i + 1 :]
|
|
73
|
+
if param['kind'] == 'VAR_KEYWORD':
|
|
74
|
+
# As noted above, variable kwargs might show up either in args or in kwargs. If it's in args, it
|
|
75
|
+
# is necessarily the last element.
|
|
76
|
+
if new_args_len > i:
|
|
77
|
+
assert new_args_len <= i + 1, new_args
|
|
78
|
+
rolled_kwargs = new_args.pop()
|
|
79
|
+
if param['name'] in kwargs:
|
|
80
|
+
assert rolled_kwargs is None
|
|
81
|
+
rolled_kwargs = kwargs.pop(param['name'])
|
|
82
|
+
|
|
83
|
+
if rolled_args is not None:
|
|
84
|
+
assert rolled_args['_classname'] in ('InlineArray', 'InlineList')
|
|
85
|
+
new_args.extend(rolled_args['components'])
|
|
86
|
+
if rolled_kwargs is not None:
|
|
87
|
+
assert rolled_kwargs['_classname'] == 'InlineDict'
|
|
88
|
+
new_kwargs.update(zip(rolled_kwargs['keys'], rolled_kwargs['components']))
|
|
89
|
+
|
|
90
|
+
group_by_exprs = [components[i] for i in range(group_by_start_idx, group_by_stop_idx)]
|
|
91
|
+
order_by_exprs = [components[i] for i in range(order_by_start_idx, len(components))]
|
|
92
|
+
|
|
93
|
+
new_components = [*new_args, *new_kwargs.values(), *group_by_exprs, *order_by_exprs]
|
|
94
|
+
|
|
95
|
+
newv = {
|
|
96
|
+
'fn': v['fn'],
|
|
97
|
+
'arg_idxs': list(range(len(new_args))),
|
|
98
|
+
'kwarg_idxs': {name: i + len(new_args) for i, name in enumerate(new_kwargs.keys())},
|
|
99
|
+
'group_by_start_idx': len(new_args) + len(new_kwargs),
|
|
100
|
+
'group_by_stop_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
|
|
101
|
+
'order_by_start_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
|
|
102
|
+
'is_method_call': False,
|
|
103
|
+
'_classname': 'FunctionCall',
|
|
104
|
+
'components': new_components,
|
|
105
|
+
}
|
|
106
|
+
if 'return_type' in v:
|
|
107
|
+
newv['return_type'] = v['return_type']
|
|
108
|
+
|
|
109
|
+
return k, newv
|
|
110
|
+
|
|
111
|
+
return None
|
|
@@ -5,7 +5,7 @@ from uuid import UUID
|
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
|
-
from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
8
|
+
from pixeltable.metadata.schema import Function, Table, TableSchemaVersion
|
|
9
9
|
|
|
10
10
|
__logger = logging.getLogger('pixeltable')
|
|
11
11
|
|
|
@@ -50,6 +50,17 @@ def convert_table_md(
|
|
|
50
50
|
__logger.info(f'Updating schema for table: {id}')
|
|
51
51
|
conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
|
|
52
52
|
|
|
53
|
+
for row in conn.execute(sql.select(Function)):
|
|
54
|
+
id = row[0]
|
|
55
|
+
function_md = row[2]
|
|
56
|
+
assert isinstance(function_md, dict)
|
|
57
|
+
updated_function_md = copy.deepcopy(function_md)
|
|
58
|
+
if substitution_fn is not None:
|
|
59
|
+
updated_function_md = __substitute_md_rec(updated_function_md, substitution_fn)
|
|
60
|
+
if updated_function_md != function_md:
|
|
61
|
+
__logger.info(f'Updating function: {id}')
|
|
62
|
+
conn.execute(sql.update(Function).where(Function.id == id).values(md=updated_function_md))
|
|
63
|
+
|
|
53
64
|
|
|
54
65
|
def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
|
|
55
66
|
columns_md = table_md['column_md']
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
30: 'Store default values and constant arguments as literals',
|
|
6
|
+
29: 'Add user and additional_md fields to metadata structs',
|
|
7
|
+
28: 'Enable view creation from DataFrame with select clause',
|
|
5
8
|
27: 'Enable pxt.query parameterization of limit clauses',
|
|
6
9
|
26: 'Rename clip_text and clip_image to clip',
|
|
7
10
|
25: 'Functions with multiple signatures',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -74,6 +74,8 @@ class SystemInfo(Base):
|
|
|
74
74
|
@dataclasses.dataclass
|
|
75
75
|
class DirMd:
|
|
76
76
|
name: str
|
|
77
|
+
user: Optional[str]
|
|
78
|
+
additional_md: dict[str, Any]
|
|
77
79
|
|
|
78
80
|
|
|
79
81
|
class Dir(Base):
|
|
@@ -132,6 +134,7 @@ class IndexMd:
|
|
|
132
134
|
@dataclasses.dataclass
|
|
133
135
|
class ViewMd:
|
|
134
136
|
is_snapshot: bool
|
|
137
|
+
include_base_columns: bool
|
|
135
138
|
|
|
136
139
|
# (table id, version); for mutable views, all versions are None
|
|
137
140
|
base_versions: list[tuple[str, Optional[int]]]
|
|
@@ -150,6 +153,8 @@ class ViewMd:
|
|
|
150
153
|
class TableMd:
|
|
151
154
|
name: str
|
|
152
155
|
|
|
156
|
+
user: Optional[str]
|
|
157
|
+
|
|
153
158
|
# monotonically increasing w/in Table for both data and schema changes, starting at 0
|
|
154
159
|
current_version: int
|
|
155
160
|
# each version has a corresponding schema version (current_version >= current_schema_version)
|
|
@@ -169,6 +174,7 @@ class TableMd:
|
|
|
169
174
|
column_md: dict[int, ColumnMd] # col_id -> ColumnMd
|
|
170
175
|
index_md: dict[int, IndexMd] # index_id -> IndexMd
|
|
171
176
|
view_md: Optional[ViewMd]
|
|
177
|
+
additional_md: dict[str, Any]
|
|
172
178
|
|
|
173
179
|
|
|
174
180
|
class Table(Base):
|
|
@@ -194,6 +200,7 @@ class TableVersionMd:
|
|
|
194
200
|
created_at: float # time.time()
|
|
195
201
|
version: int
|
|
196
202
|
schema_version: int
|
|
203
|
+
additional_md: dict[str, Any]
|
|
197
204
|
|
|
198
205
|
|
|
199
206
|
class TableVersion(Base):
|
|
@@ -232,6 +239,7 @@ class TableSchemaVersionMd:
|
|
|
232
239
|
# default validation strategy for any media column of this table
|
|
233
240
|
# stores column.MediaValiation.name.lower()
|
|
234
241
|
media_validation: str
|
|
242
|
+
additional_md: dict[str, Any]
|
|
235
243
|
|
|
236
244
|
|
|
237
245
|
# versioning: each table schema change results in a new record
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .publish import publish_snapshot
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import tarfile
|
|
6
|
+
import urllib.parse
|
|
7
|
+
import urllib.request
|
|
8
|
+
import uuid
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Iterator, Optional
|
|
12
|
+
|
|
13
|
+
import more_itertools
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pyarrow as pa
|
|
16
|
+
import pyiceberg.catalog
|
|
17
|
+
|
|
18
|
+
import pixeltable as pxt
|
|
19
|
+
import pixeltable.type_system as ts
|
|
20
|
+
from pixeltable import catalog, exprs, metadata
|
|
21
|
+
from pixeltable.dataframe import DataFrame
|
|
22
|
+
from pixeltable.env import Env
|
|
23
|
+
from pixeltable.utils.arrow import PXT_TO_PA_TYPES
|
|
24
|
+
from pixeltable.utils.iceberg import sqlite_catalog
|
|
25
|
+
|
|
26
|
+
_logger = logging.getLogger('pixeltable')
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TablePackager:
|
|
30
|
+
"""
|
|
31
|
+
Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
|
|
32
|
+
is as follows:
|
|
33
|
+
|
|
34
|
+
metadata.json # Pixeltable metadata for the packaged table
|
|
35
|
+
warehouse/catalog.db # sqlite Iceberg catalog
|
|
36
|
+
warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
|
|
37
|
+
media/** # Local media files
|
|
38
|
+
|
|
39
|
+
If the table being archived is a view, then the Iceberg catalog will contain separate tables for the view and each
|
|
40
|
+
of its ancestors. All rows will be exported with additional _rowid and _v_min columns. Currently, only the most
|
|
41
|
+
recent version of the table can be exported, and only the full table contents.
|
|
42
|
+
|
|
43
|
+
If the table contains media columns, they are handled as follows:
|
|
44
|
+
- If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
|
|
45
|
+
stored in the Iceberg table.
|
|
46
|
+
- If a media file is a local file, then it will be copied into the tarball as a file of the form
|
|
47
|
+
'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
table: catalog.Table # The table to be packaged
|
|
51
|
+
tmp_dir: Path # Temporary directory where the package will reside
|
|
52
|
+
iceberg_catalog: pyiceberg.catalog.Catalog
|
|
53
|
+
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
54
|
+
md: dict[str, Any]
|
|
55
|
+
|
|
56
|
+
def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
|
|
57
|
+
self.table = table
|
|
58
|
+
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
59
|
+
self.media_files = {}
|
|
60
|
+
|
|
61
|
+
# Generate metadata
|
|
62
|
+
self.md = {
|
|
63
|
+
'pxt_version': pxt.__version__,
|
|
64
|
+
'pxt_md_version': metadata.VERSION,
|
|
65
|
+
'md': {
|
|
66
|
+
'tables': [
|
|
67
|
+
{
|
|
68
|
+
'table_id': str(t._tbl_version.id),
|
|
69
|
+
# These are temporary; will replace with a better solution once the concurrency changes to catalog have
|
|
70
|
+
# been merged
|
|
71
|
+
'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
|
|
72
|
+
'table_version_md': dataclasses.asdict(
|
|
73
|
+
t._tbl_version._create_version_md(datetime.now().timestamp())
|
|
74
|
+
),
|
|
75
|
+
'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
|
|
76
|
+
}
|
|
77
|
+
for t in (table, *table._bases)
|
|
78
|
+
]
|
|
79
|
+
},
|
|
80
|
+
}
|
|
81
|
+
if additional_md is not None:
|
|
82
|
+
self.md.update(additional_md)
|
|
83
|
+
|
|
84
|
+
def package(self) -> Path:
|
|
85
|
+
"""
|
|
86
|
+
Export the table to a tarball containing Iceberg tables and media files.
|
|
87
|
+
"""
|
|
88
|
+
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
89
|
+
_logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
|
|
90
|
+
self.tmp_dir.mkdir()
|
|
91
|
+
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
92
|
+
json.dump(self.md, fp)
|
|
93
|
+
self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
|
|
94
|
+
ancestors = (self.table, *self.table._bases)
|
|
95
|
+
for t in ancestors:
|
|
96
|
+
_logger.info(f"Exporting table '{t._path}'.")
|
|
97
|
+
self.__export_table(t)
|
|
98
|
+
_logger.info(f'Building archive.')
|
|
99
|
+
bundle_path = self.__build_tarball()
|
|
100
|
+
_logger.info(f'Packaging complete: {bundle_path}')
|
|
101
|
+
return bundle_path
|
|
102
|
+
|
|
103
|
+
def __export_table(self, t: catalog.Table) -> None:
|
|
104
|
+
"""
|
|
105
|
+
Exports the data from `t` into an Iceberg table.
|
|
106
|
+
"""
|
|
107
|
+
# First generate a select list for the data we want to extract from `t`. This includes:
|
|
108
|
+
# - all stored columns, including computed columns;
|
|
109
|
+
# - errortype and errormsg fields whenever they're defined.
|
|
110
|
+
# We select only those columns that are defined in this table (columns inherited from ancestor tables will be
|
|
111
|
+
# handled separately).
|
|
112
|
+
# For media columns, we substitute `col.fileurl` so that we always get the URL (which may be a file:// URL;
|
|
113
|
+
# these will be specially handled later)
|
|
114
|
+
select_exprs: dict[str, exprs.Expr] = {}
|
|
115
|
+
|
|
116
|
+
# As we generate the select list, we construct a separate list of column types. We can't rely on df._schema
|
|
117
|
+
# to get the column types, since we'll be substituting `fileurl`s for media columns.
|
|
118
|
+
actual_col_types: list[ts.ColumnType] = []
|
|
119
|
+
|
|
120
|
+
for col_name, col in t._tbl_version.cols_by_name.items():
|
|
121
|
+
if not col.is_stored:
|
|
122
|
+
continue
|
|
123
|
+
if col.col_type.is_media_type():
|
|
124
|
+
select_exprs[col_name] = t[col_name].fileurl
|
|
125
|
+
else:
|
|
126
|
+
select_exprs[col_name] = t[col_name]
|
|
127
|
+
actual_col_types.append(col.col_type)
|
|
128
|
+
if col.records_errors:
|
|
129
|
+
select_exprs[f'{col_name}_errortype'] = t[col_name].errortype
|
|
130
|
+
actual_col_types.append(ts.StringType())
|
|
131
|
+
select_exprs[f'{col_name}_errormsg'] = t[col_name].errormsg
|
|
132
|
+
actual_col_types.append(ts.StringType())
|
|
133
|
+
|
|
134
|
+
# Run the select() on `self.table`, not `t`, so that we export only those rows that are actually present in
|
|
135
|
+
# `self.table`.
|
|
136
|
+
df = self.table.select(**select_exprs)
|
|
137
|
+
namespace = self.__iceberg_namespace(t)
|
|
138
|
+
self.iceberg_catalog.create_namespace_if_not_exists(namespace)
|
|
139
|
+
iceberg_schema = self.__to_iceberg_schema(df._schema)
|
|
140
|
+
iceberg_tbl = self.iceberg_catalog.create_table(f'{namespace}.{t._name}', schema=iceberg_schema)
|
|
141
|
+
|
|
142
|
+
# Populate the Iceberg table with data.
|
|
143
|
+
# The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
|
|
144
|
+
# excessive memory usage. The pyarrow tables are then amalgamated into the (single) Iceberg table on disk.
|
|
145
|
+
for pa_table in self.__to_pa_tables(df, actual_col_types, iceberg_schema):
|
|
146
|
+
iceberg_tbl.append(pa_table)
|
|
147
|
+
|
|
148
|
+
@classmethod
|
|
149
|
+
def __iceberg_namespace(cls, table: catalog.Table) -> str:
|
|
150
|
+
"""
|
|
151
|
+
Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
|
|
152
|
+
"""
|
|
153
|
+
parent_path = table._parent._path
|
|
154
|
+
if len(parent_path) == 0:
|
|
155
|
+
return 'pxt'
|
|
156
|
+
else:
|
|
157
|
+
return f'pxt.{parent_path}'
|
|
158
|
+
|
|
159
|
+
# The following methods are responsible for schema and data conversion from Pixeltable to Iceberg. Some of this
|
|
160
|
+
# logic might be consolidated into arrow.py and unified with general Parquet export, but there are several
|
|
161
|
+
# major differences:
|
|
162
|
+
# - Iceberg has no array type; we export all arrays as binary blobs
|
|
163
|
+
# - We include _rowid and _v_min columns in the Iceberg table
|
|
164
|
+
# - Media columns are handled specially as indicated above
|
|
165
|
+
|
|
166
|
+
@classmethod
|
|
167
|
+
def __to_iceberg_schema(cls, pxt_schema: dict[str, ts.ColumnType]) -> pa.Schema:
|
|
168
|
+
entries = [(name, cls.__to_iceberg_type(col_type)) for name, col_type in pxt_schema.items()]
|
|
169
|
+
entries.append(('_rowid', pa.list_(pa.int64())))
|
|
170
|
+
entries.append(('_v_min', pa.int64()))
|
|
171
|
+
return pa.schema(entries) # type: ignore[arg-type]
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def __to_iceberg_type(cls, col_type: ts.ColumnType) -> pa.DataType:
|
|
175
|
+
if col_type.is_array_type():
|
|
176
|
+
return pa.binary()
|
|
177
|
+
if col_type.is_media_type():
|
|
178
|
+
return pa.string()
|
|
179
|
+
return PXT_TO_PA_TYPES.get(col_type.__class__)
|
|
180
|
+
|
|
181
|
+
def __to_pa_tables(
|
|
182
|
+
self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
|
|
183
|
+
) -> Iterator[pa.Table]:
|
|
184
|
+
"""
|
|
185
|
+
Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
|
|
186
|
+
to avoid excessive memory usage.
|
|
187
|
+
"""
|
|
188
|
+
for rows in more_itertools.batched(self.__to_pa_rows(df, actual_col_types), batch_size):
|
|
189
|
+
cols = {col_name: [row[idx] for row in rows] for idx, col_name in enumerate(df._schema.keys())}
|
|
190
|
+
cols['_rowid'] = [row[-2] for row in rows]
|
|
191
|
+
cols['_v_min'] = [row[-1] for row in rows]
|
|
192
|
+
yield pa.Table.from_pydict(cols, schema=arrow_schema)
|
|
193
|
+
|
|
194
|
+
def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
|
|
195
|
+
for row in df._exec():
|
|
196
|
+
vals = [row[e.slot_idx] for e in df._select_list_exprs]
|
|
197
|
+
result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
|
|
198
|
+
result.append(row.rowid)
|
|
199
|
+
result.append(row.v_min)
|
|
200
|
+
yield result
|
|
201
|
+
|
|
202
|
+
def __to_pa_value(self, val: Any, col_type: ts.ColumnType) -> Any:
|
|
203
|
+
if val is None:
|
|
204
|
+
return None
|
|
205
|
+
if col_type.is_array_type():
|
|
206
|
+
# Export arrays as binary
|
|
207
|
+
assert isinstance(val, np.ndarray)
|
|
208
|
+
arr = io.BytesIO()
|
|
209
|
+
np.save(arr, val)
|
|
210
|
+
return arr.getvalue()
|
|
211
|
+
if col_type.is_json_type():
|
|
212
|
+
# Export JSON as strings
|
|
213
|
+
return json.dumps(val)
|
|
214
|
+
if col_type.is_media_type():
|
|
215
|
+
# Handle media files as described above
|
|
216
|
+
assert isinstance(val, str) # Media columns are always referenced by `fileurl`
|
|
217
|
+
return self.__process_media_url(val)
|
|
218
|
+
return val
|
|
219
|
+
|
|
220
|
+
def __process_media_url(self, url: str) -> str:
|
|
221
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
222
|
+
if parsed_url.scheme == 'file':
|
|
223
|
+
# It's the URL of a local file. Replace it with a pxtmedia:// URI.
|
|
224
|
+
# (We can't use an actual pxt:// URI, because the eventual pxt:// table name might not be known at this
|
|
225
|
+
# time. The pxtmedia:// URI serves as a relative reference into the tarball that can be replaced with an
|
|
226
|
+
# actual URL when the table is reconstituted.)
|
|
227
|
+
path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
|
|
228
|
+
if path not in self.media_files:
|
|
229
|
+
# Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
|
|
230
|
+
dest_name = f'{uuid.uuid4().hex}{path.suffix}'
|
|
231
|
+
self.media_files[path] = dest_name
|
|
232
|
+
return f'pxtmedia://{self.media_files[path]}'
|
|
233
|
+
# For any type of URL other than a local file, just return the URL as-is.
|
|
234
|
+
return url
|
|
235
|
+
|
|
236
|
+
def __build_tarball(self) -> Path:
|
|
237
|
+
bundle_path = self.tmp_dir / 'bundle.tar.bz2'
|
|
238
|
+
with tarfile.open(bundle_path, 'w:bz2') as tf:
|
|
239
|
+
# Add metadata json
|
|
240
|
+
tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
|
|
241
|
+
# Add the Iceberg warehouse dir (including the catalog)
|
|
242
|
+
tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
|
|
243
|
+
# Add the media files
|
|
244
|
+
for src_file, dest_name in self.media_files.items():
|
|
245
|
+
tf.add(src_file, arcname=f'media/{dest_name}')
|
|
246
|
+
return bundle_path
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import urllib.parse
|
|
5
|
+
import urllib.request
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
import pixeltable as pxt
|
|
13
|
+
from pixeltable import exceptions as excs, metadata
|
|
14
|
+
from pixeltable.env import Env
|
|
15
|
+
from pixeltable.utils import sha256sum
|
|
16
|
+
|
|
17
|
+
from .packager import TablePackager
|
|
18
|
+
|
|
19
|
+
# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
|
|
20
|
+
# pixeltable.com URLs are available.
|
|
21
|
+
_PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
|
|
22
|
+
_FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
26
|
+
packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
|
|
27
|
+
request_json = packager.md
|
|
28
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key}
|
|
29
|
+
|
|
30
|
+
response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
|
|
31
|
+
if response.status_code != 200:
|
|
32
|
+
raise excs.Error(f'Error publishing snapshot: {response.text}')
|
|
33
|
+
response_json = response.json()
|
|
34
|
+
if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
|
|
35
|
+
raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
|
|
36
|
+
upload_id = response_json['upload_id']
|
|
37
|
+
destination_uri = response_json['destination_uri']
|
|
38
|
+
|
|
39
|
+
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
|
|
40
|
+
|
|
41
|
+
bundle = packager.package()
|
|
42
|
+
|
|
43
|
+
parsed_location = urllib.parse.urlparse(destination_uri)
|
|
44
|
+
if parsed_location.scheme == 's3':
|
|
45
|
+
_upload_bundle_to_s3(bundle, parsed_location)
|
|
46
|
+
else:
|
|
47
|
+
raise excs.Error(f'Unsupported destination: {destination_uri}')
|
|
48
|
+
|
|
49
|
+
Env.get().console_logger.info(f'Finalizing snapshot ...')
|
|
50
|
+
|
|
51
|
+
finalize_request_json = {
|
|
52
|
+
'upload_id': upload_id,
|
|
53
|
+
'datafile': bundle.name,
|
|
54
|
+
'size': bundle.stat().st_size,
|
|
55
|
+
'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# TODO: Use Pydantic for validation
|
|
59
|
+
finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
|
|
60
|
+
if finalize_response.status_code != 200:
|
|
61
|
+
raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
|
|
62
|
+
finalize_response_json = finalize_response.json()
|
|
63
|
+
if not isinstance(finalize_response_json, dict) or 'confirmed_table_uri' not in finalize_response_json:
|
|
64
|
+
raise excs.Error(f'Error finalizing snapshot: unexpected response from server.\n{finalize_response_json}')
|
|
65
|
+
|
|
66
|
+
confirmed_tbl_uri = finalize_response_json['confirmed_table_uri']
|
|
67
|
+
Env.get().console_logger.info(f'The published snapshot is now available at: {confirmed_tbl_uri}')
|
|
68
|
+
return confirmed_tbl_uri
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
|
|
72
|
+
from pixeltable.utils.s3 import get_client
|
|
73
|
+
|
|
74
|
+
bucket = parsed_location.netloc
|
|
75
|
+
remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
|
|
76
|
+
remote_path = str(remote_dir / bundle.name)[1:] # Remove initial /
|
|
77
|
+
|
|
78
|
+
Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
|
|
79
|
+
|
|
80
|
+
boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
|
|
81
|
+
s3_client = get_client(**boto_config)
|
|
82
|
+
|
|
83
|
+
upload_args = {'ChecksumAlgorithm': 'SHA256'}
|
|
84
|
+
|
|
85
|
+
progress_bar = tqdm(
|
|
86
|
+
desc=f'Uploading',
|
|
87
|
+
total=bundle.stat().st_size,
|
|
88
|
+
unit='B',
|
|
89
|
+
unit_scale=True,
|
|
90
|
+
unit_divisor=1024,
|
|
91
|
+
miniters=1, # Update every iteration (should be fine for an upload)
|
|
92
|
+
ncols=100,
|
|
93
|
+
file=sys.stdout,
|
|
94
|
+
)
|
|
95
|
+
s3_client.upload_file(
|
|
96
|
+
Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
|
|
97
|
+
)
|