pixeltable 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (60) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -2
  4. pixeltable/catalog/column.py +1 -1
  5. pixeltable/catalog/dir.py +1 -1
  6. pixeltable/catalog/table.py +1 -1
  7. pixeltable/catalog/table_version.py +12 -2
  8. pixeltable/catalog/table_version_path.py +2 -2
  9. pixeltable/catalog/view.py +64 -20
  10. pixeltable/dataframe.py +14 -14
  11. pixeltable/env.py +20 -3
  12. pixeltable/exec/component_iteration_node.py +1 -2
  13. pixeltable/exec/expr_eval/evaluators.py +4 -2
  14. pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
  15. pixeltable/exprs/comparison.py +8 -4
  16. pixeltable/exprs/data_row.py +5 -3
  17. pixeltable/exprs/expr.py +9 -2
  18. pixeltable/exprs/function_call.py +155 -313
  19. pixeltable/func/aggregate_function.py +29 -15
  20. pixeltable/func/callable_function.py +11 -8
  21. pixeltable/func/expr_template_function.py +3 -9
  22. pixeltable/func/function.py +148 -74
  23. pixeltable/func/signature.py +65 -30
  24. pixeltable/func/udf.py +1 -1
  25. pixeltable/functions/__init__.py +1 -0
  26. pixeltable/functions/deepseek.py +121 -0
  27. pixeltable/functions/image.py +7 -7
  28. pixeltable/functions/openai.py +49 -10
  29. pixeltable/functions/video.py +14 -7
  30. pixeltable/globals.py +14 -3
  31. pixeltable/index/embedding_index.py +4 -13
  32. pixeltable/io/globals.py +88 -77
  33. pixeltable/io/hf_datasets.py +34 -34
  34. pixeltable/io/pandas.py +75 -87
  35. pixeltable/io/parquet.py +19 -27
  36. pixeltable/io/utils.py +115 -0
  37. pixeltable/iterators/audio.py +2 -1
  38. pixeltable/iterators/video.py +1 -1
  39. pixeltable/metadata/__init__.py +2 -1
  40. pixeltable/metadata/converters/convert_15.py +18 -8
  41. pixeltable/metadata/converters/convert_27.py +31 -0
  42. pixeltable/metadata/converters/convert_28.py +15 -0
  43. pixeltable/metadata/converters/convert_29.py +111 -0
  44. pixeltable/metadata/converters/util.py +12 -1
  45. pixeltable/metadata/notes.py +3 -0
  46. pixeltable/metadata/schema.py +8 -0
  47. pixeltable/share/__init__.py +1 -0
  48. pixeltable/share/packager.py +246 -0
  49. pixeltable/share/publish.py +97 -0
  50. pixeltable/type_system.py +87 -42
  51. pixeltable/utils/__init__.py +41 -0
  52. pixeltable/utils/arrow.py +45 -12
  53. pixeltable/utils/formatter.py +1 -1
  54. pixeltable/utils/iceberg.py +14 -0
  55. pixeltable/utils/media_store.py +1 -1
  56. {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/METADATA +37 -50
  57. {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/RECORD +60 -51
  58. {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/WHEEL +1 -1
  59. {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/LICENSE +0 -0
  60. {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/entry_points.txt +0 -0
@@ -17,7 +17,7 @@ _logger = logging.getLogger('pixeltable')
17
17
  def _(engine: sql.engine.Engine) -> None:
18
18
  with engine.begin() as conn:
19
19
  for row in conn.execute(sql.select(Function)):
20
- id, dir_id, md, binary_obj = row
20
+ id, _, md, binary_obj = row
21
21
  md['md'] = __update_md(md['md'], binary_obj)
22
22
  _logger.info(f'Updating function: {id}')
23
23
  conn.execute(sql.update(Function).where(Function.id == id).values(md=md))
@@ -27,14 +27,24 @@ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
27
27
  # construct dict produced by CallableFunction.to_store()
28
28
  py_fn = cloudpickle.loads(binary_obj)
29
29
  py_params = inspect.signature(py_fn).parameters
30
- return_type = ts.ColumnType.from_dict(orig_d['return_type'])
31
- params: list[func.Parameter] = []
30
+ return_type = orig_d['return_type']
31
+ params: list[dict] = []
32
32
  for name, col_type_dict, kind_int, is_batched in orig_d['parameters']:
33
- col_type = ts.ColumnType.from_dict(col_type_dict) if col_type_dict is not None else None
34
33
  default = py_params[name].default
35
- kind = inspect._ParameterKind(kind_int) # is there a way to avoid referencing a private type?
36
- params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
34
+ kind = inspect._ParameterKind(kind_int)
35
+ params.append(
36
+ {
37
+ 'name': name,
38
+ 'col_type': col_type_dict,
39
+ 'kind': str(kind),
40
+ 'is_batched': is_batched,
41
+ 'has_default': default is not inspect.Parameter.empty,
42
+ 'default': None if default is inspect.Parameter.empty else default,
43
+ }
44
+ )
37
45
  is_batched = 'batch_size' in orig_d
38
- sig = func.Signature(return_type, params, is_batched=is_batched)
39
- d = {'signature': sig.as_dict(), 'batch_size': orig_d['batch_size'] if is_batched else None}
46
+ d = {
47
+ 'signature': {'return_type': return_type, 'parameters': params, 'is_batched': is_batched},
48
+ 'batch_size': orig_d['batch_size'] if is_batched else None,
49
+ }
40
50
  return d
@@ -0,0 +1,31 @@
1
+ import logging
2
+ from typing import Any, Optional
3
+ from uuid import UUID
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from pixeltable.metadata import register_converter
8
+ from pixeltable.metadata.converters.util import convert_table_md
9
+ from pixeltable.metadata.schema import Table
10
+
11
+ _logger = logging.getLogger('pixeltable')
12
+
13
+
14
+ @register_converter(version=27)
15
+ def _(engine: sql.engine.Engine) -> None:
16
+ convert_table_md(engine, table_md_updater=__update_table_md)
17
+
18
+
19
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
20
+ """Update the view metadata to add the include_base_columns boolean if it is missing
21
+
22
+ Args:
23
+ table_md (dict): copy of the original table metadata. this gets updated in place.
24
+ table_id (UUID): the table id
25
+
26
+ """
27
+ if table_md['view_md'] is None:
28
+ return
29
+ if 'include_base_columns' not in table_md['view_md']:
30
+ table_md['view_md']['include_base_columns'] = True
31
+ _logger.info(f'Updating view metadata for table: {table_id}')
@@ -0,0 +1,15 @@
1
+ import logging
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.schema import Dir, Table, TableSchemaVersion, TableVersion
7
+
8
+
9
+ @register_converter(version=28)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ with engine.begin() as conn:
12
+ conn.execute(sql.update(Dir).values(md=Dir.md.concat({'user': None, 'additional_md': {}})))
13
+ conn.execute(sql.update(Table).values(md=Table.md.concat({'user': None, 'additional_md': {}})))
14
+ conn.execute(sql.update(TableVersion).values(md=TableVersion.md.concat({'additional_md': {}})))
15
+ conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat({'additional_md': {}})))
@@ -0,0 +1,111 @@
1
+ from typing import Any, Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable import exprs
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.converters.util import convert_table_md
8
+
9
+
10
+ @register_converter(version=29)
11
+ def _(engine: sql.engine.Engine) -> None:
12
+ convert_table_md(engine, substitution_fn=__substitute_md)
13
+
14
+
15
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
16
+ # Defaults are now stored as literals in signatures
17
+ if k == 'parameters':
18
+ for param in v:
19
+ assert isinstance(param, dict)
20
+ has_default = param.get('has_default') or (param.get('default') is not None)
21
+ if 'has_default' in param:
22
+ del param['has_default']
23
+ literal = exprs.Expr.from_object(param['default']) if has_default else None
24
+ assert literal is None or isinstance(literal, exprs.Literal)
25
+ param['default'] = None if literal is None else literal.as_dict()
26
+ return k, v
27
+
28
+ # Method of organizing argument expressions has changed
29
+ if isinstance(v, dict) and v.get('_classname') == 'FunctionCall':
30
+ args = v['args']
31
+ kwargs = v['kwargs']
32
+ components = v['components']
33
+ group_by_start_idx = v['group_by_start_idx']
34
+ group_by_stop_idx = v['group_by_stop_idx']
35
+ order_by_start_idx = v['order_by_start_idx']
36
+
37
+ new_args = []
38
+ for arg in args:
39
+ if arg[0] is not None:
40
+ assert isinstance(arg[0], int)
41
+ new_args.append(components[arg[0]])
42
+ else:
43
+ literal = exprs.Expr.from_object(arg[1])
44
+ new_args.append(literal.as_dict())
45
+
46
+ new_kwargs = {}
47
+ for name, kwarg in kwargs.items():
48
+ if kwarg[0] is not None:
49
+ assert isinstance(kwarg[0], int)
50
+ new_kwargs[name] = components[kwarg[0]]
51
+ else:
52
+ literal = exprs.Expr.from_object(kwarg[1])
53
+ new_kwargs[name] = literal.as_dict()
54
+
55
+ # We need to expand ("unroll") any var-args or var-kwargs.
56
+
57
+ new_args_len = len(new_args)
58
+ rolled_args: Optional[dict] = None
59
+ rolled_kwargs: Optional[dict] = None
60
+
61
+ if 'signature' in v['fn']:
62
+ # If it's a pickled function, there's no signature, so we're out of luck; varargs in a pickled function
63
+ # is an edge case that won't migrate properly.
64
+ parameters: list[dict] = v['fn']['signature']['parameters']
65
+ for i, param in enumerate(parameters):
66
+ if param['kind'] == 'VAR_POSITIONAL':
67
+ if new_args_len > i:
68
+ # For peculiar historical reasons, variable kwargs might show up in args. Thus variable
69
+ # positional args is not necessarily the last element of args; it might be the second-to-last.
70
+ assert new_args_len <= i + 2, new_args
71
+ rolled_args = new_args[i]
72
+ new_args = new_args[:i] + new_args[i + 1 :]
73
+ if param['kind'] == 'VAR_KEYWORD':
74
+ # As noted above, variable kwargs might show up either in args or in kwargs. If it's in args, it
75
+ # is necessarily the last element.
76
+ if new_args_len > i:
77
+ assert new_args_len <= i + 1, new_args
78
+ rolled_kwargs = new_args.pop()
79
+ if param['name'] in kwargs:
80
+ assert rolled_kwargs is None
81
+ rolled_kwargs = kwargs.pop(param['name'])
82
+
83
+ if rolled_args is not None:
84
+ assert rolled_args['_classname'] in ('InlineArray', 'InlineList')
85
+ new_args.extend(rolled_args['components'])
86
+ if rolled_kwargs is not None:
87
+ assert rolled_kwargs['_classname'] == 'InlineDict'
88
+ new_kwargs.update(zip(rolled_kwargs['keys'], rolled_kwargs['components']))
89
+
90
+ group_by_exprs = [components[i] for i in range(group_by_start_idx, group_by_stop_idx)]
91
+ order_by_exprs = [components[i] for i in range(order_by_start_idx, len(components))]
92
+
93
+ new_components = [*new_args, *new_kwargs.values(), *group_by_exprs, *order_by_exprs]
94
+
95
+ newv = {
96
+ 'fn': v['fn'],
97
+ 'arg_idxs': list(range(len(new_args))),
98
+ 'kwarg_idxs': {name: i + len(new_args) for i, name in enumerate(new_kwargs.keys())},
99
+ 'group_by_start_idx': len(new_args) + len(new_kwargs),
100
+ 'group_by_stop_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
101
+ 'order_by_start_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
102
+ 'is_method_call': False,
103
+ '_classname': 'FunctionCall',
104
+ 'components': new_components,
105
+ }
106
+ if 'return_type' in v:
107
+ newv['return_type'] = v['return_type']
108
+
109
+ return k, newv
110
+
111
+ return None
@@ -5,7 +5,7 @@ from uuid import UUID
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
- from pixeltable.metadata.schema import Table, TableSchemaVersion
8
+ from pixeltable.metadata.schema import Function, Table, TableSchemaVersion
9
9
 
10
10
  __logger = logging.getLogger('pixeltable')
11
11
 
@@ -50,6 +50,17 @@ def convert_table_md(
50
50
  __logger.info(f'Updating schema for table: {id}')
51
51
  conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
52
52
 
53
+ for row in conn.execute(sql.select(Function)):
54
+ id = row[0]
55
+ function_md = row[2]
56
+ assert isinstance(function_md, dict)
57
+ updated_function_md = copy.deepcopy(function_md)
58
+ if substitution_fn is not None:
59
+ updated_function_md = __substitute_md_rec(updated_function_md, substitution_fn)
60
+ if updated_function_md != function_md:
61
+ __logger.info(f'Updating function: {id}')
62
+ conn.execute(sql.update(Function).where(Function.id == id).values(md=updated_function_md))
63
+
53
64
 
54
65
  def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
55
66
  columns_md = table_md['column_md']
@@ -2,6 +2,9 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 30: 'Store default values and constant arguments as literals',
6
+ 29: 'Add user and additional_md fields to metadata structs',
7
+ 28: 'Enable view creation from DataFrame with select clause',
5
8
  27: 'Enable pxt.query parameterization of limit clauses',
6
9
  26: 'Rename clip_text and clip_image to clip',
7
10
  25: 'Functions with multiple signatures',
@@ -74,6 +74,8 @@ class SystemInfo(Base):
74
74
  @dataclasses.dataclass
75
75
  class DirMd:
76
76
  name: str
77
+ user: Optional[str]
78
+ additional_md: dict[str, Any]
77
79
 
78
80
 
79
81
  class Dir(Base):
@@ -132,6 +134,7 @@ class IndexMd:
132
134
  @dataclasses.dataclass
133
135
  class ViewMd:
134
136
  is_snapshot: bool
137
+ include_base_columns: bool
135
138
 
136
139
  # (table id, version); for mutable views, all versions are None
137
140
  base_versions: list[tuple[str, Optional[int]]]
@@ -150,6 +153,8 @@ class ViewMd:
150
153
  class TableMd:
151
154
  name: str
152
155
 
156
+ user: Optional[str]
157
+
153
158
  # monotonically increasing w/in Table for both data and schema changes, starting at 0
154
159
  current_version: int
155
160
  # each version has a corresponding schema version (current_version >= current_schema_version)
@@ -169,6 +174,7 @@ class TableMd:
169
174
  column_md: dict[int, ColumnMd] # col_id -> ColumnMd
170
175
  index_md: dict[int, IndexMd] # index_id -> IndexMd
171
176
  view_md: Optional[ViewMd]
177
+ additional_md: dict[str, Any]
172
178
 
173
179
 
174
180
  class Table(Base):
@@ -194,6 +200,7 @@ class TableVersionMd:
194
200
  created_at: float # time.time()
195
201
  version: int
196
202
  schema_version: int
203
+ additional_md: dict[str, Any]
197
204
 
198
205
 
199
206
  class TableVersion(Base):
@@ -232,6 +239,7 @@ class TableSchemaVersionMd:
232
239
  # default validation strategy for any media column of this table
233
240
  # stores column.MediaValiation.name.lower()
234
241
  media_validation: str
242
+ additional_md: dict[str, Any]
235
243
 
236
244
 
237
245
  # versioning: each table schema change results in a new record
@@ -0,0 +1 @@
1
+ from .publish import publish_snapshot
@@ -0,0 +1,246 @@
1
+ import dataclasses
2
+ import io
3
+ import json
4
+ import logging
5
+ import tarfile
6
+ import urllib.parse
7
+ import urllib.request
8
+ import uuid
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Any, Iterator, Optional
12
+
13
+ import more_itertools
14
+ import numpy as np
15
+ import pyarrow as pa
16
+ import pyiceberg.catalog
17
+
18
+ import pixeltable as pxt
19
+ import pixeltable.type_system as ts
20
+ from pixeltable import catalog, exprs, metadata
21
+ from pixeltable.dataframe import DataFrame
22
+ from pixeltable.env import Env
23
+ from pixeltable.utils.arrow import PXT_TO_PA_TYPES
24
+ from pixeltable.utils.iceberg import sqlite_catalog
25
+
26
+ _logger = logging.getLogger('pixeltable')
27
+
28
+
29
+ class TablePackager:
30
+ """
31
+ Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
32
+ is as follows:
33
+
34
+ metadata.json # Pixeltable metadata for the packaged table
35
+ warehouse/catalog.db # sqlite Iceberg catalog
36
+ warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
37
+ media/** # Local media files
38
+
39
+ If the table being archived is a view, then the Iceberg catalog will contain separate tables for the view and each
40
+ of its ancestors. All rows will be exported with additional _rowid and _v_min columns. Currently, only the most
41
+ recent version of the table can be exported, and only the full table contents.
42
+
43
+ If the table contains media columns, they are handled as follows:
44
+ - If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
45
+ stored in the Iceberg table.
46
+ - If a media file is a local file, then it will be copied into the tarball as a file of the form
47
+ 'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
48
+ """
49
+
50
+ table: catalog.Table # The table to be packaged
51
+ tmp_dir: Path # Temporary directory where the package will reside
52
+ iceberg_catalog: pyiceberg.catalog.Catalog
53
+ media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
54
+ md: dict[str, Any]
55
+
56
+ def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
57
+ self.table = table
58
+ self.tmp_dir = Path(Env.get().create_tmp_path())
59
+ self.media_files = {}
60
+
61
+ # Generate metadata
62
+ self.md = {
63
+ 'pxt_version': pxt.__version__,
64
+ 'pxt_md_version': metadata.VERSION,
65
+ 'md': {
66
+ 'tables': [
67
+ {
68
+ 'table_id': str(t._tbl_version.id),
69
+ # These are temporary; will replace with a better solution once the concurrency changes to catalog have
70
+ # been merged
71
+ 'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
72
+ 'table_version_md': dataclasses.asdict(
73
+ t._tbl_version._create_version_md(datetime.now().timestamp())
74
+ ),
75
+ 'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
76
+ }
77
+ for t in (table, *table._bases)
78
+ ]
79
+ },
80
+ }
81
+ if additional_md is not None:
82
+ self.md.update(additional_md)
83
+
84
+ def package(self) -> Path:
85
+ """
86
+ Export the table to a tarball containing Iceberg tables and media files.
87
+ """
88
+ assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
89
+ _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
90
+ self.tmp_dir.mkdir()
91
+ with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
92
+ json.dump(self.md, fp)
93
+ self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
94
+ ancestors = (self.table, *self.table._bases)
95
+ for t in ancestors:
96
+ _logger.info(f"Exporting table '{t._path}'.")
97
+ self.__export_table(t)
98
+ _logger.info(f'Building archive.')
99
+ bundle_path = self.__build_tarball()
100
+ _logger.info(f'Packaging complete: {bundle_path}')
101
+ return bundle_path
102
+
103
+ def __export_table(self, t: catalog.Table) -> None:
104
+ """
105
+ Exports the data from `t` into an Iceberg table.
106
+ """
107
+ # First generate a select list for the data we want to extract from `t`. This includes:
108
+ # - all stored columns, including computed columns;
109
+ # - errortype and errormsg fields whenever they're defined.
110
+ # We select only those columns that are defined in this table (columns inherited from ancestor tables will be
111
+ # handled separately).
112
+ # For media columns, we substitute `col.fileurl` so that we always get the URL (which may be a file:// URL;
113
+ # these will be specially handled later)
114
+ select_exprs: dict[str, exprs.Expr] = {}
115
+
116
+ # As we generate the select list, we construct a separate list of column types. We can't rely on df._schema
117
+ # to get the column types, since we'll be substituting `fileurl`s for media columns.
118
+ actual_col_types: list[ts.ColumnType] = []
119
+
120
+ for col_name, col in t._tbl_version.cols_by_name.items():
121
+ if not col.is_stored:
122
+ continue
123
+ if col.col_type.is_media_type():
124
+ select_exprs[col_name] = t[col_name].fileurl
125
+ else:
126
+ select_exprs[col_name] = t[col_name]
127
+ actual_col_types.append(col.col_type)
128
+ if col.records_errors:
129
+ select_exprs[f'{col_name}_errortype'] = t[col_name].errortype
130
+ actual_col_types.append(ts.StringType())
131
+ select_exprs[f'{col_name}_errormsg'] = t[col_name].errormsg
132
+ actual_col_types.append(ts.StringType())
133
+
134
+ # Run the select() on `self.table`, not `t`, so that we export only those rows that are actually present in
135
+ # `self.table`.
136
+ df = self.table.select(**select_exprs)
137
+ namespace = self.__iceberg_namespace(t)
138
+ self.iceberg_catalog.create_namespace_if_not_exists(namespace)
139
+ iceberg_schema = self.__to_iceberg_schema(df._schema)
140
+ iceberg_tbl = self.iceberg_catalog.create_table(f'{namespace}.{t._name}', schema=iceberg_schema)
141
+
142
+ # Populate the Iceberg table with data.
143
+ # The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
144
+ # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Iceberg table on disk.
145
+ for pa_table in self.__to_pa_tables(df, actual_col_types, iceberg_schema):
146
+ iceberg_tbl.append(pa_table)
147
+
148
+ @classmethod
149
+ def __iceberg_namespace(cls, table: catalog.Table) -> str:
150
+ """
151
+ Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
152
+ """
153
+ parent_path = table._parent._path
154
+ if len(parent_path) == 0:
155
+ return 'pxt'
156
+ else:
157
+ return f'pxt.{parent_path}'
158
+
159
+ # The following methods are responsible for schema and data conversion from Pixeltable to Iceberg. Some of this
160
+ # logic might be consolidated into arrow.py and unified with general Parquet export, but there are several
161
+ # major differences:
162
+ # - Iceberg has no array type; we export all arrays as binary blobs
163
+ # - We include _rowid and _v_min columns in the Iceberg table
164
+ # - Media columns are handled specially as indicated above
165
+
166
+ @classmethod
167
+ def __to_iceberg_schema(cls, pxt_schema: dict[str, ts.ColumnType]) -> pa.Schema:
168
+ entries = [(name, cls.__to_iceberg_type(col_type)) for name, col_type in pxt_schema.items()]
169
+ entries.append(('_rowid', pa.list_(pa.int64())))
170
+ entries.append(('_v_min', pa.int64()))
171
+ return pa.schema(entries) # type: ignore[arg-type]
172
+
173
+ @classmethod
174
+ def __to_iceberg_type(cls, col_type: ts.ColumnType) -> pa.DataType:
175
+ if col_type.is_array_type():
176
+ return pa.binary()
177
+ if col_type.is_media_type():
178
+ return pa.string()
179
+ return PXT_TO_PA_TYPES.get(col_type.__class__)
180
+
181
+ def __to_pa_tables(
182
+ self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
183
+ ) -> Iterator[pa.Table]:
184
+ """
185
+ Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
186
+ to avoid excessive memory usage.
187
+ """
188
+ for rows in more_itertools.batched(self.__to_pa_rows(df, actual_col_types), batch_size):
189
+ cols = {col_name: [row[idx] for row in rows] for idx, col_name in enumerate(df._schema.keys())}
190
+ cols['_rowid'] = [row[-2] for row in rows]
191
+ cols['_v_min'] = [row[-1] for row in rows]
192
+ yield pa.Table.from_pydict(cols, schema=arrow_schema)
193
+
194
+ def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
195
+ for row in df._exec():
196
+ vals = [row[e.slot_idx] for e in df._select_list_exprs]
197
+ result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
198
+ result.append(row.rowid)
199
+ result.append(row.v_min)
200
+ yield result
201
+
202
+ def __to_pa_value(self, val: Any, col_type: ts.ColumnType) -> Any:
203
+ if val is None:
204
+ return None
205
+ if col_type.is_array_type():
206
+ # Export arrays as binary
207
+ assert isinstance(val, np.ndarray)
208
+ arr = io.BytesIO()
209
+ np.save(arr, val)
210
+ return arr.getvalue()
211
+ if col_type.is_json_type():
212
+ # Export JSON as strings
213
+ return json.dumps(val)
214
+ if col_type.is_media_type():
215
+ # Handle media files as described above
216
+ assert isinstance(val, str) # Media columns are always referenced by `fileurl`
217
+ return self.__process_media_url(val)
218
+ return val
219
+
220
+ def __process_media_url(self, url: str) -> str:
221
+ parsed_url = urllib.parse.urlparse(url)
222
+ if parsed_url.scheme == 'file':
223
+ # It's the URL of a local file. Replace it with a pxtmedia:// URI.
224
+ # (We can't use an actual pxt:// URI, because the eventual pxt:// table name might not be known at this
225
+ # time. The pxtmedia:// URI serves as a relative reference into the tarball that can be replaced with an
226
+ # actual URL when the table is reconstituted.)
227
+ path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
228
+ if path not in self.media_files:
229
+ # Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
230
+ dest_name = f'{uuid.uuid4().hex}{path.suffix}'
231
+ self.media_files[path] = dest_name
232
+ return f'pxtmedia://{self.media_files[path]}'
233
+ # For any type of URL other than a local file, just return the URL as-is.
234
+ return url
235
+
236
+ def __build_tarball(self) -> Path:
237
+ bundle_path = self.tmp_dir / 'bundle.tar.bz2'
238
+ with tarfile.open(bundle_path, 'w:bz2') as tf:
239
+ # Add metadata json
240
+ tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
241
+ # Add the Iceberg warehouse dir (including the catalog)
242
+ tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
243
+ # Add the media files
244
+ for src_file, dest_name in self.media_files.items():
245
+ tf.add(src_file, arcname=f'media/{dest_name}')
246
+ return bundle_path
@@ -0,0 +1,97 @@
1
+ import dataclasses
2
+ import os
3
+ import sys
4
+ import urllib.parse
5
+ import urllib.request
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+
9
+ import requests
10
+ from tqdm import tqdm
11
+
12
+ import pixeltable as pxt
13
+ from pixeltable import exceptions as excs, metadata
14
+ from pixeltable.env import Env
15
+ from pixeltable.utils import sha256sum
16
+
17
+ from .packager import TablePackager
18
+
19
+ # These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
20
+ # pixeltable.com URLs are available.
21
+ _PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
22
+ _FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
23
+
24
+
25
+ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
26
+ packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
27
+ request_json = packager.md
28
+ headers_json = {'X-api-key': Env.get().pxt_api_key}
29
+
30
+ response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
31
+ if response.status_code != 200:
32
+ raise excs.Error(f'Error publishing snapshot: {response.text}')
33
+ response_json = response.json()
34
+ if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
35
+ raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
36
+ upload_id = response_json['upload_id']
37
+ destination_uri = response_json['destination_uri']
38
+
39
+ Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
40
+
41
+ bundle = packager.package()
42
+
43
+ parsed_location = urllib.parse.urlparse(destination_uri)
44
+ if parsed_location.scheme == 's3':
45
+ _upload_bundle_to_s3(bundle, parsed_location)
46
+ else:
47
+ raise excs.Error(f'Unsupported destination: {destination_uri}')
48
+
49
+ Env.get().console_logger.info(f'Finalizing snapshot ...')
50
+
51
+ finalize_request_json = {
52
+ 'upload_id': upload_id,
53
+ 'datafile': bundle.name,
54
+ 'size': bundle.stat().st_size,
55
+ 'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
56
+ }
57
+
58
+ # TODO: Use Pydantic for validation
59
+ finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
60
+ if finalize_response.status_code != 200:
61
+ raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
62
+ finalize_response_json = finalize_response.json()
63
+ if not isinstance(finalize_response_json, dict) or 'confirmed_table_uri' not in finalize_response_json:
64
+ raise excs.Error(f'Error finalizing snapshot: unexpected response from server.\n{finalize_response_json}')
65
+
66
+ confirmed_tbl_uri = finalize_response_json['confirmed_table_uri']
67
+ Env.get().console_logger.info(f'The published snapshot is now available at: {confirmed_tbl_uri}')
68
+ return confirmed_tbl_uri
69
+
70
+
71
+ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
72
+ from pixeltable.utils.s3 import get_client
73
+
74
+ bucket = parsed_location.netloc
75
+ remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
76
+ remote_path = str(remote_dir / bundle.name)[1:] # Remove initial /
77
+
78
+ Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
79
+
80
+ boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
81
+ s3_client = get_client(**boto_config)
82
+
83
+ upload_args = {'ChecksumAlgorithm': 'SHA256'}
84
+
85
+ progress_bar = tqdm(
86
+ desc=f'Uploading',
87
+ total=bundle.stat().st_size,
88
+ unit='B',
89
+ unit_scale=True,
90
+ unit_divisor=1024,
91
+ miniters=1, # Update every iteration (should be fine for an upload)
92
+ ncols=100,
93
+ file=sys.stdout,
94
+ )
95
+ s3_client.upload_file(
96
+ Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
97
+ )