pixeltable 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (57) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -2
  4. pixeltable/catalog/column.py +1 -1
  5. pixeltable/catalog/dir.py +1 -1
  6. pixeltable/catalog/table.py +1 -1
  7. pixeltable/catalog/table_version.py +12 -2
  8. pixeltable/catalog/table_version_path.py +2 -2
  9. pixeltable/catalog/view.py +64 -20
  10. pixeltable/dataframe.py +10 -5
  11. pixeltable/env.py +12 -0
  12. pixeltable/exec/expr_eval/evaluators.py +4 -2
  13. pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
  14. pixeltable/exprs/comparison.py +8 -4
  15. pixeltable/exprs/data_row.py +5 -3
  16. pixeltable/exprs/expr.py +2 -2
  17. pixeltable/exprs/function_call.py +155 -313
  18. pixeltable/func/aggregate_function.py +29 -15
  19. pixeltable/func/callable_function.py +11 -8
  20. pixeltable/func/expr_template_function.py +3 -9
  21. pixeltable/func/function.py +148 -74
  22. pixeltable/func/signature.py +65 -30
  23. pixeltable/func/udf.py +1 -1
  24. pixeltable/functions/__init__.py +1 -0
  25. pixeltable/functions/deepseek.py +121 -0
  26. pixeltable/functions/image.py +7 -7
  27. pixeltable/functions/openai.py +23 -9
  28. pixeltable/functions/video.py +14 -7
  29. pixeltable/globals.py +14 -3
  30. pixeltable/index/embedding_index.py +4 -13
  31. pixeltable/io/globals.py +88 -77
  32. pixeltable/io/hf_datasets.py +34 -34
  33. pixeltable/io/pandas.py +75 -76
  34. pixeltable/io/parquet.py +19 -27
  35. pixeltable/io/utils.py +115 -0
  36. pixeltable/iterators/audio.py +2 -1
  37. pixeltable/iterators/video.py +1 -1
  38. pixeltable/metadata/__init__.py +2 -1
  39. pixeltable/metadata/converters/convert_15.py +18 -8
  40. pixeltable/metadata/converters/convert_27.py +31 -0
  41. pixeltable/metadata/converters/convert_28.py +15 -0
  42. pixeltable/metadata/converters/convert_29.py +111 -0
  43. pixeltable/metadata/converters/util.py +12 -1
  44. pixeltable/metadata/notes.py +3 -0
  45. pixeltable/metadata/schema.py +8 -0
  46. pixeltable/share/__init__.py +1 -0
  47. pixeltable/share/packager.py +41 -13
  48. pixeltable/share/publish.py +97 -0
  49. pixeltable/type_system.py +40 -14
  50. pixeltable/utils/__init__.py +41 -0
  51. pixeltable/utils/arrow.py +40 -7
  52. pixeltable/utils/formatter.py +1 -1
  53. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/METADATA +34 -49
  54. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/RECORD +57 -51
  55. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/WHEEL +1 -1
  56. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/LICENSE +0 -0
  57. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/entry_points.txt +0 -0
@@ -17,7 +17,7 @@ _logger = logging.getLogger('pixeltable')
17
17
  def _(engine: sql.engine.Engine) -> None:
18
18
  with engine.begin() as conn:
19
19
  for row in conn.execute(sql.select(Function)):
20
- id, dir_id, md, binary_obj = row
20
+ id, _, md, binary_obj = row
21
21
  md['md'] = __update_md(md['md'], binary_obj)
22
22
  _logger.info(f'Updating function: {id}')
23
23
  conn.execute(sql.update(Function).where(Function.id == id).values(md=md))
@@ -27,14 +27,24 @@ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
27
27
  # construct dict produced by CallableFunction.to_store()
28
28
  py_fn = cloudpickle.loads(binary_obj)
29
29
  py_params = inspect.signature(py_fn).parameters
30
- return_type = ts.ColumnType.from_dict(orig_d['return_type'])
31
- params: list[func.Parameter] = []
30
+ return_type = orig_d['return_type']
31
+ params: list[dict] = []
32
32
  for name, col_type_dict, kind_int, is_batched in orig_d['parameters']:
33
- col_type = ts.ColumnType.from_dict(col_type_dict) if col_type_dict is not None else None
34
33
  default = py_params[name].default
35
- kind = inspect._ParameterKind(kind_int) # is there a way to avoid referencing a private type?
36
- params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
34
+ kind = inspect._ParameterKind(kind_int)
35
+ params.append(
36
+ {
37
+ 'name': name,
38
+ 'col_type': col_type_dict,
39
+ 'kind': str(kind),
40
+ 'is_batched': is_batched,
41
+ 'has_default': default is not inspect.Parameter.empty,
42
+ 'default': None if default is inspect.Parameter.empty else default,
43
+ }
44
+ )
37
45
  is_batched = 'batch_size' in orig_d
38
- sig = func.Signature(return_type, params, is_batched=is_batched)
39
- d = {'signature': sig.as_dict(), 'batch_size': orig_d['batch_size'] if is_batched else None}
46
+ d = {
47
+ 'signature': {'return_type': return_type, 'parameters': params, 'is_batched': is_batched},
48
+ 'batch_size': orig_d['batch_size'] if is_batched else None,
49
+ }
40
50
  return d
@@ -0,0 +1,31 @@
1
+ import logging
2
+ from typing import Any, Optional
3
+ from uuid import UUID
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from pixeltable.metadata import register_converter
8
+ from pixeltable.metadata.converters.util import convert_table_md
9
+ from pixeltable.metadata.schema import Table
10
+
11
+ _logger = logging.getLogger('pixeltable')
12
+
13
+
14
+ @register_converter(version=27)
15
+ def _(engine: sql.engine.Engine) -> None:
16
+ convert_table_md(engine, table_md_updater=__update_table_md)
17
+
18
+
19
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
20
+ """Update the view metadata to add the include_base_columns boolean if it is missing
21
+
22
+ Args:
23
+ table_md (dict): copy of the original table metadata. this gets updated in place.
24
+ table_id (UUID): the table id
25
+
26
+ """
27
+ if table_md['view_md'] is None:
28
+ return
29
+ if 'include_base_columns' not in table_md['view_md']:
30
+ table_md['view_md']['include_base_columns'] = True
31
+ _logger.info(f'Updating view metadata for table: {table_id}')
@@ -0,0 +1,15 @@
1
+ import logging
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.schema import Dir, Table, TableSchemaVersion, TableVersion
7
+
8
+
9
+ @register_converter(version=28)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ with engine.begin() as conn:
12
+ conn.execute(sql.update(Dir).values(md=Dir.md.concat({'user': None, 'additional_md': {}})))
13
+ conn.execute(sql.update(Table).values(md=Table.md.concat({'user': None, 'additional_md': {}})))
14
+ conn.execute(sql.update(TableVersion).values(md=TableVersion.md.concat({'additional_md': {}})))
15
+ conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat({'additional_md': {}})))
@@ -0,0 +1,111 @@
1
+ from typing import Any, Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable import exprs
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.converters.util import convert_table_md
8
+
9
+
10
+ @register_converter(version=29)
11
+ def _(engine: sql.engine.Engine) -> None:
12
+ convert_table_md(engine, substitution_fn=__substitute_md)
13
+
14
+
15
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
16
+ # Defaults are now stored as literals in signatures
17
+ if k == 'parameters':
18
+ for param in v:
19
+ assert isinstance(param, dict)
20
+ has_default = param.get('has_default') or (param.get('default') is not None)
21
+ if 'has_default' in param:
22
+ del param['has_default']
23
+ literal = exprs.Expr.from_object(param['default']) if has_default else None
24
+ assert literal is None or isinstance(literal, exprs.Literal)
25
+ param['default'] = None if literal is None else literal.as_dict()
26
+ return k, v
27
+
28
+ # Method of organizing argument expressions has changed
29
+ if isinstance(v, dict) and v.get('_classname') == 'FunctionCall':
30
+ args = v['args']
31
+ kwargs = v['kwargs']
32
+ components = v['components']
33
+ group_by_start_idx = v['group_by_start_idx']
34
+ group_by_stop_idx = v['group_by_stop_idx']
35
+ order_by_start_idx = v['order_by_start_idx']
36
+
37
+ new_args = []
38
+ for arg in args:
39
+ if arg[0] is not None:
40
+ assert isinstance(arg[0], int)
41
+ new_args.append(components[arg[0]])
42
+ else:
43
+ literal = exprs.Expr.from_object(arg[1])
44
+ new_args.append(literal.as_dict())
45
+
46
+ new_kwargs = {}
47
+ for name, kwarg in kwargs.items():
48
+ if kwarg[0] is not None:
49
+ assert isinstance(kwarg[0], int)
50
+ new_kwargs[name] = components[kwarg[0]]
51
+ else:
52
+ literal = exprs.Expr.from_object(kwarg[1])
53
+ new_kwargs[name] = literal.as_dict()
54
+
55
+ # We need to expand ("unroll") any var-args or var-kwargs.
56
+
57
+ new_args_len = len(new_args)
58
+ rolled_args: Optional[dict] = None
59
+ rolled_kwargs: Optional[dict] = None
60
+
61
+ if 'signature' in v['fn']:
62
+ # If it's a pickled function, there's no signature, so we're out of luck; varargs in a pickled function
63
+ # is an edge case that won't migrate properly.
64
+ parameters: list[dict] = v['fn']['signature']['parameters']
65
+ for i, param in enumerate(parameters):
66
+ if param['kind'] == 'VAR_POSITIONAL':
67
+ if new_args_len > i:
68
+ # For peculiar historical reasons, variable kwargs might show up in args. Thus variable
69
+ # positional args is not necessarily the last element of args; it might be the second-to-last.
70
+ assert new_args_len <= i + 2, new_args
71
+ rolled_args = new_args[i]
72
+ new_args = new_args[:i] + new_args[i + 1 :]
73
+ if param['kind'] == 'VAR_KEYWORD':
74
+ # As noted above, variable kwargs might show up either in args or in kwargs. If it's in args, it
75
+ # is necessarily the last element.
76
+ if new_args_len > i:
77
+ assert new_args_len <= i + 1, new_args
78
+ rolled_kwargs = new_args.pop()
79
+ if param['name'] in kwargs:
80
+ assert rolled_kwargs is None
81
+ rolled_kwargs = kwargs.pop(param['name'])
82
+
83
+ if rolled_args is not None:
84
+ assert rolled_args['_classname'] in ('InlineArray', 'InlineList')
85
+ new_args.extend(rolled_args['components'])
86
+ if rolled_kwargs is not None:
87
+ assert rolled_kwargs['_classname'] == 'InlineDict'
88
+ new_kwargs.update(zip(rolled_kwargs['keys'], rolled_kwargs['components']))
89
+
90
+ group_by_exprs = [components[i] for i in range(group_by_start_idx, group_by_stop_idx)]
91
+ order_by_exprs = [components[i] for i in range(order_by_start_idx, len(components))]
92
+
93
+ new_components = [*new_args, *new_kwargs.values(), *group_by_exprs, *order_by_exprs]
94
+
95
+ newv = {
96
+ 'fn': v['fn'],
97
+ 'arg_idxs': list(range(len(new_args))),
98
+ 'kwarg_idxs': {name: i + len(new_args) for i, name in enumerate(new_kwargs.keys())},
99
+ 'group_by_start_idx': len(new_args) + len(new_kwargs),
100
+ 'group_by_stop_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
101
+ 'order_by_start_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
102
+ 'is_method_call': False,
103
+ '_classname': 'FunctionCall',
104
+ 'components': new_components,
105
+ }
106
+ if 'return_type' in v:
107
+ newv['return_type'] = v['return_type']
108
+
109
+ return k, newv
110
+
111
+ return None
@@ -5,7 +5,7 @@ from uuid import UUID
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
- from pixeltable.metadata.schema import Table, TableSchemaVersion
8
+ from pixeltable.metadata.schema import Function, Table, TableSchemaVersion
9
9
 
10
10
  __logger = logging.getLogger('pixeltable')
11
11
 
@@ -50,6 +50,17 @@ def convert_table_md(
50
50
  __logger.info(f'Updating schema for table: {id}')
51
51
  conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
52
52
 
53
+ for row in conn.execute(sql.select(Function)):
54
+ id = row[0]
55
+ function_md = row[2]
56
+ assert isinstance(function_md, dict)
57
+ updated_function_md = copy.deepcopy(function_md)
58
+ if substitution_fn is not None:
59
+ updated_function_md = __substitute_md_rec(updated_function_md, substitution_fn)
60
+ if updated_function_md != function_md:
61
+ __logger.info(f'Updating function: {id}')
62
+ conn.execute(sql.update(Function).where(Function.id == id).values(md=updated_function_md))
63
+
53
64
 
54
65
  def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
55
66
  columns_md = table_md['column_md']
@@ -2,6 +2,9 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 30: 'Store default values and constant arguments as literals',
6
+ 29: 'Add user and additional_md fields to metadata structs',
7
+ 28: 'Enable view creation from DataFrame with select clause',
5
8
  27: 'Enable pxt.query parameterization of limit clauses',
6
9
  26: 'Rename clip_text and clip_image to clip',
7
10
  25: 'Functions with multiple signatures',
@@ -74,6 +74,8 @@ class SystemInfo(Base):
74
74
  @dataclasses.dataclass
75
75
  class DirMd:
76
76
  name: str
77
+ user: Optional[str]
78
+ additional_md: dict[str, Any]
77
79
 
78
80
 
79
81
  class Dir(Base):
@@ -132,6 +134,7 @@ class IndexMd:
132
134
  @dataclasses.dataclass
133
135
  class ViewMd:
134
136
  is_snapshot: bool
137
+ include_base_columns: bool
135
138
 
136
139
  # (table id, version); for mutable views, all versions are None
137
140
  base_versions: list[tuple[str, Optional[int]]]
@@ -150,6 +153,8 @@ class ViewMd:
150
153
  class TableMd:
151
154
  name: str
152
155
 
156
+ user: Optional[str]
157
+
153
158
  # monotonically increasing w/in Table for both data and schema changes, starting at 0
154
159
  current_version: int
155
160
  # each version has a corresponding schema version (current_version >= current_schema_version)
@@ -169,6 +174,7 @@ class TableMd:
169
174
  column_md: dict[int, ColumnMd] # col_id -> ColumnMd
170
175
  index_md: dict[int, IndexMd] # index_id -> IndexMd
171
176
  view_md: Optional[ViewMd]
177
+ additional_md: dict[str, Any]
172
178
 
173
179
 
174
180
  class Table(Base):
@@ -194,6 +200,7 @@ class TableVersionMd:
194
200
  created_at: float # time.time()
195
201
  version: int
196
202
  schema_version: int
203
+ additional_md: dict[str, Any]
197
204
 
198
205
 
199
206
  class TableVersion(Base):
@@ -232,6 +239,7 @@ class TableSchemaVersionMd:
232
239
  # default validation strategy for any media column of this table
233
240
  # stores column.MediaValiation.name.lower()
234
241
  media_validation: str
242
+ additional_md: dict[str, Any]
235
243
 
236
244
 
237
245
  # versioning: each table schema change results in a new record
@@ -0,0 +1 @@
1
+ from .publish import publish_snapshot
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import io
2
3
  import json
3
4
  import logging
@@ -5,8 +6,9 @@ import tarfile
5
6
  import urllib.parse
6
7
  import urllib.request
7
8
  import uuid
9
+ from datetime import datetime
8
10
  from pathlib import Path
9
- from typing import Any, Iterator
11
+ from typing import Any, Iterator, Optional
10
12
 
11
13
  import more_itertools
12
14
  import numpy as np
@@ -15,7 +17,8 @@ import pyiceberg.catalog
15
17
 
16
18
  import pixeltable as pxt
17
19
  import pixeltable.type_system as ts
18
- from pixeltable import exprs
20
+ from pixeltable import catalog, exprs, metadata
21
+ from pixeltable.dataframe import DataFrame
19
22
  from pixeltable.env import Env
20
23
  from pixeltable.utils.arrow import PXT_TO_PA_TYPES
21
24
  from pixeltable.utils.iceberg import sqlite_catalog
@@ -28,6 +31,7 @@ class TablePackager:
28
31
  Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
29
32
  is as follows:
30
33
 
34
+ metadata.json # Pixeltable metadata for the packaged table
31
35
  warehouse/catalog.db # sqlite Iceberg catalog
32
36
  warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
33
37
  media/** # Local media files
@@ -43,16 +47,40 @@ class TablePackager:
43
47
  'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
44
48
  """
45
49
 
46
- table: pxt.Table # The table to be packaged
50
+ table: catalog.Table # The table to be packaged
47
51
  tmp_dir: Path # Temporary directory where the package will reside
48
52
  iceberg_catalog: pyiceberg.catalog.Catalog
49
53
  media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
54
+ md: dict[str, Any]
50
55
 
51
- def __init__(self, table: pxt.Table) -> None:
56
+ def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
52
57
  self.table = table
53
58
  self.tmp_dir = Path(Env.get().create_tmp_path())
54
59
  self.media_files = {}
55
60
 
61
+ # Generate metadata
62
+ self.md = {
63
+ 'pxt_version': pxt.__version__,
64
+ 'pxt_md_version': metadata.VERSION,
65
+ 'md': {
66
+ 'tables': [
67
+ {
68
+ 'table_id': str(t._tbl_version.id),
69
+ # These are temporary; will replace with a better solution once the concurrency changes to catalog have
70
+ # been merged
71
+ 'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
72
+ 'table_version_md': dataclasses.asdict(
73
+ t._tbl_version._create_version_md(datetime.now().timestamp())
74
+ ),
75
+ 'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
76
+ }
77
+ for t in (table, *table._bases)
78
+ ]
79
+ },
80
+ }
81
+ if additional_md is not None:
82
+ self.md.update(additional_md)
83
+
56
84
  def package(self) -> Path:
57
85
  """
58
86
  Export the table to a tarball containing Iceberg tables and media files.
@@ -60,8 +88,10 @@ class TablePackager:
60
88
  assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
61
89
  _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
62
90
  self.tmp_dir.mkdir()
91
+ with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
92
+ json.dump(self.md, fp)
63
93
  self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
64
- ancestors = [self.table] + self.table._bases
94
+ ancestors = (self.table, *self.table._bases)
65
95
  for t in ancestors:
66
96
  _logger.info(f"Exporting table '{t._path}'.")
67
97
  self.__export_table(t)
@@ -70,7 +100,7 @@ class TablePackager:
70
100
  _logger.info(f'Packaging complete: {bundle_path}')
71
101
  return bundle_path
72
102
 
73
- def __export_table(self, t: pxt.Table) -> None:
103
+ def __export_table(self, t: catalog.Table) -> None:
74
104
  """
75
105
  Exports the data from `t` into an Iceberg table.
76
106
  """
@@ -116,7 +146,7 @@ class TablePackager:
116
146
  iceberg_tbl.append(pa_table)
117
147
 
118
148
  @classmethod
119
- def __iceberg_namespace(cls, table: pxt.Table) -> str:
149
+ def __iceberg_namespace(cls, table: catalog.Table) -> str:
120
150
  """
121
151
  Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
122
152
  """
@@ -149,11 +179,7 @@ class TablePackager:
149
179
  return PXT_TO_PA_TYPES.get(col_type.__class__)
150
180
 
151
181
  def __to_pa_tables(
152
- self,
153
- df: pxt.DataFrame,
154
- actual_col_types: list[pxt.ColumnType],
155
- arrow_schema: pa.Schema,
156
- batch_size: int = 1_000,
182
+ self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
157
183
  ) -> Iterator[pa.Table]:
158
184
  """
159
185
  Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
@@ -165,7 +191,7 @@ class TablePackager:
165
191
  cols['_v_min'] = [row[-1] for row in rows]
166
192
  yield pa.Table.from_pydict(cols, schema=arrow_schema)
167
193
 
168
- def __to_pa_rows(self, df: pxt.DataFrame, actual_col_types: list[pxt.ColumnType]) -> Iterator[list]:
194
+ def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
169
195
  for row in df._exec():
170
196
  vals = [row[e.slot_idx] for e in df._select_list_exprs]
171
197
  result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
@@ -210,6 +236,8 @@ class TablePackager:
210
236
  def __build_tarball(self) -> Path:
211
237
  bundle_path = self.tmp_dir / 'bundle.tar.bz2'
212
238
  with tarfile.open(bundle_path, 'w:bz2') as tf:
239
+ # Add metadata json
240
+ tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
213
241
  # Add the Iceberg warehouse dir (including the catalog)
214
242
  tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
215
243
  # Add the media files
@@ -0,0 +1,97 @@
1
+ import dataclasses
2
+ import os
3
+ import sys
4
+ import urllib.parse
5
+ import urllib.request
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+
9
+ import requests
10
+ from tqdm import tqdm
11
+
12
+ import pixeltable as pxt
13
+ from pixeltable import exceptions as excs, metadata
14
+ from pixeltable.env import Env
15
+ from pixeltable.utils import sha256sum
16
+
17
+ from .packager import TablePackager
18
+
19
+ # These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
20
+ # pixeltable.com URLs are available.
21
+ _PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
22
+ _FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
23
+
24
+
25
+ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
26
+ packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
27
+ request_json = packager.md
28
+ headers_json = {'X-api-key': Env.get().pxt_api_key}
29
+
30
+ response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
31
+ if response.status_code != 200:
32
+ raise excs.Error(f'Error publishing snapshot: {response.text}')
33
+ response_json = response.json()
34
+ if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
35
+ raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
36
+ upload_id = response_json['upload_id']
37
+ destination_uri = response_json['destination_uri']
38
+
39
+ Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
40
+
41
+ bundle = packager.package()
42
+
43
+ parsed_location = urllib.parse.urlparse(destination_uri)
44
+ if parsed_location.scheme == 's3':
45
+ _upload_bundle_to_s3(bundle, parsed_location)
46
+ else:
47
+ raise excs.Error(f'Unsupported destination: {destination_uri}')
48
+
49
+ Env.get().console_logger.info(f'Finalizing snapshot ...')
50
+
51
+ finalize_request_json = {
52
+ 'upload_id': upload_id,
53
+ 'datafile': bundle.name,
54
+ 'size': bundle.stat().st_size,
55
+ 'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
56
+ }
57
+
58
+ # TODO: Use Pydantic for validation
59
+ finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
60
+ if finalize_response.status_code != 200:
61
+ raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
62
+ finalize_response_json = finalize_response.json()
63
+ if not isinstance(finalize_response_json, dict) or 'confirmed_table_uri' not in finalize_response_json:
64
+ raise excs.Error(f'Error finalizing snapshot: unexpected response from server.\n{finalize_response_json}')
65
+
66
+ confirmed_tbl_uri = finalize_response_json['confirmed_table_uri']
67
+ Env.get().console_logger.info(f'The published snapshot is now available at: {confirmed_tbl_uri}')
68
+ return confirmed_tbl_uri
69
+
70
+
71
+ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
72
+ from pixeltable.utils.s3 import get_client
73
+
74
+ bucket = parsed_location.netloc
75
+ remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
76
+ remote_path = str(remote_dir / bundle.name)[1:] # Remove initial /
77
+
78
+ Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
79
+
80
+ boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
81
+ s3_client = get_client(**boto_config)
82
+
83
+ upload_args = {'ChecksumAlgorithm': 'SHA256'}
84
+
85
+ progress_bar = tqdm(
86
+ desc=f'Uploading',
87
+ total=bundle.stat().st_size,
88
+ unit='B',
89
+ unit_scale=True,
90
+ unit_divisor=1024,
91
+ miniters=1, # Update every iteration (should be fine for an upload)
92
+ ncols=100,
93
+ file=sys.stdout,
94
+ )
95
+ s3_client.upload_file(
96
+ Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
97
+ )
pixeltable/type_system.py CHANGED
@@ -8,10 +8,9 @@ import json
8
8
  import typing
9
9
  import urllib.parse
10
10
  import urllib.request
11
- from pathlib import Path
12
11
  from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
13
12
 
14
- import av # type: ignore
13
+ import av
15
14
  import jsonschema
16
15
  import jsonschema.protocols
17
16
  import jsonschema.validators
@@ -22,6 +21,7 @@ import sqlalchemy as sql
22
21
  from typing_extensions import _AnnotatedAlias
23
22
 
24
23
  import pixeltable.exceptions as excs
24
+ from pixeltable.utils import parse_local_file_path
25
25
 
26
26
  from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
27
27
 
@@ -47,8 +47,8 @@ class ColumnType:
47
47
  @classmethod
48
48
  def supertype(
49
49
  cls,
50
- type1: 'ColumnType.Type',
51
- type2: 'ColumnType.Type',
50
+ type1: Optional['ColumnType.Type'],
51
+ type2: Optional['ColumnType.Type'],
52
52
  # we need to pass this in because we can't easily append it as a class member
53
53
  common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
54
54
  ) -> Optional['ColumnType.Type']:
@@ -93,6 +93,9 @@ class ColumnType:
93
93
  self._type = t
94
94
  self._nullable = nullable
95
95
 
96
+ def has_supertype(self) -> bool:
97
+ return True
98
+
96
99
  @property
97
100
  def nullable(self) -> bool:
98
101
  return self._nullable
@@ -271,8 +274,10 @@ class ColumnType:
271
274
  inferred_type = val_type
272
275
  else:
273
276
  inferred_type = inferred_type.supertype(val_type)
274
- if inferred_type is None:
275
- return None
277
+ if inferred_type is None:
278
+ return None
279
+ if not inferred_type.has_supertype():
280
+ return inferred_type
276
281
  return inferred_type
277
282
 
278
283
  @classmethod
@@ -397,12 +402,9 @@ class ColumnType:
397
402
  def _validate_file_path(self, val: Any) -> None:
398
403
  """Raises TypeError if not a valid local file path or not a path/byte sequence"""
399
404
  if isinstance(val, str):
400
- parsed = urllib.parse.urlparse(val)
401
- if parsed.scheme != '' and parsed.scheme != 'file':
402
- return
403
- path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
404
- if not path.is_file():
405
- raise TypeError(f'File not found: {str(path)}')
405
+ path = parse_local_file_path(val)
406
+ if path is not None and not path.is_file():
407
+ raise TypeError(f'File not found: {path}')
406
408
  else:
407
409
  if not isinstance(val, bytes):
408
410
  raise TypeError(f'expected file path or bytes, got {type(val)}')
@@ -495,7 +497,7 @@ class InvalidType(ColumnType):
495
497
  super().__init__(self.Type.INVALID, nullable=nullable)
496
498
 
497
499
  def to_sa_type(self) -> sql.types.TypeEngine:
498
- assert False
500
+ return sql.types.NullType()
499
501
 
500
502
  def print_value(self, val: Any) -> str:
501
503
  return str(val)
@@ -508,6 +510,9 @@ class StringType(ColumnType):
508
510
  def __init__(self, nullable: bool = False):
509
511
  super().__init__(self.Type.STRING, nullable=nullable)
510
512
 
513
+ def has_supertype(self):
514
+ return not self.nullable
515
+
511
516
  def to_sa_type(self) -> sql.types.TypeEngine:
512
517
  return sql.String()
513
518
 
@@ -591,6 +596,9 @@ class TimestampType(ColumnType):
591
596
  def __init__(self, nullable: bool = False):
592
597
  super().__init__(self.Type.TIMESTAMP, nullable=nullable)
593
598
 
599
+ def has_supertype(self):
600
+ return not self.nullable
601
+
594
602
  def to_sa_type(self) -> sql.types.TypeEngine:
595
603
  return sql.TIMESTAMP(timezone=True)
596
604
 
@@ -601,6 +609,8 @@ class TimestampType(ColumnType):
601
609
  def _create_literal(self, val: Any) -> Any:
602
610
  if isinstance(val, str):
603
611
  return datetime.datetime.fromisoformat(val)
612
+ if isinstance(val, datetime.datetime):
613
+ return val
604
614
  return val
605
615
 
606
616
 
@@ -651,6 +661,10 @@ class JsonType(ColumnType):
651
661
  return val_type.print_value(val)
652
662
 
653
663
  def _validate_literal(self, val: Any) -> None:
664
+ if isinstance(val, tuple):
665
+ val = list(val)
666
+ if isinstance(val, pydantic.BaseModel):
667
+ val = val.model_dump()
654
668
  if not self.__is_valid_json(val):
655
669
  raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
656
670
  if self.__validator is not None:
@@ -818,14 +832,20 @@ class ArrayType(ColumnType):
818
832
  return hash((self._type, self.nullable, self.shape, self.dtype))
819
833
 
820
834
  def supertype(self, other: ColumnType) -> Optional[ArrayType]:
835
+ basic_supertype = super().supertype(other)
836
+ if basic_supertype is not None:
837
+ assert isinstance(basic_supertype, ArrayType)
838
+ return basic_supertype
839
+
821
840
  if not isinstance(other, ArrayType):
822
841
  return None
842
+
823
843
  super_dtype = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
824
844
  if super_dtype is None:
825
845
  # if the dtypes are incompatible, then the supertype is a fully general array
826
846
  return ArrayType(nullable=(self.nullable or other.nullable))
827
847
  super_shape: Optional[tuple[Optional[int], ...]]
828
- if len(self.shape) != len(other.shape):
848
+ if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
829
849
  super_shape = None
830
850
  else:
831
851
  super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
@@ -1009,8 +1029,14 @@ class ImageType(ColumnType):
1009
1029
  return hash((self._type, self.nullable, self.size, self.mode))
1010
1030
 
1011
1031
  def supertype(self, other: ColumnType) -> Optional[ImageType]:
1032
+ basic_supertype = super().supertype(other)
1033
+ if basic_supertype is not None:
1034
+ assert isinstance(basic_supertype, ImageType)
1035
+ return basic_supertype
1036
+
1012
1037
  if not isinstance(other, ImageType):
1013
1038
  return None
1039
+
1014
1040
  width = self.width if self.width == other.width else None
1015
1041
  height = self.height if self.height == other.height else None
1016
1042
  mode = self.mode if self.mode == other.mode else None