pixeltable 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -7
  4. pixeltable/catalog/column.py +6 -2
  5. pixeltable/catalog/dir.py +2 -1
  6. pixeltable/catalog/insertable_table.py +1 -1
  7. pixeltable/catalog/schema_object.py +2 -1
  8. pixeltable/catalog/table.py +12 -8
  9. pixeltable/catalog/table_version.py +19 -0
  10. pixeltable/catalog/table_version_path.py +7 -0
  11. pixeltable/catalog/view.py +3 -3
  12. pixeltable/dataframe.py +48 -5
  13. pixeltable/env.py +1 -1
  14. pixeltable/exec/aggregation_node.py +14 -0
  15. pixeltable/exec/cache_prefetch_node.py +1 -1
  16. pixeltable/exec/expr_eval/expr_eval_node.py +1 -1
  17. pixeltable/exprs/column_ref.py +42 -17
  18. pixeltable/exprs/data_row.py +3 -0
  19. pixeltable/exprs/globals.py +1 -1
  20. pixeltable/exprs/literal.py +11 -1
  21. pixeltable/exprs/rowid_ref.py +4 -1
  22. pixeltable/exprs/similarity_expr.py +1 -1
  23. pixeltable/func/function.py +1 -1
  24. pixeltable/functions/__init__.py +1 -0
  25. pixeltable/functions/date.py +185 -0
  26. pixeltable/functions/gemini.py +22 -20
  27. pixeltable/functions/globals.py +1 -16
  28. pixeltable/functions/json.py +2 -1
  29. pixeltable/functions/math.py +40 -0
  30. pixeltable/functions/string.py +1 -2
  31. pixeltable/functions/video.py +2 -2
  32. pixeltable/globals.py +26 -9
  33. pixeltable/io/hf_datasets.py +2 -2
  34. pixeltable/io/pandas.py +16 -4
  35. pixeltable/io/parquet.py +2 -0
  36. pixeltable/metadata/__init__.py +1 -1
  37. pixeltable/metadata/converters/convert_34.py +21 -0
  38. pixeltable/metadata/notes.py +1 -0
  39. pixeltable/plan.py +12 -5
  40. pixeltable/share/__init__.py +1 -1
  41. pixeltable/share/packager.py +219 -119
  42. pixeltable/share/publish.py +61 -16
  43. pixeltable/store.py +45 -20
  44. pixeltable/type_system.py +46 -2
  45. pixeltable/utils/arrow.py +8 -2
  46. pixeltable/utils/pytorch.py +4 -0
  47. {pixeltable-0.3.13.dist-info → pixeltable-0.3.14.dist-info}/METADATA +2 -4
  48. {pixeltable-0.3.13.dist-info → pixeltable-0.3.14.dist-info}/RECORD +51 -49
  49. {pixeltable-0.3.13.dist-info → pixeltable-0.3.14.dist-info}/LICENSE +0 -0
  50. {pixeltable-0.3.13.dist-info → pixeltable-0.3.14.dist-info}/WHEEL +0 -0
  51. {pixeltable-0.3.13.dist-info → pixeltable-0.3.14.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py CHANGED
@@ -635,8 +635,8 @@ class Planner:
635
635
  raise excs.Error(f'Join predicate {join_clause.join_predicate} not expressible in SQL')
636
636
 
637
637
  @classmethod
638
- def _verify_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> None:
639
- """Verify that the various ordering requirements don't conflict"""
638
+ def _create_combined_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> Optional[OrderByClause]:
639
+ """Verify that the various ordering requirements don't conflict and return a combined ordering"""
640
640
  ob_clauses: list[OrderByClause] = [analyzer.order_by_clause.copy()]
641
641
 
642
642
  if verify_agg:
@@ -652,8 +652,11 @@ class Planner:
652
652
  OrderByItem(e, True) for e in fn_call.get_agg_order_by()
653
653
  ]
654
654
  ob_clauses.append(ordering)
655
- if len(ob_clauses) <= 1:
656
- return
655
+
656
+ if len(ob_clauses) == 0:
657
+ return None
658
+ elif len(ob_clauses) == 1:
659
+ return ob_clauses[0]
657
660
 
658
661
  combined_ordering = ob_clauses[0]
659
662
  for ordering in ob_clauses[1:]:
@@ -664,6 +667,7 @@ class Planner:
664
667
  f'{print_order_by_clause(combined_ordering)} vs {print_order_by_clause(ordering)}'
665
668
  )
666
669
  combined_ordering = combined
670
+ return combined_ordering
667
671
 
668
672
  @classmethod
669
673
  def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
@@ -761,7 +765,7 @@ class Planner:
761
765
  analyzer.window_fn_calls
762
766
  )
763
767
  ctx = exec.ExecContext(row_builder)
764
- cls._verify_ordering(analyzer, verify_agg=is_python_agg)
768
+ combined_ordering = cls._create_combined_ordering(analyzer, verify_agg=is_python_agg)
765
769
  cls._verify_join_clauses(analyzer)
766
770
 
767
771
  # materialized with SQL table scans (ie, single-table SELECT statements):
@@ -859,6 +863,9 @@ class Planner:
859
863
  row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause
860
864
  )
861
865
  else:
866
+ input_sql_node = plan.get_node(exec.SqlNode)
867
+ assert combined_ordering is not None
868
+ input_sql_node.set_order_by(combined_ordering)
862
869
  plan = exec.AggregationNode(
863
870
  tbl.tbl_version,
864
871
  row_builder,
@@ -1,3 +1,3 @@
1
1
  # ruff: noqa: F401
2
2
 
3
- from .publish import publish_snapshot
3
+ from .publish import pull_replica, push_replica
@@ -1,4 +1,4 @@
1
- import io
1
+ import datetime
2
2
  import json
3
3
  import logging
4
4
  import tarfile
@@ -9,45 +9,38 @@ from pathlib import Path
9
9
  from typing import Any, Iterator, Optional
10
10
 
11
11
  import more_itertools
12
- import numpy as np
13
12
  import pyarrow as pa
14
- import pyiceberg.catalog
13
+ import pyarrow.parquet as pq
14
+ import sqlalchemy as sql
15
15
 
16
16
  import pixeltable as pxt
17
- import pixeltable.type_system as ts
18
- from pixeltable import catalog, exprs, metadata
19
- from pixeltable.dataframe import DataFrame
17
+ from pixeltable import catalog, exceptions as excs, metadata
20
18
  from pixeltable.env import Env
21
- from pixeltable.utils.arrow import PXT_TO_PA_TYPES
22
- from pixeltable.utils.iceberg import sqlite_catalog
19
+ from pixeltable.metadata import schema
20
+ from pixeltable.utils.media_store import MediaStore
23
21
 
24
22
  _logger = logging.getLogger('pixeltable')
25
23
 
26
24
 
27
25
  class TablePackager:
28
26
  """
29
- Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
27
+ Packages a pixeltable Table into a tarball containing Parquet tables and media files. The structure of the tarball
30
28
  is as follows:
31
29
 
32
- metadata.json # Pixeltable metadata for the packaged table
33
- warehouse/catalog.db # sqlite Iceberg catalog
34
- warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
30
+ metadata.json # Pixeltable metadata for the packaged table and its ancestors
31
+ tables/** # Parquet tables for the packaged table and its ancestors, each table in a directory 'tbl_{tbl_id.hex}'
35
32
  media/** # Local media files
36
33
 
37
- If the table being archived is a view, then the Iceberg catalog will contain separate tables for the view and each
38
- of its ancestors. All rows will be exported with additional _rowid and _v_min columns. Currently, only the most
39
- recent version of the table can be exported, and only the full table contents.
40
-
41
34
  If the table contains media columns, they are handled as follows:
42
35
  - If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
43
- stored in the Iceberg table.
36
+ stored in the Parquet table.
44
37
  - If a media file is a local file, then it will be copied into the tarball as a file of the form
45
- 'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
38
+ 'media/{uuid}{extension}', and the Parquet table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
46
39
  """
47
40
 
48
41
  table: catalog.Table # The table to be packaged
49
42
  tmp_dir: Path # Temporary directory where the package will reside
50
- iceberg_catalog: pyiceberg.catalog.Catalog
43
+ tables_dir: Path # Directory where the Parquet tables will be written
51
44
  media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
52
45
  md: dict[str, Any]
53
46
 
@@ -69,138 +62,113 @@ class TablePackager:
69
62
 
70
63
  def package(self) -> Path:
71
64
  """
72
- Export the table to a tarball containing Iceberg tables and media files.
65
+ Export the table to a tarball containing Parquet tables and media files.
73
66
  """
74
67
  assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
75
68
  _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
76
69
  self.tmp_dir.mkdir()
77
70
  with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
78
71
  json.dump(self.md, fp)
79
- self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
72
+ self.tables_dir = self.tmp_dir / 'tables'
73
+ self.tables_dir.mkdir()
80
74
  with Env.get().begin_xact():
81
- ancestors = (self.table, *self.table._base_tables)
82
- for t in ancestors:
83
- _logger.info(f"Exporting table '{t._path}'.")
84
- self.__export_table(t)
75
+ for tv in self.table._tbl_version_path.get_tbl_versions():
76
+ _logger.info(f"Exporting table '{tv.get().name}:{tv.get().version}'.")
77
+ self.__export_table(tv.get())
85
78
  _logger.info('Building archive.')
86
79
  bundle_path = self.__build_tarball()
87
80
  _logger.info(f'Packaging complete: {bundle_path}')
88
81
  return bundle_path
89
82
 
90
- def __export_table(self, t: catalog.Table) -> None:
83
+ def __export_table(self, tv: catalog.TableVersion) -> None:
91
84
  """
92
- Exports the data from `t` into an Iceberg table.
85
+ Exports the data from `t` into a Parquet table.
93
86
  """
94
- # First generate a select list for the data we want to extract from `t`. This includes:
95
- # - all stored columns, including computed columns;
96
- # - errortype and errormsg fields whenever they're defined.
97
- # We select only those columns that are defined in this table (columns inherited from ancestor tables will be
98
- # handled separately).
99
- # For media columns, we substitute `col.fileurl` so that we always get the URL (which may be a file:// URL;
100
- # these will be specially handled later)
101
- select_exprs: dict[str, exprs.Expr] = {}
102
-
103
- # As we generate the select list, we construct a separate list of column types. We can't rely on df._schema
104
- # to get the column types, since we'll be substituting `fileurl`s for media columns.
105
- actual_col_types: list[ts.ColumnType] = []
106
-
107
- for col_name, col in t._tbl_version.get().cols_by_name.items():
108
- if not col.is_stored:
109
- continue
110
- if col.col_type.is_media_type():
111
- select_exprs[col_name] = t[col_name].fileurl
112
- else:
113
- select_exprs[col_name] = t[col_name]
114
- actual_col_types.append(col.col_type)
115
- if col.records_errors:
116
- select_exprs[f'{col_name}_errortype'] = t[col_name].errortype
117
- actual_col_types.append(ts.StringType())
118
- select_exprs[f'{col_name}_errormsg'] = t[col_name].errormsg
119
- actual_col_types.append(ts.StringType())
120
-
121
- # Run the select() on `self.table`, not `t`, so that we export only those rows that are actually present in
122
- # `self.table`.
123
- df = self.table.select(**select_exprs)
124
- namespace = self.__iceberg_namespace(t)
125
- self.iceberg_catalog.create_namespace_if_not_exists(namespace)
126
- iceberg_schema = self.__to_iceberg_schema(df._schema)
127
- iceberg_tbl = self.iceberg_catalog.create_table(f'{namespace}.{t._name}', schema=iceberg_schema)
128
-
129
- # Populate the Iceberg table with data.
130
- # The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
131
- # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Iceberg table on disk.
132
- for pa_table in self.__to_pa_tables(df, actual_col_types, iceberg_schema):
133
- iceberg_tbl.append(pa_table)
87
+ # `tv` must be an ancestor of the primary table
88
+ assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
89
+ sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
90
+ media_cols: set[str] = set()
91
+ for col in tv.cols_by_name.values():
92
+ if col.is_stored and col.col_type.is_media_type():
93
+ media_cols.add(col.store_name())
134
94
 
135
- @classmethod
136
- def __iceberg_namespace(cls, table: catalog.Table) -> str:
137
- """
138
- Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
139
- """
140
- parent_path = table._parent()._path()
141
- if len(parent_path) == 0:
142
- return 'pxt'
143
- else:
144
- return f'pxt.{parent_path}'
95
+ parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
96
+ # TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
97
+ # future support for this.)
98
+ parquet_dir = self.tables_dir / f'tbl_{tv.id.hex}'
99
+ parquet_dir.mkdir()
100
+ parquet_file = parquet_dir / f'tbl_{tv.id.hex}.00000.parquet'
101
+ _logger.info(f'Creating parquet table: {parquet_file}')
145
102
 
146
- # The following methods are responsible for schema and data conversion from Pixeltable to Iceberg. Some of this
147
- # logic might be consolidated into arrow.py and unified with general Parquet export, but there are several
148
- # major differences:
149
- # - Iceberg has no array type; we export all arrays as binary blobs
150
- # - We include _rowid and _v_min columns in the Iceberg table
151
- # - Media columns are handled specially as indicated above
103
+ # Populate the Parquet table with data.
104
+ # The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
105
+ # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
106
+ # We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
107
+ # faster compression should provide good performance while still reducing temporary storage utilization.
108
+ parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
109
+ filter_tv = self.table._tbl_version.get()
110
+ row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
111
+ for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
112
+ parquet_writer.write_table(pa_table)
113
+ parquet_writer.close()
114
+
115
+ # The following methods are responsible for schema and data conversion from Pixeltable to Parquet.
152
116
 
153
117
  @classmethod
154
- def __to_iceberg_schema(cls, pxt_schema: dict[str, ts.ColumnType]) -> pa.Schema:
155
- entries = [(name, cls.__to_iceberg_type(col_type)) for name, col_type in pxt_schema.items()]
156
- entries.append(('_rowid', pa.list_(pa.int64())))
157
- entries.append(('_v_min', pa.int64()))
118
+ def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
119
+ entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
158
120
  return pa.schema(entries) # type: ignore[arg-type]
159
121
 
160
122
  @classmethod
161
- def __to_iceberg_type(cls, col_type: ts.ColumnType) -> pa.DataType:
162
- if col_type.is_array_type():
163
- return pa.binary()
164
- if col_type.is_media_type():
123
+ def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
124
+ if isinstance(col_type, sql.String):
165
125
  return pa.string()
166
- return PXT_TO_PA_TYPES.get(col_type.__class__)
126
+ if isinstance(col_type, sql.Boolean):
127
+ return pa.bool_()
128
+ if isinstance(col_type, sql.BigInteger):
129
+ return pa.int64()
130
+ if isinstance(col_type, sql.Float):
131
+ return pa.float32()
132
+ if isinstance(col_type, sql.TIMESTAMP):
133
+ return pa.timestamp('us', tz=datetime.timezone.utc)
134
+ if isinstance(col_type, sql.Date):
135
+ return pa.date32()
136
+ if isinstance(col_type, sql.JSON):
137
+ return pa.string() # JSON will be exported as strings
138
+ if isinstance(col_type, sql.LargeBinary):
139
+ return pa.binary()
140
+ raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
167
141
 
168
142
  def __to_pa_tables(
169
- self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
143
+ self,
144
+ row_iter: Iterator[dict[str, Any]],
145
+ sql_types: dict[str, sql.types.TypeEngine[Any]],
146
+ media_cols: set[str],
147
+ arrow_schema: pa.Schema,
148
+ batch_size: int = 1_000,
170
149
  ) -> Iterator[pa.Table]:
171
150
  """
172
- Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
173
- to avoid excessive memory usage.
151
+ Group rows into a sequence of pyarrow tables, batched into smaller chunks to minimize memory utilization.
152
+ The row dictionaries have the format {store_col_name: value}, where the values reflect the unprocessed contents
153
+ of the store database (as returned by `StoreTable.dump_rows()`).
174
154
  """
175
- for rows in more_itertools.batched(self.__to_pa_rows(df, actual_col_types), batch_size):
176
- cols = {col_name: [row[idx] for row in rows] for idx, col_name in enumerate(df._schema.keys())}
177
- cols['_rowid'] = [row[-2] for row in rows]
178
- cols['_v_min'] = [row[-1] for row in rows]
155
+ for rows in more_itertools.batched(row_iter, batch_size):
156
+ cols = {}
157
+ for name, sql_type in sql_types.items():
158
+ is_media_col = name in media_cols
159
+ values = [self.__to_pa_value(row.get(name), sql_type, is_media_col) for row in rows]
160
+ cols[name] = values
179
161
  yield pa.Table.from_pydict(cols, schema=arrow_schema)
180
162
 
181
- def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
182
- for row in df._exec():
183
- vals = [row[e.slot_idx] for e in df._select_list_exprs]
184
- result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
185
- result.append(row.rowid)
186
- result.append(row.v_min)
187
- yield result
188
-
189
- def __to_pa_value(self, val: Any, col_type: ts.ColumnType) -> Any:
163
+ def __to_pa_value(self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool) -> Any:
190
164
  if val is None:
191
165
  return None
192
- if col_type.is_array_type():
193
- # Export arrays as binary
194
- assert isinstance(val, np.ndarray)
195
- arr = io.BytesIO()
196
- np.save(arr, val)
197
- return arr.getvalue()
198
- if col_type.is_json_type():
166
+ if isinstance(sql_type, sql.JSON):
199
167
  # Export JSON as strings
200
168
  return json.dumps(val)
201
- if col_type.is_media_type():
169
+ if is_media_col:
202
170
  # Handle media files as described above
203
- assert isinstance(val, str) # Media columns are always referenced by `fileurl`
171
+ assert isinstance(val, str)
204
172
  return self.__process_media_url(val)
205
173
  return val
206
174
 
@@ -225,9 +193,141 @@ class TablePackager:
225
193
  with tarfile.open(bundle_path, 'w:bz2') as tf:
226
194
  # Add metadata json
227
195
  tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
228
- # Add the Iceberg warehouse dir (including the catalog)
229
- tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
196
+ # Add the dir containing Parquet tables
197
+ tf.add(self.tables_dir, arcname='tables')
230
198
  # Add the media files
231
199
  for src_file, dest_name in self.media_files.items():
232
200
  tf.add(src_file, arcname=f'media/{dest_name}')
233
201
  return bundle_path
202
+
203
+
204
+ class TableRestorer:
205
+ """
206
+ Creates a replica table from a tarball containing Parquet tables and media files. See the `TablePackager` docs for
207
+ details on the tarball structure.
208
+
209
+ Args:
210
+ tbl_path: Pixeltable path (such as 'my_dir.my_table') where the materialized table will be made visible.
211
+ md: Optional metadata dictionary. If not provided, metadata will be read from the tarball's `metadata.json`.
212
+ The metadata contains table_md, table_version_md, and table_schema_version_md entries for each ancestor
213
+ of the table being restored, as written out by `TablePackager`.
214
+ """
215
+
216
+ tbl_path: str
217
+ md: Optional[dict[str, Any]]
218
+ tmp_dir: Path
219
+ media_files: dict[str, str] # Mapping from pxtmedia:// URLs to local file:// URLs
220
+
221
+ def __init__(self, tbl_path: str, md: Optional[dict[str, Any]] = None) -> None:
222
+ self.tbl_path = tbl_path
223
+ self.md = md
224
+ self.tmp_dir = Path(Env.get().create_tmp_path())
225
+ self.media_files = {}
226
+
227
+ def restore(self, bundle_path: Path) -> pxt.Table:
228
+ # Extract tarball
229
+ print(f'Extracting table data into: {self.tmp_dir}')
230
+ with tarfile.open(bundle_path, 'r:bz2') as tf:
231
+ tf.extractall(path=self.tmp_dir)
232
+
233
+ if self.md is None:
234
+ # No metadata supplied; read it from the archive
235
+ with open(self.tmp_dir / 'metadata.json', 'r', encoding='utf8') as fp:
236
+ self.md = json.load(fp)
237
+
238
+ pxt_md_version = self.md['pxt_md_version']
239
+ assert isinstance(pxt_md_version, int)
240
+
241
+ if pxt_md_version != metadata.VERSION:
242
+ raise excs.Error(
243
+ f'Pixeltable metadata version mismatch: {pxt_md_version} != {metadata.VERSION}.\n'
244
+ 'Please upgrade Pixeltable to use this dataset: pip install -U pixeltable'
245
+ )
246
+
247
+ tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
248
+
249
+ # Create the replica table
250
+ # TODO: This needs to be made concurrency-safe.
251
+ replica_tbl = catalog.Catalog.get().create_replica(catalog.Path(self.tbl_path), tbl_md)
252
+ assert replica_tbl._tbl_version.get().is_snapshot
253
+
254
+ # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
255
+ # replica_tbl itself if it's a pure snapshot.
256
+ if replica_tbl._id != replica_tbl._tbl_version.id:
257
+ ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
258
+ else:
259
+ ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
260
+
261
+ # Instantiate data from the Parquet tables.
262
+ with Env.get().begin_xact():
263
+ for md in ancestor_md[::-1]: # Base table first
264
+ # Create a TableVersion instance (and a store table) for this ancestor.
265
+ tv = catalog.TableVersion.create_replica(md)
266
+ # Now import data from Parquet.
267
+ _logger.info(f'Importing table {tv.name!r}.')
268
+ self.__import_table(self.tmp_dir, tv, md)
269
+
270
+ return replica_tbl
271
+
272
+ def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
273
+ """
274
+ Import the Parquet table into the Pixeltable catalog.
275
+ """
276
+ tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
277
+ parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
278
+ parquet_table = pq.read_table(str(parquet_dir))
279
+
280
+ for batch in parquet_table.to_batches():
281
+ pydict = batch.to_pydict()
282
+ rows = self.__from_pa_pydict(tv, pydict)
283
+ tv.store_tbl.load_rows(rows)
284
+
285
+ def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
286
+ # Data conversions from pyarrow to Pixeltable
287
+ sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
288
+ for col_name in pydict:
289
+ assert col_name in tv.store_tbl.sa_tbl.columns
290
+ sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
291
+ media_col_ids: dict[str, int] = {}
292
+ for col in tv.cols_by_name.values():
293
+ if col.is_stored and col.col_type.is_media_type():
294
+ media_col_ids[col.store_name()] = col.id
295
+
296
+ row_count = len(next(iter(pydict.values())))
297
+ rows: list[dict[str, Any]] = []
298
+ for i in range(row_count):
299
+ row = {
300
+ col_name: self.__from_pa_value(tv, col_vals[i], sql_types[col_name], media_col_ids.get(col_name))
301
+ for col_name, col_vals in pydict.items()
302
+ }
303
+ rows.append(row)
304
+
305
+ return rows
306
+
307
+ def __from_pa_value(
308
+ self, tv: catalog.TableVersion, val: Any, sql_type: sql.types.TypeEngine[Any], media_col_id: Optional[int]
309
+ ) -> Any:
310
+ if val is None:
311
+ return None
312
+ if isinstance(sql_type, sql.JSON):
313
+ return json.loads(val)
314
+ if media_col_id is not None:
315
+ assert isinstance(val, str)
316
+ return self.__relocate_media_file(tv, media_col_id, val)
317
+ return val
318
+
319
+ def __relocate_media_file(self, tv: catalog.TableVersion, media_col_id: int, url: str) -> str:
320
+ # If this is a pxtmedia:// URL, relocate it
321
+ parsed_url = urllib.parse.urlparse(url)
322
+ assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
323
+ if parsed_url.scheme == 'pxtmedia':
324
+ if url not in self.media_files:
325
+ # First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
326
+ # in self.media_files.
327
+ src_path = self.tmp_dir / 'media' / parsed_url.netloc
328
+ dest_path = MediaStore.prepare_media_path(tv.id, media_col_id, tv.version, ext=src_path.suffix)
329
+ src_path.rename(dest_path)
330
+ self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
331
+ return self.media_files[url]
332
+ # For any type of URL other than a local file, just return the URL as-is.
333
+ return url
@@ -9,10 +9,9 @@ from tqdm import tqdm
9
9
  import pixeltable as pxt
10
10
  from pixeltable import exceptions as excs
11
11
  from pixeltable.env import Env
12
- from pixeltable.metadata.schema import FullTableMd
13
12
  from pixeltable.utils import sha256sum
14
13
 
15
- from .packager import TablePackager
14
+ from .packager import TablePackager, TableRestorer
16
15
 
17
16
  # These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
18
17
  # pixeltable.com URLs are available.
@@ -20,7 +19,10 @@ from .packager import TablePackager
20
19
  PIXELTABLE_API_URL = 'https://internal-api.pixeltable.com'
21
20
 
22
21
 
23
- def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
22
+ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
23
+ if not src_tbl._tbl_version.get().is_snapshot:
24
+ raise excs.Error('Only snapshots may be published.')
25
+
24
26
  packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
25
27
  request_json = packager.md | {'operation_type': 'publish_snapshot'}
26
28
  headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
@@ -65,18 +67,6 @@ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
65
67
  return confirmed_tbl_uri
66
68
 
67
69
 
68
- def clone_snapshot(dest_tbl_uri: str) -> list[FullTableMd]:
69
- headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
70
- clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': dest_tbl_uri}
71
- response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=headers_json)
72
- if response.status_code != 200:
73
- raise excs.Error(f'Error cloning snapshot: {response.text}')
74
- response_json = response.json()
75
- if not isinstance(response_json, dict) or 'table_uri' not in response_json:
76
- raise excs.Error(f'Unexpected response from server.\n{response_json}')
77
- return [FullTableMd.from_dict(t) for t in response_json['md']['tables']]
78
-
79
-
80
70
  def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
81
71
  from pixeltable.utils.s3 import get_client
82
72
 
@@ -102,5 +92,60 @@ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult
102
92
  file=sys.stdout,
103
93
  )
104
94
  s3_client.upload_file(
105
- Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
95
+ Filename=str(bundle), Bucket=bucket, Key=remote_path, ExtraArgs=upload_args, Callback=progress_bar.update
96
+ )
97
+
98
+
99
+ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
100
+ headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
101
+ clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': src_tbl_uri}
102
+ response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=headers_json)
103
+ if response.status_code != 200:
104
+ raise excs.Error(f'Error cloning snapshot: {response.text}')
105
+ response_json = response.json()
106
+ if not isinstance(response_json, dict) or 'table_uri' not in response_json:
107
+ raise excs.Error(f'Error cloning shapshot: unexpected response from server.\n{response_json}')
108
+
109
+ primary_tbl_additional_md = response_json['md']['tables'][0]['table_md']['additional_md']
110
+ bundle_uri = primary_tbl_additional_md['destination_uri']
111
+ bundle_filename = primary_tbl_additional_md['datafile']
112
+ parsed_location = urllib.parse.urlparse(bundle_uri)
113
+ if parsed_location.scheme == 's3':
114
+ bundle_path = _download_bundle_from_s3(parsed_location, bundle_filename)
115
+ else:
116
+ raise excs.Error(f'Unexpected response from server: unsupported bundle uri: {bundle_uri}')
117
+
118
+ restorer = TableRestorer(dest_path, response_json)
119
+ tbl = restorer.restore(bundle_path)
120
+ Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
121
+ return tbl
122
+
123
+
124
+ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_filename: str) -> Path:
125
+ from pixeltable.utils.s3 import get_client
126
+
127
+ bucket = parsed_location.netloc
128
+ remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
129
+ remote_path = str(remote_dir / bundle_filename)[1:] # Remove initial /
130
+
131
+ Env.get().console_logger.info(f'Downloading snapshot from: {bucket}:{remote_path}')
132
+
133
+ boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
134
+ s3_client = get_client(**boto_config)
135
+
136
+ obj = s3_client.head_object(Bucket=bucket, Key=remote_path) # Check if the object exists
137
+ bundle_size = obj['ContentLength']
138
+
139
+ bundle_path = Path(Env.get().create_tmp_path())
140
+ progress_bar = tqdm(
141
+ desc='Downloading',
142
+ total=bundle_size,
143
+ unit='B',
144
+ unit_scale=True,
145
+ unit_divisor=1024,
146
+ miniters=1,
147
+ ncols=100,
148
+ file=sys.stdout,
106
149
  )
150
+ s3_client.download_file(Bucket=bucket, Key=remote_path, Filename=str(bundle_path), Callback=progress_bar.update)
151
+ return bundle_path