pixeltable 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (50) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -7
  4. pixeltable/catalog/column.py +6 -2
  5. pixeltable/catalog/dir.py +2 -1
  6. pixeltable/catalog/insertable_table.py +1 -1
  7. pixeltable/catalog/schema_object.py +2 -1
  8. pixeltable/catalog/table.py +12 -8
  9. pixeltable/catalog/table_version.py +21 -0
  10. pixeltable/catalog/view.py +3 -3
  11. pixeltable/dataframe.py +48 -5
  12. pixeltable/env.py +1 -1
  13. pixeltable/exec/aggregation_node.py +14 -0
  14. pixeltable/exec/cache_prefetch_node.py +1 -1
  15. pixeltable/exec/expr_eval/expr_eval_node.py +1 -1
  16. pixeltable/exprs/column_ref.py +42 -17
  17. pixeltable/exprs/data_row.py +3 -0
  18. pixeltable/exprs/globals.py +1 -1
  19. pixeltable/exprs/literal.py +11 -1
  20. pixeltable/exprs/rowid_ref.py +4 -1
  21. pixeltable/exprs/similarity_expr.py +1 -1
  22. pixeltable/func/function.py +1 -1
  23. pixeltable/functions/__init__.py +1 -0
  24. pixeltable/functions/date.py +185 -0
  25. pixeltable/functions/gemini.py +184 -49
  26. pixeltable/functions/globals.py +1 -16
  27. pixeltable/functions/json.py +2 -1
  28. pixeltable/functions/math.py +103 -0
  29. pixeltable/functions/string.py +1 -2
  30. pixeltable/functions/video.py +2 -2
  31. pixeltable/globals.py +26 -9
  32. pixeltable/io/hf_datasets.py +2 -2
  33. pixeltable/io/pandas.py +16 -4
  34. pixeltable/io/parquet.py +4 -2
  35. pixeltable/metadata/__init__.py +1 -1
  36. pixeltable/metadata/converters/convert_34.py +21 -0
  37. pixeltable/metadata/notes.py +1 -0
  38. pixeltable/plan.py +12 -5
  39. pixeltable/share/__init__.py +1 -1
  40. pixeltable/share/packager.py +397 -120
  41. pixeltable/share/publish.py +61 -16
  42. pixeltable/store.py +57 -20
  43. pixeltable/type_system.py +46 -2
  44. pixeltable/utils/arrow.py +8 -2
  45. pixeltable/utils/pytorch.py +4 -0
  46. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/METADATA +2 -4
  47. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/RECORD +50 -48
  48. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/LICENSE +0 -0
  49. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/WHEEL +0 -0
  50. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,4 @@
1
- import io
1
+ import datetime
2
2
  import json
3
3
  import logging
4
4
  import tarfile
@@ -9,45 +9,39 @@ from pathlib import Path
9
9
  from typing import Any, Iterator, Optional
10
10
 
11
11
  import more_itertools
12
- import numpy as np
13
12
  import pyarrow as pa
14
- import pyiceberg.catalog
13
+ import pyarrow.parquet as pq
14
+ import sqlalchemy as sql
15
15
 
16
16
  import pixeltable as pxt
17
- import pixeltable.type_system as ts
18
- from pixeltable import catalog, exprs, metadata
19
- from pixeltable.dataframe import DataFrame
17
+ from pixeltable import catalog, exceptions as excs, metadata
20
18
  from pixeltable.env import Env
21
- from pixeltable.utils.arrow import PXT_TO_PA_TYPES
22
- from pixeltable.utils.iceberg import sqlite_catalog
19
+ from pixeltable.metadata import schema
20
+ from pixeltable.utils import sha256sum
21
+ from pixeltable.utils.media_store import MediaStore
23
22
 
24
23
  _logger = logging.getLogger('pixeltable')
25
24
 
26
25
 
27
26
  class TablePackager:
28
27
  """
29
- Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
28
+ Packages a pixeltable Table into a tarball containing Parquet tables and media files. The structure of the tarball
30
29
  is as follows:
31
30
 
32
- metadata.json # Pixeltable metadata for the packaged table
33
- warehouse/catalog.db # sqlite Iceberg catalog
34
- warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
31
+ metadata.json # Pixeltable metadata for the packaged table and its ancestors
32
+ tables/** # Parquet tables for the packaged table and its ancestors, each table in a directory 'tbl_{tbl_id.hex}'
35
33
  media/** # Local media files
36
34
 
37
- If the table being archived is a view, then the Iceberg catalog will contain separate tables for the view and each
38
- of its ancestors. All rows will be exported with additional _rowid and _v_min columns. Currently, only the most
39
- recent version of the table can be exported, and only the full table contents.
40
-
41
35
  If the table contains media columns, they are handled as follows:
42
36
  - If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
43
- stored in the Iceberg table.
37
+ stored in the Parquet table.
44
38
  - If a media file is a local file, then it will be copied into the tarball as a file of the form
45
- 'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
39
+ 'media/{uuid}{extension}', and the Parquet table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
46
40
  """
47
41
 
48
42
  table: catalog.Table # The table to be packaged
49
43
  tmp_dir: Path # Temporary directory where the package will reside
50
- iceberg_catalog: pyiceberg.catalog.Catalog
44
+ tables_dir: Path # Directory where the Parquet tables will be written
51
45
  media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
52
46
  md: dict[str, Any]
53
47
 
@@ -69,138 +63,113 @@ class TablePackager:
69
63
 
70
64
  def package(self) -> Path:
71
65
  """
72
- Export the table to a tarball containing Iceberg tables and media files.
66
+ Export the table to a tarball containing Parquet tables and media files.
73
67
  """
74
68
  assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
75
69
  _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
76
70
  self.tmp_dir.mkdir()
77
71
  with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
78
72
  json.dump(self.md, fp)
79
- self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
73
+ self.tables_dir = self.tmp_dir / 'tables'
74
+ self.tables_dir.mkdir()
80
75
  with Env.get().begin_xact():
81
- ancestors = (self.table, *self.table._base_tables)
82
- for t in ancestors:
83
- _logger.info(f"Exporting table '{t._path}'.")
84
- self.__export_table(t)
76
+ for tv in self.table._tbl_version_path.get_tbl_versions():
77
+ _logger.info(f"Exporting table '{tv.get().name}:{tv.get().version}'.")
78
+ self.__export_table(tv.get())
85
79
  _logger.info('Building archive.')
86
80
  bundle_path = self.__build_tarball()
87
81
  _logger.info(f'Packaging complete: {bundle_path}')
88
82
  return bundle_path
89
83
 
90
- def __export_table(self, t: catalog.Table) -> None:
84
+ def __export_table(self, tv: catalog.TableVersion) -> None:
91
85
  """
92
- Exports the data from `t` into an Iceberg table.
86
+ Exports the data from `t` into a Parquet table.
93
87
  """
94
- # First generate a select list for the data we want to extract from `t`. This includes:
95
- # - all stored columns, including computed columns;
96
- # - errortype and errormsg fields whenever they're defined.
97
- # We select only those columns that are defined in this table (columns inherited from ancestor tables will be
98
- # handled separately).
99
- # For media columns, we substitute `col.fileurl` so that we always get the URL (which may be a file:// URL;
100
- # these will be specially handled later)
101
- select_exprs: dict[str, exprs.Expr] = {}
102
-
103
- # As we generate the select list, we construct a separate list of column types. We can't rely on df._schema
104
- # to get the column types, since we'll be substituting `fileurl`s for media columns.
105
- actual_col_types: list[ts.ColumnType] = []
106
-
107
- for col_name, col in t._tbl_version.get().cols_by_name.items():
108
- if not col.is_stored:
109
- continue
110
- if col.col_type.is_media_type():
111
- select_exprs[col_name] = t[col_name].fileurl
112
- else:
113
- select_exprs[col_name] = t[col_name]
114
- actual_col_types.append(col.col_type)
115
- if col.records_errors:
116
- select_exprs[f'{col_name}_errortype'] = t[col_name].errortype
117
- actual_col_types.append(ts.StringType())
118
- select_exprs[f'{col_name}_errormsg'] = t[col_name].errormsg
119
- actual_col_types.append(ts.StringType())
120
-
121
- # Run the select() on `self.table`, not `t`, so that we export only those rows that are actually present in
122
- # `self.table`.
123
- df = self.table.select(**select_exprs)
124
- namespace = self.__iceberg_namespace(t)
125
- self.iceberg_catalog.create_namespace_if_not_exists(namespace)
126
- iceberg_schema = self.__to_iceberg_schema(df._schema)
127
- iceberg_tbl = self.iceberg_catalog.create_table(f'{namespace}.{t._name}', schema=iceberg_schema)
128
-
129
- # Populate the Iceberg table with data.
130
- # The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
131
- # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Iceberg table on disk.
132
- for pa_table in self.__to_pa_tables(df, actual_col_types, iceberg_schema):
133
- iceberg_tbl.append(pa_table)
88
+ # `tv` must be an ancestor of the primary table
89
+ assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
90
+ sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
91
+ media_cols: set[str] = set()
92
+ for col in tv.cols:
93
+ if col.is_stored and col.col_type.is_media_type():
94
+ media_cols.add(col.store_name())
134
95
 
135
- @classmethod
136
- def __iceberg_namespace(cls, table: catalog.Table) -> str:
137
- """
138
- Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
139
- """
140
- parent_path = table._parent()._path()
141
- if len(parent_path) == 0:
142
- return 'pxt'
143
- else:
144
- return f'pxt.{parent_path}'
96
+ parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
97
+ # TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
98
+ # future support for this.)
99
+ parquet_dir = self.tables_dir / f'tbl_{tv.id.hex}'
100
+ parquet_dir.mkdir()
101
+ parquet_file = parquet_dir / f'tbl_{tv.id.hex}.00000.parquet'
102
+ _logger.info(f'Creating parquet table: {parquet_file}')
103
+
104
+ # Populate the Parquet table with data.
105
+ # The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
106
+ # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
107
+ # We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
108
+ # faster compression should provide good performance while still reducing temporary storage utilization.
109
+ parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
110
+ filter_tv = self.table._tbl_version.get()
111
+ row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
112
+ for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
113
+ parquet_writer.write_table(pa_table)
114
+ parquet_writer.close()
145
115
 
146
- # The following methods are responsible for schema and data conversion from Pixeltable to Iceberg. Some of this
147
- # logic might be consolidated into arrow.py and unified with general Parquet export, but there are several
148
- # major differences:
149
- # - Iceberg has no array type; we export all arrays as binary blobs
150
- # - We include _rowid and _v_min columns in the Iceberg table
151
- # - Media columns are handled specially as indicated above
116
+ # The following methods are responsible for schema and data conversion from Pixeltable to Parquet.
152
117
 
153
118
  @classmethod
154
- def __to_iceberg_schema(cls, pxt_schema: dict[str, ts.ColumnType]) -> pa.Schema:
155
- entries = [(name, cls.__to_iceberg_type(col_type)) for name, col_type in pxt_schema.items()]
156
- entries.append(('_rowid', pa.list_(pa.int64())))
157
- entries.append(('_v_min', pa.int64()))
119
+ def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
120
+ entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
158
121
  return pa.schema(entries) # type: ignore[arg-type]
159
122
 
160
123
  @classmethod
161
- def __to_iceberg_type(cls, col_type: ts.ColumnType) -> pa.DataType:
162
- if col_type.is_array_type():
163
- return pa.binary()
164
- if col_type.is_media_type():
124
+ def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
125
+ if isinstance(col_type, sql.String):
165
126
  return pa.string()
166
- return PXT_TO_PA_TYPES.get(col_type.__class__)
127
+ if isinstance(col_type, sql.Boolean):
128
+ return pa.bool_()
129
+ if isinstance(col_type, sql.BigInteger):
130
+ return pa.int64()
131
+ if isinstance(col_type, sql.Float):
132
+ return pa.float32()
133
+ if isinstance(col_type, sql.TIMESTAMP):
134
+ return pa.timestamp('us', tz=datetime.timezone.utc)
135
+ if isinstance(col_type, sql.Date):
136
+ return pa.date32()
137
+ if isinstance(col_type, sql.JSON):
138
+ return pa.string() # JSON will be exported as strings
139
+ if isinstance(col_type, sql.LargeBinary):
140
+ return pa.binary()
141
+ raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
167
142
 
168
143
  def __to_pa_tables(
169
- self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
144
+ self,
145
+ row_iter: Iterator[dict[str, Any]],
146
+ sql_types: dict[str, sql.types.TypeEngine[Any]],
147
+ media_cols: set[str],
148
+ arrow_schema: pa.Schema,
149
+ batch_size: int = 1_000,
170
150
  ) -> Iterator[pa.Table]:
171
151
  """
172
- Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
173
- to avoid excessive memory usage.
152
+ Group rows into a sequence of pyarrow tables, batched into smaller chunks to minimize memory utilization.
153
+ The row dictionaries have the format {store_col_name: value}, where the values reflect the unprocessed contents
154
+ of the store database (as returned by `StoreTable.dump_rows()`).
174
155
  """
175
- for rows in more_itertools.batched(self.__to_pa_rows(df, actual_col_types), batch_size):
176
- cols = {col_name: [row[idx] for row in rows] for idx, col_name in enumerate(df._schema.keys())}
177
- cols['_rowid'] = [row[-2] for row in rows]
178
- cols['_v_min'] = [row[-1] for row in rows]
156
+ for rows in more_itertools.batched(row_iter, batch_size):
157
+ cols = {}
158
+ for name, sql_type in sql_types.items():
159
+ is_media_col = name in media_cols
160
+ values = [self.__to_pa_value(row.get(name), sql_type, is_media_col) for row in rows]
161
+ cols[name] = values
179
162
  yield pa.Table.from_pydict(cols, schema=arrow_schema)
180
163
 
181
- def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
182
- for row in df._exec():
183
- vals = [row[e.slot_idx] for e in df._select_list_exprs]
184
- result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
185
- result.append(row.rowid)
186
- result.append(row.v_min)
187
- yield result
188
-
189
- def __to_pa_value(self, val: Any, col_type: ts.ColumnType) -> Any:
164
+ def __to_pa_value(self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool) -> Any:
190
165
  if val is None:
191
166
  return None
192
- if col_type.is_array_type():
193
- # Export arrays as binary
194
- assert isinstance(val, np.ndarray)
195
- arr = io.BytesIO()
196
- np.save(arr, val)
197
- return arr.getvalue()
198
- if col_type.is_json_type():
167
+ if isinstance(sql_type, sql.JSON):
199
168
  # Export JSON as strings
200
169
  return json.dumps(val)
201
- if col_type.is_media_type():
170
+ if is_media_col:
202
171
  # Handle media files as described above
203
- assert isinstance(val, str) # Media columns are always referenced by `fileurl`
172
+ assert isinstance(val, str)
204
173
  return self.__process_media_url(val)
205
174
  return val
206
175
 
@@ -214,7 +183,12 @@ class TablePackager:
214
183
  path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
215
184
  if path not in self.media_files:
216
185
  # Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
217
- dest_name = f'{uuid.uuid4().hex}{path.suffix}'
186
+ # We name the media files in the archive by their SHA256 hash. This ensures that we can properly
187
+ # deduplicate and validate them later.
188
+ # If we get a collision, it's not a problem; it just means we have two identical files (which will
189
+ # be conveniently deduplicated in the bundle).
190
+ sha = sha256sum(path)
191
+ dest_name = f'{sha}{path.suffix}'
218
192
  self.media_files[path] = dest_name
219
193
  return f'pxtmedia://{self.media_files[path]}'
220
194
  # For any type of URL other than a local file, just return the URL as-is.
@@ -225,9 +199,312 @@ class TablePackager:
225
199
  with tarfile.open(bundle_path, 'w:bz2') as tf:
226
200
  # Add metadata json
227
201
  tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
228
- # Add the Iceberg warehouse dir (including the catalog)
229
- tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
202
+ # Add the dir containing Parquet tables
203
+ tf.add(self.tables_dir, arcname='tables')
230
204
  # Add the media files
231
205
  for src_file, dest_name in self.media_files.items():
232
206
  tf.add(src_file, arcname=f'media/{dest_name}')
233
207
  return bundle_path
208
+
209
+
210
+ class TableRestorer:
211
+ """
212
+ Creates a replica table from a tarball containing Parquet tables and media files. See the `TablePackager` docs for
213
+ details on the tarball structure.
214
+
215
+ Args:
216
+ tbl_path: Pixeltable path (such as 'my_dir.my_table') where the materialized table will be made visible.
217
+ md: Optional metadata dictionary. If not provided, metadata will be read from the tarball's `metadata.json`.
218
+ The metadata contains table_md, table_version_md, and table_schema_version_md entries for each ancestor
219
+ of the table being restored, as written out by `TablePackager`.
220
+ """
221
+
222
+ tbl_path: str
223
+ md: Optional[dict[str, Any]]
224
+ tmp_dir: Path
225
+ media_files: dict[str, str] # Mapping from pxtmedia:// URLs to local file:// URLs
226
+
227
+ def __init__(self, tbl_path: str, md: Optional[dict[str, Any]] = None) -> None:
228
+ self.tbl_path = tbl_path
229
+ self.md = md
230
+ self.tmp_dir = Path(Env.get().create_tmp_path())
231
+ self.media_files = {}
232
+
233
+ def restore(self, bundle_path: Path) -> pxt.Table:
234
+ # Extract tarball
235
+ print(f'Extracting table data into: {self.tmp_dir}')
236
+ with tarfile.open(bundle_path, 'r:bz2') as tf:
237
+ tf.extractall(path=self.tmp_dir)
238
+
239
+ if self.md is None:
240
+ # No metadata supplied; read it from the archive
241
+ with open(self.tmp_dir / 'metadata.json', 'r', encoding='utf8') as fp:
242
+ self.md = json.load(fp)
243
+
244
+ pxt_md_version = self.md['pxt_md_version']
245
+ assert isinstance(pxt_md_version, int)
246
+
247
+ if pxt_md_version != metadata.VERSION:
248
+ raise excs.Error(
249
+ f'Pixeltable metadata version mismatch: {pxt_md_version} != {metadata.VERSION}.\n'
250
+ 'Please upgrade Pixeltable to use this dataset: pip install -U pixeltable'
251
+ )
252
+
253
+ tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
254
+
255
+ # Create the replica table
256
+ # TODO: This needs to be made concurrency-safe.
257
+ replica_tbl = catalog.Catalog.get().create_replica(catalog.Path(self.tbl_path), tbl_md)
258
+ assert replica_tbl._tbl_version.get().is_snapshot
259
+
260
+ # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
261
+ # replica_tbl itself if it's a pure snapshot.
262
+ if replica_tbl._id != replica_tbl._tbl_version.id:
263
+ ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
264
+ else:
265
+ ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
266
+
267
+ # Instantiate data from the Parquet tables.
268
+ with Env.get().begin_xact():
269
+ for md in ancestor_md[::-1]: # Base table first
270
+ # Create a TableVersion instance (and a store table) for this ancestor.
271
+ tv = catalog.TableVersion.create_replica(md)
272
+ # Now import data from Parquet.
273
+ _logger.info(f'Importing table {tv.name!r}.')
274
+ self.__import_table(self.tmp_dir, tv, md)
275
+
276
+ return replica_tbl
277
+
278
+ def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
279
+ """
280
+ Import the Parquet table into the Pixeltable catalog.
281
+ """
282
+ tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
283
+ parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
284
+ parquet_table = pq.read_table(str(parquet_dir))
285
+ replica_version = tv.version
286
+
287
+ conn = Env.get().conn
288
+ store_sa_tbl = tv.store_tbl.sa_tbl
289
+ store_sa_tbl_name = tv.store_tbl._storage_name()
290
+
291
+ # Sometimes we are importing a table that has never been seen before. Other times, however, we are importing
292
+ # an existing replica table, and the table version and/or row selection differs from what was imported
293
+ # previously. Care must be taken to ensure that the new data is merged with existing data in a way that
294
+ # yields an internally consistent version history for each row.
295
+
296
+ # The overall strategy is this:
297
+ # 1. Import the parquet data into a temporary table;
298
+ # 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
299
+ # 3. Delete any row instances from the temporary table that are already present in the existing table;
300
+ # 4. Copy the remaining rows from the temporary table into the existing table.
301
+
302
+ # Create a temporary table for the initial data load, containing columns for all columns present in the
303
+ # parquet table. The parquet columns have identical names to those in the store table, so we can use the
304
+ # store table schema to get their SQL types (which are not necessarily derivable from their Parquet types,
305
+ # e.g., pa.string() may hold either VARCHAR or serialized JSONB).
306
+ temp_cols: dict[str, sql.Column] = {}
307
+ for field in parquet_table.schema:
308
+ assert field.name in store_sa_tbl.columns
309
+ col_type = store_sa_tbl.columns[field.name].type
310
+ temp_cols[field.name] = sql.Column(field.name, col_type)
311
+ temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
312
+ _logger.debug(f'Creating temporary table: {temp_sa_tbl_name}')
313
+ temp_md = sql.MetaData()
314
+ temp_sa_tbl = sql.Table(temp_sa_tbl_name, temp_md, *temp_cols.values(), prefixes=['TEMPORARY'])
315
+ temp_sa_tbl.create(conn)
316
+
317
+ # Populate the temporary table with data from the Parquet file.
318
+ _logger.debug(f'Loading {parquet_table.num_rows} row(s) into temporary table: {temp_sa_tbl_name}')
319
+ for batch in parquet_table.to_batches(max_chunksize=10_000):
320
+ pydict = batch.to_pydict()
321
+ rows = self.__from_pa_pydict(tv, pydict)
322
+ conn.execute(sql.insert(temp_sa_tbl), rows)
323
+
324
+ # Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
325
+ # Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
326
+ # In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
327
+ # row id, or MAX_VERSION if no such row instance exists. But in the replica, we need to be careful, since
328
+ # we might see only a subset of the original table's versions, and we might see them out of order.
329
+
330
+ # We'll adjust the v_max values according to the principle of "latest provable v_max":
331
+ # they will always correspond to the latest version for which we can prove the row instance was alive. This
332
+ # will enable us to maintain consistency of the v_max values if additional table versions are later imported,
333
+ # regardless of the order in which they are seen. It also means that replica tables (unlike original tables)
334
+ # may have gaps in their row version histories, but this is fine; the gaps simply correspond to table versions
335
+ # that have never been observed.
336
+
337
+ pk_predicates = [col == temp_cols[col.name] for col in tv.store_tbl.pk_columns()]
338
+ pk_clause = sql.and_(*pk_predicates)
339
+
340
+ # If the same pk exists in both the temporary table and the existing table, then the corresponding row data
341
+ # must be identical; the rows can differ only in their v_max value. As a sanity check, we go through the
342
+ # motion of verifying this; a failure implies data corruption in either the replica being imported or in a
343
+ # previously imported replica.
344
+
345
+ system_col_names = {col.name for col in tv.store_tbl.system_columns()}
346
+ media_col_names = {col.store_name() for col in tv.cols if col.col_type.is_media_type() and col.is_stored}
347
+ value_store_cols = [
348
+ store_sa_tbl.c[col_name]
349
+ for col_name in temp_cols
350
+ if col_name not in system_col_names and col_name not in media_col_names
351
+ ]
352
+ value_temp_cols = [
353
+ col
354
+ for col_name, col in temp_cols.items()
355
+ if col_name not in system_col_names and col_name not in media_col_names
356
+ ]
357
+ mismatch_predicates = [store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)]
358
+ mismatch_clause = sql.or_(*mismatch_predicates)
359
+
360
+ # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
361
+ # one value column. Pseudo-SQL:
362
+ #
363
+ # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
364
+ # FROM store_tbl, temp_tbl
365
+ # WHERE store_tbl.rowid = temp_tbl.rowid
366
+ # AND store_tbl.pos_0 = temp_tbl.pos_0
367
+ # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
368
+ # AND store_tbl.v_min = temp_tbl.v_min
369
+ # AND (
370
+ # store_tbl.col_0 != temp_tbl.col_0
371
+ # OR store_tbl.col_1 != temp_tbl.col_1
372
+ # OR ... OR store_tbl.col_n != temp_tbl.col_n
373
+ # )
374
+ #
375
+ # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
376
+ # either column is NULL; this is what we want, since it may indicate a column that is present in one version
377
+ # but not the other.
378
+ q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
379
+ _logger.debug(q.compile())
380
+ result = conn.execute(q)
381
+ if result.rowcount > 0:
382
+ _logger.debug(
383
+ f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
384
+ f'{result.rowcount} inconsistent row(s).'
385
+ )
386
+ row = result.first()
387
+ _logger.debug('Example mismatch:')
388
+ _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
389
+ _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
390
+ raise excs.Error(
391
+ 'Data corruption error: the replica data are inconsistent with data retrieved from a previous replica.'
392
+ )
393
+ _logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
394
+
395
+ # Now rectify the v_max values in the temporary table.
396
+ # If a row instance has a concrete v_max value, then we know it's genuine: it's the unique and immutable
397
+ # version when the row was deleted. (This can only happen if later versions of the base table already
398
+ # existed at the time this replica was published.)
399
+ # But if a row instance has a v_max value of MAX_VERSION, then we don't know anything about its future.
400
+ # It might live indefinitely, or it might be deleted as early as version `n + 1`. Following the principle
401
+ # of "latest provable v_max", we simply set v_max equal to `n + 1`.
402
+ q = (
403
+ temp_sa_tbl.update()
404
+ .values(v_max=(replica_version + 1))
405
+ .where(temp_sa_tbl.c.v_max == schema.Table.MAX_VERSION)
406
+ )
407
+ _logger.debug(q.compile())
408
+ result = conn.execute(q)
409
+ _logger.debug(f'Rectified {result.rowcount} row(s) in {temp_sa_tbl_name!r}.')
410
+
411
+ # Now rectify the v_max values in the existing table. This is done by simply taking the later of the two v_max
412
+ # values (the existing one and the new one) for each row instance, following the "latest provable v_max"
413
+ # principle. Obviously we only need to do this for rows that exist in both tables (it's a simple join).
414
+ q = (
415
+ store_sa_tbl.update()
416
+ .values(v_max=sql.func.greatest(store_sa_tbl.c.v_max, temp_sa_tbl.c.v_max))
417
+ .where(pk_clause)
418
+ )
419
+ _logger.debug(q.compile())
420
+ result = conn.execute(q)
421
+ _logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
422
+
423
+ # Now we need to update rows in the existing table that are also present in the temporary table. This is to
424
+ # account for the scenario where the temporary table has columns that are not present in the existing table.
425
+ # (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
426
+ # might also occur; there may be columns in the existing table that are not present in the temporary table.)
427
+ value_update_clauses: dict[str, sql.ColumnElement] = {}
428
+ for temp_col in temp_cols.values():
429
+ if temp_col.name not in system_col_names:
430
+ store_col = store_sa_tbl.c[temp_col.name]
431
+ # Prefer the value from the existing table, substituting the value from the temporary table if it's
432
+ # NULL. This works in all cases (including media columns, where we prefer the existing media file).
433
+ clause = sql.case((store_col == None, temp_col), else_=store_col)
434
+ value_update_clauses[temp_col.name] = clause
435
+ if len(value_update_clauses) > 0:
436
+ q = store_sa_tbl.update().values(**value_update_clauses).where(pk_clause)
437
+ _logger.debug(q.compile())
438
+ result = conn.execute(q)
439
+ _logger.debug(
440
+ f'Merged values from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r} for {result.rowcount} row(s).'
441
+ )
442
+
443
+ # Now drop any rows from the temporary table that are also present in the existing table.
444
+ # The v_max values have been rectified, data has been merged into NULL cells, and all other row values have
445
+ # been verified identical.
446
+ # TODO: Delete any media files that were orphaned by this operation (they're necessarily duplicates of media
447
+ # files that are already present in the existing table).
448
+ q = temp_sa_tbl.delete().where(pk_clause)
449
+ _logger.debug(q.compile())
450
+ result = conn.execute(q)
451
+ _logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
452
+
453
+ # Finally, copy the remaining data (consisting entirely of new row instances) from the temporary table into
454
+ # the actual table.
455
+ q = store_sa_tbl.insert().from_select(
456
+ [store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
457
+ )
458
+ _logger.debug(q.compile())
459
+ result = conn.execute(q)
460
+ _logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
461
+
462
+ def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
463
+ # Data conversions from pyarrow to Pixeltable
464
+ sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
465
+ for col_name in pydict:
466
+ assert col_name in tv.store_tbl.sa_tbl.columns
467
+ sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
468
+ media_col_ids: dict[str, int] = {}
469
+ for col in tv.cols:
470
+ if col.is_stored and col.col_type.is_media_type():
471
+ media_col_ids[col.store_name()] = col.id
472
+
473
+ row_count = len(next(iter(pydict.values())))
474
+ rows: list[dict[str, Any]] = []
475
+ for i in range(row_count):
476
+ row = {
477
+ col_name: self.__from_pa_value(tv, col_vals[i], sql_types[col_name], media_col_ids.get(col_name))
478
+ for col_name, col_vals in pydict.items()
479
+ }
480
+ rows.append(row)
481
+
482
+ return rows
483
+
484
+ def __from_pa_value(
485
+ self, tv: catalog.TableVersion, val: Any, sql_type: sql.types.TypeEngine[Any], media_col_id: Optional[int]
486
+ ) -> Any:
487
+ if val is None:
488
+ return None
489
+ if isinstance(sql_type, sql.JSON):
490
+ return json.loads(val)
491
+ if media_col_id is not None:
492
+ assert isinstance(val, str)
493
+ return self.__relocate_media_file(tv, media_col_id, val)
494
+ return val
495
+
496
+ def __relocate_media_file(self, tv: catalog.TableVersion, media_col_id: int, url: str) -> str:
497
+ # If this is a pxtmedia:// URL, relocate it
498
+ parsed_url = urllib.parse.urlparse(url)
499
+ assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
500
+ if parsed_url.scheme == 'pxtmedia':
501
+ if url not in self.media_files:
502
+ # First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
503
+ # in self.media_files.
504
+ src_path = self.tmp_dir / 'media' / parsed_url.netloc
505
+ dest_path = MediaStore.prepare_media_path(tv.id, media_col_id, tv.version, ext=src_path.suffix)
506
+ src_path.rename(dest_path)
507
+ self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
508
+ return self.media_files[url]
509
+ # For any type of URL other than a local file, just return the URL as-is.
510
+ return url