pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (51) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +3 -10
  4. pixeltable/catalog/catalog.py +139 -59
  5. pixeltable/catalog/column.py +32 -23
  6. pixeltable/catalog/globals.py +2 -45
  7. pixeltable/catalog/insertable_table.py +5 -2
  8. pixeltable/catalog/path.py +6 -0
  9. pixeltable/catalog/table.py +173 -23
  10. pixeltable/catalog/table_version.py +156 -92
  11. pixeltable/catalog/table_version_handle.py +26 -1
  12. pixeltable/catalog/update_status.py +179 -0
  13. pixeltable/catalog/view.py +12 -3
  14. pixeltable/config.py +76 -12
  15. pixeltable/dataframe.py +1 -1
  16. pixeltable/env.py +29 -0
  17. pixeltable/exec/exec_node.py +7 -24
  18. pixeltable/exec/expr_eval/schedulers.py +134 -7
  19. pixeltable/exprs/column_property_ref.py +23 -20
  20. pixeltable/exprs/column_ref.py +24 -18
  21. pixeltable/exprs/data_row.py +9 -0
  22. pixeltable/exprs/function_call.py +2 -2
  23. pixeltable/exprs/row_builder.py +46 -14
  24. pixeltable/exprs/rowid_ref.py +0 -4
  25. pixeltable/func/function.py +3 -3
  26. pixeltable/functions/audio.py +36 -9
  27. pixeltable/functions/video.py +57 -10
  28. pixeltable/globals.py +61 -1
  29. pixeltable/io/__init__.py +1 -1
  30. pixeltable/io/external_store.py +39 -64
  31. pixeltable/io/globals.py +4 -4
  32. pixeltable/io/hf_datasets.py +10 -2
  33. pixeltable/io/label_studio.py +52 -48
  34. pixeltable/metadata/__init__.py +1 -1
  35. pixeltable/metadata/converters/convert_38.py +39 -0
  36. pixeltable/metadata/converters/convert_39.py +125 -0
  37. pixeltable/metadata/converters/util.py +3 -0
  38. pixeltable/metadata/notes.py +2 -0
  39. pixeltable/metadata/schema.py +14 -2
  40. pixeltable/metadata/utils.py +78 -0
  41. pixeltable/plan.py +26 -18
  42. pixeltable/share/packager.py +20 -38
  43. pixeltable/store.py +121 -142
  44. pixeltable/type_system.py +2 -2
  45. pixeltable/utils/coroutine.py +6 -23
  46. pixeltable/utils/media_store.py +39 -0
  47. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
  48. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
  49. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
  50. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
  51. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,125 @@
1
+ import logging
2
+ from typing import Optional
3
+ from uuid import UUID
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from pixeltable.metadata import register_converter
8
+ from pixeltable.metadata.converters.util import convert_table_md
9
+
10
+ _logger = logging.getLogger('pixeltable')
11
+
12
+
13
+ @register_converter(version=39)
14
+ def _(engine: sql.engine.Engine) -> None:
15
+ convert_table_md(engine, table_modifier=__table_modifier)
16
+
17
+
18
+ def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
19
+ store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
20
+ store_name = f'{store_prefix}_{tbl_id.hex}'
21
+
22
+ # Get the list of column names that need to be migrated
23
+ col_names = find_error_columns(conn=conn, store_name=store_name)
24
+ if len(col_names) == 0:
25
+ _logger.info(f'No error columns found in table {store_name}. Skipping migration.')
26
+ return
27
+
28
+ # Check if the table exists, outside of the metadata we were given
29
+ # There seem to be cases where the metadata is present in the catalog,
30
+ # but the table itself is not in the database.
31
+ check_table_sql = sql.text(f"""
32
+ SELECT EXISTS (
33
+ SELECT 1
34
+ FROM information_schema.tables
35
+ WHERE table_name = '{store_name}'
36
+ )
37
+ """)
38
+ table_exists = conn.execute(check_table_sql).scalar()
39
+ if not table_exists:
40
+ _logger.warning(f'Table {store_name} does not exist. Skipping migration.')
41
+ return
42
+
43
+ return migrate_error_to_cellmd_columns(conn, store_name, col_names)
44
+
45
+
46
+ def find_error_columns(conn: sql.Connection, store_name: str) -> list[str]:
47
+ """
48
+ Return and errormsg or errortype columns in the given table
49
+
50
+ Args:
51
+ conn: SQLAlchemy connection
52
+ store_name: Name of the table to check
53
+
54
+ Returns:
55
+ List of column name roots (root_errormsg, root_errortype)
56
+ """
57
+ check_columns_sql = sql.text(f"""
58
+ SELECT column_name
59
+ FROM information_schema.columns
60
+ WHERE table_name = '{store_name}'
61
+ """)
62
+ found_columns = [
63
+ row[0]
64
+ for row in conn.execute(check_columns_sql)
65
+ if row[0].endswith('_errormsg') or row[0].endswith('_errortype')
66
+ ]
67
+ column_roots = {s.removesuffix('_errormsg').removesuffix('_errortype') for s in found_columns}
68
+ return [*column_roots]
69
+
70
+
71
+ def migrate_error_to_cellmd_columns(
72
+ conn: sql.Connection, store_name: str, col_names: list[str], backup_table: Optional[str] = None
73
+ ) -> None:
74
+ """
75
+ Safe version with error handling and optional backup.
76
+
77
+ Args:
78
+ engine: SQLAlchemy engine
79
+ store_name: Name of the table to modify
80
+ col_names: List of column name prefixes
81
+ backup_table: Optional name for backup table
82
+
83
+ Usage:
84
+ migrate_error_to_cellmd_columns(engine, 'my_table', ['columnname'], 'my_table_backup')
85
+ """
86
+
87
+ try:
88
+ # Optional: Create backup
89
+ if backup_table:
90
+ backup_sql = sql.text(f"""
91
+ CREATE TABLE {backup_table} AS SELECT * FROM {store_name}
92
+ """)
93
+ conn.execute(backup_sql)
94
+ _logger.info(f'Backup created: {backup_table}')
95
+
96
+ # Step 1: Add new columns
97
+ add_column_str = ', '.join(f'ADD COLUMN {col}_cellmd JSONB DEFAULT NULL' for col in col_names)
98
+ add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
99
+ conn.execute(add_column_sql)
100
+ _logger.info(f'Added columns: {", ".join(f"{col}_cellmd" for col in col_names)}')
101
+
102
+ # Step 2: Populate new columns
103
+ set_column_str = ', '.join(
104
+ [
105
+ f'{col}_cellmd = CASE WHEN {col}_errormsg IS NULL OR {col}_errortype IS NULL '
106
+ f"THEN NULL ELSE jsonb_build_object('errormsg', {col}_errormsg, 'errortype', {col}_errortype) END"
107
+ for col in col_names
108
+ ]
109
+ )
110
+ populate_sql = sql.text(f'UPDATE {store_name} SET {set_column_str}')
111
+ result = conn.execute(populate_sql)
112
+ _logger.info(f'Updated {result.rowcount} rows')
113
+
114
+ # Step 3: Drop old columns
115
+ drop_columns_str = ', '.join(
116
+ [f'DROP COLUMN IF EXISTS {col}_errormsg, DROP COLUMN IF EXISTS {col}_errortype' for col in col_names]
117
+ )
118
+ drop_columns_sql = sql.text(f'ALTER TABLE {store_name} {drop_columns_str}')
119
+ conn.execute(drop_columns_sql)
120
+ _logger.info(f'Dropped columns: {", ".join(f"{col}_errormsg, {col}_errortype" for col in col_names)}')
121
+ _logger.info(f'Migration completed successfully for table: {store_name}')
122
+
123
+ except sql.exc.SQLAlchemyError as e:
124
+ _logger.error(f'Migration for table {store_name} failed: {e}')
125
+ raise
@@ -16,6 +16,7 @@ def convert_table_md(
16
16
  column_md_updater: Optional[Callable[[dict], None]] = None,
17
17
  external_store_md_updater: Optional[Callable[[dict], None]] = None,
18
18
  substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None,
19
+ table_modifier: Optional[Callable[[sql.Connection, UUID, dict, dict], None]] = None,
19
20
  ) -> None:
20
21
  """
21
22
  Converts schema.TableMd dicts based on the specified conversion functions.
@@ -50,6 +51,8 @@ def convert_table_md(
50
51
  if updated_table_md != table_md:
51
52
  __logger.info(f'Updating schema for table: {tbl_id}')
52
53
  conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
54
+ if table_modifier is not None:
55
+ table_modifier(conn, tbl_id, table_md, updated_table_md)
53
56
 
54
57
  for row in conn.execute(sql.select(Function)):
55
58
  fn_id = row[0]
@@ -2,6 +2,8 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 40: 'Convert error property columns to cellmd columns',
6
+ 39: 'ColumnHandles in external stores',
5
7
  38: 'Added TableMd.view_sn',
6
8
  37: 'Add support for the sample() method on DataFrames',
7
9
  36: 'Added Table.lock_dummy',
@@ -8,6 +8,8 @@ from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
8
8
  from sqlalchemy.dialects.postgresql import JSONB, UUID
9
9
  from sqlalchemy.orm.decl_api import DeclarativeMeta
10
10
 
11
+ from ..catalog.update_status import UpdateStatus
12
+
11
13
  # Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
12
14
  # a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
13
15
  # outside of the module in a typesafe way.
@@ -213,13 +215,15 @@ class Table(Base):
213
215
  lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
214
216
 
215
217
 
216
- @dataclasses.dataclass
218
+ @dataclasses.dataclass(frozen=True)
217
219
  class TableVersionMd:
218
220
  tbl_id: str # uuid.UUID
219
221
  created_at: float # time.time()
220
222
  version: int
221
223
  schema_version: int
222
- additional_md: dict[str, Any]
224
+ user: Optional[str] = None # User that created this version
225
+ update_status: Optional[UpdateStatus] = None # UpdateStatus of the change that created this version
226
+ additional_md: dict[str, Any] = dataclasses.field(default_factory=dict)
223
227
 
224
228
 
225
229
  class TableVersion(Base):
@@ -308,6 +312,14 @@ class FullTableMd(NamedTuple):
308
312
  version_md: TableVersionMd
309
313
  schema_version_md: TableSchemaVersionMd
310
314
 
315
+ @property
316
+ def is_pure_snapshot(self) -> bool:
317
+ return (
318
+ self.tbl_md.view_md is not None
319
+ and self.tbl_md.view_md.predicate is None
320
+ and len(self.schema_version_md.columns) == 0
321
+ )
322
+
311
323
  def as_dict(self) -> dict[str, Any]:
312
324
  return {
313
325
  'table_id': self.tbl_md.tbl_id,
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+
5
+ from pixeltable.metadata import schema
6
+
7
+
8
+ class MetadataUtils:
9
+ @classmethod
10
+ def _diff_md(
11
+ cls, old_md: Optional[dict[int, schema.SchemaColumn]], new_md: Optional[dict[int, schema.SchemaColumn]]
12
+ ) -> str:
13
+ """Return a string reporting the differences in a specific entry in two dictionaries
14
+
15
+ Results are formatted as follows:
16
+ - If `old_md` is `None`, returns 'Initial Version'.
17
+ - If `old_md` and `new_md` are the same, returns an empty string.
18
+ - If there are additions, changes, or deletions, returns a string summarizing the changes.
19
+ """
20
+ assert new_md is not None
21
+ if old_md is None:
22
+ return 'Initial Version'
23
+ if old_md == new_md:
24
+ return ''
25
+ added = {k: v.name for k, v in new_md.items() if k not in old_md}
26
+ changed = {
27
+ k: f'{old_md[k].name!r} to {v.name!r}'
28
+ for k, v in new_md.items()
29
+ if k in old_md and old_md[k].name != v.name
30
+ }
31
+ deleted = {k: v.name for k, v in old_md.items() if k not in new_md}
32
+ if len(added) == 0 and len(changed) == 0 and len(deleted) == 0:
33
+ return ''
34
+ # Format the result
35
+ t = []
36
+ if len(added) > 0:
37
+ t.append('Added: ' + ', '.join(added.values()))
38
+ if len(changed) > 0:
39
+ t.append('Renamed: ' + ', '.join(changed.values()))
40
+ if len(deleted) > 0:
41
+ t.append('Deleted: ' + ', '.join(deleted.values()))
42
+ r = ', '.join(t)
43
+ return r
44
+
45
+ @classmethod
46
+ def _create_md_change_dict(
47
+ cls, md_list: Optional[list[tuple[int, dict[int, schema.SchemaColumn]]]]
48
+ ) -> dict[int, str]:
49
+ """Return a dictionary of schema changes by version
50
+ Args:
51
+ md_list: a list of tuples, each containing a version number and a metadata dictionary.
52
+ """
53
+ r: dict[int, str] = {}
54
+ if md_list is None or len(md_list) == 0:
55
+ return r
56
+
57
+ # Sort the list in place by version number
58
+ md_list.sort()
59
+
60
+ first_retrieved_version = md_list[0][0]
61
+ if first_retrieved_version == 0:
62
+ prev_md = None
63
+ prev_ver = -1
64
+ start = 0
65
+ else:
66
+ prev_md = md_list[0][1]
67
+ prev_ver = first_retrieved_version
68
+ start = 1
69
+
70
+ for ver, curr_md in md_list[start:]:
71
+ if ver == prev_ver:
72
+ continue
73
+ assert ver > prev_ver
74
+ tf = cls._diff_md(prev_md, curr_md)
75
+ if tf != '':
76
+ r[ver] = tf
77
+ prev_md = curr_md
78
+ return r
pixeltable/plan.py CHANGED
@@ -378,7 +378,7 @@ class Planner:
378
378
 
379
379
  cls.__check_valid_columns(tbl, stored_cols, 'inserted into')
380
380
 
381
- row_builder = exprs.RowBuilder([], stored_cols, [])
381
+ row_builder = exprs.RowBuilder([], stored_cols, [], tbl)
382
382
 
383
383
  # create InMemoryDataNode for 'rows'
384
384
  plan: exec.ExecNode = exec.InMemoryDataNode(
@@ -473,15 +473,19 @@ class Planner:
473
473
  assert isinstance(tbl, catalog.TableVersionPath)
474
474
  target = tbl.tbl_version.get() # the one we need to update
475
475
  updated_cols = list(update_targets.keys())
476
+ recomputed_cols: set[Column]
476
477
  if len(recompute_targets) > 0:
477
- recomputed_cols = set(recompute_targets)
478
+ assert len(update_targets) == 0
479
+ recomputed_cols = {*recompute_targets}
480
+ if cascade:
481
+ recomputed_cols |= target.get_dependent_columns(recomputed_cols)
478
482
  else:
479
483
  recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
480
- # regardless of cascade, we need to update all indices on any updated column
481
- idx_val_cols = target.get_idx_val_columns(updated_cols)
482
- recomputed_cols.update(idx_val_cols)
483
- # we only need to recompute stored columns (unstored ones are substituted away)
484
- recomputed_cols = {c for c in recomputed_cols if c.is_stored}
484
+ # regardless of cascade, we need to update all indices on any updated/recomputed column
485
+ idx_val_cols = target.get_idx_val_columns(set(updated_cols) | recomputed_cols)
486
+ recomputed_cols.update(idx_val_cols)
487
+ # we only need to recompute stored columns (unstored ones are substituted away)
488
+ recomputed_cols = {c for c in recomputed_cols if c.is_stored}
485
489
 
486
490
  cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
487
491
 
@@ -508,6 +512,7 @@ class Planner:
508
512
  # update row builder with column information
509
513
  for i, col in enumerate(all_base_cols):
510
514
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
515
+ plan.ctx.num_computed_exprs = len(recomputed_exprs)
511
516
  recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
512
517
  return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
513
518
 
@@ -588,7 +593,7 @@ class Planner:
588
593
  sql_exprs = list(
589
594
  exprs.Expr.list_subexprs(analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False)
590
595
  )
591
- row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs)
596
+ row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs, target)
592
597
  analyzer.finalize(row_builder)
593
598
  sql_lookup_node = exec.SqlLookupNode(tbl, row_builder, sql_exprs, sa_key_cols, key_vals)
594
599
  col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
@@ -602,8 +607,7 @@ class Planner:
602
607
  row_builder.set_slot_idxs(select_list, remove_duplicates=False)
603
608
  for i, col in enumerate(all_base_cols):
604
609
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
605
-
606
- ctx = exec.ExecContext(row_builder)
610
+ ctx = exec.ExecContext(row_builder, num_computed_exprs=len(recomputed_exprs))
607
611
  # we're returning everything to the user, so we might as well do it in a single batch
608
612
  ctx.batch_size = 0
609
613
  plan.set_ctx(ctx)
@@ -656,6 +660,7 @@ class Planner:
656
660
  ignore_errors=True,
657
661
  exact_version_only=view.get_bases(),
658
662
  )
663
+ plan.ctx.num_computed_exprs = len(recomputed_exprs)
659
664
  for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
660
665
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
661
666
  # TODO: avoid duplication with view_load_plan() logic (where does this belong?)
@@ -695,7 +700,7 @@ class Planner:
695
700
  base_analyzer = Analyzer(
696
701
  from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
697
702
  )
698
- row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
703
+ row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [], target)
699
704
 
700
705
  # if we're propagating an insert, we only want to see those base rows that were created for the current version
701
706
  # execution plan:
@@ -832,7 +837,11 @@ class Planner:
832
837
  order_by_clause=order_by_clause,
833
838
  sample_clause=sample_clause,
834
839
  )
835
- row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
840
+ # If the from_clause has a single table, we can use it as the context table for the RowBuilder.
841
+ # Otherwise there is no context table, but that's ok, because the context table is only needed for
842
+ # table mutations, which can't happen during a join.
843
+ context_tbl = from_clause.tbls[0].tbl_version.get() if len(from_clause.tbls) == 1 else None
844
+ row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [], context_tbl)
836
845
 
837
846
  analyzer.finalize(row_builder)
838
847
  # select_list: we need to materialize everything that's been collected
@@ -1035,16 +1044,14 @@ class Planner:
1035
1044
  return Analyzer(FromClause(tbls=[tbl]), [], where_clause=where_clause)
1036
1045
 
1037
1046
  @classmethod
1038
- def create_add_column_plan(
1039
- cls, tbl: catalog.TableVersionPath, col: catalog.Column
1040
- ) -> tuple[exec.ExecNode, Optional[int]]:
1047
+ def create_add_column_plan(cls, tbl: catalog.TableVersionPath, col: catalog.Column) -> exec.ExecNode:
1041
1048
  """Creates a plan for InsertableTable.add_column()
1042
1049
  Returns:
1043
1050
  plan: the plan to execute
1044
1051
  value_expr slot idx for the plan output (for computed cols)
1045
1052
  """
1046
1053
  assert isinstance(tbl, catalog.TableVersionPath)
1047
- row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[])
1054
+ row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[], tbl=tbl.tbl_version.get())
1048
1055
  analyzer = Analyzer(FromClause(tbls=[tbl]), row_builder.default_eval_ctx.target_exprs)
1049
1056
  plan = cls._create_query_plan(
1050
1057
  row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True
@@ -1052,9 +1059,10 @@ class Planner:
1052
1059
  plan.ctx.batch_size = 16
1053
1060
  plan.ctx.show_pbar = True
1054
1061
  plan.ctx.ignore_errors = True
1062
+ computed_exprs = row_builder.output_exprs - row_builder.input_exprs
1063
+ plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
1055
1064
 
1056
1065
  # we want to flush images
1057
1066
  if col.is_computed and col.is_stored and col.col_type.is_image_type():
1058
1067
  plan.set_stored_img_cols(row_builder.output_slot_idxs())
1059
- value_expr_slot_idx = row_builder.output_slot_idxs()[0].slot_idx if col.is_computed else None
1060
- return plan, value_expr_slot_idx
1068
+ return plan
@@ -361,49 +361,32 @@ class TableRestorer:
361
361
  )
362
362
 
363
363
  tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
364
+ for md in tbl_md:
365
+ md.tbl_md.is_replica = True
364
366
 
365
- # Create the replica table
366
- # The logic here needs to be completely restructured in order to make it concurrency-safe.
367
- # - Catalog.create_replica() needs to write the metadata and also create the physical store tables
368
- # and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
369
- # an actual table)
370
- # - this could be done one replica at a time (instead of the entire hierarchy)
371
367
  cat = catalog.Catalog.get()
372
- cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
373
- # don't call get_table() until after the calls to create_replica() and __import_table() below;
374
- # the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
375
- # TV instances for the same replica version, which then leads to failures when constructing queries
376
-
377
- # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
378
- # replica_tbl itself if it's a pure snapshot.
379
- target_md = tbl_md[0]
380
- is_pure_snapshot = (
381
- target_md.tbl_md.view_md is not None
382
- and target_md.tbl_md.view_md.predicate is None
383
- and len(target_md.schema_version_md.columns) == 0
384
- )
385
- if is_pure_snapshot:
386
- ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
387
- else:
388
- ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
389
-
390
- # Instantiate data from the Parquet tables.
391
- with Env.get().begin_xact():
392
- for md in ancestor_md[::-1]: # Base table first
393
- # Create a TableVersion instance (and a store table) for this ancestor.
394
- tv = catalog.TableVersion.create_replica(md)
395
- # Now import data from Parquet.
396
- _logger.info(f'Importing table {tv.name!r}.')
397
- self.__import_table(self.tmp_dir, tv, md)
398
-
399
- with cat.begin_xact(for_write=False):
368
+
369
+ with cat.begin_xact(for_write=True):
370
+ # Create (or update) the replica table and its ancestors, along with TableVersion instances for any
371
+ # versions that have not been seen before.
372
+ cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
373
+
374
+ # Now we need to load data for replica_tbl and its ancestors, except that we skip
375
+ # replica_tbl itself if it's a pure snapshot.
376
+ for md in tbl_md[::-1]: # Base table first
377
+ if not md.is_pure_snapshot:
378
+ tv = cat.get_tbl_version(UUID(md.tbl_md.tbl_id), md.version_md.version)
379
+ # Import data from Parquet.
380
+ _logger.info(f'Importing table {tv.name!r}.')
381
+ self.__import_table(self.tmp_dir, tv, md)
382
+
400
383
  return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
401
384
 
402
385
  def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
403
386
  """
404
387
  Import the Parquet table into the Pixeltable catalog.
405
388
  """
406
- tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
389
+ tbl_id = UUID(tbl_md.tbl_md.tbl_id)
407
390
  parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
408
391
  parquet_table = pq.read_table(str(parquet_dir))
409
392
  replica_version = tv.version
@@ -626,9 +609,8 @@ class TableRestorer:
626
609
  # First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
627
610
  # in self.media_files.
628
611
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
629
- dest_path = MediaStore.prepare_media_path(tv.id, media_col_id, tv.version, ext=src_path.suffix)
630
- src_path.rename(dest_path)
631
- self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
612
+ # Move the file to the media store and update the URL.
613
+ self.media_files[url] = MediaStore.relocate_local_media_file(src_path, tv.id, media_col_id, tv.version)
632
614
  return self.media_files[url]
633
615
  # For any type of URL other than a local file, just return the URL as-is.
634
616
  return url