pixeltable 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (60) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +3 -11
  4. pixeltable/catalog/catalog.py +575 -220
  5. pixeltable/catalog/column.py +22 -23
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +2 -148
  8. pixeltable/catalog/insertable_table.py +15 -13
  9. pixeltable/catalog/path.py +6 -0
  10. pixeltable/catalog/schema_object.py +9 -4
  11. pixeltable/catalog/table.py +96 -85
  12. pixeltable/catalog/table_version.py +257 -174
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/tbl_ops.py +44 -0
  15. pixeltable/catalog/update_status.py +179 -0
  16. pixeltable/catalog/view.py +50 -56
  17. pixeltable/config.py +76 -12
  18. pixeltable/dataframe.py +19 -6
  19. pixeltable/env.py +50 -4
  20. pixeltable/exec/data_row_batch.py +3 -1
  21. pixeltable/exec/exec_node.py +7 -24
  22. pixeltable/exec/expr_eval/schedulers.py +134 -7
  23. pixeltable/exec/in_memory_data_node.py +6 -7
  24. pixeltable/exprs/column_property_ref.py +21 -9
  25. pixeltable/exprs/column_ref.py +7 -2
  26. pixeltable/exprs/function_call.py +2 -2
  27. pixeltable/exprs/row_builder.py +10 -9
  28. pixeltable/exprs/rowid_ref.py +0 -4
  29. pixeltable/func/function.py +3 -3
  30. pixeltable/functions/audio.py +36 -9
  31. pixeltable/functions/gemini.py +4 -4
  32. pixeltable/functions/openai.py +1 -2
  33. pixeltable/functions/video.py +59 -16
  34. pixeltable/globals.py +109 -24
  35. pixeltable/io/__init__.py +1 -1
  36. pixeltable/io/datarows.py +2 -1
  37. pixeltable/io/external_store.py +3 -55
  38. pixeltable/io/globals.py +4 -4
  39. pixeltable/io/hf_datasets.py +10 -2
  40. pixeltable/io/label_studio.py +16 -16
  41. pixeltable/io/pandas.py +1 -0
  42. pixeltable/io/table_data_conduit.py +12 -13
  43. pixeltable/iterators/audio.py +17 -8
  44. pixeltable/iterators/image.py +5 -2
  45. pixeltable/metadata/__init__.py +1 -1
  46. pixeltable/metadata/converters/convert_39.py +125 -0
  47. pixeltable/metadata/converters/util.py +3 -0
  48. pixeltable/metadata/notes.py +1 -0
  49. pixeltable/metadata/schema.py +50 -1
  50. pixeltable/plan.py +4 -0
  51. pixeltable/share/packager.py +20 -38
  52. pixeltable/store.py +40 -51
  53. pixeltable/type_system.py +2 -2
  54. pixeltable/utils/coroutine.py +6 -23
  55. pixeltable/utils/media_store.py +50 -0
  56. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
  57. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/RECORD +60 -57
  58. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
  59. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
  60. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
@@ -47,13 +47,13 @@ class TableDataConduitFormat(str, enum.Enum):
47
47
 
48
48
  @dataclass
49
49
  class TableDataConduit:
50
- source: TableDataSource
50
+ source: 'TableDataSource'
51
51
  source_format: Optional[str] = None
52
52
  source_column_map: Optional[dict[str, str]] = None
53
53
  if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
54
- pxt_schema: Optional[dict[str, Any]] = None
55
- src_schema_overrides: Optional[dict[str, Any]] = None
56
- src_schema: Optional[dict[str, Any]] = None
54
+ pxt_schema: Optional[dict[str, ts.ColumnType]] = None
55
+ src_schema_overrides: Optional[dict[str, ts.ColumnType]] = None
56
+ src_schema: Optional[dict[str, ts.ColumnType]] = None
57
57
  pxt_pk: Optional[list[str]] = None
58
58
  src_pk: Optional[list[str]] = None
59
59
  valid_rows: Optional[RowData] = None
@@ -87,7 +87,7 @@ class TableDataConduit:
87
87
  for name, coltype in self.pxt_schema.items():
88
88
  self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
89
89
 
90
- def infer_schema(self) -> dict[str, Any]:
90
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
91
91
  raise NotImplementedError
92
92
 
93
93
  def valid_row_batch(self) -> Iterator[RowData]:
@@ -137,7 +137,7 @@ class DFTableDataConduit(TableDataConduit):
137
137
  t.pxt_df = tds.source
138
138
  return t
139
139
 
140
- def infer_schema(self) -> dict[str, Any]:
140
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
141
141
  self.pxt_schema = self.pxt_df.schema
142
142
  self.pxt_pk = self.src_pk
143
143
  return self.pxt_schema
@@ -168,7 +168,7 @@ class RowDataTableDataConduit(TableDataConduit):
168
168
  t.batch_count = 0
169
169
  return t
170
170
 
171
- def infer_schema(self) -> dict[str, Any]:
171
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
172
172
  from .datarows import _infer_schema_from_rows
173
173
 
174
174
  if self.source_column_map is None:
@@ -239,7 +239,7 @@ class PandasTableDataConduit(TableDataConduit):
239
239
  t.batch_count = 0
240
240
  return t
241
241
 
242
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
242
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
243
243
  """Return inferred schema, inferred primary key, and source column map"""
244
244
  if self.source_column_map is None:
245
245
  if self.src_schema_overrides is None:
@@ -252,7 +252,7 @@ class PandasTableDataConduit(TableDataConduit):
252
252
  else:
253
253
  raise NotImplementedError()
254
254
 
255
- def infer_schema(self) -> dict[str, Any]:
255
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
256
256
  self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
257
257
  self.normalize_pxt_schema_types()
258
258
  _df_check_primary_key_values(self.pd_df, self.src_pk)
@@ -328,7 +328,6 @@ class HFTableDataConduit(TableDataConduit):
328
328
  hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
329
329
  column_name_for_split: Optional[str] = None
330
330
  categorical_features: dict[str, dict[int, str]]
331
- hf_schema: dict[str, Any] = None
332
331
  dataset_dict: dict[str, datasets.Dataset] = None
333
332
  hf_schema_source: dict[str, Any] = None
334
333
 
@@ -356,7 +355,7 @@ class HFTableDataConduit(TableDataConduit):
356
355
  except ImportError:
357
356
  return False
358
357
 
359
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
358
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
360
359
  from pixeltable.io.hf_datasets import _get_hf_schema, huggingface_schema_to_pxt_schema
361
360
 
362
361
  if self.source_column_map is None:
@@ -469,7 +468,7 @@ class ParquetTableDataConduit(TableDataConduit):
469
468
  t.pq_ds = parquet.ParquetDataset(str(input_path))
470
469
  return t
471
470
 
472
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
471
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
473
472
  from pixeltable.utils.arrow import ar_infer_schema
474
473
 
475
474
  if self.source_column_map is None:
@@ -483,7 +482,7 @@ class ParquetTableDataConduit(TableDataConduit):
483
482
  else:
484
483
  raise NotImplementedError()
485
484
 
486
- def infer_schema(self) -> dict[str, Any]:
485
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
487
486
  self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
488
487
  self.normalize_pxt_schema_types()
489
488
  self.prepare_insert()
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import uuid
3
2
  from fractions import Fraction
4
3
  from pathlib import Path
5
4
  from typing import Any, ClassVar, Optional
@@ -55,12 +54,9 @@ class AudioSplitter(ComponentIterator):
55
54
  def __init__(
56
55
  self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
57
56
  ):
58
- if chunk_duration_sec <= 0.0:
59
- raise excs.Error('chunk_duration_sec must be a positive number')
60
- if chunk_duration_sec < min_chunk_duration_sec:
61
- raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
62
- if overlap_sec >= chunk_duration_sec:
63
- raise excs.Error('overlap_sec must be less than chunk_duration_sec')
57
+ assert chunk_duration_sec > 0.0
58
+ assert chunk_duration_sec >= min_chunk_duration_sec
59
+ assert overlap_sec < chunk_duration_sec
64
60
  audio_path = Path(audio)
65
61
  assert audio_path.exists() and audio_path.is_file()
66
62
  self.audio_path = audio_path
@@ -128,6 +124,19 @@ class AudioSplitter(ComponentIterator):
128
124
 
129
125
  @classmethod
130
126
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
127
+ param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
128
+ params = dict(zip(param_names, args))
129
+ params.update(kwargs)
130
+
131
+ chunk_duration_sec = params['chunk_duration_sec']
132
+ min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
133
+ overlap_sec = params.get('overlap_sec', 0.0)
134
+ if chunk_duration_sec <= 0.0:
135
+ raise excs.Error('chunk_duration_sec must be a positive number')
136
+ if chunk_duration_sec < min_chunk_duration_sec:
137
+ raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
138
+ if overlap_sec >= chunk_duration_sec:
139
+ raise excs.Error('overlap_sec must be less than chunk_duration_sec')
131
140
  return {
132
141
  'start_time_sec': ts.FloatType(),
133
142
  'end_time_sec': ts.FloatType(),
@@ -140,7 +149,7 @@ class AudioSplitter(ComponentIterator):
140
149
  target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
141
150
  chunk_start_pts = 0
142
151
  chunk_end_pts = 0
143
- chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
152
+ chunk_file = str(env.Env.get().create_tmp_path(self.audio_path.suffix))
144
153
  output_container = av.open(chunk_file, mode='w')
145
154
  input_stream = self.container.streams.audio[0]
146
155
  codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
@@ -31,8 +31,7 @@ class TileIterator(ComponentIterator):
31
31
  __j: int
32
32
 
33
33
  def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
34
- if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
35
- raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
34
+ assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
36
35
 
37
36
  self.__image = image
38
37
  self.__image.load()
@@ -79,4 +78,8 @@ class TileIterator(ComponentIterator):
79
78
 
80
79
  @classmethod
81
80
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
81
+ tile_size = kwargs.get('tile_size')
82
+ overlap = kwargs.get('overlap', (0, 0))
83
+ if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
84
+ raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
82
85
  return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
18
18
  _logger = logging.getLogger('pixeltable')
19
19
 
20
20
  # current version of the metadata; this is incremented whenever the metadata schema changes
21
- VERSION = 39
21
+ VERSION = 40
22
22
 
23
23
 
24
24
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -0,0 +1,125 @@
1
+ import logging
2
+ from typing import Optional
3
+ from uuid import UUID
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from pixeltable.metadata import register_converter
8
+ from pixeltable.metadata.converters.util import convert_table_md
9
+
10
+ _logger = logging.getLogger('pixeltable')
11
+
12
+
13
+ @register_converter(version=39)
14
+ def _(engine: sql.engine.Engine) -> None:
15
+ convert_table_md(engine, table_modifier=__table_modifier)
16
+
17
+
18
+ def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
19
+ store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
20
+ store_name = f'{store_prefix}_{tbl_id.hex}'
21
+
22
+ # Get the list of column names that need to be migrated
23
+ col_names = find_error_columns(conn=conn, store_name=store_name)
24
+ if len(col_names) == 0:
25
+ _logger.info(f'No error columns found in table {store_name}. Skipping migration.')
26
+ return
27
+
28
+ # Check if the table exists, outside of the metadata we were given
29
+ # There seem to be cases where the metadata is present in the catalog,
30
+ # but the table itself is not in the database.
31
+ check_table_sql = sql.text(f"""
32
+ SELECT EXISTS (
33
+ SELECT 1
34
+ FROM information_schema.tables
35
+ WHERE table_name = '{store_name}'
36
+ )
37
+ """)
38
+ table_exists = conn.execute(check_table_sql).scalar()
39
+ if not table_exists:
40
+ _logger.warning(f'Table {store_name} does not exist. Skipping migration.')
41
+ return
42
+
43
+ return migrate_error_to_cellmd_columns(conn, store_name, col_names)
44
+
45
+
46
+ def find_error_columns(conn: sql.Connection, store_name: str) -> list[str]:
47
+ """
48
+ Return and errormsg or errortype columns in the given table
49
+
50
+ Args:
51
+ conn: SQLAlchemy connection
52
+ store_name: Name of the table to check
53
+
54
+ Returns:
55
+ List of column name roots (root_errormsg, root_errortype)
56
+ """
57
+ check_columns_sql = sql.text(f"""
58
+ SELECT column_name
59
+ FROM information_schema.columns
60
+ WHERE table_name = '{store_name}'
61
+ """)
62
+ found_columns = [
63
+ row[0]
64
+ for row in conn.execute(check_columns_sql)
65
+ if row[0].endswith('_errormsg') or row[0].endswith('_errortype')
66
+ ]
67
+ column_roots = {s.removesuffix('_errormsg').removesuffix('_errortype') for s in found_columns}
68
+ return [*column_roots]
69
+
70
+
71
+ def migrate_error_to_cellmd_columns(
72
+ conn: sql.Connection, store_name: str, col_names: list[str], backup_table: Optional[str] = None
73
+ ) -> None:
74
+ """
75
+ Safe version with error handling and optional backup.
76
+
77
+ Args:
78
+ engine: SQLAlchemy engine
79
+ store_name: Name of the table to modify
80
+ col_names: List of column name prefixes
81
+ backup_table: Optional name for backup table
82
+
83
+ Usage:
84
+ migrate_error_to_cellmd_columns(engine, 'my_table', ['columnname'], 'my_table_backup')
85
+ """
86
+
87
+ try:
88
+ # Optional: Create backup
89
+ if backup_table:
90
+ backup_sql = sql.text(f"""
91
+ CREATE TABLE {backup_table} AS SELECT * FROM {store_name}
92
+ """)
93
+ conn.execute(backup_sql)
94
+ _logger.info(f'Backup created: {backup_table}')
95
+
96
+ # Step 1: Add new columns
97
+ add_column_str = ', '.join(f'ADD COLUMN {col}_cellmd JSONB DEFAULT NULL' for col in col_names)
98
+ add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
99
+ conn.execute(add_column_sql)
100
+ _logger.info(f'Added columns: {", ".join(f"{col}_cellmd" for col in col_names)}')
101
+
102
+ # Step 2: Populate new columns
103
+ set_column_str = ', '.join(
104
+ [
105
+ f'{col}_cellmd = CASE WHEN {col}_errormsg IS NULL OR {col}_errortype IS NULL '
106
+ f"THEN NULL ELSE jsonb_build_object('errormsg', {col}_errormsg, 'errortype', {col}_errortype) END"
107
+ for col in col_names
108
+ ]
109
+ )
110
+ populate_sql = sql.text(f'UPDATE {store_name} SET {set_column_str}')
111
+ result = conn.execute(populate_sql)
112
+ _logger.info(f'Updated {result.rowcount} rows')
113
+
114
+ # Step 3: Drop old columns
115
+ drop_columns_str = ', '.join(
116
+ [f'DROP COLUMN IF EXISTS {col}_errormsg, DROP COLUMN IF EXISTS {col}_errortype' for col in col_names]
117
+ )
118
+ drop_columns_sql = sql.text(f'ALTER TABLE {store_name} {drop_columns_str}')
119
+ conn.execute(drop_columns_sql)
120
+ _logger.info(f'Dropped columns: {", ".join(f"{col}_errormsg, {col}_errortype" for col in col_names)}')
121
+ _logger.info(f'Migration completed successfully for table: {store_name}')
122
+
123
+ except sql.exc.SQLAlchemyError as e:
124
+ _logger.error(f'Migration for table {store_name} failed: {e}')
125
+ raise
@@ -16,6 +16,7 @@ def convert_table_md(
16
16
  column_md_updater: Optional[Callable[[dict], None]] = None,
17
17
  external_store_md_updater: Optional[Callable[[dict], None]] = None,
18
18
  substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None,
19
+ table_modifier: Optional[Callable[[sql.Connection, UUID, dict, dict], None]] = None,
19
20
  ) -> None:
20
21
  """
21
22
  Converts schema.TableMd dicts based on the specified conversion functions.
@@ -50,6 +51,8 @@ def convert_table_md(
50
51
  if updated_table_md != table_md:
51
52
  __logger.info(f'Updating schema for table: {tbl_id}')
52
53
  conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
54
+ if table_modifier is not None:
55
+ table_modifier(conn, tbl_id, table_md, updated_table_md)
53
56
 
54
57
  for row in conn.execute(sql.select(Function)):
55
58
  fn_id = row[0]
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 40: 'Convert error property columns to cellmd columns',
5
6
  39: 'ColumnHandles in external stores',
6
7
  38: 'Added TableMd.view_sn',
7
8
  37: 'Add support for the sample() method on DataFrames',
@@ -8,6 +8,8 @@ from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
8
8
  from sqlalchemy.dialects.postgresql import JSONB, UUID
9
9
  from sqlalchemy.orm.decl_api import DeclarativeMeta
10
10
 
11
+ from ..catalog.update_status import UpdateStatus
12
+
11
13
  # Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
12
14
  # a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
13
15
  # outside of the module in a typesafe way.
@@ -180,6 +182,7 @@ class TableMd:
180
182
  # sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
181
183
  # - incremented for each add/drop of a mutable view
182
184
  # - only maintained for mutable tables
185
+ # TODO: replace with mutable_views: list[UUID] to help with debugging
183
186
  view_sn: int
184
187
 
185
188
  # Metadata format for external stores:
@@ -191,6 +194,26 @@ class TableMd:
191
194
  view_md: Optional[ViewMd]
192
195
  additional_md: dict[str, Any]
193
196
 
197
+ has_pending_ops: bool = False
198
+
199
+ @property
200
+ def is_snapshot(self) -> bool:
201
+ return self.view_md is not None and self.view_md.is_snapshot
202
+
203
+ @property
204
+ def is_mutable(self) -> bool:
205
+ return not self.is_snapshot and not self.is_replica
206
+
207
+ @property
208
+ def is_pure_snapshot(self) -> bool:
209
+ return (
210
+ self.view_md is not None
211
+ and self.view_md.is_snapshot
212
+ and self.view_md.sample_clause is None
213
+ and self.view_md.predicate is None
214
+ and len(self.column_md) == 0
215
+ )
216
+
194
217
 
195
218
  class Table(Base):
196
219
  """
@@ -219,7 +242,9 @@ class TableVersionMd:
219
242
  created_at: float # time.time()
220
243
  version: int
221
244
  schema_version: int
222
- additional_md: dict[str, Any]
245
+ user: Optional[str] = None # User that created this version
246
+ update_status: Optional[UpdateStatus] = None # UpdateStatus of the change that created this version
247
+ additional_md: dict[str, Any] = dataclasses.field(default_factory=dict)
223
248
 
224
249
 
225
250
  class TableVersion(Base):
@@ -275,6 +300,22 @@ class TableSchemaVersion(Base):
275
300
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableSchemaVersionMd
276
301
 
277
302
 
303
+ class PendingTableOp(Base):
304
+ """
305
+ Table operation that needs to be completed before the table can be used.
306
+
307
+ Operations need to be completed in order of increasing seq_num.
308
+ """
309
+
310
+ __tablename__ = 'pendingtableops'
311
+
312
+ tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
313
+ UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
314
+ )
315
+ op_sn: orm.Mapped[int] = orm.mapped_column(Integer, primary_key=True, nullable=False) # catalog.TableOp.op_sn
316
+ op: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # catalog.TableOp
317
+
318
+
278
319
  @dataclasses.dataclass
279
320
  class FunctionMd:
280
321
  name: str
@@ -308,6 +349,14 @@ class FullTableMd(NamedTuple):
308
349
  version_md: TableVersionMd
309
350
  schema_version_md: TableSchemaVersionMd
310
351
 
352
+ @property
353
+ def is_pure_snapshot(self) -> bool:
354
+ return (
355
+ self.tbl_md.view_md is not None
356
+ and self.tbl_md.view_md.predicate is None
357
+ and len(self.schema_version_md.columns) == 0
358
+ )
359
+
311
360
  def as_dict(self) -> dict[str, Any]:
312
361
  return {
313
362
  'table_id': self.tbl_md.tbl_id,
pixeltable/plan.py CHANGED
@@ -512,6 +512,7 @@ class Planner:
512
512
  # update row builder with column information
513
513
  for i, col in enumerate(all_base_cols):
514
514
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
515
+ plan.ctx.num_computed_exprs = len(recomputed_exprs)
515
516
  recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
516
517
  return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
517
518
 
@@ -659,6 +660,7 @@ class Planner:
659
660
  ignore_errors=True,
660
661
  exact_version_only=view.get_bases(),
661
662
  )
663
+ plan.ctx.num_computed_exprs = len(recomputed_exprs)
662
664
  for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
663
665
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
664
666
  # TODO: avoid duplication with view_load_plan() logic (where does this belong?)
@@ -1057,6 +1059,8 @@ class Planner:
1057
1059
  plan.ctx.batch_size = 16
1058
1060
  plan.ctx.show_pbar = True
1059
1061
  plan.ctx.ignore_errors = True
1062
+ computed_exprs = row_builder.output_exprs - row_builder.input_exprs
1063
+ plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
1060
1064
 
1061
1065
  # we want to flush images
1062
1066
  if col.is_computed and col.is_stored and col.col_type.is_image_type():
@@ -361,49 +361,32 @@ class TableRestorer:
361
361
  )
362
362
 
363
363
  tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
364
+ for md in tbl_md:
365
+ md.tbl_md.is_replica = True
364
366
 
365
- # Create the replica table
366
- # The logic here needs to be completely restructured in order to make it concurrency-safe.
367
- # - Catalog.create_replica() needs to write the metadata and also create the physical store tables
368
- # and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
369
- # an actual table)
370
- # - this could be done one replica at a time (instead of the entire hierarchy)
371
367
  cat = catalog.Catalog.get()
372
- cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
373
- # don't call get_table() until after the calls to create_replica() and __import_table() below;
374
- # the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
375
- # TV instances for the same replica version, which then leads to failures when constructing queries
376
-
377
- # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
378
- # replica_tbl itself if it's a pure snapshot.
379
- target_md = tbl_md[0]
380
- is_pure_snapshot = (
381
- target_md.tbl_md.view_md is not None
382
- and target_md.tbl_md.view_md.predicate is None
383
- and len(target_md.schema_version_md.columns) == 0
384
- )
385
- if is_pure_snapshot:
386
- ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
387
- else:
388
- ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
389
-
390
- # Instantiate data from the Parquet tables.
391
- with Env.get().begin_xact():
392
- for md in ancestor_md[::-1]: # Base table first
393
- # Create a TableVersion instance (and a store table) for this ancestor.
394
- tv = catalog.TableVersion.create_replica(md)
395
- # Now import data from Parquet.
396
- _logger.info(f'Importing table {tv.name!r}.')
397
- self.__import_table(self.tmp_dir, tv, md)
398
-
399
- with cat.begin_xact(for_write=False):
368
+
369
+ with cat.begin_xact(for_write=True):
370
+ # Create (or update) the replica table and its ancestors, along with TableVersion instances for any
371
+ # versions that have not been seen before.
372
+ cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
373
+
374
+ # Now we need to load data for replica_tbl and its ancestors, except that we skip
375
+ # replica_tbl itself if it's a pure snapshot.
376
+ for md in tbl_md[::-1]: # Base table first
377
+ if not md.is_pure_snapshot:
378
+ tv = cat.get_tbl_version(UUID(md.tbl_md.tbl_id), md.version_md.version)
379
+ # Import data from Parquet.
380
+ _logger.info(f'Importing table {tv.name!r}.')
381
+ self.__import_table(self.tmp_dir, tv, md)
382
+
400
383
  return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
401
384
 
402
385
  def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
403
386
  """
404
387
  Import the Parquet table into the Pixeltable catalog.
405
388
  """
406
- tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
389
+ tbl_id = UUID(tbl_md.tbl_md.tbl_id)
407
390
  parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
408
391
  parquet_table = pq.read_table(str(parquet_dir))
409
392
  replica_version = tv.version
@@ -626,9 +609,8 @@ class TableRestorer:
626
609
  # First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
627
610
  # in self.media_files.
628
611
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
629
- dest_path = MediaStore.prepare_media_path(tv.id, media_col_id, tv.version, ext=src_path.suffix)
630
- src_path.rename(dest_path)
631
- self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
612
+ # Move the file to the media store and update the URL.
613
+ self.media_files[url] = MediaStore.relocate_local_media_file(src_path, tv.id, media_col_id, tv.version)
632
614
  return self.media_files[url]
633
615
  # For any type of URL other than a local file, just return the URL as-is.
634
616
  return url