pixeltable 0.4.13__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (46) hide show
  1. pixeltable/catalog/catalog.py +179 -63
  2. pixeltable/catalog/column.py +24 -20
  3. pixeltable/catalog/table.py +24 -8
  4. pixeltable/catalog/table_version.py +15 -6
  5. pixeltable/catalog/view.py +22 -22
  6. pixeltable/config.py +2 -0
  7. pixeltable/dataframe.py +3 -2
  8. pixeltable/env.py +42 -21
  9. pixeltable/exec/__init__.py +1 -0
  10. pixeltable/exec/aggregation_node.py +0 -1
  11. pixeltable/exec/cache_prefetch_node.py +74 -98
  12. pixeltable/exec/data_row_batch.py +2 -18
  13. pixeltable/exec/in_memory_data_node.py +1 -1
  14. pixeltable/exec/object_store_save_node.py +299 -0
  15. pixeltable/exec/sql_node.py +28 -33
  16. pixeltable/exprs/data_row.py +31 -25
  17. pixeltable/exprs/json_path.py +6 -5
  18. pixeltable/exprs/row_builder.py +6 -12
  19. pixeltable/functions/gemini.py +1 -1
  20. pixeltable/functions/openai.py +1 -1
  21. pixeltable/functions/video.py +5 -6
  22. pixeltable/globals.py +3 -3
  23. pixeltable/index/embedding_index.py +5 -8
  24. pixeltable/io/fiftyone.py +1 -1
  25. pixeltable/io/label_studio.py +4 -5
  26. pixeltable/iterators/audio.py +1 -1
  27. pixeltable/iterators/document.py +10 -12
  28. pixeltable/iterators/video.py +1 -1
  29. pixeltable/metadata/schema.py +7 -0
  30. pixeltable/plan.py +26 -1
  31. pixeltable/share/packager.py +8 -2
  32. pixeltable/share/publish.py +3 -9
  33. pixeltable/type_system.py +1 -3
  34. pixeltable/utils/dbms.py +31 -5
  35. pixeltable/utils/gcs_store.py +283 -0
  36. pixeltable/utils/local_store.py +316 -0
  37. pixeltable/utils/object_stores.py +497 -0
  38. pixeltable/utils/pytorch.py +5 -6
  39. pixeltable/utils/s3_store.py +354 -0
  40. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/METADATA +1 -1
  41. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/RECORD +44 -41
  42. pixeltable/utils/media_store.py +0 -248
  43. pixeltable/utils/s3.py +0 -17
  44. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
  45. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
  46. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
@@ -14,8 +14,6 @@ import psycopg
14
14
  import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import exceptions as excs
17
-
18
- # from pixeltable import exceptions as excs, UpdateStatus
19
17
  from pixeltable.env import Env
20
18
  from pixeltable.iterators import ComponentIterator
21
19
  from pixeltable.metadata import schema
@@ -906,9 +904,9 @@ class Catalog:
906
904
  """Must be executed inside a transaction. Might raise PendingTableOpsError."""
907
905
  if (tbl_id, version) not in self._tbls:
908
906
  if version is None:
909
- self._load_tbl(tbl_id)
907
+ return self._load_tbl(tbl_id)
910
908
  else:
911
- self._load_tbl_at_version(tbl_id, version)
909
+ return self._load_tbl_at_version(tbl_id, version)
912
910
  return self._tbls.get((tbl_id, version))
913
911
 
914
912
  @retry_loop(for_write=True)
@@ -1040,23 +1038,18 @@ class Catalog:
1040
1038
  )
1041
1039
 
1042
1040
  # Ensure that the system directory exists.
1043
- self._create_dir(Path.parse('_system', allow_system_path=True), if_exists=IfExistsParam.IGNORE, parents=False)
1041
+ self.__ensure_system_dir_exists()
1044
1042
 
1045
1043
  # Now check to see if this table already exists in the catalog.
1046
1044
  existing = self.get_table_by_id(tbl_id)
1047
1045
  if existing is not None:
1048
1046
  existing_path = Path.parse(existing._path(), allow_system_path=True)
1049
- if existing_path != path:
1047
+ if existing_path != path and not existing_path.is_system_path:
1050
1048
  # It does exist, under a different path from the specified one.
1051
- if not existing_path.is_system_path:
1052
- raise excs.Error(
1053
- f'That table has already been replicated as {existing_path!r}.\n'
1054
- f'Drop the existing replica if you wish to re-create it.'
1055
- )
1056
- # If it's a system table, then this means it was created at some point as the ancestor of some other
1057
- # table (a snapshot-over-snapshot scenario). In that case, we simply move it to the new (named)
1058
- # location.
1059
- self._move(existing_path, path)
1049
+ raise excs.Error(
1050
+ f'That table has already been replicated as {existing_path!r}.\n'
1051
+ f'Drop the existing replica if you wish to re-create it.'
1052
+ )
1060
1053
 
1061
1054
  # Now store the metadata for this replica's proper ancestors. If one or more proper ancestors
1062
1055
  # do not yet exist in the store, they will be created as anonymous system tables.
@@ -1084,14 +1077,31 @@ class Catalog:
1084
1077
  # the new TableVersion instance. This is necessary because computed columns of descendant tables might
1085
1078
  # reference columns of the ancestor table that only exist in the new version.
1086
1079
  replica = Catalog.get().get_table_by_id(ancestor_id)
1087
- assert replica is not None # If it didn't exist before, it must have been created by now.
1088
- replica._tbl_version_path.clear_cached_md()
1080
+ # assert replica is not None # If it didn't exist before, it must have been created by now.
1081
+ if replica is not None:
1082
+ replica._tbl_version_path.clear_cached_md()
1089
1083
 
1090
- # Finally, store the metadata for the table being replicated; as before, it could be a new version or a known
1091
- # version. If it's a new version, then a TableVersion record will be created, unless the table being replicated
1084
+ # Store the metadata for the table being replicated; as before, it could be a new version or a known version.
1085
+ # If it's a new version, then a TableVersion record will be created, unless the table being replicated
1092
1086
  # is a pure snapshot.
1093
1087
  self.__store_replica_md(path, md[0])
1094
1088
 
1089
+ # Finally, it's possible that the table already exists in the catalog, but as an anonymous system table that
1090
+ # was hidden the last time we checked (and that just became visible when the replica was imported). In this
1091
+ # case, we need to make the existing table visible by moving it to the specified path.
1092
+ # We need to do this at the end, since `existing_path` needs to first have a non-fragment table version in
1093
+ # order to be instantiated as a schema object.
1094
+ existing = self.get_table_by_id(tbl_id)
1095
+ if existing is not None:
1096
+ existing_path = Path.parse(existing._path(), allow_system_path=True)
1097
+ if existing_path != path:
1098
+ assert existing_path.is_system_path
1099
+ self._move(existing_path, path)
1100
+
1101
+ def __ensure_system_dir_exists(self) -> Dir:
1102
+ system_path = Path.parse('_system', allow_system_path=True)
1103
+ return self._create_dir(system_path, if_exists=IfExistsParam.IGNORE, parents=False)
1104
+
1095
1105
  def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
1096
1106
  _logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
1097
1107
  dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
@@ -1104,6 +1114,7 @@ class Catalog:
1104
1114
  new_tbl_md: Optional[schema.TableMd] = None
1105
1115
  new_version_md: Optional[schema.TableVersionMd] = None
1106
1116
  new_schema_version_md: Optional[schema.TableSchemaVersionMd] = None
1117
+ is_new_tbl_version: bool = False
1107
1118
 
1108
1119
  # We need to ensure that the table metadata in the catalog always reflects the latest observed version of
1109
1120
  # this table. (In particular, if this is a base table, then its table metadata need to be consistent
@@ -1138,14 +1149,21 @@ class Catalog:
1138
1149
  existing_version_md_row = conn.execute(q).one_or_none()
1139
1150
  if existing_version_md_row is None:
1140
1151
  new_version_md = md.version_md
1152
+ is_new_tbl_version = True
1141
1153
  else:
1142
1154
  existing_version_md = schema.md_from_dict(schema.TableVersionMd, existing_version_md_row.md)
1143
- if existing_version_md != md.version_md:
1155
+ # Validate that the existing metadata are identical to the new metadata, except that their is_fragment
1156
+ # flags may differ.
1157
+ if dataclasses.replace(existing_version_md, is_fragment=md.version_md.is_fragment) != md.version_md:
1144
1158
  raise excs.Error(
1145
1159
  f'The version metadata for the replica {path!r}:{md.version_md.version} is inconsistent with '
1146
1160
  'the metadata recorded from a prior replica.\n'
1147
1161
  'This is likely due to data corruption in the replicated table.'
1148
1162
  )
1163
+ if existing_version_md.is_fragment and not md.version_md.is_fragment:
1164
+ # This version exists in the DB as a fragment, but we're importing a complete copy of the same version;
1165
+ # set the is_fragment flag to False in the DB.
1166
+ new_version_md = md.version_md
1149
1167
 
1150
1168
  # Do the same thing for TableSchemaVersion.
1151
1169
  q = (
@@ -1162,6 +1180,7 @@ class Catalog:
1162
1180
  existing_schema_version_md = schema.md_from_dict(
1163
1181
  schema.TableSchemaVersionMd, existing_schema_version_md_row.md
1164
1182
  )
1183
+ # Validate that the existing metadata are identical to the new metadata.
1165
1184
  if existing_schema_version_md != md.schema_version_md:
1166
1185
  raise excs.Error(
1167
1186
  f'The schema version metadata for the replica {path!r}:{md.schema_version_md.schema_version} '
@@ -1171,7 +1190,7 @@ class Catalog:
1171
1190
 
1172
1191
  self.store_tbl_md(UUID(tbl_id), None, new_tbl_md, new_version_md, new_schema_version_md)
1173
1192
 
1174
- if new_version_md is not None and not md.is_pure_snapshot:
1193
+ if is_new_tbl_version and not md.is_pure_snapshot:
1175
1194
  # It's a new version of a table that has a physical store, so we need to create a TableVersion instance.
1176
1195
  TableVersion.create_replica(md)
1177
1196
 
@@ -1206,41 +1225,72 @@ class Catalog:
1206
1225
 
1207
1226
  self._drop_tbl(tbl, force=force, is_replace=False)
1208
1227
 
1209
- def _drop_tbl(self, tbl: Table, force: bool, is_replace: bool) -> None:
1228
+ def _drop_tbl(self, tbl: Table | TableVersionPath, force: bool, is_replace: bool) -> None:
1210
1229
  """
1211
1230
  Drop the table (and recursively its views, if force == True).
1212
1231
 
1232
+ `tbl` can be an instance of `Table` for a user table, or `TableVersionPath` for a hidden (system) table.
1233
+
1213
1234
  Locking protocol:
1214
1235
  - X-lock base before X-locking any view
1215
1236
  - deadlock-free wrt to TableVersion.insert() (insert propagation also proceeds top-down)
1216
1237
  - X-locks parent dir prior to calling TableVersion.drop(): prevent concurrent creation of another SchemaObject
1217
1238
  in the same directory with the same name (which could lead to duplicate names if we get aborted)
1218
1239
  """
1219
- self._acquire_dir_xlock(dir_id=tbl._dir_id)
1220
- self._acquire_tbl_lock(tbl_id=tbl._id, for_write=True, lock_mutable_tree=False)
1240
+ is_pure_snapshot: bool
1241
+ if isinstance(tbl, TableVersionPath):
1242
+ tvp = tbl
1243
+ tbl_id = tvp.tbl_id
1244
+ tbl = None
1245
+ is_pure_snapshot = False
1246
+ else:
1247
+ tvp = tbl._tbl_version_path
1248
+ tbl_id = tbl._id
1249
+ is_pure_snapshot = tbl._tbl_version is None
1250
+
1251
+ if tbl is not None:
1252
+ self._acquire_dir_xlock(dir_id=tbl._dir_id)
1253
+ self._acquire_tbl_lock(tbl_id=tbl_id, for_write=True, lock_mutable_tree=False)
1254
+
1255
+ view_ids = self.get_view_ids(tbl_id, for_update=True)
1256
+ is_replica = tvp.is_replica()
1257
+ do_drop = True
1258
+
1259
+ _logger.debug(f'Preparing to drop table {tbl_id} (force={force!r}, is_replica={is_replica}).')
1221
1260
 
1222
- view_ids = self.get_view_ids(tbl._id, for_update=True)
1223
1261
  if len(view_ids) > 0:
1224
- if not force:
1225
- is_snapshot = tbl._tbl_version_path.is_snapshot()
1226
- obj_type_str = 'Snapshot' if is_snapshot else tbl._display_name().capitalize()
1262
+ if force:
1263
+ # recursively drop views first
1264
+ for view_id in view_ids:
1265
+ view = self.get_table_by_id(view_id)
1266
+ self._drop_tbl(view, force=force, is_replace=is_replace)
1267
+
1268
+ elif is_replica:
1269
+ # Dropping a replica with dependents and no 'force': just rename it to be a hidden table;
1270
+ # the actual table will not be dropped.
1271
+ assert tbl is not None # can only occur for a user table
1272
+ system_dir = self.__ensure_system_dir_exists()
1273
+ new_name = f'replica_{tbl_id.hex}'
1274
+ _logger.debug(f'{tbl._path()!r} is a replica with dependents; renaming to {new_name!r}.')
1275
+ tbl._move(new_name, system_dir._id)
1276
+ do_drop = False # don't actually clear the catalog for this table
1277
+
1278
+ else:
1279
+ # It has dependents but is not a replica and no 'force', so it's an error to drop it.
1280
+ assert tbl is not None # can only occur for a user table
1227
1281
  msg: str
1228
1282
  if is_replace:
1229
1283
  msg = (
1230
- f'{obj_type_str} {tbl._path()} already exists and has dependents. '
1284
+ f'{tbl._display_name()} {tbl._path()!r} already exists and has dependents. '
1231
1285
  "Use `if_exists='replace_force'` to replace it."
1232
1286
  )
1233
1287
  else:
1234
- msg = f'{obj_type_str} {tbl._path()} has dependents.'
1288
+ msg = f'{tbl._display_name()} {tbl._path()!r} has dependents.'
1235
1289
  raise excs.Error(msg)
1236
1290
 
1237
- for view_id in view_ids:
1238
- view = self.get_table_by_id(view_id)
1239
- self._drop_tbl(view, force=force, is_replace=is_replace)
1240
-
1241
1291
  # if this is a mutable view of a mutable base, advance the base's view_sn
1242
- if isinstance(tbl, View) and tbl._tbl_version_path.is_mutable() and tbl._tbl_version_path.base.is_mutable():
1243
- base_id = tbl._tbl_version_path.base.tbl_id
1292
+ if isinstance(tbl, View) and tvp.is_mutable() and tvp.base.is_mutable():
1293
+ base_id = tvp.base.tbl_id
1244
1294
  base_tv = self.get_tbl_version(base_id, None, validate_initialized=True)
1245
1295
  base_tv.tbl_md.view_sn += 1
1246
1296
  self._modified_tvs.add(base_tv.handle)
@@ -1251,26 +1301,46 @@ class Catalog:
1251
1301
  )
1252
1302
  assert result.rowcount == 1, result.rowcount
1253
1303
 
1254
- if tbl._tbl_version is not None:
1255
- # invalidate the TableVersion instance when we're done so that existing references to it can find out it
1256
- # has been dropped
1257
- self._modified_tvs.add(tbl._tbl_version)
1258
- tv = tbl._tbl_version.get() if tbl._tbl_version is not None else None
1259
- # if tv is not None:
1260
- # tv = tbl._tbl_version.get()
1261
- # # invalidate the TableVersion instance so that existing references to it can find out it has been dropped
1262
- # tv.is_validated = False
1263
- if tbl._tbl_version is not None:
1264
- # drop the store table before deleting the Table record
1265
- tv = tbl._tbl_version.get()
1266
- tv.drop()
1267
-
1268
- self.delete_tbl_md(tbl._id)
1269
- assert (tbl._id, None) in self._tbls
1270
- versions = [k[1] for k in self._tbls if k[0] == tbl._id]
1304
+ if do_drop:
1305
+ if not is_pure_snapshot:
1306
+ # invalidate the TableVersion instance when we're done so that existing references to it can find out it
1307
+ # has been dropped
1308
+ self._modified_tvs.add(tvp.tbl_version)
1309
+ tv = tvp.tbl_version.get() if tvp.tbl_version is not None else None
1310
+ if not is_pure_snapshot:
1311
+ # drop the store table before deleting the Table record
1312
+ tv = tvp.tbl_version.get()
1313
+ tv.drop()
1314
+
1315
+ self.delete_tbl_md(tbl_id)
1316
+ tvp.clear_cached_md()
1317
+
1318
+ assert (
1319
+ is_replica
1320
+ or (tbl_id, None) in self._tbls # non-replica tables must have an entry with effective_version=None
1321
+ )
1322
+
1323
+ # Remove visible Table references (we do this even for a replica that was just renamed).
1324
+ versions = [version for id, version in self._tbls if id == tbl_id]
1271
1325
  for version in versions:
1272
- del self._tbls[tbl._id, version]
1273
- _logger.info(f'Dropped table `{tbl._path()}`.')
1326
+ del self._tbls[tbl_id, version]
1327
+
1328
+ _logger.info(f'Dropped table {tbl_id if tbl is None else repr(tbl._path())}.')
1329
+
1330
+ if (
1331
+ is_replica # if this is a replica,
1332
+ and do_drop # and it was actually dropped (not just renamed),
1333
+ and tvp.base is not None # and it has a base table,
1334
+ ):
1335
+ base_tbl = self.get_table_by_id(tvp.base.tbl_id)
1336
+ base_tbl_path = None if base_tbl is None else Path.parse(base_tbl._path(), allow_system_path=True)
1337
+ if (
1338
+ (base_tbl_path is None or base_tbl_path.is_system_path) # and the base table is hidden,
1339
+ and len(self.get_view_ids(tvp.base.tbl_id, for_update=True)) == 0 # and has no other dependents,
1340
+ ):
1341
+ # then drop the base table as well (possibly recursively).
1342
+ _logger.debug(f'Dropping hidden base table {tvp.base.tbl_id} of dropped replica {tbl_id}.')
1343
+ self._drop_tbl(tvp.base, force=False, is_replace=False)
1274
1344
 
1275
1345
  @retry_loop(for_write=True)
1276
1346
  def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
@@ -1456,7 +1526,7 @@ class Catalog:
1456
1526
  row = conn.execute(q).one_or_none()
1457
1527
  return schema.Dir(**row._mapping) if row is not None else None
1458
1528
 
1459
- def _load_tbl(self, tbl_id: UUID) -> None:
1529
+ def _load_tbl(self, tbl_id: UUID) -> Optional[Table]:
1460
1530
  """Loads metadata for the table with the given id and caches it."""
1461
1531
  _logger.info(f'Loading table {tbl_id}')
1462
1532
  from .insertable_table import InsertableTable
@@ -1470,7 +1540,7 @@ class Catalog:
1470
1540
  if has_pending_ops:
1471
1541
  raise PendingTableOpsError(tbl_id)
1472
1542
 
1473
- q = (
1543
+ q: sql.Executable = (
1474
1544
  sql.select(schema.Table, schema.TableSchemaVersion)
1475
1545
  .join(schema.TableSchemaVersion)
1476
1546
  .where(schema.Table.id == schema.TableSchemaVersion.tbl_id)
@@ -1486,13 +1556,34 @@ class Catalog:
1486
1556
 
1487
1557
  tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
1488
1558
  view_md = tbl_md.view_md
1559
+
1560
+ if tbl_md.is_replica and not tbl_md.is_snapshot:
1561
+ # If this is a non-snapshot replica, we have to load it as a specific version handle. This is because:
1562
+ # (1) the head version might be a version fragment that isn't user-accessible, and
1563
+ # (2) the cached data in view_md.base_versions is not reliable, since the replicated version does not
1564
+ # necessarily track the head version of the originally shared table.
1565
+
1566
+ # Query for the latest non-fragment table version.
1567
+ q = (
1568
+ sql.select(schema.TableVersion.version)
1569
+ .where(schema.TableVersion.tbl_id == tbl_id)
1570
+ .where(schema.TableVersion.md['is_fragment'].astext == 'false')
1571
+ .order_by(schema.TableVersion.md['version'].cast(sql.Integer).desc())
1572
+ .limit(1)
1573
+ )
1574
+ row = conn.execute(q).one_or_none()
1575
+ if row is not None:
1576
+ version = row[0]
1577
+ return self._load_tbl_at_version(tbl_id, version)
1578
+ return None
1579
+
1489
1580
  if view_md is None and not tbl_md.is_replica:
1490
- # this is a base table
1581
+ # this is a base, non-replica table
1491
1582
  if (tbl_id, None) not in self._tbl_versions:
1492
1583
  _ = self._load_tbl_version(tbl_id, None)
1493
1584
  tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
1494
1585
  self._tbls[tbl_id, None] = tbl
1495
- return
1586
+ return tbl
1496
1587
 
1497
1588
  # this is a view; determine the sequence of TableVersions to load
1498
1589
  tbl_version_path: list[tuple[UUID, Optional[int]]] = []
@@ -1517,8 +1608,9 @@ class Catalog:
1517
1608
  base_path = view_path
1518
1609
  view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=tbl_md.is_pure_snapshot)
1519
1610
  self._tbls[tbl_id, None] = view
1611
+ return view
1520
1612
 
1521
- def _load_tbl_at_version(self, tbl_id: UUID, version: int) -> None:
1613
+ def _load_tbl_at_version(self, tbl_id: UUID, version: int) -> Optional[Table]:
1522
1614
  from .view import View
1523
1615
 
1524
1616
  # Load the specified TableMd and TableVersionMd records from the db.
@@ -1578,6 +1670,7 @@ class Catalog:
1578
1670
 
1579
1671
  view = View(tbl_id, tbl_record.dir_id, tbl_md.name, tvp, snapshot_only=True)
1580
1672
  self._tbls[tbl_id, version] = view
1673
+ return view
1581
1674
 
1582
1675
  @retry_loop(for_write=False)
1583
1676
  def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
@@ -1724,10 +1817,29 @@ class Catalog:
1724
1817
  assert version_md.tbl_id == str(tbl_id)
1725
1818
  if schema_version_md is not None:
1726
1819
  assert version_md.schema_version == schema_version_md.schema_version
1727
- tbl_version_record = schema.TableVersion(
1728
- tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
1820
+ tv_rows = (
1821
+ session.query(schema.TableVersion)
1822
+ .filter(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version_md.version)
1823
+ .all()
1729
1824
  )
1730
- session.add(tbl_version_record)
1825
+ if len(tv_rows) == 0:
1826
+ # It's a new table version; insert a new record in the DB for it.
1827
+ tbl_version_record = schema.TableVersion(
1828
+ tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
1829
+ )
1830
+ session.add(tbl_version_record)
1831
+ else:
1832
+ # This table version already exists; update it.
1833
+ assert len(tv_rows) == 1 # must be unique
1834
+ tv = tv_rows[0]
1835
+ # Validate that the only field that can change is 'is_fragment'.
1836
+ assert tv.md == dataclasses.asdict(dataclasses.replace(version_md, is_fragment=tv.md['is_fragment']))
1837
+ result = session.execute(
1838
+ sql.update(schema.TableVersion.__table__)
1839
+ .values({schema.TableVersion.md: dataclasses.asdict(version_md)})
1840
+ .where(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version_md.version)
1841
+ )
1842
+ assert result.rowcount == 1, result.rowcount
1731
1843
 
1732
1844
  # Construct and insert a new schema version record if requested.
1733
1845
  if schema_version_md is not None:
@@ -1796,6 +1908,10 @@ class Catalog:
1796
1908
  # destination catalog.
1797
1909
  ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
1798
1910
  ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
1911
+ # Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
1912
+ # table version (the data might be incomplete, since we have only retrieved one of its views, not
1913
+ # the table itself).
1914
+ ancestor_md.version_md.is_fragment = True
1799
1915
 
1800
1916
  return md
1801
1917
 
@@ -27,6 +27,25 @@ class Column:
27
27
 
28
28
  A Column contains all the metadata necessary for executing queries and updates against a particular version of a
29
29
  table/view.
30
+
31
+ Args:
32
+ name: column name; None for system columns (eg, index columns)
33
+ col_type: column type; can be None if the type can be derived from ``computed_with``
34
+ computed_with: an Expr that computes the column value
35
+ is_pk: if True, this column is part of the primary key
36
+ stored: determines whether a computed column is present in the stored table or recomputed on demand
37
+ destination: An object store reference for persisting computed files
38
+ col_id: column ID (only used internally)
39
+
40
+ Computed columns: those have a non-None ``computed_with`` argument
41
+ - when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
42
+ col_type is None
43
+ - when loaded from md store: ``computed_with`` is set and col_type is set
44
+
45
+ ``stored`` (only valid for computed columns):
46
+ - if True: the column is present in the stored table
47
+ - if False: the column is not present in the stored table and recomputed during a query
48
+ - if None: the system chooses for you (at present, this is always False, but this may change in the future)
30
49
  """
31
50
 
32
51
  name: str
@@ -34,6 +53,7 @@ class Column:
34
53
  col_type: ts.ColumnType
35
54
  stored: bool
36
55
  is_pk: bool
56
+ destination: Optional[str] # An object store reference for computed files
37
57
  _media_validation: Optional[MediaValidation] # if not set, TableVersion.media_validation applies
38
58
  schema_version_add: Optional[int]
39
59
  schema_version_drop: Optional[int]
@@ -62,27 +82,8 @@ class Column:
62
82
  stores_cellmd: Optional[bool] = None,
63
83
  value_expr_dict: Optional[dict[str, Any]] = None,
64
84
  tbl: Optional[TableVersion] = None,
85
+ destination: Optional[str] = None,
65
86
  ):
66
- """Column constructor.
67
-
68
- Args:
69
- name: column name; None for system columns (eg, index columns)
70
- col_type: column type; can be None if the type can be derived from ``computed_with``
71
- computed_with: an Expr that computes the column value
72
- is_pk: if True, this column is part of the primary key
73
- stored: determines whether a computed column is present in the stored table or recomputed on demand
74
- col_id: column ID (only used internally)
75
-
76
- Computed columns: those have a non-None ``computed_with`` argument
77
- - when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
78
- col_type is None
79
- - when loaded from md store: ``computed_with`` is set and col_type is set
80
-
81
- ``stored`` (only valid for computed columns):
82
- - if True: the column is present in the stored table
83
- - if False: the column is not present in the stored table and recomputed during a query
84
- - if None: the system chooses for you (at present, this is always False, but this may change in the future)
85
- """
86
87
  if name is not None and not is_valid_identifier(name):
87
88
  raise excs.Error(f"Invalid column name: '{name}'")
88
89
  self.name = name
@@ -126,6 +127,7 @@ class Column:
126
127
 
127
128
  # computed cols also have storage columns for the exception string and type
128
129
  self.sa_cellmd_col = None
130
+ self.destination = destination
129
131
 
130
132
  def to_md(self, pos: Optional[int] = None) -> tuple[schema.ColumnMd, Optional[schema.SchemaColumn]]:
131
133
  """Returns the Column and optional SchemaColumn metadata for this Column."""
@@ -138,6 +140,7 @@ class Column:
138
140
  schema_version_drop=self.schema_version_drop,
139
141
  value_expr=self.value_expr.as_dict() if self.value_expr is not None else None,
140
142
  stored=self.stored,
143
+ destination=self.destination,
141
144
  )
142
145
  if pos is None:
143
146
  return col_md, None
@@ -172,6 +175,7 @@ class Column:
172
175
  schema_version_drop=col_md.schema_version_drop,
173
176
  value_expr_dict=col_md.value_expr,
174
177
  tbl=tbl,
178
+ destination=col_md.destination,
175
179
  )
176
180
  return col
177
181
 
@@ -24,6 +24,7 @@ from pixeltable.catalog.table_metadata import (
24
24
  )
25
25
  from pixeltable.metadata import schema
26
26
  from pixeltable.metadata.utils import MetadataUtils
27
+ from pixeltable.utils.object_stores import ObjectOps
27
28
 
28
29
  from ..exprs import ColumnRef
29
30
  from ..utils.description_helper import DescriptionHelper
@@ -51,6 +52,7 @@ if TYPE_CHECKING:
51
52
  import pixeltable.plan
52
53
  from pixeltable.globals import TableDataSource
53
54
 
55
+
54
56
  _logger = logging.getLogger('pixeltable')
55
57
 
56
58
 
@@ -489,8 +491,7 @@ class Table(SchemaObject):
489
491
  Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed
490
492
  columns, use [`add_computed_column()`][pixeltable.catalog.Table.add_computed_column] instead.
491
493
 
492
- The format of the `schema` argument is identical to the format of the schema in a call to
493
- [`create_table()`][pixeltable.globals.create_table].
494
+ The format of the `schema` argument is a dict mapping column names to their types.
494
495
 
495
496
  Args:
496
497
  schema: A dictionary mapping column names to types.
@@ -603,6 +604,7 @@ class Table(SchemaObject):
603
604
  self,
604
605
  *,
605
606
  stored: Optional[bool] = None,
607
+ destination: Optional[str | Path] = None,
606
608
  print_stats: bool = False,
607
609
  on_error: Literal['abort', 'ignore'] = 'abort',
608
610
  if_exists: Literal['error', 'ignore', 'replace'] = 'error',
@@ -614,6 +616,7 @@ class Table(SchemaObject):
614
616
  Args:
615
617
  kwargs: Exactly one keyword argument of the form `col_name=expression`.
616
618
  stored: Whether the column is materialized and stored or computed on demand.
619
+ destination: An object store reference for persisting computed files.
617
620
  print_stats: If `True`, print execution metrics during evaluation.
618
621
  on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
619
622
  row.
@@ -664,6 +667,9 @@ class Table(SchemaObject):
664
667
  if stored is not None:
665
668
  col_schema['stored'] = stored
666
669
 
670
+ if destination is not None:
671
+ col_schema['destination'] = destination
672
+
667
673
  # Raise an error if the column expression refers to a column error property
668
674
  if isinstance(spec, exprs.Expr):
669
675
  for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
@@ -678,7 +684,7 @@ class Table(SchemaObject):
678
684
  [col_name], IfExistsParam.validated(if_exists, 'if_exists')
679
685
  )
680
686
  # if the column to add already exists and user asked to ignore
681
- # exiting column, there's nothing to do.
687
+ # existing column, there's nothing to do.
682
688
  result = UpdateStatus()
683
689
  if len(cols_to_ignore) != 0:
684
690
  assert cols_to_ignore[0] == col_name
@@ -699,7 +705,7 @@ class Table(SchemaObject):
699
705
  (on account of containing Python Callables or Exprs).
700
706
  """
701
707
  assert isinstance(spec, dict)
702
- valid_keys = {'type', 'value', 'stored', 'media_validation'}
708
+ valid_keys = {'type', 'value', 'stored', 'media_validation', 'destination'}
703
709
  for k in spec:
704
710
  if k not in valid_keys:
705
711
  raise excs.Error(f'Column {name}: invalid key {k!r}')
@@ -723,6 +729,10 @@ class Table(SchemaObject):
723
729
  if 'stored' in spec and not isinstance(spec['stored'], bool):
724
730
  raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
725
731
 
732
+ d = spec.get('destination')
733
+ if d is not None and not isinstance(d, (str, Path)):
734
+ raise excs.Error(f'Column {name}: `destination` must be a string or path, got {d}')
735
+
726
736
  @classmethod
727
737
  def _create_columns(cls, schema: dict[str, Any]) -> list[Column]:
728
738
  """Construct list of Columns, given schema"""
@@ -733,6 +743,7 @@ class Table(SchemaObject):
733
743
  primary_key: bool = False
734
744
  media_validation: Optional[catalog.MediaValidation] = None
735
745
  stored = True
746
+ destination: Optional[str] = None
736
747
 
737
748
  if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
738
749
  col_type = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
@@ -757,6 +768,8 @@ class Table(SchemaObject):
757
768
  media_validation = (
758
769
  catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None else None
759
770
  )
771
+ if 'destination' in spec:
772
+ destination = ObjectOps.validate_destination(spec['destination'], name)
760
773
  else:
761
774
  raise excs.Error(f'Invalid value for column {name!r}')
762
775
 
@@ -767,6 +780,7 @@ class Table(SchemaObject):
767
780
  stored=stored,
768
781
  is_pk=primary_key,
769
782
  media_validation=media_validation,
783
+ destination=destination,
770
784
  )
771
785
  columns.append(column)
772
786
  return columns
@@ -792,14 +806,16 @@ class Table(SchemaObject):
792
806
  f'streaming function'
793
807
  )
794
808
  )
809
+ if col.destination is not None and not (col.stored and col.is_computed):
810
+ raise excs.Error(
811
+ f'Column {col.name!r}: destination={col.destination} only applies to stored computed columns'
812
+ )
795
813
 
796
814
  @classmethod
797
815
  def _verify_schema(cls, schema: list[Column]) -> None:
798
816
  """Check integrity of user-supplied schema and set defaults"""
799
- column_names: set[str] = set()
800
817
  for col in schema:
801
818
  cls._verify_column(col)
802
- column_names.add(col.name)
803
819
 
804
820
  def drop_column(self, column: str | ColumnRef, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
805
821
  """Drop a column from the table.
@@ -1797,7 +1813,7 @@ class Table(SchemaObject):
1797
1813
  return pd.DataFrame([list(v.values()) for v in versions], columns=list(versions[0].keys()))
1798
1814
 
1799
1815
  def __check_mutable(self, op_descr: str) -> None:
1816
+ if self._tbl_version_path.is_replica():
1817
+ raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a replica.')
1800
1818
  if self._tbl_version_path.is_snapshot():
1801
1819
  raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a snapshot.')
1802
- if self._tbl_version_path.is_replica():
1803
- raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a {self._display_name()}.')