pixeltable 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +2 -1
  2. pixeltable/catalog/catalog.py +187 -63
  3. pixeltable/catalog/column.py +24 -20
  4. pixeltable/catalog/table.py +24 -8
  5. pixeltable/catalog/table_metadata.py +1 -0
  6. pixeltable/catalog/table_version.py +16 -34
  7. pixeltable/catalog/update_status.py +12 -0
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +4 -2
  11. pixeltable/env.py +46 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/expr_eval/expr_eval_node.py +11 -0
  17. pixeltable/exec/in_memory_data_node.py +1 -1
  18. pixeltable/exec/object_store_save_node.py +299 -0
  19. pixeltable/exec/sql_node.py +28 -33
  20. pixeltable/exprs/data_row.py +31 -25
  21. pixeltable/exprs/json_path.py +6 -5
  22. pixeltable/exprs/row_builder.py +6 -12
  23. pixeltable/functions/gemini.py +1 -1
  24. pixeltable/functions/openai.py +1 -1
  25. pixeltable/functions/video.py +128 -15
  26. pixeltable/functions/whisperx.py +2 -0
  27. pixeltable/functions/yolox.py +2 -0
  28. pixeltable/globals.py +49 -30
  29. pixeltable/index/embedding_index.py +5 -8
  30. pixeltable/io/__init__.py +1 -0
  31. pixeltable/io/fiftyone.py +1 -1
  32. pixeltable/io/label_studio.py +4 -5
  33. pixeltable/iterators/__init__.py +1 -0
  34. pixeltable/iterators/audio.py +1 -1
  35. pixeltable/iterators/document.py +10 -12
  36. pixeltable/iterators/video.py +1 -1
  37. pixeltable/metadata/schema.py +7 -0
  38. pixeltable/plan.py +26 -1
  39. pixeltable/share/packager.py +8 -2
  40. pixeltable/share/publish.py +3 -10
  41. pixeltable/store.py +1 -1
  42. pixeltable/type_system.py +1 -3
  43. pixeltable/utils/dbms.py +31 -5
  44. pixeltable/utils/gcs_store.py +283 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/METADATA +1 -1
  50. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/RECORD +53 -50
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/licenses/LICENSE +0 -0
pixeltable/__init__.py CHANGED
@@ -20,7 +20,6 @@ from .globals import (
20
20
  array,
21
21
  configure_logging,
22
22
  create_dir,
23
- create_replica,
24
23
  create_snapshot,
25
24
  create_table,
26
25
  create_view,
@@ -34,6 +33,8 @@ from .globals import (
34
33
  list_tables,
35
34
  ls,
36
35
  move,
36
+ publish,
37
+ replicate,
37
38
  tool,
38
39
  tools,
39
40
  )
@@ -14,8 +14,6 @@ import psycopg
14
14
  import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import exceptions as excs
17
-
18
- # from pixeltable import exceptions as excs, UpdateStatus
19
17
  from pixeltable.env import Env
20
18
  from pixeltable.iterators import ComponentIterator
21
19
  from pixeltable.metadata import schema
@@ -409,6 +407,11 @@ class Catalog:
409
407
  else:
410
408
  raise
411
409
 
410
+ except KeyboardInterrupt:
411
+ has_exc = True
412
+ _logger.debug('Caught KeyboardInterrupt')
413
+ raise
414
+
412
415
  except:
413
416
  has_exc = True
414
417
  raise
@@ -429,6 +432,9 @@ class Catalog:
429
432
  # stored metadata
430
433
  for handle in self._modified_tvs:
431
434
  self._clear_tv_cache(handle.id, handle.effective_version)
435
+ # Clear potentially corrupted cached metadata after error
436
+ if tbl is not None:
437
+ tbl.clear_cached_md()
432
438
  self._modified_tvs = set()
433
439
 
434
440
  @property
@@ -906,9 +912,9 @@ class Catalog:
906
912
  """Must be executed inside a transaction. Might raise PendingTableOpsError."""
907
913
  if (tbl_id, version) not in self._tbls:
908
914
  if version is None:
909
- self._load_tbl(tbl_id)
915
+ return self._load_tbl(tbl_id)
910
916
  else:
911
- self._load_tbl_at_version(tbl_id, version)
917
+ return self._load_tbl_at_version(tbl_id, version)
912
918
  return self._tbls.get((tbl_id, version))
913
919
 
914
920
  @retry_loop(for_write=True)
@@ -1040,23 +1046,18 @@ class Catalog:
1040
1046
  )
1041
1047
 
1042
1048
  # Ensure that the system directory exists.
1043
- self._create_dir(Path.parse('_system', allow_system_path=True), if_exists=IfExistsParam.IGNORE, parents=False)
1049
+ self.__ensure_system_dir_exists()
1044
1050
 
1045
1051
  # Now check to see if this table already exists in the catalog.
1046
1052
  existing = self.get_table_by_id(tbl_id)
1047
1053
  if existing is not None:
1048
1054
  existing_path = Path.parse(existing._path(), allow_system_path=True)
1049
- if existing_path != path:
1055
+ if existing_path != path and not existing_path.is_system_path:
1050
1056
  # It does exist, under a different path from the specified one.
1051
- if not existing_path.is_system_path:
1052
- raise excs.Error(
1053
- f'That table has already been replicated as {existing_path!r}.\n'
1054
- f'Drop the existing replica if you wish to re-create it.'
1055
- )
1056
- # If it's a system table, then this means it was created at some point as the ancestor of some other
1057
- # table (a snapshot-over-snapshot scenario). In that case, we simply move it to the new (named)
1058
- # location.
1059
- self._move(existing_path, path)
1057
+ raise excs.Error(
1058
+ f'That table has already been replicated as {existing_path!r}.\n'
1059
+ f'Drop the existing replica if you wish to re-create it.'
1060
+ )
1060
1061
 
1061
1062
  # Now store the metadata for this replica's proper ancestors. If one or more proper ancestors
1062
1063
  # do not yet exist in the store, they will be created as anonymous system tables.
@@ -1084,14 +1085,31 @@ class Catalog:
1084
1085
  # the new TableVersion instance. This is necessary because computed columns of descendant tables might
1085
1086
  # reference columns of the ancestor table that only exist in the new version.
1086
1087
  replica = Catalog.get().get_table_by_id(ancestor_id)
1087
- assert replica is not None # If it didn't exist before, it must have been created by now.
1088
- replica._tbl_version_path.clear_cached_md()
1088
+ # assert replica is not None # If it didn't exist before, it must have been created by now.
1089
+ if replica is not None:
1090
+ replica._tbl_version_path.clear_cached_md()
1089
1091
 
1090
- # Finally, store the metadata for the table being replicated; as before, it could be a new version or a known
1091
- # version. If it's a new version, then a TableVersion record will be created, unless the table being replicated
1092
+ # Store the metadata for the table being replicated; as before, it could be a new version or a known version.
1093
+ # If it's a new version, then a TableVersion record will be created, unless the table being replicated
1092
1094
  # is a pure snapshot.
1093
1095
  self.__store_replica_md(path, md[0])
1094
1096
 
1097
+ # Finally, it's possible that the table already exists in the catalog, but as an anonymous system table that
1098
+ # was hidden the last time we checked (and that just became visible when the replica was imported). In this
1099
+ # case, we need to make the existing table visible by moving it to the specified path.
1100
+ # We need to do this at the end, since `existing_path` needs to first have a non-fragment table version in
1101
+ # order to be instantiated as a schema object.
1102
+ existing = self.get_table_by_id(tbl_id)
1103
+ if existing is not None:
1104
+ existing_path = Path.parse(existing._path(), allow_system_path=True)
1105
+ if existing_path != path:
1106
+ assert existing_path.is_system_path
1107
+ self._move(existing_path, path)
1108
+
1109
+ def __ensure_system_dir_exists(self) -> Dir:
1110
+ system_path = Path.parse('_system', allow_system_path=True)
1111
+ return self._create_dir(system_path, if_exists=IfExistsParam.IGNORE, parents=False)
1112
+
1095
1113
  def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
1096
1114
  _logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
1097
1115
  dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
@@ -1104,6 +1122,7 @@ class Catalog:
1104
1122
  new_tbl_md: Optional[schema.TableMd] = None
1105
1123
  new_version_md: Optional[schema.TableVersionMd] = None
1106
1124
  new_schema_version_md: Optional[schema.TableSchemaVersionMd] = None
1125
+ is_new_tbl_version: bool = False
1107
1126
 
1108
1127
  # We need to ensure that the table metadata in the catalog always reflects the latest observed version of
1109
1128
  # this table. (In particular, if this is a base table, then its table metadata need to be consistent
@@ -1138,14 +1157,21 @@ class Catalog:
1138
1157
  existing_version_md_row = conn.execute(q).one_or_none()
1139
1158
  if existing_version_md_row is None:
1140
1159
  new_version_md = md.version_md
1160
+ is_new_tbl_version = True
1141
1161
  else:
1142
1162
  existing_version_md = schema.md_from_dict(schema.TableVersionMd, existing_version_md_row.md)
1143
- if existing_version_md != md.version_md:
1163
+ # Validate that the existing metadata are identical to the new metadata, except that their is_fragment
1164
+ # flags may differ.
1165
+ if dataclasses.replace(existing_version_md, is_fragment=md.version_md.is_fragment) != md.version_md:
1144
1166
  raise excs.Error(
1145
1167
  f'The version metadata for the replica {path!r}:{md.version_md.version} is inconsistent with '
1146
1168
  'the metadata recorded from a prior replica.\n'
1147
1169
  'This is likely due to data corruption in the replicated table.'
1148
1170
  )
1171
+ if existing_version_md.is_fragment and not md.version_md.is_fragment:
1172
+ # This version exists in the DB as a fragment, but we're importing a complete copy of the same version;
1173
+ # set the is_fragment flag to False in the DB.
1174
+ new_version_md = md.version_md
1149
1175
 
1150
1176
  # Do the same thing for TableSchemaVersion.
1151
1177
  q = (
@@ -1162,6 +1188,7 @@ class Catalog:
1162
1188
  existing_schema_version_md = schema.md_from_dict(
1163
1189
  schema.TableSchemaVersionMd, existing_schema_version_md_row.md
1164
1190
  )
1191
+ # Validate that the existing metadata are identical to the new metadata.
1165
1192
  if existing_schema_version_md != md.schema_version_md:
1166
1193
  raise excs.Error(
1167
1194
  f'The schema version metadata for the replica {path!r}:{md.schema_version_md.schema_version} '
@@ -1171,7 +1198,7 @@ class Catalog:
1171
1198
 
1172
1199
  self.store_tbl_md(UUID(tbl_id), None, new_tbl_md, new_version_md, new_schema_version_md)
1173
1200
 
1174
- if new_version_md is not None and not md.is_pure_snapshot:
1201
+ if is_new_tbl_version and not md.is_pure_snapshot:
1175
1202
  # It's a new version of a table that has a physical store, so we need to create a TableVersion instance.
1176
1203
  TableVersion.create_replica(md)
1177
1204
 
@@ -1206,41 +1233,72 @@ class Catalog:
1206
1233
 
1207
1234
  self._drop_tbl(tbl, force=force, is_replace=False)
1208
1235
 
1209
- def _drop_tbl(self, tbl: Table, force: bool, is_replace: bool) -> None:
1236
+ def _drop_tbl(self, tbl: Table | TableVersionPath, force: bool, is_replace: bool) -> None:
1210
1237
  """
1211
1238
  Drop the table (and recursively its views, if force == True).
1212
1239
 
1240
+ `tbl` can be an instance of `Table` for a user table, or `TableVersionPath` for a hidden (system) table.
1241
+
1213
1242
  Locking protocol:
1214
1243
  - X-lock base before X-locking any view
1215
1244
  - deadlock-free wrt to TableVersion.insert() (insert propagation also proceeds top-down)
1216
1245
  - X-locks parent dir prior to calling TableVersion.drop(): prevent concurrent creation of another SchemaObject
1217
1246
  in the same directory with the same name (which could lead to duplicate names if we get aborted)
1218
1247
  """
1219
- self._acquire_dir_xlock(dir_id=tbl._dir_id)
1220
- self._acquire_tbl_lock(tbl_id=tbl._id, for_write=True, lock_mutable_tree=False)
1248
+ is_pure_snapshot: bool
1249
+ if isinstance(tbl, TableVersionPath):
1250
+ tvp = tbl
1251
+ tbl_id = tvp.tbl_id
1252
+ tbl = None
1253
+ is_pure_snapshot = False
1254
+ else:
1255
+ tvp = tbl._tbl_version_path
1256
+ tbl_id = tbl._id
1257
+ is_pure_snapshot = tbl._tbl_version is None
1258
+
1259
+ if tbl is not None:
1260
+ self._acquire_dir_xlock(dir_id=tbl._dir_id)
1261
+ self._acquire_tbl_lock(tbl_id=tbl_id, for_write=True, lock_mutable_tree=False)
1262
+
1263
+ view_ids = self.get_view_ids(tbl_id, for_update=True)
1264
+ is_replica = tvp.is_replica()
1265
+ do_drop = True
1266
+
1267
+ _logger.debug(f'Preparing to drop table {tbl_id} (force={force!r}, is_replica={is_replica}).')
1221
1268
 
1222
- view_ids = self.get_view_ids(tbl._id, for_update=True)
1223
1269
  if len(view_ids) > 0:
1224
- if not force:
1225
- is_snapshot = tbl._tbl_version_path.is_snapshot()
1226
- obj_type_str = 'Snapshot' if is_snapshot else tbl._display_name().capitalize()
1270
+ if force:
1271
+ # recursively drop views first
1272
+ for view_id in view_ids:
1273
+ view = self.get_table_by_id(view_id)
1274
+ self._drop_tbl(view, force=force, is_replace=is_replace)
1275
+
1276
+ elif is_replica:
1277
+ # Dropping a replica with dependents and no 'force': just rename it to be a hidden table;
1278
+ # the actual table will not be dropped.
1279
+ assert tbl is not None # can only occur for a user table
1280
+ system_dir = self.__ensure_system_dir_exists()
1281
+ new_name = f'replica_{tbl_id.hex}'
1282
+ _logger.debug(f'{tbl._path()!r} is a replica with dependents; renaming to {new_name!r}.')
1283
+ tbl._move(new_name, system_dir._id)
1284
+ do_drop = False # don't actually clear the catalog for this table
1285
+
1286
+ else:
1287
+ # It has dependents but is not a replica and no 'force', so it's an error to drop it.
1288
+ assert tbl is not None # can only occur for a user table
1227
1289
  msg: str
1228
1290
  if is_replace:
1229
1291
  msg = (
1230
- f'{obj_type_str} {tbl._path()} already exists and has dependents. '
1292
+ f'{tbl._display_name()} {tbl._path()!r} already exists and has dependents. '
1231
1293
  "Use `if_exists='replace_force'` to replace it."
1232
1294
  )
1233
1295
  else:
1234
- msg = f'{obj_type_str} {tbl._path()} has dependents.'
1296
+ msg = f'{tbl._display_name()} {tbl._path()!r} has dependents.'
1235
1297
  raise excs.Error(msg)
1236
1298
 
1237
- for view_id in view_ids:
1238
- view = self.get_table_by_id(view_id)
1239
- self._drop_tbl(view, force=force, is_replace=is_replace)
1240
-
1241
1299
  # if this is a mutable view of a mutable base, advance the base's view_sn
1242
- if isinstance(tbl, View) and tbl._tbl_version_path.is_mutable() and tbl._tbl_version_path.base.is_mutable():
1243
- base_id = tbl._tbl_version_path.base.tbl_id
1300
+ if isinstance(tbl, View) and tvp.is_mutable() and tvp.base.is_mutable():
1301
+ base_id = tvp.base.tbl_id
1244
1302
  base_tv = self.get_tbl_version(base_id, None, validate_initialized=True)
1245
1303
  base_tv.tbl_md.view_sn += 1
1246
1304
  self._modified_tvs.add(base_tv.handle)
@@ -1251,26 +1309,46 @@ class Catalog:
1251
1309
  )
1252
1310
  assert result.rowcount == 1, result.rowcount
1253
1311
 
1254
- if tbl._tbl_version is not None:
1255
- # invalidate the TableVersion instance when we're done so that existing references to it can find out it
1256
- # has been dropped
1257
- self._modified_tvs.add(tbl._tbl_version)
1258
- tv = tbl._tbl_version.get() if tbl._tbl_version is not None else None
1259
- # if tv is not None:
1260
- # tv = tbl._tbl_version.get()
1261
- # # invalidate the TableVersion instance so that existing references to it can find out it has been dropped
1262
- # tv.is_validated = False
1263
- if tbl._tbl_version is not None:
1264
- # drop the store table before deleting the Table record
1265
- tv = tbl._tbl_version.get()
1266
- tv.drop()
1267
-
1268
- self.delete_tbl_md(tbl._id)
1269
- assert (tbl._id, None) in self._tbls
1270
- versions = [k[1] for k in self._tbls if k[0] == tbl._id]
1312
+ if do_drop:
1313
+ if not is_pure_snapshot:
1314
+ # invalidate the TableVersion instance when we're done so that existing references to it can find out it
1315
+ # has been dropped
1316
+ self._modified_tvs.add(tvp.tbl_version)
1317
+ tv = tvp.tbl_version.get() if tvp.tbl_version is not None else None
1318
+ if not is_pure_snapshot:
1319
+ # drop the store table before deleting the Table record
1320
+ tv = tvp.tbl_version.get()
1321
+ tv.drop()
1322
+
1323
+ self.delete_tbl_md(tbl_id)
1324
+ tvp.clear_cached_md()
1325
+
1326
+ assert (
1327
+ is_replica
1328
+ or (tbl_id, None) in self._tbls # non-replica tables must have an entry with effective_version=None
1329
+ )
1330
+
1331
+ # Remove visible Table references (we do this even for a replica that was just renamed).
1332
+ versions = [version for id, version in self._tbls if id == tbl_id]
1271
1333
  for version in versions:
1272
- del self._tbls[tbl._id, version]
1273
- _logger.info(f'Dropped table `{tbl._path()}`.')
1334
+ del self._tbls[tbl_id, version]
1335
+
1336
+ _logger.info(f'Dropped table {tbl_id if tbl is None else repr(tbl._path())}.')
1337
+
1338
+ if (
1339
+ is_replica # if this is a replica,
1340
+ and do_drop # and it was actually dropped (not just renamed),
1341
+ and tvp.base is not None # and it has a base table,
1342
+ ):
1343
+ base_tbl = self.get_table_by_id(tvp.base.tbl_id)
1344
+ base_tbl_path = None if base_tbl is None else Path.parse(base_tbl._path(), allow_system_path=True)
1345
+ if (
1346
+ (base_tbl_path is None or base_tbl_path.is_system_path) # and the base table is hidden,
1347
+ and len(self.get_view_ids(tvp.base.tbl_id, for_update=True)) == 0 # and has no other dependents,
1348
+ ):
1349
+ # then drop the base table as well (possibly recursively).
1350
+ _logger.debug(f'Dropping hidden base table {tvp.base.tbl_id} of dropped replica {tbl_id}.')
1351
+ self._drop_tbl(tvp.base, force=False, is_replace=False)
1274
1352
 
1275
1353
  @retry_loop(for_write=True)
1276
1354
  def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
@@ -1456,7 +1534,7 @@ class Catalog:
1456
1534
  row = conn.execute(q).one_or_none()
1457
1535
  return schema.Dir(**row._mapping) if row is not None else None
1458
1536
 
1459
- def _load_tbl(self, tbl_id: UUID) -> None:
1537
+ def _load_tbl(self, tbl_id: UUID) -> Optional[Table]:
1460
1538
  """Loads metadata for the table with the given id and caches it."""
1461
1539
  _logger.info(f'Loading table {tbl_id}')
1462
1540
  from .insertable_table import InsertableTable
@@ -1470,7 +1548,7 @@ class Catalog:
1470
1548
  if has_pending_ops:
1471
1549
  raise PendingTableOpsError(tbl_id)
1472
1550
 
1473
- q = (
1551
+ q: sql.Executable = (
1474
1552
  sql.select(schema.Table, schema.TableSchemaVersion)
1475
1553
  .join(schema.TableSchemaVersion)
1476
1554
  .where(schema.Table.id == schema.TableSchemaVersion.tbl_id)
@@ -1486,13 +1564,34 @@ class Catalog:
1486
1564
 
1487
1565
  tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
1488
1566
  view_md = tbl_md.view_md
1567
+
1568
+ if tbl_md.is_replica and not tbl_md.is_snapshot:
1569
+ # If this is a non-snapshot replica, we have to load it as a specific version handle. This is because:
1570
+ # (1) the head version might be a version fragment that isn't user-accessible, and
1571
+ # (2) the cached data in view_md.base_versions is not reliable, since the replicated version does not
1572
+ # necessarily track the head version of the originally shared table.
1573
+
1574
+ # Query for the latest non-fragment table version.
1575
+ q = (
1576
+ sql.select(schema.TableVersion.version)
1577
+ .where(schema.TableVersion.tbl_id == tbl_id)
1578
+ .where(schema.TableVersion.md['is_fragment'].astext == 'false')
1579
+ .order_by(schema.TableVersion.md['version'].cast(sql.Integer).desc())
1580
+ .limit(1)
1581
+ )
1582
+ row = conn.execute(q).one_or_none()
1583
+ if row is not None:
1584
+ version = row[0]
1585
+ return self._load_tbl_at_version(tbl_id, version)
1586
+ return None
1587
+
1489
1588
  if view_md is None and not tbl_md.is_replica:
1490
- # this is a base table
1589
+ # this is a base, non-replica table
1491
1590
  if (tbl_id, None) not in self._tbl_versions:
1492
1591
  _ = self._load_tbl_version(tbl_id, None)
1493
1592
  tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
1494
1593
  self._tbls[tbl_id, None] = tbl
1495
- return
1594
+ return tbl
1496
1595
 
1497
1596
  # this is a view; determine the sequence of TableVersions to load
1498
1597
  tbl_version_path: list[tuple[UUID, Optional[int]]] = []
@@ -1517,8 +1616,9 @@ class Catalog:
1517
1616
  base_path = view_path
1518
1617
  view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=tbl_md.is_pure_snapshot)
1519
1618
  self._tbls[tbl_id, None] = view
1619
+ return view
1520
1620
 
1521
- def _load_tbl_at_version(self, tbl_id: UUID, version: int) -> None:
1621
+ def _load_tbl_at_version(self, tbl_id: UUID, version: int) -> Optional[Table]:
1522
1622
  from .view import View
1523
1623
 
1524
1624
  # Load the specified TableMd and TableVersionMd records from the db.
@@ -1578,6 +1678,7 @@ class Catalog:
1578
1678
 
1579
1679
  view = View(tbl_id, tbl_record.dir_id, tbl_md.name, tvp, snapshot_only=True)
1580
1680
  self._tbls[tbl_id, version] = view
1681
+ return view
1581
1682
 
1582
1683
  @retry_loop(for_write=False)
1583
1684
  def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
@@ -1724,10 +1825,29 @@ class Catalog:
1724
1825
  assert version_md.tbl_id == str(tbl_id)
1725
1826
  if schema_version_md is not None:
1726
1827
  assert version_md.schema_version == schema_version_md.schema_version
1727
- tbl_version_record = schema.TableVersion(
1728
- tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
1828
+ tv_rows = (
1829
+ session.query(schema.TableVersion)
1830
+ .filter(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version_md.version)
1831
+ .all()
1729
1832
  )
1730
- session.add(tbl_version_record)
1833
+ if len(tv_rows) == 0:
1834
+ # It's a new table version; insert a new record in the DB for it.
1835
+ tbl_version_record = schema.TableVersion(
1836
+ tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
1837
+ )
1838
+ session.add(tbl_version_record)
1839
+ else:
1840
+ # This table version already exists; update it.
1841
+ assert len(tv_rows) == 1 # must be unique
1842
+ tv = tv_rows[0]
1843
+ # Validate that the only field that can change is 'is_fragment'.
1844
+ assert tv.md == dataclasses.asdict(dataclasses.replace(version_md, is_fragment=tv.md['is_fragment']))
1845
+ result = session.execute(
1846
+ sql.update(schema.TableVersion.__table__)
1847
+ .values({schema.TableVersion.md: dataclasses.asdict(version_md)})
1848
+ .where(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version_md.version)
1849
+ )
1850
+ assert result.rowcount == 1, result.rowcount
1731
1851
 
1732
1852
  # Construct and insert a new schema version record if requested.
1733
1853
  if schema_version_md is not None:
@@ -1796,6 +1916,10 @@ class Catalog:
1796
1916
  # destination catalog.
1797
1917
  ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
1798
1918
  ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
1919
+ # Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
1920
+ # table version (the data might be incomplete, since we have only retrieved one of its views, not
1921
+ # the table itself).
1922
+ ancestor_md.version_md.is_fragment = True
1799
1923
 
1800
1924
  return md
1801
1925
 
@@ -27,6 +27,25 @@ class Column:
27
27
 
28
28
  A Column contains all the metadata necessary for executing queries and updates against a particular version of a
29
29
  table/view.
30
+
31
+ Args:
32
+ name: column name; None for system columns (eg, index columns)
33
+ col_type: column type; can be None if the type can be derived from ``computed_with``
34
+ computed_with: an Expr that computes the column value
35
+ is_pk: if True, this column is part of the primary key
36
+ stored: determines whether a computed column is present in the stored table or recomputed on demand
37
+ destination: An object store reference for persisting computed files
38
+ col_id: column ID (only used internally)
39
+
40
+ Computed columns: those have a non-None ``computed_with`` argument
41
+ - when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
42
+ col_type is None
43
+ - when loaded from md store: ``computed_with`` is set and col_type is set
44
+
45
+ ``stored`` (only valid for computed columns):
46
+ - if True: the column is present in the stored table
47
+ - if False: the column is not present in the stored table and recomputed during a query
48
+ - if None: the system chooses for you (at present, this is always False, but this may change in the future)
30
49
  """
31
50
 
32
51
  name: str
@@ -34,6 +53,7 @@ class Column:
34
53
  col_type: ts.ColumnType
35
54
  stored: bool
36
55
  is_pk: bool
56
+ destination: Optional[str] # An object store reference for computed files
37
57
  _media_validation: Optional[MediaValidation] # if not set, TableVersion.media_validation applies
38
58
  schema_version_add: Optional[int]
39
59
  schema_version_drop: Optional[int]
@@ -62,27 +82,8 @@ class Column:
62
82
  stores_cellmd: Optional[bool] = None,
63
83
  value_expr_dict: Optional[dict[str, Any]] = None,
64
84
  tbl: Optional[TableVersion] = None,
85
+ destination: Optional[str] = None,
65
86
  ):
66
- """Column constructor.
67
-
68
- Args:
69
- name: column name; None for system columns (eg, index columns)
70
- col_type: column type; can be None if the type can be derived from ``computed_with``
71
- computed_with: an Expr that computes the column value
72
- is_pk: if True, this column is part of the primary key
73
- stored: determines whether a computed column is present in the stored table or recomputed on demand
74
- col_id: column ID (only used internally)
75
-
76
- Computed columns: those have a non-None ``computed_with`` argument
77
- - when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
78
- col_type is None
79
- - when loaded from md store: ``computed_with`` is set and col_type is set
80
-
81
- ``stored`` (only valid for computed columns):
82
- - if True: the column is present in the stored table
83
- - if False: the column is not present in the stored table and recomputed during a query
84
- - if None: the system chooses for you (at present, this is always False, but this may change in the future)
85
- """
86
87
  if name is not None and not is_valid_identifier(name):
87
88
  raise excs.Error(f"Invalid column name: '{name}'")
88
89
  self.name = name
@@ -126,6 +127,7 @@ class Column:
126
127
 
127
128
  # computed cols also have storage columns for the exception string and type
128
129
  self.sa_cellmd_col = None
130
+ self.destination = destination
129
131
 
130
132
  def to_md(self, pos: Optional[int] = None) -> tuple[schema.ColumnMd, Optional[schema.SchemaColumn]]:
131
133
  """Returns the Column and optional SchemaColumn metadata for this Column."""
@@ -138,6 +140,7 @@ class Column:
138
140
  schema_version_drop=self.schema_version_drop,
139
141
  value_expr=self.value_expr.as_dict() if self.value_expr is not None else None,
140
142
  stored=self.stored,
143
+ destination=self.destination,
141
144
  )
142
145
  if pos is None:
143
146
  return col_md, None
@@ -172,6 +175,7 @@ class Column:
172
175
  schema_version_drop=col_md.schema_version_drop,
173
176
  value_expr_dict=col_md.value_expr,
174
177
  tbl=tbl,
178
+ destination=col_md.destination,
175
179
  )
176
180
  return col
177
181
 
@@ -24,6 +24,7 @@ from pixeltable.catalog.table_metadata import (
24
24
  )
25
25
  from pixeltable.metadata import schema
26
26
  from pixeltable.metadata.utils import MetadataUtils
27
+ from pixeltable.utils.object_stores import ObjectOps
27
28
 
28
29
  from ..exprs import ColumnRef
29
30
  from ..utils.description_helper import DescriptionHelper
@@ -51,6 +52,7 @@ if TYPE_CHECKING:
51
52
  import pixeltable.plan
52
53
  from pixeltable.globals import TableDataSource
53
54
 
55
+
54
56
  _logger = logging.getLogger('pixeltable')
55
57
 
56
58
 
@@ -489,8 +491,7 @@ class Table(SchemaObject):
489
491
  Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed
490
492
  columns, use [`add_computed_column()`][pixeltable.catalog.Table.add_computed_column] instead.
491
493
 
492
- The format of the `schema` argument is identical to the format of the schema in a call to
493
- [`create_table()`][pixeltable.globals.create_table].
494
+ The format of the `schema` argument is a dict mapping column names to their types.
494
495
 
495
496
  Args:
496
497
  schema: A dictionary mapping column names to types.
@@ -603,6 +604,7 @@ class Table(SchemaObject):
603
604
  self,
604
605
  *,
605
606
  stored: Optional[bool] = None,
607
+ destination: Optional[str | Path] = None,
606
608
  print_stats: bool = False,
607
609
  on_error: Literal['abort', 'ignore'] = 'abort',
608
610
  if_exists: Literal['error', 'ignore', 'replace'] = 'error',
@@ -614,6 +616,7 @@ class Table(SchemaObject):
614
616
  Args:
615
617
  kwargs: Exactly one keyword argument of the form `col_name=expression`.
616
618
  stored: Whether the column is materialized and stored or computed on demand.
619
+ destination: An object store reference for persisting computed files.
617
620
  print_stats: If `True`, print execution metrics during evaluation.
618
621
  on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
619
622
  row.
@@ -664,6 +667,9 @@ class Table(SchemaObject):
664
667
  if stored is not None:
665
668
  col_schema['stored'] = stored
666
669
 
670
+ if destination is not None:
671
+ col_schema['destination'] = destination
672
+
667
673
  # Raise an error if the column expression refers to a column error property
668
674
  if isinstance(spec, exprs.Expr):
669
675
  for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
@@ -678,7 +684,7 @@ class Table(SchemaObject):
678
684
  [col_name], IfExistsParam.validated(if_exists, 'if_exists')
679
685
  )
680
686
  # if the column to add already exists and user asked to ignore
681
- # exiting column, there's nothing to do.
687
+ # existing column, there's nothing to do.
682
688
  result = UpdateStatus()
683
689
  if len(cols_to_ignore) != 0:
684
690
  assert cols_to_ignore[0] == col_name
@@ -699,7 +705,7 @@ class Table(SchemaObject):
699
705
  (on account of containing Python Callables or Exprs).
700
706
  """
701
707
  assert isinstance(spec, dict)
702
- valid_keys = {'type', 'value', 'stored', 'media_validation'}
708
+ valid_keys = {'type', 'value', 'stored', 'media_validation', 'destination'}
703
709
  for k in spec:
704
710
  if k not in valid_keys:
705
711
  raise excs.Error(f'Column {name}: invalid key {k!r}')
@@ -723,6 +729,10 @@ class Table(SchemaObject):
723
729
  if 'stored' in spec and not isinstance(spec['stored'], bool):
724
730
  raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
725
731
 
732
+ d = spec.get('destination')
733
+ if d is not None and not isinstance(d, (str, Path)):
734
+ raise excs.Error(f'Column {name}: `destination` must be a string or path, got {d}')
735
+
726
736
  @classmethod
727
737
  def _create_columns(cls, schema: dict[str, Any]) -> list[Column]:
728
738
  """Construct list of Columns, given schema"""
@@ -733,6 +743,7 @@ class Table(SchemaObject):
733
743
  primary_key: bool = False
734
744
  media_validation: Optional[catalog.MediaValidation] = None
735
745
  stored = True
746
+ destination: Optional[str] = None
736
747
 
737
748
  if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
738
749
  col_type = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
@@ -757,6 +768,8 @@ class Table(SchemaObject):
757
768
  media_validation = (
758
769
  catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None else None
759
770
  )
771
+ if 'destination' in spec:
772
+ destination = ObjectOps.validate_destination(spec['destination'], name)
760
773
  else:
761
774
  raise excs.Error(f'Invalid value for column {name!r}')
762
775
 
@@ -767,6 +780,7 @@ class Table(SchemaObject):
767
780
  stored=stored,
768
781
  is_pk=primary_key,
769
782
  media_validation=media_validation,
783
+ destination=destination,
770
784
  )
771
785
  columns.append(column)
772
786
  return columns
@@ -792,14 +806,16 @@ class Table(SchemaObject):
792
806
  f'streaming function'
793
807
  )
794
808
  )
809
+ if col.destination is not None and not (col.stored and col.is_computed):
810
+ raise excs.Error(
811
+ f'Column {col.name!r}: destination={col.destination} only applies to stored computed columns'
812
+ )
795
813
 
796
814
  @classmethod
797
815
  def _verify_schema(cls, schema: list[Column]) -> None:
798
816
  """Check integrity of user-supplied schema and set defaults"""
799
- column_names: set[str] = set()
800
817
  for col in schema:
801
818
  cls._verify_column(col)
802
- column_names.add(col.name)
803
819
 
804
820
  def drop_column(self, column: str | ColumnRef, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
805
821
  """Drop a column from the table.
@@ -1797,7 +1813,7 @@ class Table(SchemaObject):
1797
1813
  return pd.DataFrame([list(v.values()) for v in versions], columns=list(versions[0].keys()))
1798
1814
 
1799
1815
  def __check_mutable(self, op_descr: str) -> None:
1816
+ if self._tbl_version_path.is_replica():
1817
+ raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a replica.')
1800
1818
  if self._tbl_version_path.is_snapshot():
1801
1819
  raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a snapshot.')
1802
- if self._tbl_version_path.is_replica():
1803
- raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a {self._display_name()}.')
@@ -38,6 +38,7 @@ class IndexMetadata(TypedDict):
38
38
  index_type: Literal['embedding']
39
39
  """The type of index (currently only `'embedding'` is supported, but others will be added in the future)."""
40
40
  parameters: EmbeddingIndexParams
41
+ """Parameters specific to the index type."""
41
42
 
42
43
 
43
44
  class TableMetadata(TypedDict):