pixeltable 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -1
- pixeltable/catalog/catalog.py +187 -63
- pixeltable/catalog/column.py +24 -20
- pixeltable/catalog/table.py +24 -8
- pixeltable/catalog/table_metadata.py +1 -0
- pixeltable/catalog/table_version.py +16 -34
- pixeltable/catalog/update_status.py +12 -0
- pixeltable/catalog/view.py +22 -22
- pixeltable/config.py +2 -0
- pixeltable/dataframe.py +4 -2
- pixeltable/env.py +46 -21
- pixeltable/exec/__init__.py +1 -0
- pixeltable/exec/aggregation_node.py +0 -1
- pixeltable/exec/cache_prefetch_node.py +74 -98
- pixeltable/exec/data_row_batch.py +2 -18
- pixeltable/exec/expr_eval/expr_eval_node.py +11 -0
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/object_store_save_node.py +299 -0
- pixeltable/exec/sql_node.py +28 -33
- pixeltable/exprs/data_row.py +31 -25
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/row_builder.py +6 -12
- pixeltable/functions/gemini.py +1 -1
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/video.py +128 -15
- pixeltable/functions/whisperx.py +2 -0
- pixeltable/functions/yolox.py +2 -0
- pixeltable/globals.py +49 -30
- pixeltable/index/embedding_index.py +5 -8
- pixeltable/io/__init__.py +1 -0
- pixeltable/io/fiftyone.py +1 -1
- pixeltable/io/label_studio.py +4 -5
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +1 -1
- pixeltable/iterators/document.py +10 -12
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/schema.py +7 -0
- pixeltable/plan.py +26 -1
- pixeltable/share/packager.py +8 -2
- pixeltable/share/publish.py +3 -10
- pixeltable/store.py +1 -1
- pixeltable/type_system.py +1 -3
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/object_stores.py +497 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +354 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/METADATA +1 -1
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/RECORD +53 -50
- pixeltable/utils/media_store.py +0 -248
- pixeltable/utils/s3.py +0 -17
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/licenses/LICENSE +0 -0
pixeltable/__init__.py
CHANGED
|
@@ -20,7 +20,6 @@ from .globals import (
|
|
|
20
20
|
array,
|
|
21
21
|
configure_logging,
|
|
22
22
|
create_dir,
|
|
23
|
-
create_replica,
|
|
24
23
|
create_snapshot,
|
|
25
24
|
create_table,
|
|
26
25
|
create_view,
|
|
@@ -34,6 +33,8 @@ from .globals import (
|
|
|
34
33
|
list_tables,
|
|
35
34
|
ls,
|
|
36
35
|
move,
|
|
36
|
+
publish,
|
|
37
|
+
replicate,
|
|
37
38
|
tool,
|
|
38
39
|
tools,
|
|
39
40
|
)
|
pixeltable/catalog/catalog.py
CHANGED
|
@@ -14,8 +14,6 @@ import psycopg
|
|
|
14
14
|
import sqlalchemy as sql
|
|
15
15
|
|
|
16
16
|
from pixeltable import exceptions as excs
|
|
17
|
-
|
|
18
|
-
# from pixeltable import exceptions as excs, UpdateStatus
|
|
19
17
|
from pixeltable.env import Env
|
|
20
18
|
from pixeltable.iterators import ComponentIterator
|
|
21
19
|
from pixeltable.metadata import schema
|
|
@@ -409,6 +407,11 @@ class Catalog:
|
|
|
409
407
|
else:
|
|
410
408
|
raise
|
|
411
409
|
|
|
410
|
+
except KeyboardInterrupt:
|
|
411
|
+
has_exc = True
|
|
412
|
+
_logger.debug('Caught KeyboardInterrupt')
|
|
413
|
+
raise
|
|
414
|
+
|
|
412
415
|
except:
|
|
413
416
|
has_exc = True
|
|
414
417
|
raise
|
|
@@ -429,6 +432,9 @@ class Catalog:
|
|
|
429
432
|
# stored metadata
|
|
430
433
|
for handle in self._modified_tvs:
|
|
431
434
|
self._clear_tv_cache(handle.id, handle.effective_version)
|
|
435
|
+
# Clear potentially corrupted cached metadata after error
|
|
436
|
+
if tbl is not None:
|
|
437
|
+
tbl.clear_cached_md()
|
|
432
438
|
self._modified_tvs = set()
|
|
433
439
|
|
|
434
440
|
@property
|
|
@@ -906,9 +912,9 @@ class Catalog:
|
|
|
906
912
|
"""Must be executed inside a transaction. Might raise PendingTableOpsError."""
|
|
907
913
|
if (tbl_id, version) not in self._tbls:
|
|
908
914
|
if version is None:
|
|
909
|
-
self._load_tbl(tbl_id)
|
|
915
|
+
return self._load_tbl(tbl_id)
|
|
910
916
|
else:
|
|
911
|
-
self._load_tbl_at_version(tbl_id, version)
|
|
917
|
+
return self._load_tbl_at_version(tbl_id, version)
|
|
912
918
|
return self._tbls.get((tbl_id, version))
|
|
913
919
|
|
|
914
920
|
@retry_loop(for_write=True)
|
|
@@ -1040,23 +1046,18 @@ class Catalog:
|
|
|
1040
1046
|
)
|
|
1041
1047
|
|
|
1042
1048
|
# Ensure that the system directory exists.
|
|
1043
|
-
self.
|
|
1049
|
+
self.__ensure_system_dir_exists()
|
|
1044
1050
|
|
|
1045
1051
|
# Now check to see if this table already exists in the catalog.
|
|
1046
1052
|
existing = self.get_table_by_id(tbl_id)
|
|
1047
1053
|
if existing is not None:
|
|
1048
1054
|
existing_path = Path.parse(existing._path(), allow_system_path=True)
|
|
1049
|
-
if existing_path != path:
|
|
1055
|
+
if existing_path != path and not existing_path.is_system_path:
|
|
1050
1056
|
# It does exist, under a different path from the specified one.
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
)
|
|
1056
|
-
# If it's a system table, then this means it was created at some point as the ancestor of some other
|
|
1057
|
-
# table (a snapshot-over-snapshot scenario). In that case, we simply move it to the new (named)
|
|
1058
|
-
# location.
|
|
1059
|
-
self._move(existing_path, path)
|
|
1057
|
+
raise excs.Error(
|
|
1058
|
+
f'That table has already been replicated as {existing_path!r}.\n'
|
|
1059
|
+
f'Drop the existing replica if you wish to re-create it.'
|
|
1060
|
+
)
|
|
1060
1061
|
|
|
1061
1062
|
# Now store the metadata for this replica's proper ancestors. If one or more proper ancestors
|
|
1062
1063
|
# do not yet exist in the store, they will be created as anonymous system tables.
|
|
@@ -1084,14 +1085,31 @@ class Catalog:
|
|
|
1084
1085
|
# the new TableVersion instance. This is necessary because computed columns of descendant tables might
|
|
1085
1086
|
# reference columns of the ancestor table that only exist in the new version.
|
|
1086
1087
|
replica = Catalog.get().get_table_by_id(ancestor_id)
|
|
1087
|
-
assert replica is not None # If it didn't exist before, it must have been created by now.
|
|
1088
|
-
replica
|
|
1088
|
+
# assert replica is not None # If it didn't exist before, it must have been created by now.
|
|
1089
|
+
if replica is not None:
|
|
1090
|
+
replica._tbl_version_path.clear_cached_md()
|
|
1089
1091
|
|
|
1090
|
-
#
|
|
1091
|
-
#
|
|
1092
|
+
# Store the metadata for the table being replicated; as before, it could be a new version or a known version.
|
|
1093
|
+
# If it's a new version, then a TableVersion record will be created, unless the table being replicated
|
|
1092
1094
|
# is a pure snapshot.
|
|
1093
1095
|
self.__store_replica_md(path, md[0])
|
|
1094
1096
|
|
|
1097
|
+
# Finally, it's possible that the table already exists in the catalog, but as an anonymous system table that
|
|
1098
|
+
# was hidden the last time we checked (and that just became visible when the replica was imported). In this
|
|
1099
|
+
# case, we need to make the existing table visible by moving it to the specified path.
|
|
1100
|
+
# We need to do this at the end, since `existing_path` needs to first have a non-fragment table version in
|
|
1101
|
+
# order to be instantiated as a schema object.
|
|
1102
|
+
existing = self.get_table_by_id(tbl_id)
|
|
1103
|
+
if existing is not None:
|
|
1104
|
+
existing_path = Path.parse(existing._path(), allow_system_path=True)
|
|
1105
|
+
if existing_path != path:
|
|
1106
|
+
assert existing_path.is_system_path
|
|
1107
|
+
self._move(existing_path, path)
|
|
1108
|
+
|
|
1109
|
+
def __ensure_system_dir_exists(self) -> Dir:
|
|
1110
|
+
system_path = Path.parse('_system', allow_system_path=True)
|
|
1111
|
+
return self._create_dir(system_path, if_exists=IfExistsParam.IGNORE, parents=False)
|
|
1112
|
+
|
|
1095
1113
|
def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
|
|
1096
1114
|
_logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
|
|
1097
1115
|
dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
|
|
@@ -1104,6 +1122,7 @@ class Catalog:
|
|
|
1104
1122
|
new_tbl_md: Optional[schema.TableMd] = None
|
|
1105
1123
|
new_version_md: Optional[schema.TableVersionMd] = None
|
|
1106
1124
|
new_schema_version_md: Optional[schema.TableSchemaVersionMd] = None
|
|
1125
|
+
is_new_tbl_version: bool = False
|
|
1107
1126
|
|
|
1108
1127
|
# We need to ensure that the table metadata in the catalog always reflects the latest observed version of
|
|
1109
1128
|
# this table. (In particular, if this is a base table, then its table metadata need to be consistent
|
|
@@ -1138,14 +1157,21 @@ class Catalog:
|
|
|
1138
1157
|
existing_version_md_row = conn.execute(q).one_or_none()
|
|
1139
1158
|
if existing_version_md_row is None:
|
|
1140
1159
|
new_version_md = md.version_md
|
|
1160
|
+
is_new_tbl_version = True
|
|
1141
1161
|
else:
|
|
1142
1162
|
existing_version_md = schema.md_from_dict(schema.TableVersionMd, existing_version_md_row.md)
|
|
1143
|
-
|
|
1163
|
+
# Validate that the existing metadata are identical to the new metadata, except that their is_fragment
|
|
1164
|
+
# flags may differ.
|
|
1165
|
+
if dataclasses.replace(existing_version_md, is_fragment=md.version_md.is_fragment) != md.version_md:
|
|
1144
1166
|
raise excs.Error(
|
|
1145
1167
|
f'The version metadata for the replica {path!r}:{md.version_md.version} is inconsistent with '
|
|
1146
1168
|
'the metadata recorded from a prior replica.\n'
|
|
1147
1169
|
'This is likely due to data corruption in the replicated table.'
|
|
1148
1170
|
)
|
|
1171
|
+
if existing_version_md.is_fragment and not md.version_md.is_fragment:
|
|
1172
|
+
# This version exists in the DB as a fragment, but we're importing a complete copy of the same version;
|
|
1173
|
+
# set the is_fragment flag to False in the DB.
|
|
1174
|
+
new_version_md = md.version_md
|
|
1149
1175
|
|
|
1150
1176
|
# Do the same thing for TableSchemaVersion.
|
|
1151
1177
|
q = (
|
|
@@ -1162,6 +1188,7 @@ class Catalog:
|
|
|
1162
1188
|
existing_schema_version_md = schema.md_from_dict(
|
|
1163
1189
|
schema.TableSchemaVersionMd, existing_schema_version_md_row.md
|
|
1164
1190
|
)
|
|
1191
|
+
# Validate that the existing metadata are identical to the new metadata.
|
|
1165
1192
|
if existing_schema_version_md != md.schema_version_md:
|
|
1166
1193
|
raise excs.Error(
|
|
1167
1194
|
f'The schema version metadata for the replica {path!r}:{md.schema_version_md.schema_version} '
|
|
@@ -1171,7 +1198,7 @@ class Catalog:
|
|
|
1171
1198
|
|
|
1172
1199
|
self.store_tbl_md(UUID(tbl_id), None, new_tbl_md, new_version_md, new_schema_version_md)
|
|
1173
1200
|
|
|
1174
|
-
if
|
|
1201
|
+
if is_new_tbl_version and not md.is_pure_snapshot:
|
|
1175
1202
|
# It's a new version of a table that has a physical store, so we need to create a TableVersion instance.
|
|
1176
1203
|
TableVersion.create_replica(md)
|
|
1177
1204
|
|
|
@@ -1206,41 +1233,72 @@ class Catalog:
|
|
|
1206
1233
|
|
|
1207
1234
|
self._drop_tbl(tbl, force=force, is_replace=False)
|
|
1208
1235
|
|
|
1209
|
-
def _drop_tbl(self, tbl: Table, force: bool, is_replace: bool) -> None:
|
|
1236
|
+
def _drop_tbl(self, tbl: Table | TableVersionPath, force: bool, is_replace: bool) -> None:
|
|
1210
1237
|
"""
|
|
1211
1238
|
Drop the table (and recursively its views, if force == True).
|
|
1212
1239
|
|
|
1240
|
+
`tbl` can be an instance of `Table` for a user table, or `TableVersionPath` for a hidden (system) table.
|
|
1241
|
+
|
|
1213
1242
|
Locking protocol:
|
|
1214
1243
|
- X-lock base before X-locking any view
|
|
1215
1244
|
- deadlock-free wrt to TableVersion.insert() (insert propagation also proceeds top-down)
|
|
1216
1245
|
- X-locks parent dir prior to calling TableVersion.drop(): prevent concurrent creation of another SchemaObject
|
|
1217
1246
|
in the same directory with the same name (which could lead to duplicate names if we get aborted)
|
|
1218
1247
|
"""
|
|
1219
|
-
|
|
1220
|
-
|
|
1248
|
+
is_pure_snapshot: bool
|
|
1249
|
+
if isinstance(tbl, TableVersionPath):
|
|
1250
|
+
tvp = tbl
|
|
1251
|
+
tbl_id = tvp.tbl_id
|
|
1252
|
+
tbl = None
|
|
1253
|
+
is_pure_snapshot = False
|
|
1254
|
+
else:
|
|
1255
|
+
tvp = tbl._tbl_version_path
|
|
1256
|
+
tbl_id = tbl._id
|
|
1257
|
+
is_pure_snapshot = tbl._tbl_version is None
|
|
1258
|
+
|
|
1259
|
+
if tbl is not None:
|
|
1260
|
+
self._acquire_dir_xlock(dir_id=tbl._dir_id)
|
|
1261
|
+
self._acquire_tbl_lock(tbl_id=tbl_id, for_write=True, lock_mutable_tree=False)
|
|
1262
|
+
|
|
1263
|
+
view_ids = self.get_view_ids(tbl_id, for_update=True)
|
|
1264
|
+
is_replica = tvp.is_replica()
|
|
1265
|
+
do_drop = True
|
|
1266
|
+
|
|
1267
|
+
_logger.debug(f'Preparing to drop table {tbl_id} (force={force!r}, is_replica={is_replica}).')
|
|
1221
1268
|
|
|
1222
|
-
view_ids = self.get_view_ids(tbl._id, for_update=True)
|
|
1223
1269
|
if len(view_ids) > 0:
|
|
1224
|
-
if
|
|
1225
|
-
|
|
1226
|
-
|
|
1270
|
+
if force:
|
|
1271
|
+
# recursively drop views first
|
|
1272
|
+
for view_id in view_ids:
|
|
1273
|
+
view = self.get_table_by_id(view_id)
|
|
1274
|
+
self._drop_tbl(view, force=force, is_replace=is_replace)
|
|
1275
|
+
|
|
1276
|
+
elif is_replica:
|
|
1277
|
+
# Dropping a replica with dependents and no 'force': just rename it to be a hidden table;
|
|
1278
|
+
# the actual table will not be dropped.
|
|
1279
|
+
assert tbl is not None # can only occur for a user table
|
|
1280
|
+
system_dir = self.__ensure_system_dir_exists()
|
|
1281
|
+
new_name = f'replica_{tbl_id.hex}'
|
|
1282
|
+
_logger.debug(f'{tbl._path()!r} is a replica with dependents; renaming to {new_name!r}.')
|
|
1283
|
+
tbl._move(new_name, system_dir._id)
|
|
1284
|
+
do_drop = False # don't actually clear the catalog for this table
|
|
1285
|
+
|
|
1286
|
+
else:
|
|
1287
|
+
# It has dependents but is not a replica and no 'force', so it's an error to drop it.
|
|
1288
|
+
assert tbl is not None # can only occur for a user table
|
|
1227
1289
|
msg: str
|
|
1228
1290
|
if is_replace:
|
|
1229
1291
|
msg = (
|
|
1230
|
-
f'{
|
|
1292
|
+
f'{tbl._display_name()} {tbl._path()!r} already exists and has dependents. '
|
|
1231
1293
|
"Use `if_exists='replace_force'` to replace it."
|
|
1232
1294
|
)
|
|
1233
1295
|
else:
|
|
1234
|
-
msg = f'{
|
|
1296
|
+
msg = f'{tbl._display_name()} {tbl._path()!r} has dependents.'
|
|
1235
1297
|
raise excs.Error(msg)
|
|
1236
1298
|
|
|
1237
|
-
for view_id in view_ids:
|
|
1238
|
-
view = self.get_table_by_id(view_id)
|
|
1239
|
-
self._drop_tbl(view, force=force, is_replace=is_replace)
|
|
1240
|
-
|
|
1241
1299
|
# if this is a mutable view of a mutable base, advance the base's view_sn
|
|
1242
|
-
if isinstance(tbl, View) and
|
|
1243
|
-
base_id =
|
|
1300
|
+
if isinstance(tbl, View) and tvp.is_mutable() and tvp.base.is_mutable():
|
|
1301
|
+
base_id = tvp.base.tbl_id
|
|
1244
1302
|
base_tv = self.get_tbl_version(base_id, None, validate_initialized=True)
|
|
1245
1303
|
base_tv.tbl_md.view_sn += 1
|
|
1246
1304
|
self._modified_tvs.add(base_tv.handle)
|
|
@@ -1251,26 +1309,46 @@ class Catalog:
|
|
|
1251
1309
|
)
|
|
1252
1310
|
assert result.rowcount == 1, result.rowcount
|
|
1253
1311
|
|
|
1254
|
-
if
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1312
|
+
if do_drop:
|
|
1313
|
+
if not is_pure_snapshot:
|
|
1314
|
+
# invalidate the TableVersion instance when we're done so that existing references to it can find out it
|
|
1315
|
+
# has been dropped
|
|
1316
|
+
self._modified_tvs.add(tvp.tbl_version)
|
|
1317
|
+
tv = tvp.tbl_version.get() if tvp.tbl_version is not None else None
|
|
1318
|
+
if not is_pure_snapshot:
|
|
1319
|
+
# drop the store table before deleting the Table record
|
|
1320
|
+
tv = tvp.tbl_version.get()
|
|
1321
|
+
tv.drop()
|
|
1322
|
+
|
|
1323
|
+
self.delete_tbl_md(tbl_id)
|
|
1324
|
+
tvp.clear_cached_md()
|
|
1325
|
+
|
|
1326
|
+
assert (
|
|
1327
|
+
is_replica
|
|
1328
|
+
or (tbl_id, None) in self._tbls # non-replica tables must have an entry with effective_version=None
|
|
1329
|
+
)
|
|
1330
|
+
|
|
1331
|
+
# Remove visible Table references (we do this even for a replica that was just renamed).
|
|
1332
|
+
versions = [version for id, version in self._tbls if id == tbl_id]
|
|
1271
1333
|
for version in versions:
|
|
1272
|
-
del self._tbls[
|
|
1273
|
-
|
|
1334
|
+
del self._tbls[tbl_id, version]
|
|
1335
|
+
|
|
1336
|
+
_logger.info(f'Dropped table {tbl_id if tbl is None else repr(tbl._path())}.')
|
|
1337
|
+
|
|
1338
|
+
if (
|
|
1339
|
+
is_replica # if this is a replica,
|
|
1340
|
+
and do_drop # and it was actually dropped (not just renamed),
|
|
1341
|
+
and tvp.base is not None # and it has a base table,
|
|
1342
|
+
):
|
|
1343
|
+
base_tbl = self.get_table_by_id(tvp.base.tbl_id)
|
|
1344
|
+
base_tbl_path = None if base_tbl is None else Path.parse(base_tbl._path(), allow_system_path=True)
|
|
1345
|
+
if (
|
|
1346
|
+
(base_tbl_path is None or base_tbl_path.is_system_path) # and the base table is hidden,
|
|
1347
|
+
and len(self.get_view_ids(tvp.base.tbl_id, for_update=True)) == 0 # and has no other dependents,
|
|
1348
|
+
):
|
|
1349
|
+
# then drop the base table as well (possibly recursively).
|
|
1350
|
+
_logger.debug(f'Dropping hidden base table {tvp.base.tbl_id} of dropped replica {tbl_id}.')
|
|
1351
|
+
self._drop_tbl(tvp.base, force=False, is_replace=False)
|
|
1274
1352
|
|
|
1275
1353
|
@retry_loop(for_write=True)
|
|
1276
1354
|
def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
|
|
@@ -1456,7 +1534,7 @@ class Catalog:
|
|
|
1456
1534
|
row = conn.execute(q).one_or_none()
|
|
1457
1535
|
return schema.Dir(**row._mapping) if row is not None else None
|
|
1458
1536
|
|
|
1459
|
-
def _load_tbl(self, tbl_id: UUID) ->
|
|
1537
|
+
def _load_tbl(self, tbl_id: UUID) -> Optional[Table]:
|
|
1460
1538
|
"""Loads metadata for the table with the given id and caches it."""
|
|
1461
1539
|
_logger.info(f'Loading table {tbl_id}')
|
|
1462
1540
|
from .insertable_table import InsertableTable
|
|
@@ -1470,7 +1548,7 @@ class Catalog:
|
|
|
1470
1548
|
if has_pending_ops:
|
|
1471
1549
|
raise PendingTableOpsError(tbl_id)
|
|
1472
1550
|
|
|
1473
|
-
q = (
|
|
1551
|
+
q: sql.Executable = (
|
|
1474
1552
|
sql.select(schema.Table, schema.TableSchemaVersion)
|
|
1475
1553
|
.join(schema.TableSchemaVersion)
|
|
1476
1554
|
.where(schema.Table.id == schema.TableSchemaVersion.tbl_id)
|
|
@@ -1486,13 +1564,34 @@ class Catalog:
|
|
|
1486
1564
|
|
|
1487
1565
|
tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
|
|
1488
1566
|
view_md = tbl_md.view_md
|
|
1567
|
+
|
|
1568
|
+
if tbl_md.is_replica and not tbl_md.is_snapshot:
|
|
1569
|
+
# If this is a non-snapshot replica, we have to load it as a specific version handle. This is because:
|
|
1570
|
+
# (1) the head version might be a version fragment that isn't user-accessible, and
|
|
1571
|
+
# (2) the cached data in view_md.base_versions is not reliable, since the replicated version does not
|
|
1572
|
+
# necessarily track the head version of the originally shared table.
|
|
1573
|
+
|
|
1574
|
+
# Query for the latest non-fragment table version.
|
|
1575
|
+
q = (
|
|
1576
|
+
sql.select(schema.TableVersion.version)
|
|
1577
|
+
.where(schema.TableVersion.tbl_id == tbl_id)
|
|
1578
|
+
.where(schema.TableVersion.md['is_fragment'].astext == 'false')
|
|
1579
|
+
.order_by(schema.TableVersion.md['version'].cast(sql.Integer).desc())
|
|
1580
|
+
.limit(1)
|
|
1581
|
+
)
|
|
1582
|
+
row = conn.execute(q).one_or_none()
|
|
1583
|
+
if row is not None:
|
|
1584
|
+
version = row[0]
|
|
1585
|
+
return self._load_tbl_at_version(tbl_id, version)
|
|
1586
|
+
return None
|
|
1587
|
+
|
|
1489
1588
|
if view_md is None and not tbl_md.is_replica:
|
|
1490
|
-
# this is a base table
|
|
1589
|
+
# this is a base, non-replica table
|
|
1491
1590
|
if (tbl_id, None) not in self._tbl_versions:
|
|
1492
1591
|
_ = self._load_tbl_version(tbl_id, None)
|
|
1493
1592
|
tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
|
|
1494
1593
|
self._tbls[tbl_id, None] = tbl
|
|
1495
|
-
return
|
|
1594
|
+
return tbl
|
|
1496
1595
|
|
|
1497
1596
|
# this is a view; determine the sequence of TableVersions to load
|
|
1498
1597
|
tbl_version_path: list[tuple[UUID, Optional[int]]] = []
|
|
@@ -1517,8 +1616,9 @@ class Catalog:
|
|
|
1517
1616
|
base_path = view_path
|
|
1518
1617
|
view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=tbl_md.is_pure_snapshot)
|
|
1519
1618
|
self._tbls[tbl_id, None] = view
|
|
1619
|
+
return view
|
|
1520
1620
|
|
|
1521
|
-
def _load_tbl_at_version(self, tbl_id: UUID, version: int) ->
|
|
1621
|
+
def _load_tbl_at_version(self, tbl_id: UUID, version: int) -> Optional[Table]:
|
|
1522
1622
|
from .view import View
|
|
1523
1623
|
|
|
1524
1624
|
# Load the specified TableMd and TableVersionMd records from the db.
|
|
@@ -1578,6 +1678,7 @@ class Catalog:
|
|
|
1578
1678
|
|
|
1579
1679
|
view = View(tbl_id, tbl_record.dir_id, tbl_md.name, tvp, snapshot_only=True)
|
|
1580
1680
|
self._tbls[tbl_id, version] = view
|
|
1681
|
+
return view
|
|
1581
1682
|
|
|
1582
1683
|
@retry_loop(for_write=False)
|
|
1583
1684
|
def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
|
|
@@ -1724,10 +1825,29 @@ class Catalog:
|
|
|
1724
1825
|
assert version_md.tbl_id == str(tbl_id)
|
|
1725
1826
|
if schema_version_md is not None:
|
|
1726
1827
|
assert version_md.schema_version == schema_version_md.schema_version
|
|
1727
|
-
|
|
1728
|
-
|
|
1828
|
+
tv_rows = (
|
|
1829
|
+
session.query(schema.TableVersion)
|
|
1830
|
+
.filter(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version_md.version)
|
|
1831
|
+
.all()
|
|
1729
1832
|
)
|
|
1730
|
-
|
|
1833
|
+
if len(tv_rows) == 0:
|
|
1834
|
+
# It's a new table version; insert a new record in the DB for it.
|
|
1835
|
+
tbl_version_record = schema.TableVersion(
|
|
1836
|
+
tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
|
|
1837
|
+
)
|
|
1838
|
+
session.add(tbl_version_record)
|
|
1839
|
+
else:
|
|
1840
|
+
# This table version already exists; update it.
|
|
1841
|
+
assert len(tv_rows) == 1 # must be unique
|
|
1842
|
+
tv = tv_rows[0]
|
|
1843
|
+
# Validate that the only field that can change is 'is_fragment'.
|
|
1844
|
+
assert tv.md == dataclasses.asdict(dataclasses.replace(version_md, is_fragment=tv.md['is_fragment']))
|
|
1845
|
+
result = session.execute(
|
|
1846
|
+
sql.update(schema.TableVersion.__table__)
|
|
1847
|
+
.values({schema.TableVersion.md: dataclasses.asdict(version_md)})
|
|
1848
|
+
.where(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version_md.version)
|
|
1849
|
+
)
|
|
1850
|
+
assert result.rowcount == 1, result.rowcount
|
|
1731
1851
|
|
|
1732
1852
|
# Construct and insert a new schema version record if requested.
|
|
1733
1853
|
if schema_version_md is not None:
|
|
@@ -1796,6 +1916,10 @@ class Catalog:
|
|
|
1796
1916
|
# destination catalog.
|
|
1797
1917
|
ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
|
|
1798
1918
|
ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
|
|
1919
|
+
# Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
|
|
1920
|
+
# table version (the data might be incomplete, since we have only retrieved one of its views, not
|
|
1921
|
+
# the table itself).
|
|
1922
|
+
ancestor_md.version_md.is_fragment = True
|
|
1799
1923
|
|
|
1800
1924
|
return md
|
|
1801
1925
|
|
pixeltable/catalog/column.py
CHANGED
|
@@ -27,6 +27,25 @@ class Column:
|
|
|
27
27
|
|
|
28
28
|
A Column contains all the metadata necessary for executing queries and updates against a particular version of a
|
|
29
29
|
table/view.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
name: column name; None for system columns (eg, index columns)
|
|
33
|
+
col_type: column type; can be None if the type can be derived from ``computed_with``
|
|
34
|
+
computed_with: an Expr that computes the column value
|
|
35
|
+
is_pk: if True, this column is part of the primary key
|
|
36
|
+
stored: determines whether a computed column is present in the stored table or recomputed on demand
|
|
37
|
+
destination: An object store reference for persisting computed files
|
|
38
|
+
col_id: column ID (only used internally)
|
|
39
|
+
|
|
40
|
+
Computed columns: those have a non-None ``computed_with`` argument
|
|
41
|
+
- when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
|
|
42
|
+
col_type is None
|
|
43
|
+
- when loaded from md store: ``computed_with`` is set and col_type is set
|
|
44
|
+
|
|
45
|
+
``stored`` (only valid for computed columns):
|
|
46
|
+
- if True: the column is present in the stored table
|
|
47
|
+
- if False: the column is not present in the stored table and recomputed during a query
|
|
48
|
+
- if None: the system chooses for you (at present, this is always False, but this may change in the future)
|
|
30
49
|
"""
|
|
31
50
|
|
|
32
51
|
name: str
|
|
@@ -34,6 +53,7 @@ class Column:
|
|
|
34
53
|
col_type: ts.ColumnType
|
|
35
54
|
stored: bool
|
|
36
55
|
is_pk: bool
|
|
56
|
+
destination: Optional[str] # An object store reference for computed files
|
|
37
57
|
_media_validation: Optional[MediaValidation] # if not set, TableVersion.media_validation applies
|
|
38
58
|
schema_version_add: Optional[int]
|
|
39
59
|
schema_version_drop: Optional[int]
|
|
@@ -62,27 +82,8 @@ class Column:
|
|
|
62
82
|
stores_cellmd: Optional[bool] = None,
|
|
63
83
|
value_expr_dict: Optional[dict[str, Any]] = None,
|
|
64
84
|
tbl: Optional[TableVersion] = None,
|
|
85
|
+
destination: Optional[str] = None,
|
|
65
86
|
):
|
|
66
|
-
"""Column constructor.
|
|
67
|
-
|
|
68
|
-
Args:
|
|
69
|
-
name: column name; None for system columns (eg, index columns)
|
|
70
|
-
col_type: column type; can be None if the type can be derived from ``computed_with``
|
|
71
|
-
computed_with: an Expr that computes the column value
|
|
72
|
-
is_pk: if True, this column is part of the primary key
|
|
73
|
-
stored: determines whether a computed column is present in the stored table or recomputed on demand
|
|
74
|
-
col_id: column ID (only used internally)
|
|
75
|
-
|
|
76
|
-
Computed columns: those have a non-None ``computed_with`` argument
|
|
77
|
-
- when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
|
|
78
|
-
col_type is None
|
|
79
|
-
- when loaded from md store: ``computed_with`` is set and col_type is set
|
|
80
|
-
|
|
81
|
-
``stored`` (only valid for computed columns):
|
|
82
|
-
- if True: the column is present in the stored table
|
|
83
|
-
- if False: the column is not present in the stored table and recomputed during a query
|
|
84
|
-
- if None: the system chooses for you (at present, this is always False, but this may change in the future)
|
|
85
|
-
"""
|
|
86
87
|
if name is not None and not is_valid_identifier(name):
|
|
87
88
|
raise excs.Error(f"Invalid column name: '{name}'")
|
|
88
89
|
self.name = name
|
|
@@ -126,6 +127,7 @@ class Column:
|
|
|
126
127
|
|
|
127
128
|
# computed cols also have storage columns for the exception string and type
|
|
128
129
|
self.sa_cellmd_col = None
|
|
130
|
+
self.destination = destination
|
|
129
131
|
|
|
130
132
|
def to_md(self, pos: Optional[int] = None) -> tuple[schema.ColumnMd, Optional[schema.SchemaColumn]]:
|
|
131
133
|
"""Returns the Column and optional SchemaColumn metadata for this Column."""
|
|
@@ -138,6 +140,7 @@ class Column:
|
|
|
138
140
|
schema_version_drop=self.schema_version_drop,
|
|
139
141
|
value_expr=self.value_expr.as_dict() if self.value_expr is not None else None,
|
|
140
142
|
stored=self.stored,
|
|
143
|
+
destination=self.destination,
|
|
141
144
|
)
|
|
142
145
|
if pos is None:
|
|
143
146
|
return col_md, None
|
|
@@ -172,6 +175,7 @@ class Column:
|
|
|
172
175
|
schema_version_drop=col_md.schema_version_drop,
|
|
173
176
|
value_expr_dict=col_md.value_expr,
|
|
174
177
|
tbl=tbl,
|
|
178
|
+
destination=col_md.destination,
|
|
175
179
|
)
|
|
176
180
|
return col
|
|
177
181
|
|
pixeltable/catalog/table.py
CHANGED
|
@@ -24,6 +24,7 @@ from pixeltable.catalog.table_metadata import (
|
|
|
24
24
|
)
|
|
25
25
|
from pixeltable.metadata import schema
|
|
26
26
|
from pixeltable.metadata.utils import MetadataUtils
|
|
27
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
27
28
|
|
|
28
29
|
from ..exprs import ColumnRef
|
|
29
30
|
from ..utils.description_helper import DescriptionHelper
|
|
@@ -51,6 +52,7 @@ if TYPE_CHECKING:
|
|
|
51
52
|
import pixeltable.plan
|
|
52
53
|
from pixeltable.globals import TableDataSource
|
|
53
54
|
|
|
55
|
+
|
|
54
56
|
_logger = logging.getLogger('pixeltable')
|
|
55
57
|
|
|
56
58
|
|
|
@@ -489,8 +491,7 @@ class Table(SchemaObject):
|
|
|
489
491
|
Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed
|
|
490
492
|
columns, use [`add_computed_column()`][pixeltable.catalog.Table.add_computed_column] instead.
|
|
491
493
|
|
|
492
|
-
The format of the `schema` argument is
|
|
493
|
-
[`create_table()`][pixeltable.globals.create_table].
|
|
494
|
+
The format of the `schema` argument is a dict mapping column names to their types.
|
|
494
495
|
|
|
495
496
|
Args:
|
|
496
497
|
schema: A dictionary mapping column names to types.
|
|
@@ -603,6 +604,7 @@ class Table(SchemaObject):
|
|
|
603
604
|
self,
|
|
604
605
|
*,
|
|
605
606
|
stored: Optional[bool] = None,
|
|
607
|
+
destination: Optional[str | Path] = None,
|
|
606
608
|
print_stats: bool = False,
|
|
607
609
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
608
610
|
if_exists: Literal['error', 'ignore', 'replace'] = 'error',
|
|
@@ -614,6 +616,7 @@ class Table(SchemaObject):
|
|
|
614
616
|
Args:
|
|
615
617
|
kwargs: Exactly one keyword argument of the form `col_name=expression`.
|
|
616
618
|
stored: Whether the column is materialized and stored or computed on demand.
|
|
619
|
+
destination: An object store reference for persisting computed files.
|
|
617
620
|
print_stats: If `True`, print execution metrics during evaluation.
|
|
618
621
|
on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
|
|
619
622
|
row.
|
|
@@ -664,6 +667,9 @@ class Table(SchemaObject):
|
|
|
664
667
|
if stored is not None:
|
|
665
668
|
col_schema['stored'] = stored
|
|
666
669
|
|
|
670
|
+
if destination is not None:
|
|
671
|
+
col_schema['destination'] = destination
|
|
672
|
+
|
|
667
673
|
# Raise an error if the column expression refers to a column error property
|
|
668
674
|
if isinstance(spec, exprs.Expr):
|
|
669
675
|
for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
|
|
@@ -678,7 +684,7 @@ class Table(SchemaObject):
|
|
|
678
684
|
[col_name], IfExistsParam.validated(if_exists, 'if_exists')
|
|
679
685
|
)
|
|
680
686
|
# if the column to add already exists and user asked to ignore
|
|
681
|
-
#
|
|
687
|
+
# existing column, there's nothing to do.
|
|
682
688
|
result = UpdateStatus()
|
|
683
689
|
if len(cols_to_ignore) != 0:
|
|
684
690
|
assert cols_to_ignore[0] == col_name
|
|
@@ -699,7 +705,7 @@ class Table(SchemaObject):
|
|
|
699
705
|
(on account of containing Python Callables or Exprs).
|
|
700
706
|
"""
|
|
701
707
|
assert isinstance(spec, dict)
|
|
702
|
-
valid_keys = {'type', 'value', 'stored', 'media_validation'}
|
|
708
|
+
valid_keys = {'type', 'value', 'stored', 'media_validation', 'destination'}
|
|
703
709
|
for k in spec:
|
|
704
710
|
if k not in valid_keys:
|
|
705
711
|
raise excs.Error(f'Column {name}: invalid key {k!r}')
|
|
@@ -723,6 +729,10 @@ class Table(SchemaObject):
|
|
|
723
729
|
if 'stored' in spec and not isinstance(spec['stored'], bool):
|
|
724
730
|
raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
|
|
725
731
|
|
|
732
|
+
d = spec.get('destination')
|
|
733
|
+
if d is not None and not isinstance(d, (str, Path)):
|
|
734
|
+
raise excs.Error(f'Column {name}: `destination` must be a string or path, got {d}')
|
|
735
|
+
|
|
726
736
|
@classmethod
|
|
727
737
|
def _create_columns(cls, schema: dict[str, Any]) -> list[Column]:
|
|
728
738
|
"""Construct list of Columns, given schema"""
|
|
@@ -733,6 +743,7 @@ class Table(SchemaObject):
|
|
|
733
743
|
primary_key: bool = False
|
|
734
744
|
media_validation: Optional[catalog.MediaValidation] = None
|
|
735
745
|
stored = True
|
|
746
|
+
destination: Optional[str] = None
|
|
736
747
|
|
|
737
748
|
if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
|
|
738
749
|
col_type = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
|
|
@@ -757,6 +768,8 @@ class Table(SchemaObject):
|
|
|
757
768
|
media_validation = (
|
|
758
769
|
catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None else None
|
|
759
770
|
)
|
|
771
|
+
if 'destination' in spec:
|
|
772
|
+
destination = ObjectOps.validate_destination(spec['destination'], name)
|
|
760
773
|
else:
|
|
761
774
|
raise excs.Error(f'Invalid value for column {name!r}')
|
|
762
775
|
|
|
@@ -767,6 +780,7 @@ class Table(SchemaObject):
|
|
|
767
780
|
stored=stored,
|
|
768
781
|
is_pk=primary_key,
|
|
769
782
|
media_validation=media_validation,
|
|
783
|
+
destination=destination,
|
|
770
784
|
)
|
|
771
785
|
columns.append(column)
|
|
772
786
|
return columns
|
|
@@ -792,14 +806,16 @@ class Table(SchemaObject):
|
|
|
792
806
|
f'streaming function'
|
|
793
807
|
)
|
|
794
808
|
)
|
|
809
|
+
if col.destination is not None and not (col.stored and col.is_computed):
|
|
810
|
+
raise excs.Error(
|
|
811
|
+
f'Column {col.name!r}: destination={col.destination} only applies to stored computed columns'
|
|
812
|
+
)
|
|
795
813
|
|
|
796
814
|
@classmethod
|
|
797
815
|
def _verify_schema(cls, schema: list[Column]) -> None:
|
|
798
816
|
"""Check integrity of user-supplied schema and set defaults"""
|
|
799
|
-
column_names: set[str] = set()
|
|
800
817
|
for col in schema:
|
|
801
818
|
cls._verify_column(col)
|
|
802
|
-
column_names.add(col.name)
|
|
803
819
|
|
|
804
820
|
def drop_column(self, column: str | ColumnRef, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
|
|
805
821
|
"""Drop a column from the table.
|
|
@@ -1797,7 +1813,7 @@ class Table(SchemaObject):
|
|
|
1797
1813
|
return pd.DataFrame([list(v.values()) for v in versions], columns=list(versions[0].keys()))
|
|
1798
1814
|
|
|
1799
1815
|
def __check_mutable(self, op_descr: str) -> None:
|
|
1816
|
+
if self._tbl_version_path.is_replica():
|
|
1817
|
+
raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a replica.')
|
|
1800
1818
|
if self._tbl_version_path.is_snapshot():
|
|
1801
1819
|
raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a snapshot.')
|
|
1802
|
-
if self._tbl_version_path.is_replica():
|
|
1803
|
-
raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a {self._display_name()}.')
|
|
@@ -38,6 +38,7 @@ class IndexMetadata(TypedDict):
|
|
|
38
38
|
index_type: Literal['embedding']
|
|
39
39
|
"""The type of index (currently only `'embedding'` is supported, but others will be added in the future)."""
|
|
40
40
|
parameters: EmbeddingIndexParams
|
|
41
|
+
"""Parameters specific to the index type."""
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
class TableMetadata(TypedDict):
|