pixeltable 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/catalog.py +47 -32
- pixeltable/catalog/table.py +33 -14
- pixeltable/catalog/table_version.py +86 -46
- pixeltable/catalog/table_version_path.py +0 -11
- pixeltable/catalog/view.py +6 -0
- pixeltable/config.py +1 -0
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +12 -0
- pixeltable/exec/exec_context.py +15 -2
- pixeltable/exec/sql_node.py +3 -2
- pixeltable/exprs/arithmetic_expr.py +13 -7
- pixeltable/functions/huggingface.py +1031 -2
- pixeltable/functions/video.py +140 -31
- pixeltable/globals.py +23 -4
- pixeltable/io/globals.py +2 -2
- pixeltable/io/parquet.py +1 -1
- pixeltable/io/table_data_conduit.py +1 -1
- pixeltable/iterators/document.py +111 -42
- pixeltable/iterators/video.py +169 -62
- pixeltable/plan.py +2 -6
- pixeltable/share/packager.py +155 -26
- pixeltable/store.py +25 -5
- pixeltable/utils/arrow.py +6 -6
- pixeltable/utils/av.py +104 -11
- pixeltable/utils/object_stores.py +16 -1
- pixeltable/utils/s3_store.py +44 -11
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/METADATA +30 -30
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/RECORD +31 -31
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/licenses/LICENSE +0 -0
pixeltable/catalog/catalog.py
CHANGED
|
@@ -280,7 +280,7 @@ class Catalog:
|
|
|
280
280
|
- this needs to be done in a retry loop, because Postgres can decide to abort the transaction
|
|
281
281
|
(SerializationFailure, LockNotAvailable)
|
|
282
282
|
- for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
|
|
283
|
-
to minimize the probability of
|
|
283
|
+
to minimize the probability of losing that work due to a forced abort
|
|
284
284
|
|
|
285
285
|
If convert_db_excs == True, converts DBAPIErrors into excs.Errors.
|
|
286
286
|
"""
|
|
@@ -433,7 +433,7 @@ class Catalog:
|
|
|
433
433
|
|
|
434
434
|
The function should not raise exceptions; if it does, they are logged and ignored.
|
|
435
435
|
"""
|
|
436
|
-
assert
|
|
436
|
+
assert self.in_write_xact
|
|
437
437
|
self._undo_actions.append(func)
|
|
438
438
|
return func
|
|
439
439
|
|
|
@@ -472,11 +472,13 @@ class Catalog:
|
|
|
472
472
|
else:
|
|
473
473
|
msg = ''
|
|
474
474
|
_logger.debug(f'Exception: {e.orig.__class__}: {msg} ({e})')
|
|
475
|
+
# Suppress the underlying SQL exception unless DEBUG is enabled
|
|
476
|
+
raise_from = e if _logger.isEnabledFor(logging.DEBUG) else None
|
|
475
477
|
raise excs.Error(
|
|
476
478
|
'That Pixeltable operation could not be completed because it conflicted with another '
|
|
477
479
|
'operation that was run on a different process.\n'
|
|
478
480
|
'Please re-run the operation.'
|
|
479
|
-
) from
|
|
481
|
+
) from raise_from
|
|
480
482
|
|
|
481
483
|
@property
|
|
482
484
|
def in_write_xact(self) -> bool:
|
|
@@ -790,19 +792,25 @@ class Catalog:
|
|
|
790
792
|
return result
|
|
791
793
|
|
|
792
794
|
@retry_loop(for_write=True)
|
|
793
|
-
def move(self, path: Path, new_path: Path) -> None:
|
|
794
|
-
self._move(path, new_path)
|
|
795
|
+
def move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
|
|
796
|
+
self._move(path, new_path, if_exists, if_not_exists)
|
|
795
797
|
|
|
796
|
-
def _move(self, path: Path, new_path: Path) -> None:
|
|
797
|
-
|
|
798
|
+
def _move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
|
|
799
|
+
dest_obj, dest_dir, src_obj = self._prepare_dir_op(
|
|
798
800
|
add_dir_path=new_path.parent,
|
|
799
801
|
add_name=new_path.name,
|
|
800
802
|
drop_dir_path=path.parent,
|
|
801
803
|
drop_name=path.name,
|
|
802
|
-
raise_if_exists=
|
|
803
|
-
raise_if_not_exists=
|
|
804
|
+
raise_if_exists=(if_exists == IfExistsParam.ERROR),
|
|
805
|
+
raise_if_not_exists=(if_not_exists == IfNotExistsParam.ERROR),
|
|
804
806
|
)
|
|
805
|
-
|
|
807
|
+
assert dest_obj is None or if_exists == IfExistsParam.IGNORE
|
|
808
|
+
assert src_obj is not None or if_not_exists == IfNotExistsParam.IGNORE
|
|
809
|
+
if dest_obj is None and src_obj is not None:
|
|
810
|
+
# If dest_obj is not None, it means `if_exists='ignore'` and the destination already exists.
|
|
811
|
+
# If src_obj is None, it means `if_not_exists='ignore'` and the source doesn't exist.
|
|
812
|
+
# If dest_obj is None and src_obj is not None, then we can proceed with the move.
|
|
813
|
+
src_obj._move(new_path.name, dest_dir._id)
|
|
806
814
|
|
|
807
815
|
def _prepare_dir_op(
|
|
808
816
|
self,
|
|
@@ -813,7 +821,7 @@ class Catalog:
|
|
|
813
821
|
drop_expected: Optional[type[SchemaObject]] = None,
|
|
814
822
|
raise_if_exists: bool = False,
|
|
815
823
|
raise_if_not_exists: bool = False,
|
|
816
|
-
) -> tuple[Optional[SchemaObject], Optional[
|
|
824
|
+
) -> tuple[Optional[SchemaObject], Optional[Dir], Optional[SchemaObject]]:
|
|
817
825
|
"""
|
|
818
826
|
Validates paths and acquires locks needed for a directory operation, ie, add/drop/rename (add + drop) of a
|
|
819
827
|
directory entry.
|
|
@@ -900,9 +908,10 @@ class Catalog:
|
|
|
900
908
|
schema.Table.md['name'].astext == name,
|
|
901
909
|
schema.Table.md['user'].astext == user,
|
|
902
910
|
)
|
|
903
|
-
tbl_id = conn.execute(q).
|
|
904
|
-
|
|
905
|
-
|
|
911
|
+
tbl_id = conn.execute(q).scalars().all()
|
|
912
|
+
assert len(tbl_id) <= 1, name
|
|
913
|
+
if len(tbl_id) == 1:
|
|
914
|
+
return self.get_table_by_id(tbl_id[0], version)
|
|
906
915
|
|
|
907
916
|
return None
|
|
908
917
|
|
|
@@ -1082,7 +1091,7 @@ class Catalog:
|
|
|
1082
1091
|
The metadata should be presented in standard "ancestor order", with the table being replicated at
|
|
1083
1092
|
list position 0 and the (root) base table at list position -1.
|
|
1084
1093
|
"""
|
|
1085
|
-
assert
|
|
1094
|
+
assert self.in_write_xact
|
|
1086
1095
|
|
|
1087
1096
|
tbl_id = UUID(md[0].tbl_md.tbl_id)
|
|
1088
1097
|
|
|
@@ -1148,11 +1157,11 @@ class Catalog:
|
|
|
1148
1157
|
# We need to do this at the end, since `existing_path` needs to first have a non-fragment table version in
|
|
1149
1158
|
# order to be instantiated as a schema object.
|
|
1150
1159
|
existing = self.get_table_by_id(tbl_id)
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1160
|
+
assert existing is not None
|
|
1161
|
+
existing_path = Path.parse(existing._path(), allow_system_path=True)
|
|
1162
|
+
if existing_path != path:
|
|
1163
|
+
assert existing_path.is_system_path
|
|
1164
|
+
self._move(existing_path, path, IfExistsParam.ERROR, IfNotExistsParam.ERROR)
|
|
1156
1165
|
|
|
1157
1166
|
def __ensure_system_dir_exists(self) -> Dir:
|
|
1158
1167
|
system_path = Path.parse('_system', allow_system_path=True)
|
|
@@ -1736,6 +1745,9 @@ class Catalog:
|
|
|
1736
1745
|
|
|
1737
1746
|
@retry_loop(for_write=False)
|
|
1738
1747
|
def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
|
|
1748
|
+
return self._collect_tbl_history(tbl_id, n)
|
|
1749
|
+
|
|
1750
|
+
def _collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
|
|
1739
1751
|
"""
|
|
1740
1752
|
Returns the history of up to n versions of the table with the given UUID.
|
|
1741
1753
|
|
|
@@ -1748,14 +1760,15 @@ class Catalog:
|
|
|
1748
1760
|
Each row contains a TableVersion and a TableSchemaVersion object.
|
|
1749
1761
|
"""
|
|
1750
1762
|
q = (
|
|
1751
|
-
sql.select(schema.TableVersion, schema.TableSchemaVersion)
|
|
1752
|
-
.
|
|
1753
|
-
.join(
|
|
1754
|
-
schema.TableSchemaVersion,
|
|
1755
|
-
schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version,
|
|
1756
|
-
)
|
|
1763
|
+
sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
|
|
1764
|
+
.where(schema.Table.id == tbl_id)
|
|
1765
|
+
.join(schema.TableVersion)
|
|
1757
1766
|
.where(schema.TableVersion.tbl_id == tbl_id)
|
|
1767
|
+
.join(schema.TableSchemaVersion)
|
|
1758
1768
|
.where(schema.TableSchemaVersion.tbl_id == tbl_id)
|
|
1769
|
+
.where(
|
|
1770
|
+
schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version
|
|
1771
|
+
)
|
|
1759
1772
|
.order_by(schema.TableVersion.version.desc())
|
|
1760
1773
|
)
|
|
1761
1774
|
if n is not None:
|
|
@@ -1763,7 +1776,7 @@ class Catalog:
|
|
|
1763
1776
|
src_rows = Env.get().session.execute(q).fetchall()
|
|
1764
1777
|
return [
|
|
1765
1778
|
schema.FullTableMd(
|
|
1766
|
-
|
|
1779
|
+
schema.md_from_dict(schema.TableMd, row.Table.md),
|
|
1767
1780
|
schema.md_from_dict(schema.TableVersionMd, row.TableVersion.md),
|
|
1768
1781
|
schema.md_from_dict(schema.TableSchemaVersionMd, row.TableSchemaVersion.md),
|
|
1769
1782
|
)
|
|
@@ -1958,11 +1971,13 @@ class Catalog:
|
|
|
1958
1971
|
|
|
1959
1972
|
# If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
|
|
1960
1973
|
# TableVersionPath. We need to prepend it separately.
|
|
1961
|
-
if isinstance(tbl, View) and tbl.
|
|
1974
|
+
if isinstance(tbl, View) and tbl._is_named_pure_snapshot():
|
|
1962
1975
|
snapshot_md = self.load_tbl_md(tbl._id, 0)
|
|
1963
1976
|
md = [snapshot_md, *md]
|
|
1964
1977
|
|
|
1965
|
-
for ancestor_md in md
|
|
1978
|
+
for ancestor_md in md:
|
|
1979
|
+
# Set the `is_replica` flag on every ancestor's TableMd.
|
|
1980
|
+
ancestor_md.tbl_md.is_replica = True
|
|
1966
1981
|
# For replica metadata, we guarantee that the current_version and current_schema_version of TableMd
|
|
1967
1982
|
# match the corresponding values in TableVersionMd and TableSchemaVersionMd. This is to ensure that,
|
|
1968
1983
|
# when the metadata is later stored in the catalog of a different Pixeltable instance, the values of
|
|
@@ -1970,6 +1985,8 @@ class Catalog:
|
|
|
1970
1985
|
# destination catalog.
|
|
1971
1986
|
ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
|
|
1972
1987
|
ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
|
|
1988
|
+
|
|
1989
|
+
for ancestor_md in md[1:]:
|
|
1973
1990
|
# Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
|
|
1974
1991
|
# table version (the data might be incomplete, since we have only retrieved one of its views, not
|
|
1975
1992
|
# the table itself).
|
|
@@ -2022,9 +2039,7 @@ class Catalog:
|
|
|
2022
2039
|
tbl_version: TableVersion
|
|
2023
2040
|
if view_md is None:
|
|
2024
2041
|
# this is a base table
|
|
2025
|
-
tbl_version = TableVersion(
|
|
2026
|
-
tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views=mutable_views
|
|
2027
|
-
)
|
|
2042
|
+
tbl_version = TableVersion(tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views)
|
|
2028
2043
|
else:
|
|
2029
2044
|
assert len(view_md.base_versions) > 0 # a view needs to have a base
|
|
2030
2045
|
# TODO: add TableVersionMd.is_pure_snapshot() and use that
|
pixeltable/catalog/table.py
CHANGED
|
@@ -77,6 +77,17 @@ class Table(SchemaObject):
|
|
|
77
77
|
self._tbl_version = None
|
|
78
78
|
|
|
79
79
|
def _move(self, new_name: str, new_dir_id: UUID) -> None:
|
|
80
|
+
old_name = self._name
|
|
81
|
+
old_dir_id = self._dir_id
|
|
82
|
+
|
|
83
|
+
cat = catalog.Catalog.get()
|
|
84
|
+
|
|
85
|
+
@cat.register_undo_action
|
|
86
|
+
def _() -> None:
|
|
87
|
+
# TODO: We should really be invalidating the Table instance and forcing a reload.
|
|
88
|
+
self._name = old_name
|
|
89
|
+
self._dir_id = old_dir_id
|
|
90
|
+
|
|
80
91
|
super()._move(new_name, new_dir_id)
|
|
81
92
|
conn = env.Env.get().conn
|
|
82
93
|
stmt = sql.text(
|
|
@@ -625,7 +636,7 @@ class Table(SchemaObject):
|
|
|
625
636
|
- `'abort'`: an exception will be raised and the column will not be added.
|
|
626
637
|
- `'ignore'`: execution will continue and the column will be added. Any rows
|
|
627
638
|
with errors will have a `None` value for the column, with information about the error stored in the
|
|
628
|
-
corresponding `tbl.col_name.errormsg` tbl.col_name.errortype` fields.
|
|
639
|
+
corresponding `tbl.col_name.errormsg` and `tbl.col_name.errortype` fields.
|
|
629
640
|
if_exists: Determines the behavior if the column already exists. Must be one of the following:
|
|
630
641
|
|
|
631
642
|
- `'error'`: an exception will be raised.
|
|
@@ -986,22 +997,28 @@ class Table(SchemaObject):
|
|
|
986
997
|
Only `String` and `Image` columns are currently supported. Here's an example that uses a
|
|
987
998
|
[CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
|
|
988
999
|
|
|
1000
|
+
```
|
|
989
1001
|
>>> from pixeltable.functions.huggingface import clip
|
|
990
|
-
|
|
991
|
-
|
|
1002
|
+
>>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
|
|
1003
|
+
>>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
|
|
1004
|
+
```
|
|
992
1005
|
|
|
993
|
-
Once the index is created,
|
|
1006
|
+
Once the index is created, similarity lookups can be performed using the `similarity` pseudo-function:
|
|
994
1007
|
|
|
1008
|
+
```
|
|
995
1009
|
>>> reference_img = PIL.Image.open('my_image.jpg')
|
|
996
|
-
|
|
997
|
-
|
|
1010
|
+
>>> sim = tbl.img.similarity(reference_img)
|
|
1011
|
+
>>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
1012
|
+
```
|
|
998
1013
|
|
|
999
1014
|
If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
|
|
1000
1015
|
performed using any of its supported types. In our example, CLIP supports both text and images, so we can
|
|
1001
1016
|
also search for images using a text description:
|
|
1002
1017
|
|
|
1018
|
+
```
|
|
1003
1019
|
>>> sim = tbl.img.similarity('a picture of a train')
|
|
1004
|
-
|
|
1020
|
+
>>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
1021
|
+
```
|
|
1005
1022
|
|
|
1006
1023
|
Args:
|
|
1007
1024
|
column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
|
|
@@ -1032,9 +1049,9 @@ class Table(SchemaObject):
|
|
|
1032
1049
|
Add an index to the `img` column of the table `my_table`:
|
|
1033
1050
|
|
|
1034
1051
|
>>> from pixeltable.functions.huggingface import clip
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1052
|
+
>>> tbl = pxt.get_table('my_table')
|
|
1053
|
+
>>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
|
|
1054
|
+
>>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
|
|
1038
1055
|
|
|
1039
1056
|
Alternatively, the `img` column may be specified by name:
|
|
1040
1057
|
|
|
@@ -1328,7 +1345,8 @@ class Table(SchemaObject):
|
|
|
1328
1345
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1329
1346
|
print_stats: bool = False,
|
|
1330
1347
|
**kwargs: Any,
|
|
1331
|
-
)
|
|
1348
|
+
)
|
|
1349
|
+
```
|
|
1332
1350
|
|
|
1333
1351
|
To insert just a single row, you can use the more concise syntax:
|
|
1334
1352
|
|
|
@@ -1338,7 +1356,8 @@ class Table(SchemaObject):
|
|
|
1338
1356
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1339
1357
|
print_stats: bool = False,
|
|
1340
1358
|
**kwargs: Any
|
|
1341
|
-
)
|
|
1359
|
+
)
|
|
1360
|
+
```
|
|
1342
1361
|
|
|
1343
1362
|
Args:
|
|
1344
1363
|
source: A data source from which data can be imported.
|
|
@@ -1459,8 +1478,8 @@ class Table(SchemaObject):
|
|
|
1459
1478
|
the row with new `id` 3 (assuming this key does not exist):
|
|
1460
1479
|
|
|
1461
1480
|
>>> tbl.update(
|
|
1462
|
-
|
|
1463
|
-
|
|
1481
|
+
... [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
|
|
1482
|
+
... if_not_exists='insert')
|
|
1464
1483
|
"""
|
|
1465
1484
|
from pixeltable.catalog import Catalog
|
|
1466
1485
|
|
|
@@ -24,7 +24,7 @@ from pixeltable.utils.object_stores import ObjectOps
|
|
|
24
24
|
|
|
25
25
|
from ..func.globals import resolve_symbol
|
|
26
26
|
from .column import Column
|
|
27
|
-
from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, is_valid_identifier
|
|
27
|
+
from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, QColumnId, is_valid_identifier
|
|
28
28
|
from .tbl_ops import TableOp
|
|
29
29
|
from .update_status import RowCountStats, UpdateStatus
|
|
30
30
|
|
|
@@ -96,6 +96,8 @@ class TableVersion:
|
|
|
96
96
|
cols_by_name: dict[str, Column]
|
|
97
97
|
# contains only columns visible in this version, both system and user
|
|
98
98
|
cols_by_id: dict[int, Column]
|
|
99
|
+
# all indices defined on this table
|
|
100
|
+
all_idxs: dict[str, TableVersion.IndexInfo]
|
|
99
101
|
# contains only actively maintained indices
|
|
100
102
|
idxs_by_name: dict[str, TableVersion.IndexInfo]
|
|
101
103
|
|
|
@@ -129,6 +131,12 @@ class TableVersion:
|
|
|
129
131
|
base_path: Optional[pxt.catalog.TableVersionPath] = None,
|
|
130
132
|
base: Optional[TableVersionHandle] = None,
|
|
131
133
|
):
|
|
134
|
+
from pixeltable import exprs
|
|
135
|
+
from pixeltable.plan import SampleClause
|
|
136
|
+
|
|
137
|
+
from .table_version_handle import TableVersionHandle
|
|
138
|
+
from .table_version_path import TableVersionPath
|
|
139
|
+
|
|
132
140
|
self.is_validated = True # a freshly constructed instance is always valid
|
|
133
141
|
self.is_initialized = False
|
|
134
142
|
self.id = id
|
|
@@ -141,9 +149,6 @@ class TableVersion:
|
|
|
141
149
|
self.store_tbl = None
|
|
142
150
|
|
|
143
151
|
# mutable tables need their TableVersionPath for expr eval during updates
|
|
144
|
-
from .table_version_handle import TableVersionHandle
|
|
145
|
-
from .table_version_path import TableVersionPath
|
|
146
|
-
|
|
147
152
|
if self.is_snapshot:
|
|
148
153
|
self.path = None
|
|
149
154
|
else:
|
|
@@ -153,9 +158,6 @@ class TableVersion:
|
|
|
153
158
|
self.path = TableVersionPath(self_handle, base=base_path)
|
|
154
159
|
|
|
155
160
|
# view-specific initialization
|
|
156
|
-
from pixeltable import exprs
|
|
157
|
-
from pixeltable.plan import SampleClause
|
|
158
|
-
|
|
159
161
|
predicate_dict = None if self.view_md is None or self.view_md.predicate is None else self.view_md.predicate
|
|
160
162
|
self.predicate = exprs.Expr.from_dict(predicate_dict) if predicate_dict is not None else None
|
|
161
163
|
sample_dict = None if self.view_md is None or self.view_md.sample_clause is None else self.view_md.sample_clause
|
|
@@ -180,6 +182,7 @@ class TableVersion:
|
|
|
180
182
|
self.cols = []
|
|
181
183
|
self.cols_by_name = {}
|
|
182
184
|
self.cols_by_id = {}
|
|
185
|
+
self.all_idxs = {}
|
|
183
186
|
self.idxs_by_name = {}
|
|
184
187
|
self.external_stores = {}
|
|
185
188
|
|
|
@@ -190,9 +193,7 @@ class TableVersion:
|
|
|
190
193
|
"""Create a snapshot copy of this TableVersion"""
|
|
191
194
|
assert not self.is_snapshot
|
|
192
195
|
base = self.path.base.tbl_version if self.is_view else None
|
|
193
|
-
return TableVersion(
|
|
194
|
-
self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, mutable_views=[], base=base
|
|
195
|
-
)
|
|
196
|
+
return TableVersion(self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, [], base=base)
|
|
196
197
|
|
|
197
198
|
@property
|
|
198
199
|
def versioned_name(self) -> str:
|
|
@@ -201,6 +202,12 @@ class TableVersion:
|
|
|
201
202
|
else:
|
|
202
203
|
return f'{self.name}:{self.effective_version}'
|
|
203
204
|
|
|
205
|
+
def __repr__(self) -> str:
|
|
206
|
+
return (
|
|
207
|
+
f'TableVersion(id={self.id!r}, name={self.name!r}, '
|
|
208
|
+
f'version={self.version}, effective_version={self.effective_version})'
|
|
209
|
+
)
|
|
210
|
+
|
|
204
211
|
@property
|
|
205
212
|
def handle(self) -> 'TableVersionHandle':
|
|
206
213
|
from .table_version_handle import TableVersionHandle
|
|
@@ -287,12 +294,12 @@ class TableVersion:
|
|
|
287
294
|
comment: str,
|
|
288
295
|
media_validation: MediaValidation,
|
|
289
296
|
) -> tuple[UUID, Optional[TableVersion]]:
|
|
290
|
-
|
|
297
|
+
initial_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
|
|
291
298
|
cat = pxt.catalog.Catalog.get()
|
|
292
299
|
|
|
293
|
-
tbl_id = UUID(hex=
|
|
300
|
+
tbl_id = UUID(hex=initial_md.tbl_md.tbl_id)
|
|
294
301
|
assert (tbl_id, None) not in cat._tbl_versions
|
|
295
|
-
tbl_version = cls(tbl_id,
|
|
302
|
+
tbl_version = cls(tbl_id, initial_md.tbl_md, initial_md.version_md, None, initial_md.schema_version_md, [])
|
|
296
303
|
|
|
297
304
|
@cat.register_undo_action
|
|
298
305
|
def _() -> None:
|
|
@@ -312,8 +319,8 @@ class TableVersion:
|
|
|
312
319
|
tbl_id=tbl_id,
|
|
313
320
|
dir_id=dir_id,
|
|
314
321
|
tbl_md=tbl_version.tbl_md,
|
|
315
|
-
version_md=
|
|
316
|
-
schema_version_md=
|
|
322
|
+
version_md=initial_md.version_md,
|
|
323
|
+
schema_version_md=initial_md.schema_version_md,
|
|
317
324
|
)
|
|
318
325
|
return tbl_id, tbl_version
|
|
319
326
|
|
|
@@ -340,11 +347,14 @@ class TableVersion:
|
|
|
340
347
|
|
|
341
348
|
@classmethod
|
|
342
349
|
def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
|
|
350
|
+
from .catalog import TableVersionPath
|
|
351
|
+
|
|
343
352
|
assert Env.get().in_xact
|
|
353
|
+
assert md.tbl_md.is_replica
|
|
344
354
|
tbl_id = UUID(md.tbl_md.tbl_id)
|
|
345
355
|
_logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
|
|
346
356
|
view_md = md.tbl_md.view_md
|
|
347
|
-
base_path =
|
|
357
|
+
base_path = TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
|
|
348
358
|
base = base_path.tbl_version if base_path is not None else None
|
|
349
359
|
tbl_version = cls(
|
|
350
360
|
tbl_id,
|
|
@@ -366,7 +376,7 @@ class TableVersion:
|
|
|
366
376
|
cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
|
|
367
377
|
tbl_version.init()
|
|
368
378
|
tbl_version.store_tbl.create()
|
|
369
|
-
tbl_version.store_tbl.
|
|
379
|
+
tbl_version.store_tbl.ensure_updated_schema()
|
|
370
380
|
return tbl_version
|
|
371
381
|
|
|
372
382
|
def delete_media(self, tbl_version: Optional[int] = None) -> None:
|
|
@@ -409,8 +419,8 @@ class TableVersion:
|
|
|
409
419
|
def _init_schema(self) -> None:
|
|
410
420
|
# create columns first, so the indices can reference them
|
|
411
421
|
self._init_cols()
|
|
412
|
-
|
|
413
|
-
|
|
422
|
+
self._init_idxs()
|
|
423
|
+
|
|
414
424
|
# create the sa schema only after creating the columns and indices
|
|
415
425
|
self._init_sa_schema()
|
|
416
426
|
|
|
@@ -448,39 +458,71 @@ class TableVersion:
|
|
|
448
458
|
# self._record_refd_columns(col)
|
|
449
459
|
|
|
450
460
|
def _init_idxs(self) -> None:
|
|
451
|
-
# self.idx_md = tbl_md.index_md
|
|
452
|
-
self.idxs_by_name = {}
|
|
453
|
-
import pixeltable.index as index_module
|
|
454
|
-
|
|
455
461
|
for md in self.tbl_md.index_md.values():
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
):
|
|
459
|
-
# index not visible in this schema version
|
|
460
|
-
continue
|
|
461
|
-
|
|
462
|
-
# instantiate index object
|
|
462
|
+
# Instantiate index object. This needs to be done for all indices, even those that are not active in this
|
|
463
|
+
# TableVersion, so that we can make appropriate adjustments to the SA schema.
|
|
463
464
|
cls_name = md.class_fqn.rsplit('.', 1)[-1]
|
|
464
|
-
cls = getattr(
|
|
465
|
-
idx_col
|
|
466
|
-
|
|
467
|
-
# this is a reference to one of our columns: avoid TVP.get_column_by_id() here, because we're not fully
|
|
468
|
-
# initialized yet
|
|
469
|
-
idx_col = self.cols_by_id[md.indexed_col_id]
|
|
470
|
-
else:
|
|
471
|
-
assert self.path.base is not None
|
|
472
|
-
idx_col = self.path.base.get_column_by_id(UUID(md.indexed_col_tbl_id), md.indexed_col_id)
|
|
465
|
+
cls = getattr(index, cls_name)
|
|
466
|
+
idx_col = self._lookup_column(QColumnId(UUID(md.indexed_col_tbl_id), md.indexed_col_id))
|
|
467
|
+
assert idx_col is not None
|
|
473
468
|
idx = cls.from_dict(idx_col, md.init_args)
|
|
469
|
+
assert isinstance(idx, index.IndexBase)
|
|
470
|
+
|
|
471
|
+
val_col = next(col for col in self.cols if col.id == md.index_val_col_id)
|
|
472
|
+
undo_col = next(col for col in self.cols if col.id == md.index_val_undo_col_id)
|
|
473
|
+
idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
|
|
474
|
+
self.all_idxs[md.name] = idx_info
|
|
474
475
|
|
|
475
476
|
# fix up the sa column type of the index value and undo columns
|
|
476
|
-
|
|
477
|
+
# we need to do this for all indices, not just those that are active in this TableVersion, to ensure we get
|
|
478
|
+
# the correct SA schema in the StoreTable.
|
|
477
479
|
val_col.sa_col_type = idx.index_sa_type()
|
|
478
|
-
val_col._stores_cellmd = False
|
|
479
|
-
undo_col = self.cols_by_id[md.index_val_undo_col_id]
|
|
480
480
|
undo_col.sa_col_type = idx.index_sa_type()
|
|
481
|
+
if not isinstance(idx, index.EmbeddingIndex):
|
|
482
|
+
# Historically, the intent has been not to store cellmd data, even for embedding indices. However,
|
|
483
|
+
# the cellmd columns get created anyway, even if stores_cellmd is set to `False` here, due to the
|
|
484
|
+
# timing of index column creation. In order to ensure that SA schemas align with what is actually in
|
|
485
|
+
# the physical tables, we keep this `True` for embedding indices.
|
|
486
|
+
# TODO: Decide whether index columns should store cellmd data.
|
|
487
|
+
# - If not, set to `False`, fix the column creation timing issue, and add a migration script to
|
|
488
|
+
# remedy existing cellmd columns.
|
|
489
|
+
# - If so, remove this TODO.
|
|
490
|
+
val_col._stores_cellmd = False
|
|
481
491
|
undo_col._stores_cellmd = False
|
|
482
|
-
|
|
483
|
-
|
|
492
|
+
|
|
493
|
+
# The index is active in this TableVersion provided that:
|
|
494
|
+
# (i) the TableVersion supports indices (either it's not a snapshot, or it's a replica at
|
|
495
|
+
# the head version); and
|
|
496
|
+
# (ii) the index was created on or before the schema version of this TableVersion; and
|
|
497
|
+
# (iii) the index was not dropped on or before the schema version of this TableVersion.
|
|
498
|
+
supports_idxs = self.effective_version is None or (
|
|
499
|
+
self.tbl_md.is_replica and self.effective_version == self.tbl_md.current_version
|
|
500
|
+
)
|
|
501
|
+
if (
|
|
502
|
+
supports_idxs
|
|
503
|
+
and md.schema_version_add <= self.schema_version
|
|
504
|
+
and (md.schema_version_drop is None or md.schema_version_drop > self.schema_version)
|
|
505
|
+
):
|
|
506
|
+
# Since the index is present in this TableVersion, its associated columns must be as well.
|
|
507
|
+
# Sanity-check this.
|
|
508
|
+
assert md.indexed_col_id in self.cols_by_id
|
|
509
|
+
assert md.index_val_col_id in self.cols_by_id
|
|
510
|
+
assert md.index_val_undo_col_id in self.cols_by_id
|
|
511
|
+
self.idxs_by_name[md.name] = idx_info
|
|
512
|
+
|
|
513
|
+
def _lookup_column(self, id: QColumnId) -> Column | None:
|
|
514
|
+
"""
|
|
515
|
+
Look up the column with the given table id and column id, searching through the ancestors of this TableVersion
|
|
516
|
+
to find it. We avoid referencing TableVersionPath in order to work properly with snapshots as well.
|
|
517
|
+
|
|
518
|
+
This will search through *all* known columns, including columns that are not visible in this TableVersion.
|
|
519
|
+
"""
|
|
520
|
+
if id.tbl_id == self.id:
|
|
521
|
+
return next(col for col in self.cols if col.id == id.col_id)
|
|
522
|
+
elif self.base is not None:
|
|
523
|
+
return self.base.get()._lookup_column(id)
|
|
524
|
+
else:
|
|
525
|
+
return None
|
|
484
526
|
|
|
485
527
|
def _init_sa_schema(self) -> None:
|
|
486
528
|
# create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
|
|
@@ -1286,8 +1328,6 @@ class TableVersion:
|
|
|
1286
1328
|
self._write_md(new_version=False, new_schema_version=False)
|
|
1287
1329
|
|
|
1288
1330
|
# propagate to views
|
|
1289
|
-
views_str = ', '.join([str(v.id) for v in self.mutable_views])
|
|
1290
|
-
print(f'revert(): mutable_views={views_str}')
|
|
1291
1331
|
for view in self.mutable_views:
|
|
1292
1332
|
view.get()._revert()
|
|
1293
1333
|
|
|
@@ -195,17 +195,6 @@ class TableVersionPath:
|
|
|
195
195
|
else:
|
|
196
196
|
return None
|
|
197
197
|
|
|
198
|
-
def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
|
|
199
|
-
"""Return the column for the given tbl/col id"""
|
|
200
|
-
self.refresh_cached_md()
|
|
201
|
-
if self.tbl_version.id == tbl_id:
|
|
202
|
-
assert col_id in self._cached_tbl_version.cols_by_id
|
|
203
|
-
return self._cached_tbl_version.cols_by_id[col_id]
|
|
204
|
-
elif self.base is not None:
|
|
205
|
-
return self.base.get_column_by_id(tbl_id, col_id)
|
|
206
|
-
else:
|
|
207
|
-
return None
|
|
208
|
-
|
|
209
198
|
def has_column(self, col: Column) -> bool:
|
|
210
199
|
"""Return True if this table has the given column."""
|
|
211
200
|
assert col.tbl is not None
|
pixeltable/catalog/view.py
CHANGED
|
@@ -252,6 +252,12 @@ class View(Table):
|
|
|
252
252
|
base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
|
|
253
253
|
)
|
|
254
254
|
|
|
255
|
+
def _is_named_pure_snapshot(self) -> bool:
|
|
256
|
+
"""
|
|
257
|
+
Returns True if this is a named pure snapshot (i.e., a pure snapshot that is a separate schema object).
|
|
258
|
+
"""
|
|
259
|
+
return self._id != self._tbl_version_path.tbl_id
|
|
260
|
+
|
|
255
261
|
def _is_anonymous_snapshot(self) -> bool:
|
|
256
262
|
"""
|
|
257
263
|
Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).
|
pixeltable/config.py
CHANGED
|
@@ -163,6 +163,7 @@ KNOWN_CONFIG_OPTIONS = {
|
|
|
163
163
|
'api_key': 'API key for Pixeltable cloud',
|
|
164
164
|
'r2_profile': 'AWS config profile name used to access R2 storage',
|
|
165
165
|
's3_profile': 'AWS config profile name used to access S3 storage',
|
|
166
|
+
'b2_profile': 'S3-compatible profile name used to access Backblaze B2 storage',
|
|
166
167
|
},
|
|
167
168
|
'anthropic': {'api_key': 'Anthropic API key'},
|
|
168
169
|
'bedrock': {'api_key': 'AWS Bedrock API key'},
|
pixeltable/dataframe.py
CHANGED
|
@@ -1039,7 +1039,7 @@ class DataFrame:
|
|
|
1039
1039
|
>>> df = book.order_by(t.price, asc=False).order_by(t.pages)
|
|
1040
1040
|
"""
|
|
1041
1041
|
if self.sample_clause is not None:
|
|
1042
|
-
raise excs.Error('
|
|
1042
|
+
raise excs.Error('order_by() cannot be used with sample()')
|
|
1043
1043
|
for e in expr_list:
|
|
1044
1044
|
if not isinstance(e, exprs.Expr):
|
|
1045
1045
|
raise excs.Error(f'Invalid expression in order_by(): {e}')
|
pixeltable/env.py
CHANGED
|
@@ -355,6 +355,8 @@ class Env:
|
|
|
355
355
|
# accept log messages from a configured pixeltable module (at any level of the module hierarchy)
|
|
356
356
|
path_parts = list(Path(record.pathname).parts)
|
|
357
357
|
path_parts.reverse()
|
|
358
|
+
if 'pixeltable' not in path_parts:
|
|
359
|
+
return False
|
|
358
360
|
max_idx = path_parts.index('pixeltable')
|
|
359
361
|
for module_name in path_parts[:max_idx]:
|
|
360
362
|
if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
|
|
@@ -576,6 +578,12 @@ class Env:
|
|
|
576
578
|
assert isinstance(tz_name, str)
|
|
577
579
|
self._logger.info(f'Database time zone is now: {tz_name}')
|
|
578
580
|
self._default_time_zone = ZoneInfo(tz_name)
|
|
581
|
+
if self.is_using_cockroachdb:
|
|
582
|
+
# This could be set when the database is created, but we set it now
|
|
583
|
+
conn.execute(sql.text('SET null_ordered_last = true;'))
|
|
584
|
+
null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
|
|
585
|
+
assert isinstance(null_ordered_last, str)
|
|
586
|
+
self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
|
|
579
587
|
|
|
580
588
|
def _store_db_exists(self) -> bool:
|
|
581
589
|
assert self._db_name is not None
|
|
@@ -752,10 +760,12 @@ class Env:
|
|
|
752
760
|
|
|
753
761
|
def __register_packages(self) -> None:
|
|
754
762
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
763
|
+
self.__register_package('accelerate')
|
|
755
764
|
self.__register_package('anthropic')
|
|
756
765
|
self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
|
|
757
766
|
self.__register_package('boto3')
|
|
758
767
|
self.__register_package('datasets')
|
|
768
|
+
self.__register_package('diffusers')
|
|
759
769
|
self.__register_package('fiftyone')
|
|
760
770
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
761
771
|
self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
|
|
@@ -763,6 +773,7 @@ class Env:
|
|
|
763
773
|
self.__register_package('groq')
|
|
764
774
|
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
765
775
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
776
|
+
self.__register_package('librosa')
|
|
766
777
|
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
767
778
|
self.__register_package('mcp')
|
|
768
779
|
self.__register_package('mistralai')
|
|
@@ -775,6 +786,7 @@ class Env:
|
|
|
775
786
|
self.__register_package('replicate')
|
|
776
787
|
self.__register_package('sentencepiece')
|
|
777
788
|
self.__register_package('sentence_transformers', library_name='sentence-transformers')
|
|
789
|
+
self.__register_package('soundfile')
|
|
778
790
|
self.__register_package('spacy')
|
|
779
791
|
self.__register_package('tiktoken')
|
|
780
792
|
self.__register_package('together')
|