pixeltable 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

@@ -280,7 +280,7 @@ class Catalog:
280
280
  - this needs to be done in a retry loop, because Postgres can decide to abort the transaction
281
281
  (SerializationFailure, LockNotAvailable)
282
282
  - for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
283
- to minimize the probability of loosing that work due to a forced abort
283
+ to minimize the probability of losing that work due to a forced abort
284
284
 
285
285
  If convert_db_excs == True, converts DBAPIErrors into excs.Errors.
286
286
  """
@@ -433,7 +433,7 @@ class Catalog:
433
433
 
434
434
  The function should not raise exceptions; if it does, they are logged and ignored.
435
435
  """
436
- assert Env.get().in_xact
436
+ assert self.in_write_xact
437
437
  self._undo_actions.append(func)
438
438
  return func
439
439
 
@@ -472,11 +472,13 @@ class Catalog:
472
472
  else:
473
473
  msg = ''
474
474
  _logger.debug(f'Exception: {e.orig.__class__}: {msg} ({e})')
475
+ # Suppress the underlying SQL exception unless DEBUG is enabled
476
+ raise_from = e if _logger.isEnabledFor(logging.DEBUG) else None
475
477
  raise excs.Error(
476
478
  'That Pixeltable operation could not be completed because it conflicted with another '
477
479
  'operation that was run on a different process.\n'
478
480
  'Please re-run the operation.'
479
- ) from None
481
+ ) from raise_from
480
482
 
481
483
  @property
482
484
  def in_write_xact(self) -> bool:
@@ -790,19 +792,25 @@ class Catalog:
790
792
  return result
791
793
 
792
794
  @retry_loop(for_write=True)
793
- def move(self, path: Path, new_path: Path) -> None:
794
- self._move(path, new_path)
795
+ def move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
796
+ self._move(path, new_path, if_exists, if_not_exists)
795
797
 
796
- def _move(self, path: Path, new_path: Path) -> None:
797
- _, dest_dir, src_obj = self._prepare_dir_op(
798
+ def _move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
799
+ dest_obj, dest_dir, src_obj = self._prepare_dir_op(
798
800
  add_dir_path=new_path.parent,
799
801
  add_name=new_path.name,
800
802
  drop_dir_path=path.parent,
801
803
  drop_name=path.name,
802
- raise_if_exists=True,
803
- raise_if_not_exists=True,
804
+ raise_if_exists=(if_exists == IfExistsParam.ERROR),
805
+ raise_if_not_exists=(if_not_exists == IfNotExistsParam.ERROR),
804
806
  )
805
- src_obj._move(new_path.name, dest_dir._id)
807
+ assert dest_obj is None or if_exists == IfExistsParam.IGNORE
808
+ assert src_obj is not None or if_not_exists == IfNotExistsParam.IGNORE
809
+ if dest_obj is None and src_obj is not None:
810
+ # If dest_obj is not None, it means `if_exists='ignore'` and the destination already exists.
811
+ # If src_obj is None, it means `if_not_exists='ignore'` and the source doesn't exist.
812
+ # If dest_obj is None and src_obj is not None, then we can proceed with the move.
813
+ src_obj._move(new_path.name, dest_dir._id)
806
814
 
807
815
  def _prepare_dir_op(
808
816
  self,
@@ -813,7 +821,7 @@ class Catalog:
813
821
  drop_expected: Optional[type[SchemaObject]] = None,
814
822
  raise_if_exists: bool = False,
815
823
  raise_if_not_exists: bool = False,
816
- ) -> tuple[Optional[SchemaObject], Optional[SchemaObject], Optional[SchemaObject]]:
824
+ ) -> tuple[Optional[SchemaObject], Optional[Dir], Optional[SchemaObject]]:
817
825
  """
818
826
  Validates paths and acquires locks needed for a directory operation, ie, add/drop/rename (add + drop) of a
819
827
  directory entry.
@@ -900,9 +908,10 @@ class Catalog:
900
908
  schema.Table.md['name'].astext == name,
901
909
  schema.Table.md['user'].astext == user,
902
910
  )
903
- tbl_id = conn.execute(q).scalar_one_or_none()
904
- if tbl_id is not None:
905
- return self.get_table_by_id(tbl_id, version)
911
+ tbl_id = conn.execute(q).scalars().all()
912
+ assert len(tbl_id) <= 1, name
913
+ if len(tbl_id) == 1:
914
+ return self.get_table_by_id(tbl_id[0], version)
906
915
 
907
916
  return None
908
917
 
@@ -1082,7 +1091,7 @@ class Catalog:
1082
1091
  The metadata should be presented in standard "ancestor order", with the table being replicated at
1083
1092
  list position 0 and the (root) base table at list position -1.
1084
1093
  """
1085
- assert Env.get().in_xact
1094
+ assert self.in_write_xact
1086
1095
 
1087
1096
  tbl_id = UUID(md[0].tbl_md.tbl_id)
1088
1097
 
@@ -1148,11 +1157,11 @@ class Catalog:
1148
1157
  # We need to do this at the end, since `existing_path` needs to first have a non-fragment table version in
1149
1158
  # order to be instantiated as a schema object.
1150
1159
  existing = self.get_table_by_id(tbl_id)
1151
- if existing is not None:
1152
- existing_path = Path.parse(existing._path(), allow_system_path=True)
1153
- if existing_path != path:
1154
- assert existing_path.is_system_path
1155
- self._move(existing_path, path)
1160
+ assert existing is not None
1161
+ existing_path = Path.parse(existing._path(), allow_system_path=True)
1162
+ if existing_path != path:
1163
+ assert existing_path.is_system_path
1164
+ self._move(existing_path, path, IfExistsParam.ERROR, IfNotExistsParam.ERROR)
1156
1165
 
1157
1166
  def __ensure_system_dir_exists(self) -> Dir:
1158
1167
  system_path = Path.parse('_system', allow_system_path=True)
@@ -1736,6 +1745,9 @@ class Catalog:
1736
1745
 
1737
1746
  @retry_loop(for_write=False)
1738
1747
  def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
1748
+ return self._collect_tbl_history(tbl_id, n)
1749
+
1750
+ def _collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
1739
1751
  """
1740
1752
  Returns the history of up to n versions of the table with the given UUID.
1741
1753
 
@@ -1748,14 +1760,15 @@ class Catalog:
1748
1760
  Each row contains a TableVersion and a TableSchemaVersion object.
1749
1761
  """
1750
1762
  q = (
1751
- sql.select(schema.TableVersion, schema.TableSchemaVersion)
1752
- .select_from(schema.TableVersion)
1753
- .join(
1754
- schema.TableSchemaVersion,
1755
- schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version,
1756
- )
1763
+ sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
1764
+ .where(schema.Table.id == tbl_id)
1765
+ .join(schema.TableVersion)
1757
1766
  .where(schema.TableVersion.tbl_id == tbl_id)
1767
+ .join(schema.TableSchemaVersion)
1758
1768
  .where(schema.TableSchemaVersion.tbl_id == tbl_id)
1769
+ .where(
1770
+ schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version
1771
+ )
1759
1772
  .order_by(schema.TableVersion.version.desc())
1760
1773
  )
1761
1774
  if n is not None:
@@ -1763,7 +1776,7 @@ class Catalog:
1763
1776
  src_rows = Env.get().session.execute(q).fetchall()
1764
1777
  return [
1765
1778
  schema.FullTableMd(
1766
- None,
1779
+ schema.md_from_dict(schema.TableMd, row.Table.md),
1767
1780
  schema.md_from_dict(schema.TableVersionMd, row.TableVersion.md),
1768
1781
  schema.md_from_dict(schema.TableSchemaVersionMd, row.TableSchemaVersion.md),
1769
1782
  )
@@ -1958,11 +1971,13 @@ class Catalog:
1958
1971
 
1959
1972
  # If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
1960
1973
  # TableVersionPath. We need to prepend it separately.
1961
- if isinstance(tbl, View) and tbl._snapshot_only:
1974
+ if isinstance(tbl, View) and tbl._is_named_pure_snapshot():
1962
1975
  snapshot_md = self.load_tbl_md(tbl._id, 0)
1963
1976
  md = [snapshot_md, *md]
1964
1977
 
1965
- for ancestor_md in md[1:]:
1978
+ for ancestor_md in md:
1979
+ # Set the `is_replica` flag on every ancestor's TableMd.
1980
+ ancestor_md.tbl_md.is_replica = True
1966
1981
  # For replica metadata, we guarantee that the current_version and current_schema_version of TableMd
1967
1982
  # match the corresponding values in TableVersionMd and TableSchemaVersionMd. This is to ensure that,
1968
1983
  # when the metadata is later stored in the catalog of a different Pixeltable instance, the values of
@@ -1970,6 +1985,8 @@ class Catalog:
1970
1985
  # destination catalog.
1971
1986
  ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
1972
1987
  ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
1988
+
1989
+ for ancestor_md in md[1:]:
1973
1990
  # Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
1974
1991
  # table version (the data might be incomplete, since we have only retrieved one of its views, not
1975
1992
  # the table itself).
@@ -2022,9 +2039,7 @@ class Catalog:
2022
2039
  tbl_version: TableVersion
2023
2040
  if view_md is None:
2024
2041
  # this is a base table
2025
- tbl_version = TableVersion(
2026
- tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views=mutable_views
2027
- )
2042
+ tbl_version = TableVersion(tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views)
2028
2043
  else:
2029
2044
  assert len(view_md.base_versions) > 0 # a view needs to have a base
2030
2045
  # TODO: add TableVersionMd.is_pure_snapshot() and use that
@@ -77,6 +77,17 @@ class Table(SchemaObject):
77
77
  self._tbl_version = None
78
78
 
79
79
  def _move(self, new_name: str, new_dir_id: UUID) -> None:
80
+ old_name = self._name
81
+ old_dir_id = self._dir_id
82
+
83
+ cat = catalog.Catalog.get()
84
+
85
+ @cat.register_undo_action
86
+ def _() -> None:
87
+ # TODO: We should really be invalidating the Table instance and forcing a reload.
88
+ self._name = old_name
89
+ self._dir_id = old_dir_id
90
+
80
91
  super()._move(new_name, new_dir_id)
81
92
  conn = env.Env.get().conn
82
93
  stmt = sql.text(
@@ -625,7 +636,7 @@ class Table(SchemaObject):
625
636
  - `'abort'`: an exception will be raised and the column will not be added.
626
637
  - `'ignore'`: execution will continue and the column will be added. Any rows
627
638
  with errors will have a `None` value for the column, with information about the error stored in the
628
- corresponding `tbl.col_name.errormsg` tbl.col_name.errortype` fields.
639
+ corresponding `tbl.col_name.errormsg` and `tbl.col_name.errortype` fields.
629
640
  if_exists: Determines the behavior if the column already exists. Must be one of the following:
630
641
 
631
642
  - `'error'`: an exception will be raised.
@@ -986,22 +997,28 @@ class Table(SchemaObject):
986
997
  Only `String` and `Image` columns are currently supported. Here's an example that uses a
987
998
  [CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
988
999
 
1000
+ ```
989
1001
  >>> from pixeltable.functions.huggingface import clip
990
- ... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
991
- ... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1002
+ >>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
1003
+ >>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1004
+ ```
992
1005
 
993
- Once the index is created, similiarity lookups can be performed using the `similarity` pseudo-function.
1006
+ Once the index is created, similarity lookups can be performed using the `similarity` pseudo-function:
994
1007
 
1008
+ ```
995
1009
  >>> reference_img = PIL.Image.open('my_image.jpg')
996
- ... sim = tbl.img.similarity(reference_img)
997
- ... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1010
+ >>> sim = tbl.img.similarity(reference_img)
1011
+ >>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1012
+ ```
998
1013
 
999
1014
  If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
1000
1015
  performed using any of its supported types. In our example, CLIP supports both text and images, so we can
1001
1016
  also search for images using a text description:
1002
1017
 
1018
+ ```
1003
1019
  >>> sim = tbl.img.similarity('a picture of a train')
1004
- ... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1020
+ >>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1021
+ ```
1005
1022
 
1006
1023
  Args:
1007
1024
  column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
@@ -1032,9 +1049,9 @@ class Table(SchemaObject):
1032
1049
  Add an index to the `img` column of the table `my_table`:
1033
1050
 
1034
1051
  >>> from pixeltable.functions.huggingface import clip
1035
- ... tbl = pxt.get_table('my_table')
1036
- ... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
1037
- ... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1052
+ >>> tbl = pxt.get_table('my_table')
1053
+ >>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
1054
+ >>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1038
1055
 
1039
1056
  Alternatively, the `img` column may be specified by name:
1040
1057
 
@@ -1328,7 +1345,8 @@ class Table(SchemaObject):
1328
1345
  on_error: Literal['abort', 'ignore'] = 'abort',
1329
1346
  print_stats: bool = False,
1330
1347
  **kwargs: Any,
1331
- )```
1348
+ )
1349
+ ```
1332
1350
 
1333
1351
  To insert just a single row, you can use the more concise syntax:
1334
1352
 
@@ -1338,7 +1356,8 @@ class Table(SchemaObject):
1338
1356
  on_error: Literal['abort', 'ignore'] = 'abort',
1339
1357
  print_stats: bool = False,
1340
1358
  **kwargs: Any
1341
- )```
1359
+ )
1360
+ ```
1342
1361
 
1343
1362
  Args:
1344
1363
  source: A data source from which data can be imported.
@@ -1459,8 +1478,8 @@ class Table(SchemaObject):
1459
1478
  the row with new `id` 3 (assuming this key does not exist):
1460
1479
 
1461
1480
  >>> tbl.update(
1462
- [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
1463
- if_not_exists='insert')
1481
+ ... [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
1482
+ ... if_not_exists='insert')
1464
1483
  """
1465
1484
  from pixeltable.catalog import Catalog
1466
1485
 
@@ -24,7 +24,7 @@ from pixeltable.utils.object_stores import ObjectOps
24
24
 
25
25
  from ..func.globals import resolve_symbol
26
26
  from .column import Column
27
- from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, is_valid_identifier
27
+ from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, QColumnId, is_valid_identifier
28
28
  from .tbl_ops import TableOp
29
29
  from .update_status import RowCountStats, UpdateStatus
30
30
 
@@ -96,6 +96,8 @@ class TableVersion:
96
96
  cols_by_name: dict[str, Column]
97
97
  # contains only columns visible in this version, both system and user
98
98
  cols_by_id: dict[int, Column]
99
+ # all indices defined on this table
100
+ all_idxs: dict[str, TableVersion.IndexInfo]
99
101
  # contains only actively maintained indices
100
102
  idxs_by_name: dict[str, TableVersion.IndexInfo]
101
103
 
@@ -129,6 +131,12 @@ class TableVersion:
129
131
  base_path: Optional[pxt.catalog.TableVersionPath] = None,
130
132
  base: Optional[TableVersionHandle] = None,
131
133
  ):
134
+ from pixeltable import exprs
135
+ from pixeltable.plan import SampleClause
136
+
137
+ from .table_version_handle import TableVersionHandle
138
+ from .table_version_path import TableVersionPath
139
+
132
140
  self.is_validated = True # a freshly constructed instance is always valid
133
141
  self.is_initialized = False
134
142
  self.id = id
@@ -141,9 +149,6 @@ class TableVersion:
141
149
  self.store_tbl = None
142
150
 
143
151
  # mutable tables need their TableVersionPath for expr eval during updates
144
- from .table_version_handle import TableVersionHandle
145
- from .table_version_path import TableVersionPath
146
-
147
152
  if self.is_snapshot:
148
153
  self.path = None
149
154
  else:
@@ -153,9 +158,6 @@ class TableVersion:
153
158
  self.path = TableVersionPath(self_handle, base=base_path)
154
159
 
155
160
  # view-specific initialization
156
- from pixeltable import exprs
157
- from pixeltable.plan import SampleClause
158
-
159
161
  predicate_dict = None if self.view_md is None or self.view_md.predicate is None else self.view_md.predicate
160
162
  self.predicate = exprs.Expr.from_dict(predicate_dict) if predicate_dict is not None else None
161
163
  sample_dict = None if self.view_md is None or self.view_md.sample_clause is None else self.view_md.sample_clause
@@ -180,6 +182,7 @@ class TableVersion:
180
182
  self.cols = []
181
183
  self.cols_by_name = {}
182
184
  self.cols_by_id = {}
185
+ self.all_idxs = {}
183
186
  self.idxs_by_name = {}
184
187
  self.external_stores = {}
185
188
 
@@ -190,9 +193,7 @@ class TableVersion:
190
193
  """Create a snapshot copy of this TableVersion"""
191
194
  assert not self.is_snapshot
192
195
  base = self.path.base.tbl_version if self.is_view else None
193
- return TableVersion(
194
- self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, mutable_views=[], base=base
195
- )
196
+ return TableVersion(self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, [], base=base)
196
197
 
197
198
  @property
198
199
  def versioned_name(self) -> str:
@@ -201,6 +202,12 @@ class TableVersion:
201
202
  else:
202
203
  return f'{self.name}:{self.effective_version}'
203
204
 
205
+ def __repr__(self) -> str:
206
+ return (
207
+ f'TableVersion(id={self.id!r}, name={self.name!r}, '
208
+ f'version={self.version}, effective_version={self.effective_version})'
209
+ )
210
+
204
211
  @property
205
212
  def handle(self) -> 'TableVersionHandle':
206
213
  from .table_version_handle import TableVersionHandle
@@ -287,12 +294,12 @@ class TableVersion:
287
294
  comment: str,
288
295
  media_validation: MediaValidation,
289
296
  ) -> tuple[UUID, Optional[TableVersion]]:
290
- inital_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
297
+ initial_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
291
298
  cat = pxt.catalog.Catalog.get()
292
299
 
293
- tbl_id = UUID(hex=inital_md.tbl_md.tbl_id)
300
+ tbl_id = UUID(hex=initial_md.tbl_md.tbl_id)
294
301
  assert (tbl_id, None) not in cat._tbl_versions
295
- tbl_version = cls(tbl_id, inital_md.tbl_md, inital_md.version_md, None, inital_md.schema_version_md, [])
302
+ tbl_version = cls(tbl_id, initial_md.tbl_md, initial_md.version_md, None, initial_md.schema_version_md, [])
296
303
 
297
304
  @cat.register_undo_action
298
305
  def _() -> None:
@@ -312,8 +319,8 @@ class TableVersion:
312
319
  tbl_id=tbl_id,
313
320
  dir_id=dir_id,
314
321
  tbl_md=tbl_version.tbl_md,
315
- version_md=inital_md.version_md,
316
- schema_version_md=inital_md.schema_version_md,
322
+ version_md=initial_md.version_md,
323
+ schema_version_md=initial_md.schema_version_md,
317
324
  )
318
325
  return tbl_id, tbl_version
319
326
 
@@ -340,11 +347,14 @@ class TableVersion:
340
347
 
341
348
  @classmethod
342
349
  def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
350
+ from .catalog import TableVersionPath
351
+
343
352
  assert Env.get().in_xact
353
+ assert md.tbl_md.is_replica
344
354
  tbl_id = UUID(md.tbl_md.tbl_id)
345
355
  _logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
346
356
  view_md = md.tbl_md.view_md
347
- base_path = pxt.catalog.TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
357
+ base_path = TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
348
358
  base = base_path.tbl_version if base_path is not None else None
349
359
  tbl_version = cls(
350
360
  tbl_id,
@@ -366,7 +376,7 @@ class TableVersion:
366
376
  cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
367
377
  tbl_version.init()
368
378
  tbl_version.store_tbl.create()
369
- tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
379
+ tbl_version.store_tbl.ensure_updated_schema()
370
380
  return tbl_version
371
381
 
372
382
  def delete_media(self, tbl_version: Optional[int] = None) -> None:
@@ -409,8 +419,8 @@ class TableVersion:
409
419
  def _init_schema(self) -> None:
410
420
  # create columns first, so the indices can reference them
411
421
  self._init_cols()
412
- if not self.is_snapshot:
413
- self._init_idxs()
422
+ self._init_idxs()
423
+
414
424
  # create the sa schema only after creating the columns and indices
415
425
  self._init_sa_schema()
416
426
 
@@ -448,39 +458,71 @@ class TableVersion:
448
458
  # self._record_refd_columns(col)
449
459
 
450
460
  def _init_idxs(self) -> None:
451
- # self.idx_md = tbl_md.index_md
452
- self.idxs_by_name = {}
453
- import pixeltable.index as index_module
454
-
455
461
  for md in self.tbl_md.index_md.values():
456
- if md.schema_version_add > self.schema_version or (
457
- md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version
458
- ):
459
- # index not visible in this schema version
460
- continue
461
-
462
- # instantiate index object
462
+ # Instantiate index object. This needs to be done for all indices, even those that are not active in this
463
+ # TableVersion, so that we can make appropriate adjustments to the SA schema.
463
464
  cls_name = md.class_fqn.rsplit('.', 1)[-1]
464
- cls = getattr(index_module, cls_name)
465
- idx_col: Column
466
- if md.indexed_col_tbl_id == str(self.id):
467
- # this is a reference to one of our columns: avoid TVP.get_column_by_id() here, because we're not fully
468
- # initialized yet
469
- idx_col = self.cols_by_id[md.indexed_col_id]
470
- else:
471
- assert self.path.base is not None
472
- idx_col = self.path.base.get_column_by_id(UUID(md.indexed_col_tbl_id), md.indexed_col_id)
465
+ cls = getattr(index, cls_name)
466
+ idx_col = self._lookup_column(QColumnId(UUID(md.indexed_col_tbl_id), md.indexed_col_id))
467
+ assert idx_col is not None
473
468
  idx = cls.from_dict(idx_col, md.init_args)
469
+ assert isinstance(idx, index.IndexBase)
470
+
471
+ val_col = next(col for col in self.cols if col.id == md.index_val_col_id)
472
+ undo_col = next(col for col in self.cols if col.id == md.index_val_undo_col_id)
473
+ idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
474
+ self.all_idxs[md.name] = idx_info
474
475
 
475
476
  # fix up the sa column type of the index value and undo columns
476
- val_col = self.cols_by_id[md.index_val_col_id]
477
+ # we need to do this for all indices, not just those that are active in this TableVersion, to ensure we get
478
+ # the correct SA schema in the StoreTable.
477
479
  val_col.sa_col_type = idx.index_sa_type()
478
- val_col._stores_cellmd = False
479
- undo_col = self.cols_by_id[md.index_val_undo_col_id]
480
480
  undo_col.sa_col_type = idx.index_sa_type()
481
+ if not isinstance(idx, index.EmbeddingIndex):
482
+ # Historically, the intent has been not to store cellmd data, even for embedding indices. However,
483
+ # the cellmd columns get created anyway, even if stores_cellmd is set to `False` here, due to the
484
+ # timing of index column creation. In order to ensure that SA schemas align with what is actually in
485
+ # the physical tables, we keep this `True` for embedding indices.
486
+ # TODO: Decide whether index columns should store cellmd data.
487
+ # - If not, set to `False`, fix the column creation timing issue, and add a migration script to
488
+ # remedy existing cellmd columns.
489
+ # - If so, remove this TODO.
490
+ val_col._stores_cellmd = False
481
491
  undo_col._stores_cellmd = False
482
- idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
483
- self.idxs_by_name[md.name] = idx_info
492
+
493
+ # The index is active in this TableVersion provided that:
494
+ # (i) the TableVersion supports indices (either it's not a snapshot, or it's a replica at
495
+ # the head version); and
496
+ # (ii) the index was created on or before the schema version of this TableVersion; and
497
+ # (iii) the index was not dropped on or before the schema version of this TableVersion.
498
+ supports_idxs = self.effective_version is None or (
499
+ self.tbl_md.is_replica and self.effective_version == self.tbl_md.current_version
500
+ )
501
+ if (
502
+ supports_idxs
503
+ and md.schema_version_add <= self.schema_version
504
+ and (md.schema_version_drop is None or md.schema_version_drop > self.schema_version)
505
+ ):
506
+ # Since the index is present in this TableVersion, its associated columns must be as well.
507
+ # Sanity-check this.
508
+ assert md.indexed_col_id in self.cols_by_id
509
+ assert md.index_val_col_id in self.cols_by_id
510
+ assert md.index_val_undo_col_id in self.cols_by_id
511
+ self.idxs_by_name[md.name] = idx_info
512
+
513
+ def _lookup_column(self, id: QColumnId) -> Column | None:
514
+ """
515
+ Look up the column with the given table id and column id, searching through the ancestors of this TableVersion
516
+ to find it. We avoid referencing TableVersionPath in order to work properly with snapshots as well.
517
+
518
+ This will search through *all* known columns, including columns that are not visible in this TableVersion.
519
+ """
520
+ if id.tbl_id == self.id:
521
+ return next(col for col in self.cols if col.id == id.col_id)
522
+ elif self.base is not None:
523
+ return self.base.get()._lookup_column(id)
524
+ else:
525
+ return None
484
526
 
485
527
  def _init_sa_schema(self) -> None:
486
528
  # create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
@@ -1286,8 +1328,6 @@ class TableVersion:
1286
1328
  self._write_md(new_version=False, new_schema_version=False)
1287
1329
 
1288
1330
  # propagate to views
1289
- views_str = ', '.join([str(v.id) for v in self.mutable_views])
1290
- print(f'revert(): mutable_views={views_str}')
1291
1331
  for view in self.mutable_views:
1292
1332
  view.get()._revert()
1293
1333
 
@@ -195,17 +195,6 @@ class TableVersionPath:
195
195
  else:
196
196
  return None
197
197
 
198
- def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
199
- """Return the column for the given tbl/col id"""
200
- self.refresh_cached_md()
201
- if self.tbl_version.id == tbl_id:
202
- assert col_id in self._cached_tbl_version.cols_by_id
203
- return self._cached_tbl_version.cols_by_id[col_id]
204
- elif self.base is not None:
205
- return self.base.get_column_by_id(tbl_id, col_id)
206
- else:
207
- return None
208
-
209
198
  def has_column(self, col: Column) -> bool:
210
199
  """Return True if this table has the given column."""
211
200
  assert col.tbl is not None
@@ -252,6 +252,12 @@ class View(Table):
252
252
  base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
253
253
  )
254
254
 
255
+ def _is_named_pure_snapshot(self) -> bool:
256
+ """
257
+ Returns True if this is a named pure snapshot (i.e., a pure snapshot that is a separate schema object).
258
+ """
259
+ return self._id != self._tbl_version_path.tbl_id
260
+
255
261
  def _is_anonymous_snapshot(self) -> bool:
256
262
  """
257
263
  Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).
pixeltable/config.py CHANGED
@@ -163,6 +163,7 @@ KNOWN_CONFIG_OPTIONS = {
163
163
  'api_key': 'API key for Pixeltable cloud',
164
164
  'r2_profile': 'AWS config profile name used to access R2 storage',
165
165
  's3_profile': 'AWS config profile name used to access S3 storage',
166
+ 'b2_profile': 'S3-compatible profile name used to access Backblaze B2 storage',
166
167
  },
167
168
  'anthropic': {'api_key': 'Anthropic API key'},
168
169
  'bedrock': {'api_key': 'AWS Bedrock API key'},
pixeltable/dataframe.py CHANGED
@@ -1039,7 +1039,7 @@ class DataFrame:
1039
1039
  >>> df = book.order_by(t.price, asc=False).order_by(t.pages)
1040
1040
  """
1041
1041
  if self.sample_clause is not None:
1042
- raise excs.Error('group_by() cannot be used with sample()')
1042
+ raise excs.Error('order_by() cannot be used with sample()')
1043
1043
  for e in expr_list:
1044
1044
  if not isinstance(e, exprs.Expr):
1045
1045
  raise excs.Error(f'Invalid expression in order_by(): {e}')
pixeltable/env.py CHANGED
@@ -355,6 +355,8 @@ class Env:
355
355
  # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
356
356
  path_parts = list(Path(record.pathname).parts)
357
357
  path_parts.reverse()
358
+ if 'pixeltable' not in path_parts:
359
+ return False
358
360
  max_idx = path_parts.index('pixeltable')
359
361
  for module_name in path_parts[:max_idx]:
360
362
  if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
@@ -576,6 +578,12 @@ class Env:
576
578
  assert isinstance(tz_name, str)
577
579
  self._logger.info(f'Database time zone is now: {tz_name}')
578
580
  self._default_time_zone = ZoneInfo(tz_name)
581
+ if self.is_using_cockroachdb:
582
+ # This could be set when the database is created, but we set it now
583
+ conn.execute(sql.text('SET null_ordered_last = true;'))
584
+ null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
585
+ assert isinstance(null_ordered_last, str)
586
+ self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
579
587
 
580
588
  def _store_db_exists(self) -> bool:
581
589
  assert self._db_name is not None
@@ -752,10 +760,12 @@ class Env:
752
760
 
753
761
  def __register_packages(self) -> None:
754
762
  """Declare optional packages that are utilized by some parts of the code."""
763
+ self.__register_package('accelerate')
755
764
  self.__register_package('anthropic')
756
765
  self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
757
766
  self.__register_package('boto3')
758
767
  self.__register_package('datasets')
768
+ self.__register_package('diffusers')
759
769
  self.__register_package('fiftyone')
760
770
  self.__register_package('fireworks', library_name='fireworks-ai')
761
771
  self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
@@ -763,6 +773,7 @@ class Env:
763
773
  self.__register_package('groq')
764
774
  self.__register_package('huggingface_hub', library_name='huggingface-hub')
765
775
  self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
776
+ self.__register_package('librosa')
766
777
  self.__register_package('llama_cpp', library_name='llama-cpp-python')
767
778
  self.__register_package('mcp')
768
779
  self.__register_package('mistralai')
@@ -775,6 +786,7 @@ class Env:
775
786
  self.__register_package('replicate')
776
787
  self.__register_package('sentencepiece')
777
788
  self.__register_package('sentence_transformers', library_name='sentence-transformers')
789
+ self.__register_package('soundfile')
778
790
  self.__register_package('spacy')
779
791
  self.__register_package('tiktoken')
780
792
  self.__register_package('together')