PyPI - pixeltable - Versions diffs - 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl - Mend

pixeltable 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (31) hide show

pixeltable/catalog/catalog.py +47 -32
pixeltable/catalog/table.py +33 -14
pixeltable/catalog/table_version.py +86 -46
pixeltable/catalog/table_version_path.py +0 -11
pixeltable/catalog/view.py +6 -0
pixeltable/config.py +1 -0
pixeltable/dataframe.py +1 -1
pixeltable/env.py +12 -0
pixeltable/exec/exec_context.py +15 -2
pixeltable/exec/sql_node.py +3 -2
pixeltable/exprs/arithmetic_expr.py +13 -7
pixeltable/functions/huggingface.py +1031 -2
pixeltable/functions/video.py +140 -31
pixeltable/globals.py +23 -4
pixeltable/io/globals.py +2 -2
pixeltable/io/parquet.py +1 -1
pixeltable/io/table_data_conduit.py +1 -1
pixeltable/iterators/document.py +111 -42
pixeltable/iterators/video.py +169 -62
pixeltable/plan.py +2 -6
pixeltable/share/packager.py +155 -26
pixeltable/store.py +25 -5
pixeltable/utils/arrow.py +6 -6
pixeltable/utils/av.py +104 -11
pixeltable/utils/object_stores.py +16 -1
pixeltable/utils/s3_store.py +44 -11
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/METADATA +30 -30
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/RECORD +31 -31
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/WHEEL +0 -0
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/licenses/LICENSE +0 -0

pixeltable/catalog/catalog.py CHANGED Viewed

@@ -280,7 +280,7 @@ class Catalog:
         - this needs to be done in a retry loop, because Postgres can decide to abort the transaction
           (SerializationFailure, LockNotAvailable)
         - for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
-          to minimize the probability of loosing that work due to a forced abort
+          to minimize the probability of losing that work due to a forced abort
         If convert_db_excs == True, converts DBAPIErrors into excs.Errors.
         """
@@ -433,7 +433,7 @@ class Catalog:
         The function should not raise exceptions; if it does, they are logged and ignored.
         """
-        assert Env.get().in_xact
+        assert self.in_write_xact
         self._undo_actions.append(func)
         return func
@@ -472,11 +472,13 @@ class Catalog:
             else:
                 msg = ''
             _logger.debug(f'Exception: {e.orig.__class__}: {msg} ({e})')
+            # Suppress the underlying SQL exception unless DEBUG is enabled
+            raise_from = e if _logger.isEnabledFor(logging.DEBUG) else None
             raise excs.Error(
                 'That Pixeltable operation could not be completed because it conflicted with another '
                 'operation that was run on a different process.\n'
                 'Please re-run the operation.'
-            ) from None
+            ) from raise_from
     @property
     def in_write_xact(self) -> bool:
@@ -790,19 +792,25 @@ class Catalog:
         return result
     @retry_loop(for_write=True)
-    def move(self, path: Path, new_path: Path) -> None:
-        self._move(path, new_path)
+    def move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
+        self._move(path, new_path, if_exists, if_not_exists)
-    def _move(self, path: Path, new_path: Path) -> None:
-        _, dest_dir, src_obj = self._prepare_dir_op(
+    def _move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
+        dest_obj, dest_dir, src_obj = self._prepare_dir_op(
             add_dir_path=new_path.parent,
             add_name=new_path.name,
             drop_dir_path=path.parent,
             drop_name=path.name,
-            raise_if_exists=True,
-            raise_if_not_exists=True,
+            raise_if_exists=(if_exists == IfExistsParam.ERROR),
+            raise_if_not_exists=(if_not_exists == IfNotExistsParam.ERROR),
         )
-        src_obj._move(new_path.name, dest_dir._id)
+        assert dest_obj is None or if_exists == IfExistsParam.IGNORE
+        assert src_obj is not None or if_not_exists == IfNotExistsParam.IGNORE
+        if dest_obj is None and src_obj is not None:
+            # If dest_obj is not None, it means `if_exists='ignore'` and the destination already exists.
+            # If src_obj is None, it means `if_not_exists='ignore'` and the source doesn't exist.
+            # If dest_obj is None and src_obj is not None, then we can proceed with the move.
+            src_obj._move(new_path.name, dest_dir._id)
     def _prepare_dir_op(
         self,
@@ -813,7 +821,7 @@ class Catalog:
         drop_expected: Optional[type[SchemaObject]] = None,
         raise_if_exists: bool = False,
         raise_if_not_exists: bool = False,
-    ) -> tuple[Optional[SchemaObject], Optional[SchemaObject], Optional[SchemaObject]]:
+    ) -> tuple[Optional[SchemaObject], Optional[Dir], Optional[SchemaObject]]:
         """
         Validates paths and acquires locks needed for a directory operation, ie, add/drop/rename (add + drop) of a
         directory entry.
@@ -900,9 +908,10 @@ class Catalog:
             schema.Table.md['name'].astext == name,
             schema.Table.md['user'].astext == user,
         )
-        tbl_id = conn.execute(q).scalar_one_or_none()
-        if tbl_id is not None:
-            return self.get_table_by_id(tbl_id, version)
+        tbl_id = conn.execute(q).scalars().all()
+        assert len(tbl_id) <= 1, name
+        if len(tbl_id) == 1:
+            return self.get_table_by_id(tbl_id[0], version)
         return None
@@ -1082,7 +1091,7 @@ class Catalog:
         The metadata should be presented in standard "ancestor order", with the table being replicated at
         list position 0 and the (root) base table at list position -1.
         """
-        assert Env.get().in_xact
+        assert self.in_write_xact
         tbl_id = UUID(md[0].tbl_md.tbl_id)
@@ -1148,11 +1157,11 @@ class Catalog:
         # We need to do this at the end, since `existing_path` needs to first have a non-fragment table version in
         # order to be instantiated as a schema object.
         existing = self.get_table_by_id(tbl_id)
-        if existing is not None:
-            existing_path = Path.parse(existing._path(), allow_system_path=True)
-            if existing_path != path:
-                assert existing_path.is_system_path
-                self._move(existing_path, path)
+        assert existing is not None
+        existing_path = Path.parse(existing._path(), allow_system_path=True)
+        if existing_path != path:
+            assert existing_path.is_system_path
+            self._move(existing_path, path, IfExistsParam.ERROR, IfNotExistsParam.ERROR)
     def __ensure_system_dir_exists(self) -> Dir:
         system_path = Path.parse('_system', allow_system_path=True)
@@ -1736,6 +1745,9 @@ class Catalog:
     @retry_loop(for_write=False)
     def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
+        return self._collect_tbl_history(tbl_id, n)
+    def _collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
         """
         Returns the history of up to n versions of the table with the given UUID.
@@ -1748,14 +1760,15 @@ class Catalog:
             Each row contains a TableVersion and a TableSchemaVersion object.
         """
         q = (
-            sql.select(schema.TableVersion, schema.TableSchemaVersion)
-            .select_from(schema.TableVersion)
-            .join(
-                schema.TableSchemaVersion,
-                schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version,
-            )
+            sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
+            .where(schema.Table.id == tbl_id)
+            .join(schema.TableVersion)
             .where(schema.TableVersion.tbl_id == tbl_id)
+            .join(schema.TableSchemaVersion)
             .where(schema.TableSchemaVersion.tbl_id == tbl_id)
+            .where(
+                schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version
+            )
             .order_by(schema.TableVersion.version.desc())
         )
         if n is not None:
@@ -1763,7 +1776,7 @@ class Catalog:
         src_rows = Env.get().session.execute(q).fetchall()
         return [
             schema.FullTableMd(
-                None,
+                schema.md_from_dict(schema.TableMd, row.Table.md),
                 schema.md_from_dict(schema.TableVersionMd, row.TableVersion.md),
                 schema.md_from_dict(schema.TableSchemaVersionMd, row.TableSchemaVersion.md),
             )
@@ -1958,11 +1971,13 @@ class Catalog:
         # If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
         # TableVersionPath. We need to prepend it separately.
-        if isinstance(tbl, View) and tbl._snapshot_only:
+        if isinstance(tbl, View) and tbl._is_named_pure_snapshot():
             snapshot_md = self.load_tbl_md(tbl._id, 0)
             md = [snapshot_md, *md]
-        for ancestor_md in md[1:]:
+        for ancestor_md in md:
+            # Set the `is_replica` flag on every ancestor's TableMd.
+            ancestor_md.tbl_md.is_replica = True
             # For replica metadata, we guarantee that the current_version and current_schema_version of TableMd
             # match the corresponding values in TableVersionMd and TableSchemaVersionMd. This is to ensure that,
             # when the metadata is later stored in the catalog of a different Pixeltable instance, the values of
@@ -1970,6 +1985,8 @@ class Catalog:
             # destination catalog.
             ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
             ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
+        for ancestor_md in md[1:]:
             # Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
             # table version (the data might be incomplete, since we have only retrieved one of its views, not
             # the table itself).
@@ -2022,9 +2039,7 @@ class Catalog:
         tbl_version: TableVersion
         if view_md is None:
             # this is a base table
-            tbl_version = TableVersion(
-                tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views=mutable_views
-            )
+            tbl_version = TableVersion(tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views)
         else:
             assert len(view_md.base_versions) > 0  # a view needs to have a base
             # TODO: add TableVersionMd.is_pure_snapshot() and use that

pixeltable/catalog/table.py CHANGED Viewed

@@ -77,6 +77,17 @@ class Table(SchemaObject):
         self._tbl_version = None
     def _move(self, new_name: str, new_dir_id: UUID) -> None:
+        old_name = self._name
+        old_dir_id = self._dir_id
+        cat = catalog.Catalog.get()
+        @cat.register_undo_action
+        def _() -> None:
+            # TODO: We should really be invalidating the Table instance and forcing a reload.
+            self._name = old_name
+            self._dir_id = old_dir_id
         super()._move(new_name, new_dir_id)
         conn = env.Env.get().conn
         stmt = sql.text(
@@ -625,7 +636,7 @@ class Table(SchemaObject):
                 - `'abort'`: an exception will be raised and the column will not be added.
                 - `'ignore'`: execution will continue and the column will be added. Any rows
                     with errors will have a `None` value for the column, with information about the error stored in the
-                    corresponding `tbl.col_name.errormsg` tbl.col_name.errortype` fields.
+                    corresponding `tbl.col_name.errormsg` and `tbl.col_name.errortype` fields.
             if_exists: Determines the behavior if the column already exists. Must be one of the following:
                 - `'error'`: an exception will be raised.
@@ -986,22 +997,28 @@ class Table(SchemaObject):
         Only `String` and `Image` columns are currently supported. Here's an example that uses a
         [CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
+        ```
         >>> from pixeltable.functions.huggingface import clip
-        ... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
-        ... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
+        >>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
+        >>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
+        ```
-        Once the index is created, similiarity lookups can be performed using the `similarity` pseudo-function.
+        Once the index is created, similarity lookups can be performed using the `similarity` pseudo-function:
+        ```
         >>> reference_img = PIL.Image.open('my_image.jpg')
-        ... sim = tbl.img.similarity(reference_img)
-        ... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
+        >>> sim = tbl.img.similarity(reference_img)
+        >>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
+        ```
         If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
         performed using any of its supported types. In our example, CLIP supports both text and images, so we can
         also search for images using a text description:
+        ```
         >>> sim = tbl.img.similarity('a picture of a train')
-        ... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
+        >>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
+        ```
         Args:
             column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
@@ -1032,9 +1049,9 @@ class Table(SchemaObject):
             Add an index to the `img` column of the table `my_table`:
             >>> from pixeltable.functions.huggingface import clip
-            ... tbl = pxt.get_table('my_table')
-            ... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
-            ... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
+            >>> tbl = pxt.get_table('my_table')
+            >>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
+            >>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
             Alternatively, the `img` column may be specified by name:
@@ -1328,7 +1345,8 @@ class Table(SchemaObject):
             on_error: Literal['abort', 'ignore'] = 'abort',
             print_stats: bool = False,
             **kwargs: Any,
-        )```
+        )
+        ```
         To insert just a single row, you can use the more concise syntax:
@@ -1338,7 +1356,8 @@ class Table(SchemaObject):
             on_error: Literal['abort', 'ignore'] = 'abort',
             print_stats: bool = False,
             **kwargs: Any
-        )```
+        )
+        ```
         Args:
             source: A data source from which data can be imported.
@@ -1459,8 +1478,8 @@ class Table(SchemaObject):
             the row with new `id` 3 (assuming this key does not exist):
             >>> tbl.update(
-                [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
-                if_not_exists='insert')
+            ...     [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
+            ...     if_not_exists='insert')
         """
         from pixeltable.catalog import Catalog

pixeltable/catalog/table_version.py CHANGED Viewed

@@ -24,7 +24,7 @@ from pixeltable.utils.object_stores import ObjectOps
 from ..func.globals import resolve_symbol
 from .column import Column
-from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, is_valid_identifier
+from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, QColumnId, is_valid_identifier
 from .tbl_ops import TableOp
 from .update_status import RowCountStats, UpdateStatus
@@ -96,6 +96,8 @@ class TableVersion:
     cols_by_name: dict[str, Column]
     # contains only columns visible in this version, both system and user
     cols_by_id: dict[int, Column]
+    # all indices defined on this table
+    all_idxs: dict[str, TableVersion.IndexInfo]
     # contains only actively maintained indices
     idxs_by_name: dict[str, TableVersion.IndexInfo]
@@ -129,6 +131,12 @@ class TableVersion:
         base_path: Optional[pxt.catalog.TableVersionPath] = None,
         base: Optional[TableVersionHandle] = None,
     ):
+        from pixeltable import exprs
+        from pixeltable.plan import SampleClause
+        from .table_version_handle import TableVersionHandle
+        from .table_version_path import TableVersionPath
         self.is_validated = True  # a freshly constructed instance is always valid
         self.is_initialized = False
         self.id = id
@@ -141,9 +149,6 @@ class TableVersion:
         self.store_tbl = None
         # mutable tables need their TableVersionPath for expr eval during updates
-        from .table_version_handle import TableVersionHandle
-        from .table_version_path import TableVersionPath
         if self.is_snapshot:
             self.path = None
         else:
@@ -153,9 +158,6 @@ class TableVersion:
             self.path = TableVersionPath(self_handle, base=base_path)
         # view-specific initialization
-        from pixeltable import exprs
-        from pixeltable.plan import SampleClause
         predicate_dict = None if self.view_md is None or self.view_md.predicate is None else self.view_md.predicate
         self.predicate = exprs.Expr.from_dict(predicate_dict) if predicate_dict is not None else None
         sample_dict = None if self.view_md is None or self.view_md.sample_clause is None else self.view_md.sample_clause
@@ -180,6 +182,7 @@ class TableVersion:
         self.cols = []
         self.cols_by_name = {}
         self.cols_by_id = {}
+        self.all_idxs = {}
         self.idxs_by_name = {}
         self.external_stores = {}
@@ -190,9 +193,7 @@ class TableVersion:
         """Create a snapshot copy of this TableVersion"""
         assert not self.is_snapshot
         base = self.path.base.tbl_version if self.is_view else None
-        return TableVersion(
-            self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, mutable_views=[], base=base
-        )
+        return TableVersion(self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, [], base=base)
     @property
     def versioned_name(self) -> str:
@@ -201,6 +202,12 @@ class TableVersion:
         else:
             return f'{self.name}:{self.effective_version}'
+    def __repr__(self) -> str:
+        return (
+            f'TableVersion(id={self.id!r}, name={self.name!r}, '
+            f'version={self.version}, effective_version={self.effective_version})'
+        )
     @property
     def handle(self) -> 'TableVersionHandle':
         from .table_version_handle import TableVersionHandle
@@ -287,12 +294,12 @@ class TableVersion:
         comment: str,
         media_validation: MediaValidation,
     ) -> tuple[UUID, Optional[TableVersion]]:
-        inital_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
+        initial_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
         cat = pxt.catalog.Catalog.get()
-        tbl_id = UUID(hex=inital_md.tbl_md.tbl_id)
+        tbl_id = UUID(hex=initial_md.tbl_md.tbl_id)
         assert (tbl_id, None) not in cat._tbl_versions
-        tbl_version = cls(tbl_id, inital_md.tbl_md, inital_md.version_md, None, inital_md.schema_version_md, [])
+        tbl_version = cls(tbl_id, initial_md.tbl_md, initial_md.version_md, None, initial_md.schema_version_md, [])
         @cat.register_undo_action
         def _() -> None:
@@ -312,8 +319,8 @@ class TableVersion:
             tbl_id=tbl_id,
             dir_id=dir_id,
             tbl_md=tbl_version.tbl_md,
-            version_md=inital_md.version_md,
-            schema_version_md=inital_md.schema_version_md,
+            version_md=initial_md.version_md,
+            schema_version_md=initial_md.schema_version_md,
         )
         return tbl_id, tbl_version
@@ -340,11 +347,14 @@ class TableVersion:
     @classmethod
     def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
+        from .catalog import TableVersionPath
         assert Env.get().in_xact
+        assert md.tbl_md.is_replica
         tbl_id = UUID(md.tbl_md.tbl_id)
         _logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
         view_md = md.tbl_md.view_md
-        base_path = pxt.catalog.TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
+        base_path = TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
         base = base_path.tbl_version if base_path is not None else None
         tbl_version = cls(
             tbl_id,
@@ -366,7 +376,7 @@ class TableVersion:
         cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
         tbl_version.init()
         tbl_version.store_tbl.create()
-        tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
+        tbl_version.store_tbl.ensure_updated_schema()
         return tbl_version
     def delete_media(self, tbl_version: Optional[int] = None) -> None:
@@ -409,8 +419,8 @@ class TableVersion:
     def _init_schema(self) -> None:
         # create columns first, so the indices can reference them
         self._init_cols()
-        if not self.is_snapshot:
-            self._init_idxs()
+        self._init_idxs()
         # create the sa schema only after creating the columns and indices
         self._init_sa_schema()
@@ -448,39 +458,71 @@ class TableVersion:
             #     self._record_refd_columns(col)
     def _init_idxs(self) -> None:
-        # self.idx_md = tbl_md.index_md
-        self.idxs_by_name = {}
-        import pixeltable.index as index_module
         for md in self.tbl_md.index_md.values():
-            if md.schema_version_add > self.schema_version or (
-                md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version
-            ):
-                # index not visible in this schema version
-                continue
-            # instantiate index object
+            # Instantiate index object. This needs to be done for all indices, even those that are not active in this
+            # TableVersion, so that we can make appropriate adjustments to the SA schema.
             cls_name = md.class_fqn.rsplit('.', 1)[-1]
-            cls = getattr(index_module, cls_name)
-            idx_col: Column
-            if md.indexed_col_tbl_id == str(self.id):
-                # this is a reference to one of our columns: avoid TVP.get_column_by_id() here, because we're not fully
-                # initialized yet
-                idx_col = self.cols_by_id[md.indexed_col_id]
-            else:
-                assert self.path.base is not None
-                idx_col = self.path.base.get_column_by_id(UUID(md.indexed_col_tbl_id), md.indexed_col_id)
+            cls = getattr(index, cls_name)
+            idx_col = self._lookup_column(QColumnId(UUID(md.indexed_col_tbl_id), md.indexed_col_id))
+            assert idx_col is not None
             idx = cls.from_dict(idx_col, md.init_args)
+            assert isinstance(idx, index.IndexBase)
+            val_col = next(col for col in self.cols if col.id == md.index_val_col_id)
+            undo_col = next(col for col in self.cols if col.id == md.index_val_undo_col_id)
+            idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
+            self.all_idxs[md.name] = idx_info
             # fix up the sa column type of the index value and undo columns
-            val_col = self.cols_by_id[md.index_val_col_id]
+            # we need to do this for all indices, not just those that are active in this TableVersion, to ensure we get
+            # the correct SA schema in the StoreTable.
             val_col.sa_col_type = idx.index_sa_type()
-            val_col._stores_cellmd = False
-            undo_col = self.cols_by_id[md.index_val_undo_col_id]
             undo_col.sa_col_type = idx.index_sa_type()
+            if not isinstance(idx, index.EmbeddingIndex):
+                # Historically, the intent has been not to store cellmd data, even for embedding indices. However,
+                # the cellmd columns get created anyway, even if stores_cellmd is set to `False` here, due to the
+                # timing of index column creation. In order to ensure that SA schemas align with what is actually in
+                # the physical tables, we keep this `True` for embedding indices.
+                # TODO: Decide whether index columns should store cellmd data.
+                #     - If not, set to `False`, fix the column creation timing issue, and add a migration script to
+                #       remedy existing cellmd columns.
+                #     - If so, remove this TODO.
+                val_col._stores_cellmd = False
             undo_col._stores_cellmd = False
-            idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
-            self.idxs_by_name[md.name] = idx_info
+            # The index is active in this TableVersion provided that:
+            # (i) the TableVersion supports indices (either it's not a snapshot, or it's a replica at
+            #     the head version); and
+            # (ii) the index was created on or before the schema version of this TableVersion; and
+            # (iii) the index was not dropped on or before the schema version of this TableVersion.
+            supports_idxs = self.effective_version is None or (
+                self.tbl_md.is_replica and self.effective_version == self.tbl_md.current_version
+            )
+            if (
+                supports_idxs
+                and md.schema_version_add <= self.schema_version
+                and (md.schema_version_drop is None or md.schema_version_drop > self.schema_version)
+            ):
+                # Since the index is present in this TableVersion, its associated columns must be as well.
+                # Sanity-check this.
+                assert md.indexed_col_id in self.cols_by_id
+                assert md.index_val_col_id in self.cols_by_id
+                assert md.index_val_undo_col_id in self.cols_by_id
+                self.idxs_by_name[md.name] = idx_info
+    def _lookup_column(self, id: QColumnId) -> Column | None:
+        """
+        Look up the column with the given table id and column id, searching through the ancestors of this TableVersion
+        to find it. We avoid referencing TableVersionPath in order to work properly with snapshots as well.
+        This will search through *all* known columns, including columns that are not visible in this TableVersion.
+        """
+        if id.tbl_id == self.id:
+            return next(col for col in self.cols if col.id == id.col_id)
+        elif self.base is not None:
+            return self.base.get()._lookup_column(id)
+        else:
+            return None
     def _init_sa_schema(self) -> None:
         # create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
@@ -1286,8 +1328,6 @@ class TableVersion:
         self._write_md(new_version=False, new_schema_version=False)
         # propagate to views
-        views_str = ', '.join([str(v.id) for v in self.mutable_views])
-        print(f'revert(): mutable_views={views_str}')
         for view in self.mutable_views:
             view.get()._revert()

pixeltable/catalog/table_version_path.py CHANGED Viewed

@@ -195,17 +195,6 @@ class TableVersionPath:
         else:
             return None
-    def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
-        """Return the column for the given tbl/col id"""
-        self.refresh_cached_md()
-        if self.tbl_version.id == tbl_id:
-            assert col_id in self._cached_tbl_version.cols_by_id
-            return self._cached_tbl_version.cols_by_id[col_id]
-        elif self.base is not None:
-            return self.base.get_column_by_id(tbl_id, col_id)
-        else:
-            return None
     def has_column(self, col: Column) -> bool:
         """Return True if this table has the given column."""
         assert col.tbl is not None

pixeltable/catalog/view.py CHANGED Viewed

@@ -252,6 +252,12 @@ class View(Table):
             base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
         )
+    def _is_named_pure_snapshot(self) -> bool:
+        """
+        Returns True if this is a named pure snapshot (i.e., a pure snapshot that is a separate schema object).
+        """
+        return self._id != self._tbl_version_path.tbl_id
     def _is_anonymous_snapshot(self) -> bool:
         """
         Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).

pixeltable/config.py CHANGED Viewed

@@ -163,6 +163,7 @@ KNOWN_CONFIG_OPTIONS = {
         'api_key': 'API key for Pixeltable cloud',
         'r2_profile': 'AWS config profile name used to access R2 storage',
         's3_profile': 'AWS config profile name used to access S3 storage',
+        'b2_profile': 'S3-compatible profile name used to access Backblaze B2 storage',
     },
     'anthropic': {'api_key': 'Anthropic API key'},
     'bedrock': {'api_key': 'AWS Bedrock API key'},

pixeltable/dataframe.py CHANGED Viewed

@@ -1039,7 +1039,7 @@ class DataFrame:
             >>> df = book.order_by(t.price, asc=False).order_by(t.pages)
         """
         if self.sample_clause is not None:
-            raise excs.Error('group_by() cannot be used with sample()')
+            raise excs.Error('order_by() cannot be used with sample()')
         for e in expr_list:
             if not isinstance(e, exprs.Expr):
                 raise excs.Error(f'Invalid expression in order_by(): {e}')

pixeltable/env.py CHANGED Viewed

@@ -355,6 +355,8 @@ class Env:
             # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
             path_parts = list(Path(record.pathname).parts)
             path_parts.reverse()
+            if 'pixeltable' not in path_parts:
+                return False
             max_idx = path_parts.index('pixeltable')
             for module_name in path_parts[:max_idx]:
                 if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
@@ -576,6 +578,12 @@ class Env:
             assert isinstance(tz_name, str)
             self._logger.info(f'Database time zone is now: {tz_name}')
             self._default_time_zone = ZoneInfo(tz_name)
+            if self.is_using_cockroachdb:
+                # This could be set when the database is created, but we set it now
+                conn.execute(sql.text('SET null_ordered_last = true;'))
+                null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
+                assert isinstance(null_ordered_last, str)
+                self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
     def _store_db_exists(self) -> bool:
         assert self._db_name is not None
@@ -752,10 +760,12 @@ class Env:
     def __register_packages(self) -> None:
         """Declare optional packages that are utilized by some parts of the code."""
+        self.__register_package('accelerate')
         self.__register_package('anthropic')
         self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
         self.__register_package('boto3')
         self.__register_package('datasets')
+        self.__register_package('diffusers')
         self.__register_package('fiftyone')
         self.__register_package('fireworks', library_name='fireworks-ai')
         self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
@@ -763,6 +773,7 @@ class Env:
         self.__register_package('groq')
         self.__register_package('huggingface_hub', library_name='huggingface-hub')
         self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
+        self.__register_package('librosa')
         self.__register_package('llama_cpp', library_name='llama-cpp-python')
         self.__register_package('mcp')
         self.__register_package('mistralai')
@@ -775,6 +786,7 @@ class Env:
         self.__register_package('replicate')
         self.__register_package('sentencepiece')
         self.__register_package('sentence_transformers', library_name='sentence-transformers')
+        self.__register_package('soundfile')
         self.__register_package('spacy')
         self.__register_package('tiktoken')
         self.__register_package('together')

pixeltable 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl