PyPI - pixeltable - Versions diffs - 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl - Mend

pixeltable 0.4.16py3-none-any.whl → 0.4.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (24) hide show

pixeltable/catalog/catalog.py +21 -13
pixeltable/catalog/table_version.py +75 -39
pixeltable/catalog/table_version_path.py +0 -11
pixeltable/catalog/view.py +6 -0
pixeltable/config.py +1 -0
pixeltable/env.py +8 -0
pixeltable/exprs/arithmetic_expr.py +13 -7
pixeltable/functions/video.py +110 -28
pixeltable/io/globals.py +2 -2
pixeltable/io/parquet.py +1 -1
pixeltable/io/table_data_conduit.py +1 -1
pixeltable/iterators/document.py +61 -23
pixeltable/iterators/video.py +126 -53
pixeltable/share/packager.py +155 -26
pixeltable/store.py +1 -2
pixeltable/utils/arrow.py +6 -6
pixeltable/utils/av.py +65 -0
pixeltable/utils/object_stores.py +16 -1
pixeltable/utils/s3_store.py +44 -11
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/METADATA +28 -28
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/RECORD +24 -24
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0

pixeltable/catalog/catalog.py CHANGED Viewed

@@ -472,11 +472,13 @@ class Catalog:
             else:
                 msg = ''
             _logger.debug(f'Exception: {e.orig.__class__}: {msg} ({e})')
+            # Suppress the underlying SQL exception unless DEBUG is enabled
+            raise_from = e if _logger.isEnabledFor(logging.DEBUG) else None
             raise excs.Error(
                 'That Pixeltable operation could not be completed because it conflicted with another '
                 'operation that was run on a different process.\n'
                 'Please re-run the operation.'
-            ) from None
+            ) from raise_from
     @property
     def in_write_xact(self) -> bool:
@@ -1736,6 +1738,9 @@ class Catalog:
     @retry_loop(for_write=False)
     def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
+        return self._collect_tbl_history(tbl_id, n)
+    def _collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
         """
         Returns the history of up to n versions of the table with the given UUID.
@@ -1748,14 +1753,15 @@ class Catalog:
             Each row contains a TableVersion and a TableSchemaVersion object.
         """
         q = (
-            sql.select(schema.TableVersion, schema.TableSchemaVersion)
-            .select_from(schema.TableVersion)
-            .join(
-                schema.TableSchemaVersion,
-                schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version,
-            )
+            sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
+            .where(schema.Table.id == tbl_id)
+            .join(schema.TableVersion)
             .where(schema.TableVersion.tbl_id == tbl_id)
+            .join(schema.TableSchemaVersion)
             .where(schema.TableSchemaVersion.tbl_id == tbl_id)
+            .where(
+                schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version
+            )
             .order_by(schema.TableVersion.version.desc())
         )
         if n is not None:
@@ -1763,7 +1769,7 @@ class Catalog:
         src_rows = Env.get().session.execute(q).fetchall()
         return [
             schema.FullTableMd(
-                None,
+                schema.md_from_dict(schema.TableMd, row.Table.md),
                 schema.md_from_dict(schema.TableVersionMd, row.TableVersion.md),
                 schema.md_from_dict(schema.TableSchemaVersionMd, row.TableSchemaVersion.md),
             )
@@ -1958,11 +1964,13 @@ class Catalog:
         # If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
         # TableVersionPath. We need to prepend it separately.
-        if isinstance(tbl, View) and tbl._snapshot_only:
+        if isinstance(tbl, View) and tbl._is_named_pure_snapshot():
             snapshot_md = self.load_tbl_md(tbl._id, 0)
             md = [snapshot_md, *md]
-        for ancestor_md in md[1:]:
+        for ancestor_md in md:
+            # Set the `is_replica` flag on every ancestor's TableMd.
+            ancestor_md.tbl_md.is_replica = True
             # For replica metadata, we guarantee that the current_version and current_schema_version of TableMd
             # match the corresponding values in TableVersionMd and TableSchemaVersionMd. This is to ensure that,
             # when the metadata is later stored in the catalog of a different Pixeltable instance, the values of
@@ -1970,6 +1978,8 @@ class Catalog:
             # destination catalog.
             ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
             ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
+        for ancestor_md in md[1:]:
             # Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
             # table version (the data might be incomplete, since we have only retrieved one of its views, not
             # the table itself).
@@ -2022,9 +2032,7 @@ class Catalog:
         tbl_version: TableVersion
         if view_md is None:
             # this is a base table
-            tbl_version = TableVersion(
-                tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views=mutable_views
-            )
+            tbl_version = TableVersion(tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views)
         else:
             assert len(view_md.base_versions) > 0  # a view needs to have a base
             # TODO: add TableVersionMd.is_pure_snapshot() and use that

pixeltable/catalog/table_version.py CHANGED Viewed

@@ -24,7 +24,7 @@ from pixeltable.utils.object_stores import ObjectOps
 from ..func.globals import resolve_symbol
 from .column import Column
-from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, is_valid_identifier
+from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, QColumnId, is_valid_identifier
 from .tbl_ops import TableOp
 from .update_status import RowCountStats, UpdateStatus
@@ -190,9 +190,7 @@ class TableVersion:
         """Create a snapshot copy of this TableVersion"""
         assert not self.is_snapshot
         base = self.path.base.tbl_version if self.is_view else None
-        return TableVersion(
-            self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, mutable_views=[], base=base
-        )
+        return TableVersion(self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, [], base=base)
     @property
     def versioned_name(self) -> str:
@@ -201,6 +199,12 @@ class TableVersion:
         else:
             return f'{self.name}:{self.effective_version}'
+    def __repr__(self) -> str:
+        return (
+            f'TableVersion(id={self.id!r}, name={self.name!r}, '
+            f'version={self.version}, effective_version={self.effective_version})'
+        )
     @property
     def handle(self) -> 'TableVersionHandle':
         from .table_version_handle import TableVersionHandle
@@ -287,12 +291,12 @@ class TableVersion:
         comment: str,
         media_validation: MediaValidation,
     ) -> tuple[UUID, Optional[TableVersion]]:
-        inital_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
+        initial_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
         cat = pxt.catalog.Catalog.get()
-        tbl_id = UUID(hex=inital_md.tbl_md.tbl_id)
+        tbl_id = UUID(hex=initial_md.tbl_md.tbl_id)
         assert (tbl_id, None) not in cat._tbl_versions
-        tbl_version = cls(tbl_id, inital_md.tbl_md, inital_md.version_md, None, inital_md.schema_version_md, [])
+        tbl_version = cls(tbl_id, initial_md.tbl_md, initial_md.version_md, None, initial_md.schema_version_md, [])
         @cat.register_undo_action
         def _() -> None:
@@ -312,8 +316,8 @@ class TableVersion:
             tbl_id=tbl_id,
             dir_id=dir_id,
             tbl_md=tbl_version.tbl_md,
-            version_md=inital_md.version_md,
-            schema_version_md=inital_md.schema_version_md,
+            version_md=initial_md.version_md,
+            schema_version_md=initial_md.schema_version_md,
         )
         return tbl_id, tbl_version
@@ -340,11 +344,14 @@ class TableVersion:
     @classmethod
     def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
+        from .catalog import TableVersionPath
         assert Env.get().in_xact
+        assert md.tbl_md.is_replica
         tbl_id = UUID(md.tbl_md.tbl_id)
         _logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
         view_md = md.tbl_md.view_md
-        base_path = pxt.catalog.TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
+        base_path = TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
         base = base_path.tbl_version if base_path is not None else None
         tbl_version = cls(
             tbl_id,
@@ -409,8 +416,8 @@ class TableVersion:
     def _init_schema(self) -> None:
         # create columns first, so the indices can reference them
         self._init_cols()
-        if not self.is_snapshot:
-            self._init_idxs()
+        self._init_idxs()
         # create the sa schema only after creating the columns and indices
         self._init_sa_schema()
@@ -448,39 +455,70 @@ class TableVersion:
             #     self._record_refd_columns(col)
     def _init_idxs(self) -> None:
-        # self.idx_md = tbl_md.index_md
-        self.idxs_by_name = {}
-        import pixeltable.index as index_module
         for md in self.tbl_md.index_md.values():
-            if md.schema_version_add > self.schema_version or (
-                md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version
-            ):
-                # index not visible in this schema version
-                continue
-            # instantiate index object
+            # Instantiate index object. This needs to be done for all indices, even those that are not active in this
+            # TableVersion, so that we can make appropriate adjustments to the SA schema.
             cls_name = md.class_fqn.rsplit('.', 1)[-1]
-            cls = getattr(index_module, cls_name)
-            idx_col: Column
-            if md.indexed_col_tbl_id == str(self.id):
-                # this is a reference to one of our columns: avoid TVP.get_column_by_id() here, because we're not fully
-                # initialized yet
-                idx_col = self.cols_by_id[md.indexed_col_id]
-            else:
-                assert self.path.base is not None
-                idx_col = self.path.base.get_column_by_id(UUID(md.indexed_col_tbl_id), md.indexed_col_id)
+            cls = getattr(index, cls_name)
+            idx_col = self._lookup_column(QColumnId(UUID(md.indexed_col_tbl_id), md.indexed_col_id))
+            assert idx_col is not None
             idx = cls.from_dict(idx_col, md.init_args)
             # fix up the sa column type of the index value and undo columns
-            val_col = self.cols_by_id[md.index_val_col_id]
+            # we need to do this for all indices, not just those that are active in this TableVersion, to ensure we get
+            # the correct SA schema in the StoreTable.
+            val_col = next(col for col in self.cols if col.id == md.index_val_col_id)
             val_col.sa_col_type = idx.index_sa_type()
-            val_col._stores_cellmd = False
-            undo_col = self.cols_by_id[md.index_val_undo_col_id]
+            undo_col = next(col for col in self.cols if col.id == md.index_val_undo_col_id)
             undo_col.sa_col_type = idx.index_sa_type()
+            if not isinstance(idx, index.EmbeddingIndex):
+                # Historically, the intent has been not to store cellmd data, even for embedding indices. However,
+                # the cellmd columns get created anyway, even if stores_cellmd is set to `False` here, due to the
+                # timing of index column creation. In order to ensure that SA schemas align with what is actually in
+                # the physical tables, we keep this `True` for embedding indices.
+                # TODO: Decide whether index columns should store cellmd data.
+                #     - If not, set to `False`, fix the column creation timing issue, and add a migration script to
+                #       remedy existing cellmd columns.
+                #     - If so, remove this TODO.
+                val_col._stores_cellmd = False
             undo_col._stores_cellmd = False
-            idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
-            self.idxs_by_name[md.name] = idx_info
+            # The index is active in this TableVersion provided that:
+            # (i) the TableVersion supports indices (either it's not a snapshot, or it's a replica at
+            #     the head version); and
+            # (ii) the index was created on or before the schema version of this TableVersion; and
+            # (iii) the index was not dropped on or before the schema version of this TableVersion.
+            supports_idxs = self.effective_version is None or (
+                self.tbl_md.is_replica and self.effective_version == self.tbl_md.current_version
+            )
+            if (
+                supports_idxs
+                and md.schema_version_add <= self.schema_version
+                and (md.schema_version_drop is None or md.schema_version_drop > self.schema_version)
+            ):
+                # Since the index is present in this TableVersion, its associated columns must be as well.
+                # Sanity-check this.
+                assert md.indexed_col_id in self.cols_by_id
+                assert md.index_val_col_id in self.cols_by_id
+                assert md.index_val_undo_col_id in self.cols_by_id
+                idx_info = self.IndexInfo(
+                    id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col
+                )
+                self.idxs_by_name[md.name] = idx_info
+    def _lookup_column(self, id: QColumnId) -> Column | None:
+        """
+        Look up the column with the given table id and column id, searching through the ancestors of this TableVersion
+        to find it. We avoid referencing TableVersionPath in order to work properly with snapshots as well.
+        This will search through *all* known columns, including columns that are not visible in this TableVersion.
+        """
+        if id.tbl_id == self.id:
+            return next(col for col in self.cols if col.id == id.col_id)
+        elif self.base is not None:
+            return self.base.get()._lookup_column(id)
+        else:
+            return None
     def _init_sa_schema(self) -> None:
         # create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
@@ -1286,8 +1324,6 @@ class TableVersion:
         self._write_md(new_version=False, new_schema_version=False)
         # propagate to views
-        views_str = ', '.join([str(v.id) for v in self.mutable_views])
-        print(f'revert(): mutable_views={views_str}')
         for view in self.mutable_views:
             view.get()._revert()

pixeltable/catalog/table_version_path.py CHANGED Viewed

@@ -195,17 +195,6 @@ class TableVersionPath:
         else:
             return None
-    def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
-        """Return the column for the given tbl/col id"""
-        self.refresh_cached_md()
-        if self.tbl_version.id == tbl_id:
-            assert col_id in self._cached_tbl_version.cols_by_id
-            return self._cached_tbl_version.cols_by_id[col_id]
-        elif self.base is not None:
-            return self.base.get_column_by_id(tbl_id, col_id)
-        else:
-            return None
     def has_column(self, col: Column) -> bool:
         """Return True if this table has the given column."""
         assert col.tbl is not None

pixeltable/catalog/view.py CHANGED Viewed

@@ -252,6 +252,12 @@ class View(Table):
             base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
         )
+    def _is_named_pure_snapshot(self) -> bool:
+        """
+        Returns True if this is a named pure snapshot (i.e., a pure snapshot that is a separate schema object).
+        """
+        return self._id != self._tbl_version_path.tbl_id
     def _is_anonymous_snapshot(self) -> bool:
         """
         Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).

pixeltable/config.py CHANGED Viewed

@@ -163,6 +163,7 @@ KNOWN_CONFIG_OPTIONS = {
         'api_key': 'API key for Pixeltable cloud',
         'r2_profile': 'AWS config profile name used to access R2 storage',
         's3_profile': 'AWS config profile name used to access S3 storage',
+        'b2_profile': 'S3-compatible profile name used to access Backblaze B2 storage',
     },
     'anthropic': {'api_key': 'Anthropic API key'},
     'bedrock': {'api_key': 'AWS Bedrock API key'},

pixeltable/env.py CHANGED Viewed

@@ -355,6 +355,8 @@ class Env:
             # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
             path_parts = list(Path(record.pathname).parts)
             path_parts.reverse()
+            if 'pixeltable' not in path_parts:
+                return False
             max_idx = path_parts.index('pixeltable')
             for module_name in path_parts[:max_idx]:
                 if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
@@ -576,6 +578,12 @@ class Env:
             assert isinstance(tz_name, str)
             self._logger.info(f'Database time zone is now: {tz_name}')
             self._default_time_zone = ZoneInfo(tz_name)
+            if self.is_using_cockroachdb:
+                # This could be set when the database is created, but we set it now
+                conn.execute(sql.text('SET null_ordered_last = true;'))
+                null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
+                assert isinstance(null_ordered_last, str)
+                self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
     def _store_db_exists(self) -> bool:
         assert self._db_name is not None

pixeltable/exprs/arithmetic_expr.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Optional
 import sqlalchemy as sql
-from pixeltable import exceptions as excs, type_system as ts
+from pixeltable import env, exceptions as excs, type_system as ts
 from .data_row import DataRow
 from .expr import Expr
@@ -64,12 +64,18 @@ class ArithmeticExpr(Expr):
         right = sql_elements.get(self._op2)
         if left is None or right is None:
             return None
-        if self.operator == ArithmeticOperator.ADD:
-            return left + right
-        if self.operator == ArithmeticOperator.SUB:
-            return left - right
-        if self.operator == ArithmeticOperator.MUL:
-            return left * right
+        if self.operator in (ArithmeticOperator.ADD, ArithmeticOperator.SUB, ArithmeticOperator.MUL):
+            if env.Env.get().is_using_cockroachdb and self._op1.col_type != self._op2.col_type:
+                if self._op1.col_type != self.col_type:
+                    left = sql.cast(left, self.col_type.to_sa_type())
+                if self._op2.col_type != self.col_type:
+                    right = sql.cast(right, self.col_type.to_sa_type())
+            if self.operator == ArithmeticOperator.ADD:
+                return left + right
+            if self.operator == ArithmeticOperator.SUB:
+                return left - right
+            if self.operator == ArithmeticOperator.MUL:
+                return left * right
         if self.operator == ArithmeticOperator.DIV:
             assert self.col_type.is_float_type()
             # Avoid division by zero errors by converting any zero divisor to NULL.

pixeltable/functions/video.py CHANGED Viewed

@@ -2,10 +2,11 @@
 Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
 """
+import glob
 import logging
 import pathlib
 import subprocess
-from typing import Literal, NoReturn
+from typing import Any, Literal, NoReturn
 import av
 import av.stream
@@ -358,9 +359,17 @@ def clip(
 @pxt.udf(is_method=True)
-def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
+def segment_video(
+    video: pxt.Video,
+    *,
+    duration: float | None = None,
+    segment_times: list[float] | None = None,
+    mode: Literal['fast', 'accurate'] = 'fast',
+    video_encoder: str | None = None,
+    video_encoder_args: dict[str, Any] | None = None,
+) -> list[str]:
     """
-    Split a video into fixed-size segments.
+    Split a video into segments.
     __Requirements:__
@@ -368,7 +377,19 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
     Args:
         video: Input video file to segment
-        duration: Approximate duration of each segment (in seconds).
+        duration: Duration of each segment (in seconds). For `mode='fast'`, this is approximate;
+            for `mode='accurate'`, segments will have exact durations. Cannot be specified together with
+            `segment_times`.
+        segment_times: List of timestamps (in seconds) in video where segments should be split. Note that these are not
+            segment durations. If all segment times are less than the duration of the video, produces exactly
+            `len(segment_times) + 1` segments. Cannot be empty or be specified together with `duration`.
+        mode: Segmentation mode:
+            - `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
+            - `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
+        video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
+            Only available for `mode='accurate'`.
+        video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
     Returns:
         List of file paths for the generated video segments.
@@ -377,45 +398,106 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
         pxt.Error: If the video is missing timing information.
     Examples:
-        Split a video at 1 minute intervals
+        Split a video at 1 minute intervals using fast mode:
         >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
+        Split video into exact 10-second segments with accurate mode, using the libx264 encoder with a CRF of 23 and
+        slow preset (for smaller output files):
+        >>> tbl.select(
+        ...     segment_paths=tbl.video.segment_video(
+        ...         duration=10,
+        ...         mode='accurate',
+        ...         video_encoder='libx264',
+        ...         video_encoder_args={'crf': 23, 'preset': 'slow'}
+        ...     )
+        ... ).collect()
         Split video into two parts at the midpoint:
         >>> duration = tbl.video.get_duration()
-        >>> tbl.select(segment_paths=tbl.video.segment_video(duration=duration / 2 + 1)).collect()
+        >>> tbl.select(segment_paths=tbl.video.segment_video(segment_times=[duration / 2])).collect()
     """
     Env.get().require_binary('ffmpeg')
-    if duration <= 0:
+    if duration is not None and segment_times is not None:
+        raise pxt.Error('duration and segment_times cannot both be specified')
+    if duration is not None and duration <= 0:
         raise pxt.Error(f'duration must be positive, got {duration}')
+    if segment_times is not None and len(segment_times) == 0:
+        raise pxt.Error('segment_times cannot be empty')
+    if mode == 'fast':
+        if video_encoder is not None:
+            raise pxt.Error("video_encoder is not supported for mode='fast'")
+        if video_encoder_args is not None:
+            raise pxt.Error("video_encoder_args is not supported for mode='fast'")
     base_path = TempStore.create_path(extension='')
-    # we extract consecutive clips instead of running ffmpeg -f segment, which is inexplicably much slower
-    start_time = 0.0
-    result: list[str] = []
-    try:
-        while True:
-            segment_path = f'{base_path}_segment_{len(result)}.mp4'
-            cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, duration)
+    output_paths: list[str] = []
+    if mode == 'accurate':
+        # Use ffmpeg -f segment for accurate segmentation with re-encoding
+        output_pattern = f'{base_path}_segment_%04d.mp4'
+        cmd = av_utils.ffmpeg_segment_cmd(
+            str(video),
+            output_pattern,
+            segment_duration=duration,
+            segment_times=segment_times,
+            video_encoder=video_encoder,
+            video_encoder_args=video_encoder_args,
+        )
+        try:
             _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
-            segment_duration = av_utils.get_video_duration(segment_path)
-            if segment_duration == 0.0:
-                # we're done
-                pathlib.Path(segment_path).unlink()
-                return result
-            result.append(segment_path)
-            start_time += segment_duration  # use the actual segment duration here, it won't match duration exactly
+            output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
+            # TODO: is this actually an error?
+            # if len(output_paths) == 0:
+            #     stderr_output = result.stderr.strip() if result.stderr is not None else ''
+            #     raise pxt.Error(
+            #         f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
+            #     )
+            return output_paths
+        except subprocess.CalledProcessError as e:
+            _handle_ffmpeg_error(e)
-        return result
-    except subprocess.CalledProcessError as e:
-        # clean up partial results
-        for segment_path in result:
-            pathlib.Path(segment_path).unlink()
-        _handle_ffmpeg_error(e)
+    else:
+        # Fast mode: extract consecutive clips using stream copy (no re-encoding)
+        # This is faster but can only split at keyframes, leading to approximate durations
+        start_time = 0.0
+        segment_idx = 0
+        try:
+            while True:
+                target_duration: float | None
+                if duration is not None:
+                    target_duration = duration
+                elif segment_idx < len(segment_times):
+                    target_duration = segment_times[segment_idx] - start_time
+                else:
+                    target_duration = None  # the rest
+                segment_path = f'{base_path}_segment_{len(output_paths)}.mp4'
+                cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, target_duration)
+                _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
+                segment_duration = av_utils.get_video_duration(segment_path)
+                if segment_duration == 0.0:
+                    # we're done
+                    pathlib.Path(segment_path).unlink()
+                    return output_paths
+                output_paths.append(segment_path)
+                start_time += segment_duration  # use the actual segment duration here, it won't match duration exactly
+                segment_idx += 1
+                if segment_times is not None and segment_idx > len(segment_times):
+                    break
+            return output_paths
+        except subprocess.CalledProcessError as e:
+            # clean up partial results
+            for segment_path in output_paths:
+                pathlib.Path(segment_path).unlink()
+            _handle_ffmpeg_error(e)
 @pxt.udf(is_method=True)

pixeltable/io/globals.py CHANGED Viewed

@@ -152,7 +152,7 @@ def export_images_as_fo_dataset(
     (or expression) containing image data, along with optional additional columns containing labels. Currently, only
     classification and detection labels are supported.
-    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
+    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial contains a
     fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
     Images in the dataset that already exist on disk will be exported directly, in whatever format they
@@ -211,7 +211,7 @@ def export_images_as_fo_dataset(
         ...     classifications=tbl.classifications
         ... )
-        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
+        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial
         for a fully worked example.
     """
     Env.get().require_package('fiftyone')

pixeltable/io/parquet.py CHANGED Viewed

@@ -62,7 +62,7 @@ def export_parquet(
         with Catalog.get().begin_xact(for_write=False):
             for record_batch in to_record_batches(df, partition_size_bytes):
                 output_path = temp_path / f'part-{batch_num:05d}.parquet'
-                arrow_tbl = pa.Table.from_batches([record_batch])  # type: ignore
+                arrow_tbl = pa.Table.from_batches([record_batch])
                 pa.parquet.write_table(arrow_tbl, str(output_path))
                 batch_num += 1

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -528,7 +528,7 @@ class ParquetTableDataConduit(TableDataConduit):
         from pixeltable.utils.arrow import iter_tuples2
         try:
-            for fragment in self.pq_ds.fragments:  # type: ignore[attr-defined]
+            for fragment in self.pq_ds.fragments:
                 for batch in fragment.to_batches():
                     dict_batch = list(iter_tuples2(batch, self.source_column_map, self.pxt_schema))
                     self.total_rows += len(dict_batch)

pixeltable 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.16py3-none-any.whl → 0.4.17py3-none-any.whl