pixeltable 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

@@ -472,11 +472,13 @@ class Catalog:
472
472
  else:
473
473
  msg = ''
474
474
  _logger.debug(f'Exception: {e.orig.__class__}: {msg} ({e})')
475
+ # Suppress the underlying SQL exception unless DEBUG is enabled
476
+ raise_from = e if _logger.isEnabledFor(logging.DEBUG) else None
475
477
  raise excs.Error(
476
478
  'That Pixeltable operation could not be completed because it conflicted with another '
477
479
  'operation that was run on a different process.\n'
478
480
  'Please re-run the operation.'
479
- ) from None
481
+ ) from raise_from
480
482
 
481
483
  @property
482
484
  def in_write_xact(self) -> bool:
@@ -1736,6 +1738,9 @@ class Catalog:
1736
1738
 
1737
1739
  @retry_loop(for_write=False)
1738
1740
  def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
1741
+ return self._collect_tbl_history(tbl_id, n)
1742
+
1743
+ def _collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
1739
1744
  """
1740
1745
  Returns the history of up to n versions of the table with the given UUID.
1741
1746
 
@@ -1748,14 +1753,15 @@ class Catalog:
1748
1753
  Each row contains a TableVersion and a TableSchemaVersion object.
1749
1754
  """
1750
1755
  q = (
1751
- sql.select(schema.TableVersion, schema.TableSchemaVersion)
1752
- .select_from(schema.TableVersion)
1753
- .join(
1754
- schema.TableSchemaVersion,
1755
- schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version,
1756
- )
1756
+ sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
1757
+ .where(schema.Table.id == tbl_id)
1758
+ .join(schema.TableVersion)
1757
1759
  .where(schema.TableVersion.tbl_id == tbl_id)
1760
+ .join(schema.TableSchemaVersion)
1758
1761
  .where(schema.TableSchemaVersion.tbl_id == tbl_id)
1762
+ .where(
1763
+ schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version
1764
+ )
1759
1765
  .order_by(schema.TableVersion.version.desc())
1760
1766
  )
1761
1767
  if n is not None:
@@ -1763,7 +1769,7 @@ class Catalog:
1763
1769
  src_rows = Env.get().session.execute(q).fetchall()
1764
1770
  return [
1765
1771
  schema.FullTableMd(
1766
- None,
1772
+ schema.md_from_dict(schema.TableMd, row.Table.md),
1767
1773
  schema.md_from_dict(schema.TableVersionMd, row.TableVersion.md),
1768
1774
  schema.md_from_dict(schema.TableSchemaVersionMd, row.TableSchemaVersion.md),
1769
1775
  )
@@ -1958,11 +1964,13 @@ class Catalog:
1958
1964
 
1959
1965
  # If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
1960
1966
  # TableVersionPath. We need to prepend it separately.
1961
- if isinstance(tbl, View) and tbl._snapshot_only:
1967
+ if isinstance(tbl, View) and tbl._is_named_pure_snapshot():
1962
1968
  snapshot_md = self.load_tbl_md(tbl._id, 0)
1963
1969
  md = [snapshot_md, *md]
1964
1970
 
1965
- for ancestor_md in md[1:]:
1971
+ for ancestor_md in md:
1972
+ # Set the `is_replica` flag on every ancestor's TableMd.
1973
+ ancestor_md.tbl_md.is_replica = True
1966
1974
  # For replica metadata, we guarantee that the current_version and current_schema_version of TableMd
1967
1975
  # match the corresponding values in TableVersionMd and TableSchemaVersionMd. This is to ensure that,
1968
1976
  # when the metadata is later stored in the catalog of a different Pixeltable instance, the values of
@@ -1970,6 +1978,8 @@ class Catalog:
1970
1978
  # destination catalog.
1971
1979
  ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
1972
1980
  ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
1981
+
1982
+ for ancestor_md in md[1:]:
1973
1983
  # Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
1974
1984
  # table version (the data might be incomplete, since we have only retrieved one of its views, not
1975
1985
  # the table itself).
@@ -2022,9 +2032,7 @@ class Catalog:
2022
2032
  tbl_version: TableVersion
2023
2033
  if view_md is None:
2024
2034
  # this is a base table
2025
- tbl_version = TableVersion(
2026
- tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views=mutable_views
2027
- )
2035
+ tbl_version = TableVersion(tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views)
2028
2036
  else:
2029
2037
  assert len(view_md.base_versions) > 0 # a view needs to have a base
2030
2038
  # TODO: add TableVersionMd.is_pure_snapshot() and use that
@@ -24,7 +24,7 @@ from pixeltable.utils.object_stores import ObjectOps
24
24
 
25
25
  from ..func.globals import resolve_symbol
26
26
  from .column import Column
27
- from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, is_valid_identifier
27
+ from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, QColumnId, is_valid_identifier
28
28
  from .tbl_ops import TableOp
29
29
  from .update_status import RowCountStats, UpdateStatus
30
30
 
@@ -190,9 +190,7 @@ class TableVersion:
190
190
  """Create a snapshot copy of this TableVersion"""
191
191
  assert not self.is_snapshot
192
192
  base = self.path.base.tbl_version if self.is_view else None
193
- return TableVersion(
194
- self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, mutable_views=[], base=base
195
- )
193
+ return TableVersion(self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, [], base=base)
196
194
 
197
195
  @property
198
196
  def versioned_name(self) -> str:
@@ -201,6 +199,12 @@ class TableVersion:
201
199
  else:
202
200
  return f'{self.name}:{self.effective_version}'
203
201
 
202
+ def __repr__(self) -> str:
203
+ return (
204
+ f'TableVersion(id={self.id!r}, name={self.name!r}, '
205
+ f'version={self.version}, effective_version={self.effective_version})'
206
+ )
207
+
204
208
  @property
205
209
  def handle(self) -> 'TableVersionHandle':
206
210
  from .table_version_handle import TableVersionHandle
@@ -287,12 +291,12 @@ class TableVersion:
287
291
  comment: str,
288
292
  media_validation: MediaValidation,
289
293
  ) -> tuple[UUID, Optional[TableVersion]]:
290
- inital_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
294
+ initial_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
291
295
  cat = pxt.catalog.Catalog.get()
292
296
 
293
- tbl_id = UUID(hex=inital_md.tbl_md.tbl_id)
297
+ tbl_id = UUID(hex=initial_md.tbl_md.tbl_id)
294
298
  assert (tbl_id, None) not in cat._tbl_versions
295
- tbl_version = cls(tbl_id, inital_md.tbl_md, inital_md.version_md, None, inital_md.schema_version_md, [])
299
+ tbl_version = cls(tbl_id, initial_md.tbl_md, initial_md.version_md, None, initial_md.schema_version_md, [])
296
300
 
297
301
  @cat.register_undo_action
298
302
  def _() -> None:
@@ -312,8 +316,8 @@ class TableVersion:
312
316
  tbl_id=tbl_id,
313
317
  dir_id=dir_id,
314
318
  tbl_md=tbl_version.tbl_md,
315
- version_md=inital_md.version_md,
316
- schema_version_md=inital_md.schema_version_md,
319
+ version_md=initial_md.version_md,
320
+ schema_version_md=initial_md.schema_version_md,
317
321
  )
318
322
  return tbl_id, tbl_version
319
323
 
@@ -340,11 +344,14 @@ class TableVersion:
340
344
 
341
345
  @classmethod
342
346
  def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
347
+ from .catalog import TableVersionPath
348
+
343
349
  assert Env.get().in_xact
350
+ assert md.tbl_md.is_replica
344
351
  tbl_id = UUID(md.tbl_md.tbl_id)
345
352
  _logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
346
353
  view_md = md.tbl_md.view_md
347
- base_path = pxt.catalog.TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
354
+ base_path = TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
348
355
  base = base_path.tbl_version if base_path is not None else None
349
356
  tbl_version = cls(
350
357
  tbl_id,
@@ -409,8 +416,8 @@ class TableVersion:
409
416
  def _init_schema(self) -> None:
410
417
  # create columns first, so the indices can reference them
411
418
  self._init_cols()
412
- if not self.is_snapshot:
413
- self._init_idxs()
419
+ self._init_idxs()
420
+
414
421
  # create the sa schema only after creating the columns and indices
415
422
  self._init_sa_schema()
416
423
 
@@ -448,39 +455,70 @@ class TableVersion:
448
455
  # self._record_refd_columns(col)
449
456
 
450
457
  def _init_idxs(self) -> None:
451
- # self.idx_md = tbl_md.index_md
452
- self.idxs_by_name = {}
453
- import pixeltable.index as index_module
454
-
455
458
  for md in self.tbl_md.index_md.values():
456
- if md.schema_version_add > self.schema_version or (
457
- md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version
458
- ):
459
- # index not visible in this schema version
460
- continue
461
-
462
- # instantiate index object
459
+ # Instantiate index object. This needs to be done for all indices, even those that are not active in this
460
+ # TableVersion, so that we can make appropriate adjustments to the SA schema.
463
461
  cls_name = md.class_fqn.rsplit('.', 1)[-1]
464
- cls = getattr(index_module, cls_name)
465
- idx_col: Column
466
- if md.indexed_col_tbl_id == str(self.id):
467
- # this is a reference to one of our columns: avoid TVP.get_column_by_id() here, because we're not fully
468
- # initialized yet
469
- idx_col = self.cols_by_id[md.indexed_col_id]
470
- else:
471
- assert self.path.base is not None
472
- idx_col = self.path.base.get_column_by_id(UUID(md.indexed_col_tbl_id), md.indexed_col_id)
462
+ cls = getattr(index, cls_name)
463
+ idx_col = self._lookup_column(QColumnId(UUID(md.indexed_col_tbl_id), md.indexed_col_id))
464
+ assert idx_col is not None
473
465
  idx = cls.from_dict(idx_col, md.init_args)
474
466
 
475
467
  # fix up the sa column type of the index value and undo columns
476
- val_col = self.cols_by_id[md.index_val_col_id]
468
+ # we need to do this for all indices, not just those that are active in this TableVersion, to ensure we get
469
+ # the correct SA schema in the StoreTable.
470
+ val_col = next(col for col in self.cols if col.id == md.index_val_col_id)
477
471
  val_col.sa_col_type = idx.index_sa_type()
478
- val_col._stores_cellmd = False
479
- undo_col = self.cols_by_id[md.index_val_undo_col_id]
472
+ undo_col = next(col for col in self.cols if col.id == md.index_val_undo_col_id)
480
473
  undo_col.sa_col_type = idx.index_sa_type()
474
+ if not isinstance(idx, index.EmbeddingIndex):
475
+ # Historically, the intent has been not to store cellmd data, even for embedding indices. However,
476
+ # the cellmd columns get created anyway, even if stores_cellmd is set to `False` here, due to the
477
+ # timing of index column creation. In order to ensure that SA schemas align with what is actually in
478
+ # the physical tables, we keep this `True` for embedding indices.
479
+ # TODO: Decide whether index columns should store cellmd data.
480
+ # - If not, set to `False`, fix the column creation timing issue, and add a migration script to
481
+ # remedy existing cellmd columns.
482
+ # - If so, remove this TODO.
483
+ val_col._stores_cellmd = False
481
484
  undo_col._stores_cellmd = False
482
- idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
483
- self.idxs_by_name[md.name] = idx_info
485
+
486
+ # The index is active in this TableVersion provided that:
487
+ # (i) the TableVersion supports indices (either it's not a snapshot, or it's a replica at
488
+ # the head version); and
489
+ # (ii) the index was created on or before the schema version of this TableVersion; and
490
+ # (iii) the index was not dropped on or before the schema version of this TableVersion.
491
+ supports_idxs = self.effective_version is None or (
492
+ self.tbl_md.is_replica and self.effective_version == self.tbl_md.current_version
493
+ )
494
+ if (
495
+ supports_idxs
496
+ and md.schema_version_add <= self.schema_version
497
+ and (md.schema_version_drop is None or md.schema_version_drop > self.schema_version)
498
+ ):
499
+ # Since the index is present in this TableVersion, its associated columns must be as well.
500
+ # Sanity-check this.
501
+ assert md.indexed_col_id in self.cols_by_id
502
+ assert md.index_val_col_id in self.cols_by_id
503
+ assert md.index_val_undo_col_id in self.cols_by_id
504
+ idx_info = self.IndexInfo(
505
+ id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col
506
+ )
507
+ self.idxs_by_name[md.name] = idx_info
508
+
509
+ def _lookup_column(self, id: QColumnId) -> Column | None:
510
+ """
511
+ Look up the column with the given table id and column id, searching through the ancestors of this TableVersion
512
+ to find it. We avoid referencing TableVersionPath in order to work properly with snapshots as well.
513
+
514
+ This will search through *all* known columns, including columns that are not visible in this TableVersion.
515
+ """
516
+ if id.tbl_id == self.id:
517
+ return next(col for col in self.cols if col.id == id.col_id)
518
+ elif self.base is not None:
519
+ return self.base.get()._lookup_column(id)
520
+ else:
521
+ return None
484
522
 
485
523
  def _init_sa_schema(self) -> None:
486
524
  # create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
@@ -1286,8 +1324,6 @@ class TableVersion:
1286
1324
  self._write_md(new_version=False, new_schema_version=False)
1287
1325
 
1288
1326
  # propagate to views
1289
- views_str = ', '.join([str(v.id) for v in self.mutable_views])
1290
- print(f'revert(): mutable_views={views_str}')
1291
1327
  for view in self.mutable_views:
1292
1328
  view.get()._revert()
1293
1329
 
@@ -195,17 +195,6 @@ class TableVersionPath:
195
195
  else:
196
196
  return None
197
197
 
198
- def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
199
- """Return the column for the given tbl/col id"""
200
- self.refresh_cached_md()
201
- if self.tbl_version.id == tbl_id:
202
- assert col_id in self._cached_tbl_version.cols_by_id
203
- return self._cached_tbl_version.cols_by_id[col_id]
204
- elif self.base is not None:
205
- return self.base.get_column_by_id(tbl_id, col_id)
206
- else:
207
- return None
208
-
209
198
  def has_column(self, col: Column) -> bool:
210
199
  """Return True if this table has the given column."""
211
200
  assert col.tbl is not None
@@ -252,6 +252,12 @@ class View(Table):
252
252
  base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
253
253
  )
254
254
 
255
+ def _is_named_pure_snapshot(self) -> bool:
256
+ """
257
+ Returns True if this is a named pure snapshot (i.e., a pure snapshot that is a separate schema object).
258
+ """
259
+ return self._id != self._tbl_version_path.tbl_id
260
+
255
261
  def _is_anonymous_snapshot(self) -> bool:
256
262
  """
257
263
  Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).
pixeltable/config.py CHANGED
@@ -163,6 +163,7 @@ KNOWN_CONFIG_OPTIONS = {
163
163
  'api_key': 'API key for Pixeltable cloud',
164
164
  'r2_profile': 'AWS config profile name used to access R2 storage',
165
165
  's3_profile': 'AWS config profile name used to access S3 storage',
166
+ 'b2_profile': 'S3-compatible profile name used to access Backblaze B2 storage',
166
167
  },
167
168
  'anthropic': {'api_key': 'Anthropic API key'},
168
169
  'bedrock': {'api_key': 'AWS Bedrock API key'},
pixeltable/env.py CHANGED
@@ -355,6 +355,8 @@ class Env:
355
355
  # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
356
356
  path_parts = list(Path(record.pathname).parts)
357
357
  path_parts.reverse()
358
+ if 'pixeltable' not in path_parts:
359
+ return False
358
360
  max_idx = path_parts.index('pixeltable')
359
361
  for module_name in path_parts[:max_idx]:
360
362
  if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
@@ -576,6 +578,12 @@ class Env:
576
578
  assert isinstance(tz_name, str)
577
579
  self._logger.info(f'Database time zone is now: {tz_name}')
578
580
  self._default_time_zone = ZoneInfo(tz_name)
581
+ if self.is_using_cockroachdb:
582
+ # This could be set when the database is created, but we set it now
583
+ conn.execute(sql.text('SET null_ordered_last = true;'))
584
+ null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
585
+ assert isinstance(null_ordered_last, str)
586
+ self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
579
587
 
580
588
  def _store_db_exists(self) -> bool:
581
589
  assert self._db_name is not None
@@ -4,7 +4,7 @@ from typing import Any, Optional
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
7
- from pixeltable import exceptions as excs, type_system as ts
7
+ from pixeltable import env, exceptions as excs, type_system as ts
8
8
 
9
9
  from .data_row import DataRow
10
10
  from .expr import Expr
@@ -64,12 +64,18 @@ class ArithmeticExpr(Expr):
64
64
  right = sql_elements.get(self._op2)
65
65
  if left is None or right is None:
66
66
  return None
67
- if self.operator == ArithmeticOperator.ADD:
68
- return left + right
69
- if self.operator == ArithmeticOperator.SUB:
70
- return left - right
71
- if self.operator == ArithmeticOperator.MUL:
72
- return left * right
67
+ if self.operator in (ArithmeticOperator.ADD, ArithmeticOperator.SUB, ArithmeticOperator.MUL):
68
+ if env.Env.get().is_using_cockroachdb and self._op1.col_type != self._op2.col_type:
69
+ if self._op1.col_type != self.col_type:
70
+ left = sql.cast(left, self.col_type.to_sa_type())
71
+ if self._op2.col_type != self.col_type:
72
+ right = sql.cast(right, self.col_type.to_sa_type())
73
+ if self.operator == ArithmeticOperator.ADD:
74
+ return left + right
75
+ if self.operator == ArithmeticOperator.SUB:
76
+ return left - right
77
+ if self.operator == ArithmeticOperator.MUL:
78
+ return left * right
73
79
  if self.operator == ArithmeticOperator.DIV:
74
80
  assert self.col_type.is_float_type()
75
81
  # Avoid division by zero errors by converting any zero divisor to NULL.
@@ -2,10 +2,11 @@
2
2
  Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
3
3
  """
4
4
 
5
+ import glob
5
6
  import logging
6
7
  import pathlib
7
8
  import subprocess
8
- from typing import Literal, NoReturn
9
+ from typing import Any, Literal, NoReturn
9
10
 
10
11
  import av
11
12
  import av.stream
@@ -358,9 +359,17 @@ def clip(
358
359
 
359
360
 
360
361
  @pxt.udf(is_method=True)
361
- def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
362
+ def segment_video(
363
+ video: pxt.Video,
364
+ *,
365
+ duration: float | None = None,
366
+ segment_times: list[float] | None = None,
367
+ mode: Literal['fast', 'accurate'] = 'fast',
368
+ video_encoder: str | None = None,
369
+ video_encoder_args: dict[str, Any] | None = None,
370
+ ) -> list[str]:
362
371
  """
363
- Split a video into fixed-size segments.
372
+ Split a video into segments.
364
373
 
365
374
  __Requirements:__
366
375
 
@@ -368,7 +377,19 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
368
377
 
369
378
  Args:
370
379
  video: Input video file to segment
371
- duration: Approximate duration of each segment (in seconds).
380
+ duration: Duration of each segment (in seconds). For `mode='fast'`, this is approximate;
381
+ for `mode='accurate'`, segments will have exact durations. Cannot be specified together with
382
+ `segment_times`.
383
+ segment_times: List of timestamps (in seconds) in video where segments should be split. Note that these are not
384
+ segment durations. If all segment times are less than the duration of the video, produces exactly
385
+ `len(segment_times) + 1` segments. Cannot be empty or be specified together with `duration`.
386
+ mode: Segmentation mode:
387
+
388
+ - `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
389
+ - `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
390
+ video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
391
+ Only available for `mode='accurate'`.
392
+ video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
372
393
 
373
394
  Returns:
374
395
  List of file paths for the generated video segments.
@@ -377,45 +398,106 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
377
398
  pxt.Error: If the video is missing timing information.
378
399
 
379
400
  Examples:
380
- Split a video at 1 minute intervals
401
+ Split a video at 1 minute intervals using fast mode:
381
402
 
382
403
  >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
383
404
 
405
+ Split video into exact 10-second segments with accurate mode, using the libx264 encoder with a CRF of 23 and
406
+ slow preset (for smaller output files):
407
+
408
+ >>> tbl.select(
409
+ ... segment_paths=tbl.video.segment_video(
410
+ ... duration=10,
411
+ ... mode='accurate',
412
+ ... video_encoder='libx264',
413
+ ... video_encoder_args={'crf': 23, 'preset': 'slow'}
414
+ ... )
415
+ ... ).collect()
416
+
384
417
  Split video into two parts at the midpoint:
385
418
 
386
419
  >>> duration = tbl.video.get_duration()
387
- >>> tbl.select(segment_paths=tbl.video.segment_video(duration=duration / 2 + 1)).collect()
420
+ >>> tbl.select(segment_paths=tbl.video.segment_video(segment_times=[duration / 2])).collect()
388
421
  """
389
422
  Env.get().require_binary('ffmpeg')
390
- if duration <= 0:
423
+ if duration is not None and segment_times is not None:
424
+ raise pxt.Error('duration and segment_times cannot both be specified')
425
+ if duration is not None and duration <= 0:
391
426
  raise pxt.Error(f'duration must be positive, got {duration}')
427
+ if segment_times is not None and len(segment_times) == 0:
428
+ raise pxt.Error('segment_times cannot be empty')
429
+ if mode == 'fast':
430
+ if video_encoder is not None:
431
+ raise pxt.Error("video_encoder is not supported for mode='fast'")
432
+ if video_encoder_args is not None:
433
+ raise pxt.Error("video_encoder_args is not supported for mode='fast'")
392
434
 
393
435
  base_path = TempStore.create_path(extension='')
394
436
 
395
- # we extract consecutive clips instead of running ffmpeg -f segment, which is inexplicably much slower
396
- start_time = 0.0
397
- result: list[str] = []
398
- try:
399
- while True:
400
- segment_path = f'{base_path}_segment_{len(result)}.mp4'
401
- cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, duration)
437
+ output_paths: list[str] = []
438
+ if mode == 'accurate':
439
+ # Use ffmpeg -f segment for accurate segmentation with re-encoding
440
+ output_pattern = f'{base_path}_segment_%04d.mp4'
441
+ cmd = av_utils.ffmpeg_segment_cmd(
442
+ str(video),
443
+ output_pattern,
444
+ segment_duration=duration,
445
+ segment_times=segment_times,
446
+ video_encoder=video_encoder,
447
+ video_encoder_args=video_encoder_args,
448
+ )
402
449
 
450
+ try:
403
451
  _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
404
- segment_duration = av_utils.get_video_duration(segment_path)
405
- if segment_duration == 0.0:
406
- # we're done
407
- pathlib.Path(segment_path).unlink()
408
- return result
409
- result.append(segment_path)
410
- start_time += segment_duration # use the actual segment duration here, it won't match duration exactly
452
+ output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
453
+ # TODO: is this actually an error?
454
+ # if len(output_paths) == 0:
455
+ # stderr_output = result.stderr.strip() if result.stderr is not None else ''
456
+ # raise pxt.Error(
457
+ # f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
458
+ # )
459
+ return output_paths
460
+
461
+ except subprocess.CalledProcessError as e:
462
+ _handle_ffmpeg_error(e)
411
463
 
412
- return result
413
-
414
- except subprocess.CalledProcessError as e:
415
- # clean up partial results
416
- for segment_path in result:
417
- pathlib.Path(segment_path).unlink()
418
- _handle_ffmpeg_error(e)
464
+ else:
465
+ # Fast mode: extract consecutive clips using stream copy (no re-encoding)
466
+ # This is faster but can only split at keyframes, leading to approximate durations
467
+ start_time = 0.0
468
+ segment_idx = 0
469
+ try:
470
+ while True:
471
+ target_duration: float | None
472
+ if duration is not None:
473
+ target_duration = duration
474
+ elif segment_idx < len(segment_times):
475
+ target_duration = segment_times[segment_idx] - start_time
476
+ else:
477
+ target_duration = None # the rest
478
+ segment_path = f'{base_path}_segment_{len(output_paths)}.mp4'
479
+ cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, target_duration)
480
+
481
+ _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
482
+ segment_duration = av_utils.get_video_duration(segment_path)
483
+ if segment_duration == 0.0:
484
+ # we're done
485
+ pathlib.Path(segment_path).unlink()
486
+ return output_paths
487
+ output_paths.append(segment_path)
488
+ start_time += segment_duration # use the actual segment duration here, it won't match duration exactly
489
+
490
+ segment_idx += 1
491
+ if segment_times is not None and segment_idx > len(segment_times):
492
+ break
493
+
494
+ return output_paths
495
+
496
+ except subprocess.CalledProcessError as e:
497
+ # clean up partial results
498
+ for segment_path in output_paths:
499
+ pathlib.Path(segment_path).unlink()
500
+ _handle_ffmpeg_error(e)
419
501
 
420
502
 
421
503
  @pxt.udf(is_method=True)
pixeltable/io/globals.py CHANGED
@@ -152,7 +152,7 @@ def export_images_as_fo_dataset(
152
152
  (or expression) containing image data, along with optional additional columns containing labels. Currently, only
153
153
  classification and detection labels are supported.
154
154
 
155
- The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
155
+ The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial contains a
156
156
  fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
157
157
 
158
158
  Images in the dataset that already exist on disk will be exported directly, in whatever format they
@@ -211,7 +211,7 @@ def export_images_as_fo_dataset(
211
211
  ... classifications=tbl.classifications
212
212
  ... )
213
213
 
214
- See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
214
+ See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial
215
215
  for a fully worked example.
216
216
  """
217
217
  Env.get().require_package('fiftyone')
pixeltable/io/parquet.py CHANGED
@@ -62,7 +62,7 @@ def export_parquet(
62
62
  with Catalog.get().begin_xact(for_write=False):
63
63
  for record_batch in to_record_batches(df, partition_size_bytes):
64
64
  output_path = temp_path / f'part-{batch_num:05d}.parquet'
65
- arrow_tbl = pa.Table.from_batches([record_batch]) # type: ignore
65
+ arrow_tbl = pa.Table.from_batches([record_batch])
66
66
  pa.parquet.write_table(arrow_tbl, str(output_path))
67
67
  batch_num += 1
68
68
 
@@ -528,7 +528,7 @@ class ParquetTableDataConduit(TableDataConduit):
528
528
  from pixeltable.utils.arrow import iter_tuples2
529
529
 
530
530
  try:
531
- for fragment in self.pq_ds.fragments: # type: ignore[attr-defined]
531
+ for fragment in self.pq_ds.fragments:
532
532
  for batch in fragment.to_batches():
533
533
  dict_batch = list(iter_tuples2(batch, self.source_column_map, self.pxt_schema))
534
534
  self.total_rows += len(dict_batch)