pixeltable 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/catalog.py +21 -13
- pixeltable/catalog/table_version.py +75 -39
- pixeltable/catalog/table_version_path.py +0 -11
- pixeltable/catalog/view.py +6 -0
- pixeltable/config.py +1 -0
- pixeltable/env.py +8 -0
- pixeltable/exprs/arithmetic_expr.py +13 -7
- pixeltable/functions/video.py +110 -28
- pixeltable/io/globals.py +2 -2
- pixeltable/io/parquet.py +1 -1
- pixeltable/io/table_data_conduit.py +1 -1
- pixeltable/iterators/document.py +61 -23
- pixeltable/iterators/video.py +126 -53
- pixeltable/share/packager.py +155 -26
- pixeltable/store.py +1 -2
- pixeltable/utils/arrow.py +6 -6
- pixeltable/utils/av.py +65 -0
- pixeltable/utils/object_stores.py +16 -1
- pixeltable/utils/s3_store.py +44 -11
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/METADATA +28 -28
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/RECORD +24 -24
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0
pixeltable/catalog/catalog.py
CHANGED
|
@@ -472,11 +472,13 @@ class Catalog:
|
|
|
472
472
|
else:
|
|
473
473
|
msg = ''
|
|
474
474
|
_logger.debug(f'Exception: {e.orig.__class__}: {msg} ({e})')
|
|
475
|
+
# Suppress the underlying SQL exception unless DEBUG is enabled
|
|
476
|
+
raise_from = e if _logger.isEnabledFor(logging.DEBUG) else None
|
|
475
477
|
raise excs.Error(
|
|
476
478
|
'That Pixeltable operation could not be completed because it conflicted with another '
|
|
477
479
|
'operation that was run on a different process.\n'
|
|
478
480
|
'Please re-run the operation.'
|
|
479
|
-
) from
|
|
481
|
+
) from raise_from
|
|
480
482
|
|
|
481
483
|
@property
|
|
482
484
|
def in_write_xact(self) -> bool:
|
|
@@ -1736,6 +1738,9 @@ class Catalog:
|
|
|
1736
1738
|
|
|
1737
1739
|
@retry_loop(for_write=False)
|
|
1738
1740
|
def collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
|
|
1741
|
+
return self._collect_tbl_history(tbl_id, n)
|
|
1742
|
+
|
|
1743
|
+
def _collect_tbl_history(self, tbl_id: UUID, n: Optional[int]) -> list[schema.FullTableMd]:
|
|
1739
1744
|
"""
|
|
1740
1745
|
Returns the history of up to n versions of the table with the given UUID.
|
|
1741
1746
|
|
|
@@ -1748,14 +1753,15 @@ class Catalog:
|
|
|
1748
1753
|
Each row contains a TableVersion and a TableSchemaVersion object.
|
|
1749
1754
|
"""
|
|
1750
1755
|
q = (
|
|
1751
|
-
sql.select(schema.TableVersion, schema.TableSchemaVersion)
|
|
1752
|
-
.
|
|
1753
|
-
.join(
|
|
1754
|
-
schema.TableSchemaVersion,
|
|
1755
|
-
schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version,
|
|
1756
|
-
)
|
|
1756
|
+
sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
|
|
1757
|
+
.where(schema.Table.id == tbl_id)
|
|
1758
|
+
.join(schema.TableVersion)
|
|
1757
1759
|
.where(schema.TableVersion.tbl_id == tbl_id)
|
|
1760
|
+
.join(schema.TableSchemaVersion)
|
|
1758
1761
|
.where(schema.TableSchemaVersion.tbl_id == tbl_id)
|
|
1762
|
+
.where(
|
|
1763
|
+
schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version
|
|
1764
|
+
)
|
|
1759
1765
|
.order_by(schema.TableVersion.version.desc())
|
|
1760
1766
|
)
|
|
1761
1767
|
if n is not None:
|
|
@@ -1763,7 +1769,7 @@ class Catalog:
|
|
|
1763
1769
|
src_rows = Env.get().session.execute(q).fetchall()
|
|
1764
1770
|
return [
|
|
1765
1771
|
schema.FullTableMd(
|
|
1766
|
-
|
|
1772
|
+
schema.md_from_dict(schema.TableMd, row.Table.md),
|
|
1767
1773
|
schema.md_from_dict(schema.TableVersionMd, row.TableVersion.md),
|
|
1768
1774
|
schema.md_from_dict(schema.TableSchemaVersionMd, row.TableSchemaVersion.md),
|
|
1769
1775
|
)
|
|
@@ -1958,11 +1964,13 @@ class Catalog:
|
|
|
1958
1964
|
|
|
1959
1965
|
# If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
|
|
1960
1966
|
# TableVersionPath. We need to prepend it separately.
|
|
1961
|
-
if isinstance(tbl, View) and tbl.
|
|
1967
|
+
if isinstance(tbl, View) and tbl._is_named_pure_snapshot():
|
|
1962
1968
|
snapshot_md = self.load_tbl_md(tbl._id, 0)
|
|
1963
1969
|
md = [snapshot_md, *md]
|
|
1964
1970
|
|
|
1965
|
-
for ancestor_md in md
|
|
1971
|
+
for ancestor_md in md:
|
|
1972
|
+
# Set the `is_replica` flag on every ancestor's TableMd.
|
|
1973
|
+
ancestor_md.tbl_md.is_replica = True
|
|
1966
1974
|
# For replica metadata, we guarantee that the current_version and current_schema_version of TableMd
|
|
1967
1975
|
# match the corresponding values in TableVersionMd and TableSchemaVersionMd. This is to ensure that,
|
|
1968
1976
|
# when the metadata is later stored in the catalog of a different Pixeltable instance, the values of
|
|
@@ -1970,6 +1978,8 @@ class Catalog:
|
|
|
1970
1978
|
# destination catalog.
|
|
1971
1979
|
ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
|
|
1972
1980
|
ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
|
|
1981
|
+
|
|
1982
|
+
for ancestor_md in md[1:]:
|
|
1973
1983
|
# Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
|
|
1974
1984
|
# table version (the data might be incomplete, since we have only retrieved one of its views, not
|
|
1975
1985
|
# the table itself).
|
|
@@ -2022,9 +2032,7 @@ class Catalog:
|
|
|
2022
2032
|
tbl_version: TableVersion
|
|
2023
2033
|
if view_md is None:
|
|
2024
2034
|
# this is a base table
|
|
2025
|
-
tbl_version = TableVersion(
|
|
2026
|
-
tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views=mutable_views
|
|
2027
|
-
)
|
|
2035
|
+
tbl_version = TableVersion(tbl_id, tbl_md, version_md, effective_version, schema_version_md, mutable_views)
|
|
2028
2036
|
else:
|
|
2029
2037
|
assert len(view_md.base_versions) > 0 # a view needs to have a base
|
|
2030
2038
|
# TODO: add TableVersionMd.is_pure_snapshot() and use that
|
|
@@ -24,7 +24,7 @@ from pixeltable.utils.object_stores import ObjectOps
|
|
|
24
24
|
|
|
25
25
|
from ..func.globals import resolve_symbol
|
|
26
26
|
from .column import Column
|
|
27
|
-
from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, is_valid_identifier
|
|
27
|
+
from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, MediaValidation, QColumnId, is_valid_identifier
|
|
28
28
|
from .tbl_ops import TableOp
|
|
29
29
|
from .update_status import RowCountStats, UpdateStatus
|
|
30
30
|
|
|
@@ -190,9 +190,7 @@ class TableVersion:
|
|
|
190
190
|
"""Create a snapshot copy of this TableVersion"""
|
|
191
191
|
assert not self.is_snapshot
|
|
192
192
|
base = self.path.base.tbl_version if self.is_view else None
|
|
193
|
-
return TableVersion(
|
|
194
|
-
self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, mutable_views=[], base=base
|
|
195
|
-
)
|
|
193
|
+
return TableVersion(self.id, self.tbl_md, self.version_md, self.version, self.schema_version_md, [], base=base)
|
|
196
194
|
|
|
197
195
|
@property
|
|
198
196
|
def versioned_name(self) -> str:
|
|
@@ -201,6 +199,12 @@ class TableVersion:
|
|
|
201
199
|
else:
|
|
202
200
|
return f'{self.name}:{self.effective_version}'
|
|
203
201
|
|
|
202
|
+
def __repr__(self) -> str:
|
|
203
|
+
return (
|
|
204
|
+
f'TableVersion(id={self.id!r}, name={self.name!r}, '
|
|
205
|
+
f'version={self.version}, effective_version={self.effective_version})'
|
|
206
|
+
)
|
|
207
|
+
|
|
204
208
|
@property
|
|
205
209
|
def handle(self) -> 'TableVersionHandle':
|
|
206
210
|
from .table_version_handle import TableVersionHandle
|
|
@@ -287,12 +291,12 @@ class TableVersion:
|
|
|
287
291
|
comment: str,
|
|
288
292
|
media_validation: MediaValidation,
|
|
289
293
|
) -> tuple[UUID, Optional[TableVersion]]:
|
|
290
|
-
|
|
294
|
+
initial_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
|
|
291
295
|
cat = pxt.catalog.Catalog.get()
|
|
292
296
|
|
|
293
|
-
tbl_id = UUID(hex=
|
|
297
|
+
tbl_id = UUID(hex=initial_md.tbl_md.tbl_id)
|
|
294
298
|
assert (tbl_id, None) not in cat._tbl_versions
|
|
295
|
-
tbl_version = cls(tbl_id,
|
|
299
|
+
tbl_version = cls(tbl_id, initial_md.tbl_md, initial_md.version_md, None, initial_md.schema_version_md, [])
|
|
296
300
|
|
|
297
301
|
@cat.register_undo_action
|
|
298
302
|
def _() -> None:
|
|
@@ -312,8 +316,8 @@ class TableVersion:
|
|
|
312
316
|
tbl_id=tbl_id,
|
|
313
317
|
dir_id=dir_id,
|
|
314
318
|
tbl_md=tbl_version.tbl_md,
|
|
315
|
-
version_md=
|
|
316
|
-
schema_version_md=
|
|
319
|
+
version_md=initial_md.version_md,
|
|
320
|
+
schema_version_md=initial_md.schema_version_md,
|
|
317
321
|
)
|
|
318
322
|
return tbl_id, tbl_version
|
|
319
323
|
|
|
@@ -340,11 +344,14 @@ class TableVersion:
|
|
|
340
344
|
|
|
341
345
|
@classmethod
|
|
342
346
|
def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
|
|
347
|
+
from .catalog import TableVersionPath
|
|
348
|
+
|
|
343
349
|
assert Env.get().in_xact
|
|
350
|
+
assert md.tbl_md.is_replica
|
|
344
351
|
tbl_id = UUID(md.tbl_md.tbl_id)
|
|
345
352
|
_logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
|
|
346
353
|
view_md = md.tbl_md.view_md
|
|
347
|
-
base_path =
|
|
354
|
+
base_path = TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
|
|
348
355
|
base = base_path.tbl_version if base_path is not None else None
|
|
349
356
|
tbl_version = cls(
|
|
350
357
|
tbl_id,
|
|
@@ -409,8 +416,8 @@ class TableVersion:
|
|
|
409
416
|
def _init_schema(self) -> None:
|
|
410
417
|
# create columns first, so the indices can reference them
|
|
411
418
|
self._init_cols()
|
|
412
|
-
|
|
413
|
-
|
|
419
|
+
self._init_idxs()
|
|
420
|
+
|
|
414
421
|
# create the sa schema only after creating the columns and indices
|
|
415
422
|
self._init_sa_schema()
|
|
416
423
|
|
|
@@ -448,39 +455,70 @@ class TableVersion:
|
|
|
448
455
|
# self._record_refd_columns(col)
|
|
449
456
|
|
|
450
457
|
def _init_idxs(self) -> None:
|
|
451
|
-
# self.idx_md = tbl_md.index_md
|
|
452
|
-
self.idxs_by_name = {}
|
|
453
|
-
import pixeltable.index as index_module
|
|
454
|
-
|
|
455
458
|
for md in self.tbl_md.index_md.values():
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
):
|
|
459
|
-
# index not visible in this schema version
|
|
460
|
-
continue
|
|
461
|
-
|
|
462
|
-
# instantiate index object
|
|
459
|
+
# Instantiate index object. This needs to be done for all indices, even those that are not active in this
|
|
460
|
+
# TableVersion, so that we can make appropriate adjustments to the SA schema.
|
|
463
461
|
cls_name = md.class_fqn.rsplit('.', 1)[-1]
|
|
464
|
-
cls = getattr(
|
|
465
|
-
idx_col
|
|
466
|
-
|
|
467
|
-
# this is a reference to one of our columns: avoid TVP.get_column_by_id() here, because we're not fully
|
|
468
|
-
# initialized yet
|
|
469
|
-
idx_col = self.cols_by_id[md.indexed_col_id]
|
|
470
|
-
else:
|
|
471
|
-
assert self.path.base is not None
|
|
472
|
-
idx_col = self.path.base.get_column_by_id(UUID(md.indexed_col_tbl_id), md.indexed_col_id)
|
|
462
|
+
cls = getattr(index, cls_name)
|
|
463
|
+
idx_col = self._lookup_column(QColumnId(UUID(md.indexed_col_tbl_id), md.indexed_col_id))
|
|
464
|
+
assert idx_col is not None
|
|
473
465
|
idx = cls.from_dict(idx_col, md.init_args)
|
|
474
466
|
|
|
475
467
|
# fix up the sa column type of the index value and undo columns
|
|
476
|
-
|
|
468
|
+
# we need to do this for all indices, not just those that are active in this TableVersion, to ensure we get
|
|
469
|
+
# the correct SA schema in the StoreTable.
|
|
470
|
+
val_col = next(col for col in self.cols if col.id == md.index_val_col_id)
|
|
477
471
|
val_col.sa_col_type = idx.index_sa_type()
|
|
478
|
-
|
|
479
|
-
undo_col = self.cols_by_id[md.index_val_undo_col_id]
|
|
472
|
+
undo_col = next(col for col in self.cols if col.id == md.index_val_undo_col_id)
|
|
480
473
|
undo_col.sa_col_type = idx.index_sa_type()
|
|
474
|
+
if not isinstance(idx, index.EmbeddingIndex):
|
|
475
|
+
# Historically, the intent has been not to store cellmd data, even for embedding indices. However,
|
|
476
|
+
# the cellmd columns get created anyway, even if stores_cellmd is set to `False` here, due to the
|
|
477
|
+
# timing of index column creation. In order to ensure that SA schemas align with what is actually in
|
|
478
|
+
# the physical tables, we keep this `True` for embedding indices.
|
|
479
|
+
# TODO: Decide whether index columns should store cellmd data.
|
|
480
|
+
# - If not, set to `False`, fix the column creation timing issue, and add a migration script to
|
|
481
|
+
# remedy existing cellmd columns.
|
|
482
|
+
# - If so, remove this TODO.
|
|
483
|
+
val_col._stores_cellmd = False
|
|
481
484
|
undo_col._stores_cellmd = False
|
|
482
|
-
|
|
483
|
-
|
|
485
|
+
|
|
486
|
+
# The index is active in this TableVersion provided that:
|
|
487
|
+
# (i) the TableVersion supports indices (either it's not a snapshot, or it's a replica at
|
|
488
|
+
# the head version); and
|
|
489
|
+
# (ii) the index was created on or before the schema version of this TableVersion; and
|
|
490
|
+
# (iii) the index was not dropped on or before the schema version of this TableVersion.
|
|
491
|
+
supports_idxs = self.effective_version is None or (
|
|
492
|
+
self.tbl_md.is_replica and self.effective_version == self.tbl_md.current_version
|
|
493
|
+
)
|
|
494
|
+
if (
|
|
495
|
+
supports_idxs
|
|
496
|
+
and md.schema_version_add <= self.schema_version
|
|
497
|
+
and (md.schema_version_drop is None or md.schema_version_drop > self.schema_version)
|
|
498
|
+
):
|
|
499
|
+
# Since the index is present in this TableVersion, its associated columns must be as well.
|
|
500
|
+
# Sanity-check this.
|
|
501
|
+
assert md.indexed_col_id in self.cols_by_id
|
|
502
|
+
assert md.index_val_col_id in self.cols_by_id
|
|
503
|
+
assert md.index_val_undo_col_id in self.cols_by_id
|
|
504
|
+
idx_info = self.IndexInfo(
|
|
505
|
+
id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col
|
|
506
|
+
)
|
|
507
|
+
self.idxs_by_name[md.name] = idx_info
|
|
508
|
+
|
|
509
|
+
def _lookup_column(self, id: QColumnId) -> Column | None:
|
|
510
|
+
"""
|
|
511
|
+
Look up the column with the given table id and column id, searching through the ancestors of this TableVersion
|
|
512
|
+
to find it. We avoid referencing TableVersionPath in order to work properly with snapshots as well.
|
|
513
|
+
|
|
514
|
+
This will search through *all* known columns, including columns that are not visible in this TableVersion.
|
|
515
|
+
"""
|
|
516
|
+
if id.tbl_id == self.id:
|
|
517
|
+
return next(col for col in self.cols if col.id == id.col_id)
|
|
518
|
+
elif self.base is not None:
|
|
519
|
+
return self.base.get()._lookup_column(id)
|
|
520
|
+
else:
|
|
521
|
+
return None
|
|
484
522
|
|
|
485
523
|
def _init_sa_schema(self) -> None:
|
|
486
524
|
# create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
|
|
@@ -1286,8 +1324,6 @@ class TableVersion:
|
|
|
1286
1324
|
self._write_md(new_version=False, new_schema_version=False)
|
|
1287
1325
|
|
|
1288
1326
|
# propagate to views
|
|
1289
|
-
views_str = ', '.join([str(v.id) for v in self.mutable_views])
|
|
1290
|
-
print(f'revert(): mutable_views={views_str}')
|
|
1291
1327
|
for view in self.mutable_views:
|
|
1292
1328
|
view.get()._revert()
|
|
1293
1329
|
|
|
@@ -195,17 +195,6 @@ class TableVersionPath:
|
|
|
195
195
|
else:
|
|
196
196
|
return None
|
|
197
197
|
|
|
198
|
-
def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
|
|
199
|
-
"""Return the column for the given tbl/col id"""
|
|
200
|
-
self.refresh_cached_md()
|
|
201
|
-
if self.tbl_version.id == tbl_id:
|
|
202
|
-
assert col_id in self._cached_tbl_version.cols_by_id
|
|
203
|
-
return self._cached_tbl_version.cols_by_id[col_id]
|
|
204
|
-
elif self.base is not None:
|
|
205
|
-
return self.base.get_column_by_id(tbl_id, col_id)
|
|
206
|
-
else:
|
|
207
|
-
return None
|
|
208
|
-
|
|
209
198
|
def has_column(self, col: Column) -> bool:
|
|
210
199
|
"""Return True if this table has the given column."""
|
|
211
200
|
assert col.tbl is not None
|
pixeltable/catalog/view.py
CHANGED
|
@@ -252,6 +252,12 @@ class View(Table):
|
|
|
252
252
|
base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
|
|
253
253
|
)
|
|
254
254
|
|
|
255
|
+
def _is_named_pure_snapshot(self) -> bool:
|
|
256
|
+
"""
|
|
257
|
+
Returns True if this is a named pure snapshot (i.e., a pure snapshot that is a separate schema object).
|
|
258
|
+
"""
|
|
259
|
+
return self._id != self._tbl_version_path.tbl_id
|
|
260
|
+
|
|
255
261
|
def _is_anonymous_snapshot(self) -> bool:
|
|
256
262
|
"""
|
|
257
263
|
Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).
|
pixeltable/config.py
CHANGED
|
@@ -163,6 +163,7 @@ KNOWN_CONFIG_OPTIONS = {
|
|
|
163
163
|
'api_key': 'API key for Pixeltable cloud',
|
|
164
164
|
'r2_profile': 'AWS config profile name used to access R2 storage',
|
|
165
165
|
's3_profile': 'AWS config profile name used to access S3 storage',
|
|
166
|
+
'b2_profile': 'S3-compatible profile name used to access Backblaze B2 storage',
|
|
166
167
|
},
|
|
167
168
|
'anthropic': {'api_key': 'Anthropic API key'},
|
|
168
169
|
'bedrock': {'api_key': 'AWS Bedrock API key'},
|
pixeltable/env.py
CHANGED
|
@@ -355,6 +355,8 @@ class Env:
|
|
|
355
355
|
# accept log messages from a configured pixeltable module (at any level of the module hierarchy)
|
|
356
356
|
path_parts = list(Path(record.pathname).parts)
|
|
357
357
|
path_parts.reverse()
|
|
358
|
+
if 'pixeltable' not in path_parts:
|
|
359
|
+
return False
|
|
358
360
|
max_idx = path_parts.index('pixeltable')
|
|
359
361
|
for module_name in path_parts[:max_idx]:
|
|
360
362
|
if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
|
|
@@ -576,6 +578,12 @@ class Env:
|
|
|
576
578
|
assert isinstance(tz_name, str)
|
|
577
579
|
self._logger.info(f'Database time zone is now: {tz_name}')
|
|
578
580
|
self._default_time_zone = ZoneInfo(tz_name)
|
|
581
|
+
if self.is_using_cockroachdb:
|
|
582
|
+
# This could be set when the database is created, but we set it now
|
|
583
|
+
conn.execute(sql.text('SET null_ordered_last = true;'))
|
|
584
|
+
null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
|
|
585
|
+
assert isinstance(null_ordered_last, str)
|
|
586
|
+
self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
|
|
579
587
|
|
|
580
588
|
def _store_db_exists(self) -> bool:
|
|
581
589
|
assert self._db_name is not None
|
|
@@ -4,7 +4,7 @@ from typing import Any, Optional
|
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
7
|
-
from pixeltable import exceptions as excs, type_system as ts
|
|
7
|
+
from pixeltable import env, exceptions as excs, type_system as ts
|
|
8
8
|
|
|
9
9
|
from .data_row import DataRow
|
|
10
10
|
from .expr import Expr
|
|
@@ -64,12 +64,18 @@ class ArithmeticExpr(Expr):
|
|
|
64
64
|
right = sql_elements.get(self._op2)
|
|
65
65
|
if left is None or right is None:
|
|
66
66
|
return None
|
|
67
|
-
if self.operator
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
67
|
+
if self.operator in (ArithmeticOperator.ADD, ArithmeticOperator.SUB, ArithmeticOperator.MUL):
|
|
68
|
+
if env.Env.get().is_using_cockroachdb and self._op1.col_type != self._op2.col_type:
|
|
69
|
+
if self._op1.col_type != self.col_type:
|
|
70
|
+
left = sql.cast(left, self.col_type.to_sa_type())
|
|
71
|
+
if self._op2.col_type != self.col_type:
|
|
72
|
+
right = sql.cast(right, self.col_type.to_sa_type())
|
|
73
|
+
if self.operator == ArithmeticOperator.ADD:
|
|
74
|
+
return left + right
|
|
75
|
+
if self.operator == ArithmeticOperator.SUB:
|
|
76
|
+
return left - right
|
|
77
|
+
if self.operator == ArithmeticOperator.MUL:
|
|
78
|
+
return left * right
|
|
73
79
|
if self.operator == ArithmeticOperator.DIV:
|
|
74
80
|
assert self.col_type.is_float_type()
|
|
75
81
|
# Avoid division by zero errors by converting any zero divisor to NULL.
|
pixeltable/functions/video.py
CHANGED
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import glob
|
|
5
6
|
import logging
|
|
6
7
|
import pathlib
|
|
7
8
|
import subprocess
|
|
8
|
-
from typing import Literal, NoReturn
|
|
9
|
+
from typing import Any, Literal, NoReturn
|
|
9
10
|
|
|
10
11
|
import av
|
|
11
12
|
import av.stream
|
|
@@ -358,9 +359,17 @@ def clip(
|
|
|
358
359
|
|
|
359
360
|
|
|
360
361
|
@pxt.udf(is_method=True)
|
|
361
|
-
def segment_video(
|
|
362
|
+
def segment_video(
|
|
363
|
+
video: pxt.Video,
|
|
364
|
+
*,
|
|
365
|
+
duration: float | None = None,
|
|
366
|
+
segment_times: list[float] | None = None,
|
|
367
|
+
mode: Literal['fast', 'accurate'] = 'fast',
|
|
368
|
+
video_encoder: str | None = None,
|
|
369
|
+
video_encoder_args: dict[str, Any] | None = None,
|
|
370
|
+
) -> list[str]:
|
|
362
371
|
"""
|
|
363
|
-
Split a video into
|
|
372
|
+
Split a video into segments.
|
|
364
373
|
|
|
365
374
|
__Requirements:__
|
|
366
375
|
|
|
@@ -368,7 +377,19 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
|
|
|
368
377
|
|
|
369
378
|
Args:
|
|
370
379
|
video: Input video file to segment
|
|
371
|
-
duration:
|
|
380
|
+
duration: Duration of each segment (in seconds). For `mode='fast'`, this is approximate;
|
|
381
|
+
for `mode='accurate'`, segments will have exact durations. Cannot be specified together with
|
|
382
|
+
`segment_times`.
|
|
383
|
+
segment_times: List of timestamps (in seconds) in video where segments should be split. Note that these are not
|
|
384
|
+
segment durations. If all segment times are less than the duration of the video, produces exactly
|
|
385
|
+
`len(segment_times) + 1` segments. Cannot be empty or be specified together with `duration`.
|
|
386
|
+
mode: Segmentation mode:
|
|
387
|
+
|
|
388
|
+
- `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
|
|
389
|
+
- `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
|
|
390
|
+
video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
|
|
391
|
+
Only available for `mode='accurate'`.
|
|
392
|
+
video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
|
|
372
393
|
|
|
373
394
|
Returns:
|
|
374
395
|
List of file paths for the generated video segments.
|
|
@@ -377,45 +398,106 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
|
|
|
377
398
|
pxt.Error: If the video is missing timing information.
|
|
378
399
|
|
|
379
400
|
Examples:
|
|
380
|
-
Split a video at 1 minute intervals
|
|
401
|
+
Split a video at 1 minute intervals using fast mode:
|
|
381
402
|
|
|
382
403
|
>>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
|
|
383
404
|
|
|
405
|
+
Split video into exact 10-second segments with accurate mode, using the libx264 encoder with a CRF of 23 and
|
|
406
|
+
slow preset (for smaller output files):
|
|
407
|
+
|
|
408
|
+
>>> tbl.select(
|
|
409
|
+
... segment_paths=tbl.video.segment_video(
|
|
410
|
+
... duration=10,
|
|
411
|
+
... mode='accurate',
|
|
412
|
+
... video_encoder='libx264',
|
|
413
|
+
... video_encoder_args={'crf': 23, 'preset': 'slow'}
|
|
414
|
+
... )
|
|
415
|
+
... ).collect()
|
|
416
|
+
|
|
384
417
|
Split video into two parts at the midpoint:
|
|
385
418
|
|
|
386
419
|
>>> duration = tbl.video.get_duration()
|
|
387
|
-
>>> tbl.select(segment_paths=tbl.video.segment_video(
|
|
420
|
+
>>> tbl.select(segment_paths=tbl.video.segment_video(segment_times=[duration / 2])).collect()
|
|
388
421
|
"""
|
|
389
422
|
Env.get().require_binary('ffmpeg')
|
|
390
|
-
if duration
|
|
423
|
+
if duration is not None and segment_times is not None:
|
|
424
|
+
raise pxt.Error('duration and segment_times cannot both be specified')
|
|
425
|
+
if duration is not None and duration <= 0:
|
|
391
426
|
raise pxt.Error(f'duration must be positive, got {duration}')
|
|
427
|
+
if segment_times is not None and len(segment_times) == 0:
|
|
428
|
+
raise pxt.Error('segment_times cannot be empty')
|
|
429
|
+
if mode == 'fast':
|
|
430
|
+
if video_encoder is not None:
|
|
431
|
+
raise pxt.Error("video_encoder is not supported for mode='fast'")
|
|
432
|
+
if video_encoder_args is not None:
|
|
433
|
+
raise pxt.Error("video_encoder_args is not supported for mode='fast'")
|
|
392
434
|
|
|
393
435
|
base_path = TempStore.create_path(extension='')
|
|
394
436
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
437
|
+
output_paths: list[str] = []
|
|
438
|
+
if mode == 'accurate':
|
|
439
|
+
# Use ffmpeg -f segment for accurate segmentation with re-encoding
|
|
440
|
+
output_pattern = f'{base_path}_segment_%04d.mp4'
|
|
441
|
+
cmd = av_utils.ffmpeg_segment_cmd(
|
|
442
|
+
str(video),
|
|
443
|
+
output_pattern,
|
|
444
|
+
segment_duration=duration,
|
|
445
|
+
segment_times=segment_times,
|
|
446
|
+
video_encoder=video_encoder,
|
|
447
|
+
video_encoder_args=video_encoder_args,
|
|
448
|
+
)
|
|
402
449
|
|
|
450
|
+
try:
|
|
403
451
|
_ = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
452
|
+
output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
|
|
453
|
+
# TODO: is this actually an error?
|
|
454
|
+
# if len(output_paths) == 0:
|
|
455
|
+
# stderr_output = result.stderr.strip() if result.stderr is not None else ''
|
|
456
|
+
# raise pxt.Error(
|
|
457
|
+
# f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
|
|
458
|
+
# )
|
|
459
|
+
return output_paths
|
|
460
|
+
|
|
461
|
+
except subprocess.CalledProcessError as e:
|
|
462
|
+
_handle_ffmpeg_error(e)
|
|
411
463
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
464
|
+
else:
|
|
465
|
+
# Fast mode: extract consecutive clips using stream copy (no re-encoding)
|
|
466
|
+
# This is faster but can only split at keyframes, leading to approximate durations
|
|
467
|
+
start_time = 0.0
|
|
468
|
+
segment_idx = 0
|
|
469
|
+
try:
|
|
470
|
+
while True:
|
|
471
|
+
target_duration: float | None
|
|
472
|
+
if duration is not None:
|
|
473
|
+
target_duration = duration
|
|
474
|
+
elif segment_idx < len(segment_times):
|
|
475
|
+
target_duration = segment_times[segment_idx] - start_time
|
|
476
|
+
else:
|
|
477
|
+
target_duration = None # the rest
|
|
478
|
+
segment_path = f'{base_path}_segment_{len(output_paths)}.mp4'
|
|
479
|
+
cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, target_duration)
|
|
480
|
+
|
|
481
|
+
_ = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
482
|
+
segment_duration = av_utils.get_video_duration(segment_path)
|
|
483
|
+
if segment_duration == 0.0:
|
|
484
|
+
# we're done
|
|
485
|
+
pathlib.Path(segment_path).unlink()
|
|
486
|
+
return output_paths
|
|
487
|
+
output_paths.append(segment_path)
|
|
488
|
+
start_time += segment_duration # use the actual segment duration here, it won't match duration exactly
|
|
489
|
+
|
|
490
|
+
segment_idx += 1
|
|
491
|
+
if segment_times is not None and segment_idx > len(segment_times):
|
|
492
|
+
break
|
|
493
|
+
|
|
494
|
+
return output_paths
|
|
495
|
+
|
|
496
|
+
except subprocess.CalledProcessError as e:
|
|
497
|
+
# clean up partial results
|
|
498
|
+
for segment_path in output_paths:
|
|
499
|
+
pathlib.Path(segment_path).unlink()
|
|
500
|
+
_handle_ffmpeg_error(e)
|
|
419
501
|
|
|
420
502
|
|
|
421
503
|
@pxt.udf(is_method=True)
|
pixeltable/io/globals.py
CHANGED
|
@@ -152,7 +152,7 @@ def export_images_as_fo_dataset(
|
|
|
152
152
|
(or expression) containing image data, along with optional additional columns containing labels. Currently, only
|
|
153
153
|
classification and detection labels are supported.
|
|
154
154
|
|
|
155
|
-
The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/
|
|
155
|
+
The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial contains a
|
|
156
156
|
fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
|
|
157
157
|
|
|
158
158
|
Images in the dataset that already exist on disk will be exported directly, in whatever format they
|
|
@@ -211,7 +211,7 @@ def export_images_as_fo_dataset(
|
|
|
211
211
|
... classifications=tbl.classifications
|
|
212
212
|
... )
|
|
213
213
|
|
|
214
|
-
See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/
|
|
214
|
+
See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial
|
|
215
215
|
for a fully worked example.
|
|
216
216
|
"""
|
|
217
217
|
Env.get().require_package('fiftyone')
|
pixeltable/io/parquet.py
CHANGED
|
@@ -62,7 +62,7 @@ def export_parquet(
|
|
|
62
62
|
with Catalog.get().begin_xact(for_write=False):
|
|
63
63
|
for record_batch in to_record_batches(df, partition_size_bytes):
|
|
64
64
|
output_path = temp_path / f'part-{batch_num:05d}.parquet'
|
|
65
|
-
arrow_tbl = pa.Table.from_batches([record_batch])
|
|
65
|
+
arrow_tbl = pa.Table.from_batches([record_batch])
|
|
66
66
|
pa.parquet.write_table(arrow_tbl, str(output_path))
|
|
67
67
|
batch_num += 1
|
|
68
68
|
|
|
@@ -528,7 +528,7 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
528
528
|
from pixeltable.utils.arrow import iter_tuples2
|
|
529
529
|
|
|
530
530
|
try:
|
|
531
|
-
for fragment in self.pq_ds.fragments:
|
|
531
|
+
for fragment in self.pq_ds.fragments:
|
|
532
532
|
for batch in fragment.to_batches():
|
|
533
533
|
dict_batch = list(iter_tuples2(batch, self.source_column_map, self.pxt_schema))
|
|
534
534
|
self.total_rows += len(dict_batch)
|