pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -10
- pixeltable/catalog/catalog.py +139 -59
- pixeltable/catalog/column.py +32 -23
- pixeltable/catalog/globals.py +2 -45
- pixeltable/catalog/insertable_table.py +5 -2
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/table.py +173 -23
- pixeltable/catalog/table_version.py +156 -92
- pixeltable/catalog/table_version_handle.py +26 -1
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +12 -3
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +29 -0
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exprs/column_property_ref.py +23 -20
- pixeltable/exprs/column_ref.py +24 -18
- pixeltable/exprs/data_row.py +9 -0
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +46 -14
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/video.py +57 -10
- pixeltable/globals.py +61 -1
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +39 -64
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +52 -48
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +14 -2
- pixeltable/metadata/utils.py +78 -0
- pixeltable/plan.py +26 -18
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +121 -142
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +39 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
pixeltable/catalog/table.py
CHANGED
|
@@ -6,9 +6,10 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
from keyword import iskeyword as is_python_keyword
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, Union, overload
|
|
9
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Literal, Optional, Union, overload
|
|
10
10
|
|
|
11
11
|
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
12
|
+
import datetime
|
|
12
13
|
from uuid import UUID
|
|
13
14
|
|
|
14
15
|
import pandas as pd
|
|
@@ -17,6 +18,7 @@ import sqlalchemy as sql
|
|
|
17
18
|
import pixeltable as pxt
|
|
18
19
|
from pixeltable import catalog, env, exceptions as excs, exprs, index, type_system as ts
|
|
19
20
|
from pixeltable.metadata import schema
|
|
21
|
+
from pixeltable.metadata.utils import MetadataUtils
|
|
20
22
|
|
|
21
23
|
from ..exprs import ColumnRef
|
|
22
24
|
from ..utils.description_helper import DescriptionHelper
|
|
@@ -27,13 +29,13 @@ from .globals import (
|
|
|
27
29
|
IfExistsParam,
|
|
28
30
|
IfNotExistsParam,
|
|
29
31
|
MediaValidation,
|
|
30
|
-
UpdateStatus,
|
|
31
32
|
is_system_column_name,
|
|
32
33
|
is_valid_identifier,
|
|
33
34
|
)
|
|
34
35
|
from .schema_object import SchemaObject
|
|
35
36
|
from .table_version_handle import TableVersionHandle
|
|
36
37
|
from .table_version_path import TableVersionPath
|
|
38
|
+
from .update_status import UpdateStatus
|
|
37
39
|
|
|
38
40
|
if TYPE_CHECKING:
|
|
39
41
|
import torch.utils.data
|
|
@@ -107,8 +109,6 @@ class Table(SchemaObject):
|
|
|
107
109
|
|
|
108
110
|
def _get_metadata(self) -> dict[str, Any]:
|
|
109
111
|
md = super()._get_metadata()
|
|
110
|
-
base = self._get_base_table()
|
|
111
|
-
md['base'] = base._path() if base is not None else None
|
|
112
112
|
md['schema'] = self._get_schema()
|
|
113
113
|
md['is_replica'] = self._tbl_version_path.is_replica()
|
|
114
114
|
md['version'] = self._get_version()
|
|
@@ -508,15 +508,16 @@ class Table(SchemaObject):
|
|
|
508
508
|
for cname in cols_to_ignore:
|
|
509
509
|
assert cname in col_schema
|
|
510
510
|
del col_schema[cname]
|
|
511
|
+
result = UpdateStatus()
|
|
511
512
|
if len(col_schema) == 0:
|
|
512
|
-
return
|
|
513
|
+
return result
|
|
513
514
|
new_cols = self._create_columns(col_schema)
|
|
514
515
|
for new_col in new_cols:
|
|
515
516
|
self._verify_column(new_col)
|
|
516
517
|
assert self._tbl_version is not None
|
|
517
|
-
|
|
518
|
+
result += self._tbl_version.get().add_columns(new_cols, print_stats=False, on_error='abort')
|
|
518
519
|
FileCache.get().emit_eviction_warnings()
|
|
519
|
-
return
|
|
520
|
+
return result
|
|
520
521
|
|
|
521
522
|
def add_column(
|
|
522
523
|
self,
|
|
@@ -593,7 +594,7 @@ class Table(SchemaObject):
|
|
|
593
594
|
- `'abort'`: an exception will be raised and the column will not be added.
|
|
594
595
|
- `'ignore'`: execution will continue and the column will be added. Any rows
|
|
595
596
|
with errors will have a `None` value for the column, with information about the error stored in the
|
|
596
|
-
corresponding `tbl.col_name.
|
|
597
|
+
corresponding `tbl.col_name.errormsg` tbl.col_name.errortype` fields.
|
|
597
598
|
if_exists: Determines the behavior if the column already exists. Must be one of the following:
|
|
598
599
|
|
|
599
600
|
- `'error'`: an exception will be raised.
|
|
@@ -640,10 +641,10 @@ class Table(SchemaObject):
|
|
|
640
641
|
# Raise an error if the column expression refers to a column error property
|
|
641
642
|
if isinstance(spec, exprs.Expr):
|
|
642
643
|
for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
|
|
643
|
-
if e.
|
|
644
|
+
if e.is_cellmd_prop():
|
|
644
645
|
raise excs.Error(
|
|
645
|
-
'Use of a reference to
|
|
646
|
-
f'
|
|
646
|
+
f'Use of a reference to the {e.prop.name.lower()!r} property of another column '
|
|
647
|
+
f'is not allowed in a computed column.'
|
|
647
648
|
)
|
|
648
649
|
|
|
649
650
|
# handle existing columns based on if_exists parameter
|
|
@@ -652,16 +653,17 @@ class Table(SchemaObject):
|
|
|
652
653
|
)
|
|
653
654
|
# if the column to add already exists and user asked to ignore
|
|
654
655
|
# exiting column, there's nothing to do.
|
|
656
|
+
result = UpdateStatus()
|
|
655
657
|
if len(cols_to_ignore) != 0:
|
|
656
658
|
assert cols_to_ignore[0] == col_name
|
|
657
|
-
return
|
|
659
|
+
return result
|
|
658
660
|
|
|
659
661
|
new_col = self._create_columns({col_name: col_schema})[0]
|
|
660
662
|
self._verify_column(new_col)
|
|
661
663
|
assert self._tbl_version is not None
|
|
662
|
-
|
|
664
|
+
result += self._tbl_version.get().add_columns([new_col], print_stats=print_stats, on_error=on_error)
|
|
663
665
|
FileCache.get().emit_eviction_warnings()
|
|
664
|
-
return
|
|
666
|
+
return result
|
|
665
667
|
|
|
666
668
|
@classmethod
|
|
667
669
|
def _validate_column_spec(cls, name: str, spec: dict[str, Any]) -> None:
|
|
@@ -840,11 +842,12 @@ class Table(SchemaObject):
|
|
|
840
842
|
_ = self._get_views(recursive=True, include_snapshots=False)
|
|
841
843
|
# See if this column has a dependent store. We need to look through all stores in all
|
|
842
844
|
# (transitive) views of this table.
|
|
845
|
+
col_handle = col.handle
|
|
843
846
|
dependent_stores = [
|
|
844
847
|
(view, store)
|
|
845
848
|
for view in (self, *self._get_views(recursive=True, include_snapshots=False))
|
|
846
849
|
for store in view._tbl_version.get().external_stores.values()
|
|
847
|
-
if
|
|
850
|
+
if col_handle in store.get_local_columns()
|
|
848
851
|
]
|
|
849
852
|
if len(dependent_stores) > 0:
|
|
850
853
|
dependent_store_names = [
|
|
@@ -1321,6 +1324,9 @@ class Table(SchemaObject):
|
|
|
1321
1324
|
where: a predicate to filter rows to update.
|
|
1322
1325
|
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
1323
1326
|
|
|
1327
|
+
Returns:
|
|
1328
|
+
An [`UpdateStatus`][pixeltable.UpdateStatus] object containing information about the update.
|
|
1329
|
+
|
|
1324
1330
|
Examples:
|
|
1325
1331
|
Set column `int_col` to 1 for all rows:
|
|
1326
1332
|
|
|
@@ -1343,9 +1349,9 @@ class Table(SchemaObject):
|
|
|
1343
1349
|
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
1344
1350
|
if self._tbl_version_path.is_snapshot():
|
|
1345
1351
|
raise excs.Error('Cannot update a snapshot')
|
|
1346
|
-
|
|
1352
|
+
result = self._tbl_version.get().update(value_spec, where, cascade)
|
|
1347
1353
|
FileCache.get().emit_eviction_warnings()
|
|
1348
|
-
return
|
|
1354
|
+
return result
|
|
1349
1355
|
|
|
1350
1356
|
def batch_update(
|
|
1351
1357
|
self,
|
|
@@ -1409,7 +1415,7 @@ class Table(SchemaObject):
|
|
|
1409
1415
|
raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
|
|
1410
1416
|
row_updates.append(col_vals)
|
|
1411
1417
|
|
|
1412
|
-
|
|
1418
|
+
result = self._tbl_version.get().batch_update(
|
|
1413
1419
|
row_updates,
|
|
1414
1420
|
rowids,
|
|
1415
1421
|
error_if_not_exists=if_not_exists == 'error',
|
|
@@ -1417,7 +1423,70 @@ class Table(SchemaObject):
|
|
|
1417
1423
|
cascade=cascade,
|
|
1418
1424
|
)
|
|
1419
1425
|
FileCache.get().emit_eviction_warnings()
|
|
1420
|
-
return
|
|
1426
|
+
return result
|
|
1427
|
+
|
|
1428
|
+
def recompute_columns(
|
|
1429
|
+
self, *columns: Union[str, ColumnRef], errors_only: bool = False, cascade: bool = True
|
|
1430
|
+
) -> UpdateStatus:
|
|
1431
|
+
"""Recompute the values in one or more computed columns of this table.
|
|
1432
|
+
|
|
1433
|
+
Args:
|
|
1434
|
+
columns: The names or references of the computed columns to recompute.
|
|
1435
|
+
errors_only: If True, only run the recomputation for rows that have errors in the column (ie, the column's
|
|
1436
|
+
`errortype` property indicates that an error occurred). Only allowed for recomputing a single column.
|
|
1437
|
+
cascade: if True, also update all computed columns that transitively depend on the recomputed columns.
|
|
1438
|
+
|
|
1439
|
+
Examples:
|
|
1440
|
+
Recompute computed columns `c1` and `c2` for all rows in this table, and everything that transitively
|
|
1441
|
+
depends on them:
|
|
1442
|
+
|
|
1443
|
+
>>> tbl.recompute_columns('c1', 'c2')
|
|
1444
|
+
|
|
1445
|
+
Recompute computed column `c1` for all rows in this table, but don't recompute other columns that depend on
|
|
1446
|
+
it:
|
|
1447
|
+
|
|
1448
|
+
>>> tbl.recompute_columns(tbl.c1, tbl.c2, cascade=False)
|
|
1449
|
+
|
|
1450
|
+
Recompute column `c1` and its dependents, but only for rows that have errors in it:
|
|
1451
|
+
|
|
1452
|
+
>>> tbl.recompute_columns('c1', errors_only=True)
|
|
1453
|
+
"""
|
|
1454
|
+
from pixeltable.catalog import Catalog
|
|
1455
|
+
|
|
1456
|
+
cat = Catalog.get()
|
|
1457
|
+
# lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
|
|
1458
|
+
with cat.begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
1459
|
+
if self._tbl_version_path.is_snapshot():
|
|
1460
|
+
raise excs.Error('Cannot recompute columns of a snapshot.')
|
|
1461
|
+
if len(columns) == 0:
|
|
1462
|
+
raise excs.Error('At least one column must be specified to recompute')
|
|
1463
|
+
if errors_only and len(columns) > 1:
|
|
1464
|
+
raise excs.Error('Cannot use errors_only=True with multiple columns')
|
|
1465
|
+
|
|
1466
|
+
col_names: list[str] = []
|
|
1467
|
+
for column in columns:
|
|
1468
|
+
col_name: str
|
|
1469
|
+
col: Column
|
|
1470
|
+
if isinstance(column, str):
|
|
1471
|
+
col = self._tbl_version_path.get_column(column, include_bases=True)
|
|
1472
|
+
if col is None:
|
|
1473
|
+
raise excs.Error(f'Unknown column: {column!r}')
|
|
1474
|
+
col_name = column
|
|
1475
|
+
else:
|
|
1476
|
+
assert isinstance(column, ColumnRef)
|
|
1477
|
+
col = column.col
|
|
1478
|
+
if not self._tbl_version_path.has_column(col, include_bases=True):
|
|
1479
|
+
raise excs.Error(f'Unknown column: {col.name!r}')
|
|
1480
|
+
col_name = col.name
|
|
1481
|
+
if not col.is_computed:
|
|
1482
|
+
raise excs.Error(f'Column {col_name!r} is not a computed column')
|
|
1483
|
+
if col.tbl.id != self._tbl_version_path.tbl_id:
|
|
1484
|
+
raise excs.Error(f'Cannot recompute column of a base: {col_name!r}')
|
|
1485
|
+
col_names.append(col_name)
|
|
1486
|
+
|
|
1487
|
+
result = self._tbl_version.get().recompute_columns(col_names, errors_only=errors_only, cascade=cascade)
|
|
1488
|
+
FileCache.get().emit_eviction_warnings()
|
|
1489
|
+
return result
|
|
1421
1490
|
|
|
1422
1491
|
def delete(self, where: Optional['exprs.Expr'] = None) -> UpdateStatus:
|
|
1423
1492
|
"""Delete rows in this table.
|
|
@@ -1519,7 +1588,7 @@ class Table(SchemaObject):
|
|
|
1519
1588
|
|
|
1520
1589
|
def sync(
|
|
1521
1590
|
self, stores: Optional[str | list[str]] = None, *, export_data: bool = True, import_data: bool = True
|
|
1522
|
-
) ->
|
|
1591
|
+
) -> UpdateStatus:
|
|
1523
1592
|
"""
|
|
1524
1593
|
Synchronizes this table with its linked external stores.
|
|
1525
1594
|
|
|
@@ -1532,7 +1601,7 @@ class Table(SchemaObject):
|
|
|
1532
1601
|
from pixeltable.catalog import Catalog
|
|
1533
1602
|
|
|
1534
1603
|
if self._tbl_version_path.is_snapshot():
|
|
1535
|
-
return
|
|
1604
|
+
return UpdateStatus()
|
|
1536
1605
|
# we lock the entire tree starting at the root base table in order to ensure that all synced columns can
|
|
1537
1606
|
# have their updates propagated down the tree
|
|
1538
1607
|
base_tv = self._tbl_version_path.get_tbl_versions()[-1]
|
|
@@ -1548,11 +1617,11 @@ class Table(SchemaObject):
|
|
|
1548
1617
|
if store not in all_stores:
|
|
1549
1618
|
raise excs.Error(f'Table `{self._name}` has no external store with that name: {store}')
|
|
1550
1619
|
|
|
1551
|
-
sync_status =
|
|
1620
|
+
sync_status = UpdateStatus()
|
|
1552
1621
|
for store in stores:
|
|
1553
1622
|
store_obj = self._tbl_version.get().external_stores[store]
|
|
1554
1623
|
store_sync_status = store_obj.sync(self, export_data=export_data, import_data=import_data)
|
|
1555
|
-
sync_status
|
|
1624
|
+
sync_status += store_sync_status
|
|
1556
1625
|
|
|
1557
1626
|
return sync_status
|
|
1558
1627
|
|
|
@@ -1561,3 +1630,84 @@ class Table(SchemaObject):
|
|
|
1561
1630
|
|
|
1562
1631
|
def _ipython_key_completions_(self) -> list[str]:
|
|
1563
1632
|
return list(self._get_schema().keys())
|
|
1633
|
+
|
|
1634
|
+
_REPORT_SCHEMA: ClassVar[dict[str, ts.ColumnType]] = {
|
|
1635
|
+
'version': ts.IntType(),
|
|
1636
|
+
'created_at': ts.TimestampType(),
|
|
1637
|
+
'user': ts.StringType(nullable=True),
|
|
1638
|
+
'note': ts.StringType(),
|
|
1639
|
+
'inserts': ts.IntType(nullable=True),
|
|
1640
|
+
'updates': ts.IntType(nullable=True),
|
|
1641
|
+
'deletes': ts.IntType(nullable=True),
|
|
1642
|
+
'errors': ts.IntType(nullable=True),
|
|
1643
|
+
'computed': ts.IntType(),
|
|
1644
|
+
'schema_change': ts.StringType(),
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
def history(self, n: Optional[int] = None) -> pixeltable.dataframe.DataFrameResultSet:
|
|
1648
|
+
"""Returns rows of information about the versions of this table, most recent first.
|
|
1649
|
+
|
|
1650
|
+
Args:
|
|
1651
|
+
n: a limit to the number of versions listed
|
|
1652
|
+
|
|
1653
|
+
Examples:
|
|
1654
|
+
Report history:
|
|
1655
|
+
|
|
1656
|
+
>>> tbl.history()
|
|
1657
|
+
|
|
1658
|
+
Report only the most recent 5 changes to the table:
|
|
1659
|
+
|
|
1660
|
+
>>> tbl.history(n=5)
|
|
1661
|
+
|
|
1662
|
+
Returns:
|
|
1663
|
+
A list of information about each version, ordered from most recent to oldest version.
|
|
1664
|
+
"""
|
|
1665
|
+
from pixeltable.catalog import Catalog
|
|
1666
|
+
|
|
1667
|
+
if n is None:
|
|
1668
|
+
n = 1000_000_000
|
|
1669
|
+
if not isinstance(n, int) or n < 1:
|
|
1670
|
+
raise excs.Error(f'Invalid value for n: {n}')
|
|
1671
|
+
|
|
1672
|
+
# Retrieve the table history components from the catalog
|
|
1673
|
+
tbl_id = self._id
|
|
1674
|
+
# Collect an extra version, if available, to allow for computation of the first version's schema change
|
|
1675
|
+
vers_list = Catalog.get().collect_tbl_history(tbl_id, n + 1)
|
|
1676
|
+
|
|
1677
|
+
# Construct the metadata change description dictionary
|
|
1678
|
+
md_list = [(vers_md.version_md.version, vers_md.schema_version_md.columns) for vers_md in vers_list]
|
|
1679
|
+
md_dict = MetadataUtils._create_md_change_dict(md_list)
|
|
1680
|
+
|
|
1681
|
+
# Construct report lines
|
|
1682
|
+
if len(vers_list) > n:
|
|
1683
|
+
assert len(vers_list) == n + 1
|
|
1684
|
+
over_count = 1
|
|
1685
|
+
else:
|
|
1686
|
+
over_count = 0
|
|
1687
|
+
|
|
1688
|
+
report_lines: list[list[Any]] = []
|
|
1689
|
+
for vers_md in vers_list[0 : len(vers_list) - over_count]:
|
|
1690
|
+
version = vers_md.version_md.version
|
|
1691
|
+
schema_change = md_dict.get(version, '')
|
|
1692
|
+
update_status = vers_md.version_md.update_status
|
|
1693
|
+
if update_status is None:
|
|
1694
|
+
update_status = UpdateStatus()
|
|
1695
|
+
change_type = 'schema' if schema_change != '' else ''
|
|
1696
|
+
if change_type == '':
|
|
1697
|
+
change_type = 'data'
|
|
1698
|
+
rcs = update_status.row_count_stats + update_status.cascade_row_count_stats
|
|
1699
|
+
report_line = [
|
|
1700
|
+
version,
|
|
1701
|
+
datetime.datetime.fromtimestamp(vers_md.version_md.created_at),
|
|
1702
|
+
vers_md.version_md.user,
|
|
1703
|
+
change_type,
|
|
1704
|
+
rcs.ins_rows,
|
|
1705
|
+
rcs.upd_rows,
|
|
1706
|
+
rcs.del_rows,
|
|
1707
|
+
rcs.num_excs,
|
|
1708
|
+
rcs.computed_values,
|
|
1709
|
+
schema_change,
|
|
1710
|
+
]
|
|
1711
|
+
report_lines.append(report_line)
|
|
1712
|
+
|
|
1713
|
+
return pxt.dataframe.DataFrameResultSet(report_lines, self._REPORT_SCHEMA)
|