pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (51) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +3 -10
  4. pixeltable/catalog/catalog.py +139 -59
  5. pixeltable/catalog/column.py +32 -23
  6. pixeltable/catalog/globals.py +2 -45
  7. pixeltable/catalog/insertable_table.py +5 -2
  8. pixeltable/catalog/path.py +6 -0
  9. pixeltable/catalog/table.py +173 -23
  10. pixeltable/catalog/table_version.py +156 -92
  11. pixeltable/catalog/table_version_handle.py +26 -1
  12. pixeltable/catalog/update_status.py +179 -0
  13. pixeltable/catalog/view.py +12 -3
  14. pixeltable/config.py +76 -12
  15. pixeltable/dataframe.py +1 -1
  16. pixeltable/env.py +29 -0
  17. pixeltable/exec/exec_node.py +7 -24
  18. pixeltable/exec/expr_eval/schedulers.py +134 -7
  19. pixeltable/exprs/column_property_ref.py +23 -20
  20. pixeltable/exprs/column_ref.py +24 -18
  21. pixeltable/exprs/data_row.py +9 -0
  22. pixeltable/exprs/function_call.py +2 -2
  23. pixeltable/exprs/row_builder.py +46 -14
  24. pixeltable/exprs/rowid_ref.py +0 -4
  25. pixeltable/func/function.py +3 -3
  26. pixeltable/functions/audio.py +36 -9
  27. pixeltable/functions/video.py +57 -10
  28. pixeltable/globals.py +61 -1
  29. pixeltable/io/__init__.py +1 -1
  30. pixeltable/io/external_store.py +39 -64
  31. pixeltable/io/globals.py +4 -4
  32. pixeltable/io/hf_datasets.py +10 -2
  33. pixeltable/io/label_studio.py +52 -48
  34. pixeltable/metadata/__init__.py +1 -1
  35. pixeltable/metadata/converters/convert_38.py +39 -0
  36. pixeltable/metadata/converters/convert_39.py +125 -0
  37. pixeltable/metadata/converters/util.py +3 -0
  38. pixeltable/metadata/notes.py +2 -0
  39. pixeltable/metadata/schema.py +14 -2
  40. pixeltable/metadata/utils.py +78 -0
  41. pixeltable/plan.py +26 -18
  42. pixeltable/share/packager.py +20 -38
  43. pixeltable/store.py +121 -142
  44. pixeltable/type_system.py +2 -2
  45. pixeltable/utils/coroutine.py +6 -23
  46. pixeltable/utils/media_store.py +39 -0
  47. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
  48. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
  49. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
  50. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
  51. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -6,9 +6,10 @@ import json
6
6
  import logging
7
7
  from keyword import iskeyword as is_python_keyword
8
8
  from pathlib import Path
9
- from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, Union, overload
9
+ from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Literal, Optional, Union, overload
10
10
 
11
11
  from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
12
+ import datetime
12
13
  from uuid import UUID
13
14
 
14
15
  import pandas as pd
@@ -17,6 +18,7 @@ import sqlalchemy as sql
17
18
  import pixeltable as pxt
18
19
  from pixeltable import catalog, env, exceptions as excs, exprs, index, type_system as ts
19
20
  from pixeltable.metadata import schema
21
+ from pixeltable.metadata.utils import MetadataUtils
20
22
 
21
23
  from ..exprs import ColumnRef
22
24
  from ..utils.description_helper import DescriptionHelper
@@ -27,13 +29,13 @@ from .globals import (
27
29
  IfExistsParam,
28
30
  IfNotExistsParam,
29
31
  MediaValidation,
30
- UpdateStatus,
31
32
  is_system_column_name,
32
33
  is_valid_identifier,
33
34
  )
34
35
  from .schema_object import SchemaObject
35
36
  from .table_version_handle import TableVersionHandle
36
37
  from .table_version_path import TableVersionPath
38
+ from .update_status import UpdateStatus
37
39
 
38
40
  if TYPE_CHECKING:
39
41
  import torch.utils.data
@@ -107,8 +109,6 @@ class Table(SchemaObject):
107
109
 
108
110
  def _get_metadata(self) -> dict[str, Any]:
109
111
  md = super()._get_metadata()
110
- base = self._get_base_table()
111
- md['base'] = base._path() if base is not None else None
112
112
  md['schema'] = self._get_schema()
113
113
  md['is_replica'] = self._tbl_version_path.is_replica()
114
114
  md['version'] = self._get_version()
@@ -508,15 +508,16 @@ class Table(SchemaObject):
508
508
  for cname in cols_to_ignore:
509
509
  assert cname in col_schema
510
510
  del col_schema[cname]
511
+ result = UpdateStatus()
511
512
  if len(col_schema) == 0:
512
- return UpdateStatus()
513
+ return result
513
514
  new_cols = self._create_columns(col_schema)
514
515
  for new_col in new_cols:
515
516
  self._verify_column(new_col)
516
517
  assert self._tbl_version is not None
517
- status = self._tbl_version.get().add_columns(new_cols, print_stats=False, on_error='abort')
518
+ result += self._tbl_version.get().add_columns(new_cols, print_stats=False, on_error='abort')
518
519
  FileCache.get().emit_eviction_warnings()
519
- return status
520
+ return result
520
521
 
521
522
  def add_column(
522
523
  self,
@@ -593,7 +594,7 @@ class Table(SchemaObject):
593
594
  - `'abort'`: an exception will be raised and the column will not be added.
594
595
  - `'ignore'`: execution will continue and the column will be added. Any rows
595
596
  with errors will have a `None` value for the column, with information about the error stored in the
596
- corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
597
+ corresponding `tbl.col_name.errormsg` tbl.col_name.errortype` fields.
597
598
  if_exists: Determines the behavior if the column already exists. Must be one of the following:
598
599
 
599
600
  - `'error'`: an exception will be raised.
@@ -640,10 +641,10 @@ class Table(SchemaObject):
640
641
  # Raise an error if the column expression refers to a column error property
641
642
  if isinstance(spec, exprs.Expr):
642
643
  for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
643
- if e.is_error_prop():
644
+ if e.is_cellmd_prop():
644
645
  raise excs.Error(
645
- 'Use of a reference to an error property of another column is not allowed in a computed '
646
- f'column. The specified computation for this column contains this reference: `{e!r}`'
646
+ f'Use of a reference to the {e.prop.name.lower()!r} property of another column '
647
+ f'is not allowed in a computed column.'
647
648
  )
648
649
 
649
650
  # handle existing columns based on if_exists parameter
@@ -652,16 +653,17 @@ class Table(SchemaObject):
652
653
  )
653
654
  # if the column to add already exists and user asked to ignore
654
655
  # exiting column, there's nothing to do.
656
+ result = UpdateStatus()
655
657
  if len(cols_to_ignore) != 0:
656
658
  assert cols_to_ignore[0] == col_name
657
- return UpdateStatus()
659
+ return result
658
660
 
659
661
  new_col = self._create_columns({col_name: col_schema})[0]
660
662
  self._verify_column(new_col)
661
663
  assert self._tbl_version is not None
662
- status = self._tbl_version.get().add_columns([new_col], print_stats=print_stats, on_error=on_error)
664
+ result += self._tbl_version.get().add_columns([new_col], print_stats=print_stats, on_error=on_error)
663
665
  FileCache.get().emit_eviction_warnings()
664
- return status
666
+ return result
665
667
 
666
668
  @classmethod
667
669
  def _validate_column_spec(cls, name: str, spec: dict[str, Any]) -> None:
@@ -840,11 +842,12 @@ class Table(SchemaObject):
840
842
  _ = self._get_views(recursive=True, include_snapshots=False)
841
843
  # See if this column has a dependent store. We need to look through all stores in all
842
844
  # (transitive) views of this table.
845
+ col_handle = col.handle
843
846
  dependent_stores = [
844
847
  (view, store)
845
848
  for view in (self, *self._get_views(recursive=True, include_snapshots=False))
846
849
  for store in view._tbl_version.get().external_stores.values()
847
- if col in store.get_local_columns()
850
+ if col_handle in store.get_local_columns()
848
851
  ]
849
852
  if len(dependent_stores) > 0:
850
853
  dependent_store_names = [
@@ -1321,6 +1324,9 @@ class Table(SchemaObject):
1321
1324
  where: a predicate to filter rows to update.
1322
1325
  cascade: if True, also update all computed columns that transitively depend on the updated columns.
1323
1326
 
1327
+ Returns:
1328
+ An [`UpdateStatus`][pixeltable.UpdateStatus] object containing information about the update.
1329
+
1324
1330
  Examples:
1325
1331
  Set column `int_col` to 1 for all rows:
1326
1332
 
@@ -1343,9 +1349,9 @@ class Table(SchemaObject):
1343
1349
  with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
1344
1350
  if self._tbl_version_path.is_snapshot():
1345
1351
  raise excs.Error('Cannot update a snapshot')
1346
- status = self._tbl_version.get().update(value_spec, where, cascade)
1352
+ result = self._tbl_version.get().update(value_spec, where, cascade)
1347
1353
  FileCache.get().emit_eviction_warnings()
1348
- return status
1354
+ return result
1349
1355
 
1350
1356
  def batch_update(
1351
1357
  self,
@@ -1409,7 +1415,7 @@ class Table(SchemaObject):
1409
1415
  raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
1410
1416
  row_updates.append(col_vals)
1411
1417
 
1412
- status = self._tbl_version.get().batch_update(
1418
+ result = self._tbl_version.get().batch_update(
1413
1419
  row_updates,
1414
1420
  rowids,
1415
1421
  error_if_not_exists=if_not_exists == 'error',
@@ -1417,7 +1423,70 @@ class Table(SchemaObject):
1417
1423
  cascade=cascade,
1418
1424
  )
1419
1425
  FileCache.get().emit_eviction_warnings()
1420
- return status
1426
+ return result
1427
+
1428
+ def recompute_columns(
1429
+ self, *columns: Union[str, ColumnRef], errors_only: bool = False, cascade: bool = True
1430
+ ) -> UpdateStatus:
1431
+ """Recompute the values in one or more computed columns of this table.
1432
+
1433
+ Args:
1434
+ columns: The names or references of the computed columns to recompute.
1435
+ errors_only: If True, only run the recomputation for rows that have errors in the column (ie, the column's
1436
+ `errortype` property indicates that an error occurred). Only allowed for recomputing a single column.
1437
+ cascade: if True, also update all computed columns that transitively depend on the recomputed columns.
1438
+
1439
+ Examples:
1440
+ Recompute computed columns `c1` and `c2` for all rows in this table, and everything that transitively
1441
+ depends on them:
1442
+
1443
+ >>> tbl.recompute_columns('c1', 'c2')
1444
+
1445
+ Recompute computed column `c1` for all rows in this table, but don't recompute other columns that depend on
1446
+ it:
1447
+
1448
+ >>> tbl.recompute_columns(tbl.c1, tbl.c2, cascade=False)
1449
+
1450
+ Recompute column `c1` and its dependents, but only for rows that have errors in it:
1451
+
1452
+ >>> tbl.recompute_columns('c1', errors_only=True)
1453
+ """
1454
+ from pixeltable.catalog import Catalog
1455
+
1456
+ cat = Catalog.get()
1457
+ # lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
1458
+ with cat.begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
1459
+ if self._tbl_version_path.is_snapshot():
1460
+ raise excs.Error('Cannot recompute columns of a snapshot.')
1461
+ if len(columns) == 0:
1462
+ raise excs.Error('At least one column must be specified to recompute')
1463
+ if errors_only and len(columns) > 1:
1464
+ raise excs.Error('Cannot use errors_only=True with multiple columns')
1465
+
1466
+ col_names: list[str] = []
1467
+ for column in columns:
1468
+ col_name: str
1469
+ col: Column
1470
+ if isinstance(column, str):
1471
+ col = self._tbl_version_path.get_column(column, include_bases=True)
1472
+ if col is None:
1473
+ raise excs.Error(f'Unknown column: {column!r}')
1474
+ col_name = column
1475
+ else:
1476
+ assert isinstance(column, ColumnRef)
1477
+ col = column.col
1478
+ if not self._tbl_version_path.has_column(col, include_bases=True):
1479
+ raise excs.Error(f'Unknown column: {col.name!r}')
1480
+ col_name = col.name
1481
+ if not col.is_computed:
1482
+ raise excs.Error(f'Column {col_name!r} is not a computed column')
1483
+ if col.tbl.id != self._tbl_version_path.tbl_id:
1484
+ raise excs.Error(f'Cannot recompute column of a base: {col_name!r}')
1485
+ col_names.append(col_name)
1486
+
1487
+ result = self._tbl_version.get().recompute_columns(col_names, errors_only=errors_only, cascade=cascade)
1488
+ FileCache.get().emit_eviction_warnings()
1489
+ return result
1421
1490
 
1422
1491
  def delete(self, where: Optional['exprs.Expr'] = None) -> UpdateStatus:
1423
1492
  """Delete rows in this table.
@@ -1519,7 +1588,7 @@ class Table(SchemaObject):
1519
1588
 
1520
1589
  def sync(
1521
1590
  self, stores: Optional[str | list[str]] = None, *, export_data: bool = True, import_data: bool = True
1522
- ) -> 'pxt.io.SyncStatus':
1591
+ ) -> UpdateStatus:
1523
1592
  """
1524
1593
  Synchronizes this table with its linked external stores.
1525
1594
 
@@ -1532,7 +1601,7 @@ class Table(SchemaObject):
1532
1601
  from pixeltable.catalog import Catalog
1533
1602
 
1534
1603
  if self._tbl_version_path.is_snapshot():
1535
- return pxt.io.SyncStatus.empty()
1604
+ return UpdateStatus()
1536
1605
  # we lock the entire tree starting at the root base table in order to ensure that all synced columns can
1537
1606
  # have their updates propagated down the tree
1538
1607
  base_tv = self._tbl_version_path.get_tbl_versions()[-1]
@@ -1548,11 +1617,11 @@ class Table(SchemaObject):
1548
1617
  if store not in all_stores:
1549
1618
  raise excs.Error(f'Table `{self._name}` has no external store with that name: {store}')
1550
1619
 
1551
- sync_status = pxt.io.SyncStatus.empty()
1620
+ sync_status = UpdateStatus()
1552
1621
  for store in stores:
1553
1622
  store_obj = self._tbl_version.get().external_stores[store]
1554
1623
  store_sync_status = store_obj.sync(self, export_data=export_data, import_data=import_data)
1555
- sync_status = sync_status.combine(store_sync_status)
1624
+ sync_status += store_sync_status
1556
1625
 
1557
1626
  return sync_status
1558
1627
 
@@ -1561,3 +1630,84 @@ class Table(SchemaObject):
1561
1630
 
1562
1631
  def _ipython_key_completions_(self) -> list[str]:
1563
1632
  return list(self._get_schema().keys())
1633
+
1634
+ _REPORT_SCHEMA: ClassVar[dict[str, ts.ColumnType]] = {
1635
+ 'version': ts.IntType(),
1636
+ 'created_at': ts.TimestampType(),
1637
+ 'user': ts.StringType(nullable=True),
1638
+ 'note': ts.StringType(),
1639
+ 'inserts': ts.IntType(nullable=True),
1640
+ 'updates': ts.IntType(nullable=True),
1641
+ 'deletes': ts.IntType(nullable=True),
1642
+ 'errors': ts.IntType(nullable=True),
1643
+ 'computed': ts.IntType(),
1644
+ 'schema_change': ts.StringType(),
1645
+ }
1646
+
1647
+ def history(self, n: Optional[int] = None) -> pixeltable.dataframe.DataFrameResultSet:
1648
+ """Returns rows of information about the versions of this table, most recent first.
1649
+
1650
+ Args:
1651
+ n: a limit to the number of versions listed
1652
+
1653
+ Examples:
1654
+ Report history:
1655
+
1656
+ >>> tbl.history()
1657
+
1658
+ Report only the most recent 5 changes to the table:
1659
+
1660
+ >>> tbl.history(n=5)
1661
+
1662
+ Returns:
1663
+ A list of information about each version, ordered from most recent to oldest version.
1664
+ """
1665
+ from pixeltable.catalog import Catalog
1666
+
1667
+ if n is None:
1668
+ n = 1000_000_000
1669
+ if not isinstance(n, int) or n < 1:
1670
+ raise excs.Error(f'Invalid value for n: {n}')
1671
+
1672
+ # Retrieve the table history components from the catalog
1673
+ tbl_id = self._id
1674
+ # Collect an extra version, if available, to allow for computation of the first version's schema change
1675
+ vers_list = Catalog.get().collect_tbl_history(tbl_id, n + 1)
1676
+
1677
+ # Construct the metadata change description dictionary
1678
+ md_list = [(vers_md.version_md.version, vers_md.schema_version_md.columns) for vers_md in vers_list]
1679
+ md_dict = MetadataUtils._create_md_change_dict(md_list)
1680
+
1681
+ # Construct report lines
1682
+ if len(vers_list) > n:
1683
+ assert len(vers_list) == n + 1
1684
+ over_count = 1
1685
+ else:
1686
+ over_count = 0
1687
+
1688
+ report_lines: list[list[Any]] = []
1689
+ for vers_md in vers_list[0 : len(vers_list) - over_count]:
1690
+ version = vers_md.version_md.version
1691
+ schema_change = md_dict.get(version, '')
1692
+ update_status = vers_md.version_md.update_status
1693
+ if update_status is None:
1694
+ update_status = UpdateStatus()
1695
+ change_type = 'schema' if schema_change != '' else ''
1696
+ if change_type == '':
1697
+ change_type = 'data'
1698
+ rcs = update_status.row_count_stats + update_status.cascade_row_count_stats
1699
+ report_line = [
1700
+ version,
1701
+ datetime.datetime.fromtimestamp(vers_md.version_md.created_at),
1702
+ vers_md.version_md.user,
1703
+ change_type,
1704
+ rcs.ins_rows,
1705
+ rcs.upd_rows,
1706
+ rcs.del_rows,
1707
+ rcs.num_excs,
1708
+ rcs.computed_values,
1709
+ schema_change,
1710
+ ]
1711
+ report_lines.append(report_line)
1712
+
1713
+ return pxt.dataframe.DataFrameResultSet(report_lines, self._REPORT_SCHEMA)