pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +11 -1
  2. pixeltable/catalog/__init__.py +2 -1
  3. pixeltable/catalog/catalog.py +179 -63
  4. pixeltable/catalog/column.py +24 -20
  5. pixeltable/catalog/table.py +96 -124
  6. pixeltable/catalog/table_metadata.py +96 -0
  7. pixeltable/catalog/table_version.py +15 -6
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +3 -2
  11. pixeltable/env.py +43 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/object_store_save_node.py +299 -0
  18. pixeltable/exec/sql_node.py +28 -33
  19. pixeltable/exprs/data_row.py +31 -25
  20. pixeltable/exprs/json_path.py +6 -5
  21. pixeltable/exprs/row_builder.py +6 -12
  22. pixeltable/functions/gemini.py +1 -1
  23. pixeltable/functions/openai.py +1 -1
  24. pixeltable/functions/video.py +5 -6
  25. pixeltable/globals.py +6 -7
  26. pixeltable/index/embedding_index.py +5 -8
  27. pixeltable/io/__init__.py +2 -1
  28. pixeltable/io/fiftyone.py +1 -1
  29. pixeltable/io/label_studio.py +4 -5
  30. pixeltable/io/lancedb.py +3 -0
  31. pixeltable/io/parquet.py +9 -89
  32. pixeltable/io/table_data_conduit.py +2 -2
  33. pixeltable/iterators/audio.py +1 -1
  34. pixeltable/iterators/document.py +10 -12
  35. pixeltable/iterators/video.py +1 -1
  36. pixeltable/metadata/schema.py +7 -0
  37. pixeltable/plan.py +26 -1
  38. pixeltable/share/packager.py +8 -2
  39. pixeltable/share/publish.py +3 -9
  40. pixeltable/type_system.py +1 -3
  41. pixeltable/utils/arrow.py +97 -2
  42. pixeltable/utils/dbms.py +31 -5
  43. pixeltable/utils/gcs_store.py +283 -0
  44. pixeltable/utils/lancedb.py +88 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
  50. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
@@ -7,9 +7,7 @@ import json
7
7
  import logging
8
8
  from keyword import iskeyword as is_python_keyword
9
9
  from pathlib import Path
10
- from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Literal, Optional, TypedDict, overload
11
-
12
- from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
10
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, overload
13
11
  from uuid import UUID
14
12
 
15
13
  import pandas as pd
@@ -17,8 +15,16 @@ import sqlalchemy as sql
17
15
 
18
16
  import pixeltable as pxt
19
17
  from pixeltable import catalog, env, exceptions as excs, exprs, index, type_system as ts
18
+ from pixeltable.catalog.table_metadata import (
19
+ ColumnMetadata,
20
+ EmbeddingIndexParams,
21
+ IndexMetadata,
22
+ TableMetadata,
23
+ VersionMetadata,
24
+ )
20
25
  from pixeltable.metadata import schema
21
26
  from pixeltable.metadata.utils import MetadataUtils
27
+ from pixeltable.utils.object_stores import ObjectOps
22
28
 
23
29
  from ..exprs import ColumnRef
24
30
  from ..utils.description_helper import DescriptionHelper
@@ -37,12 +43,16 @@ from .table_version_handle import TableVersionHandle
37
43
  from .table_version_path import TableVersionPath
38
44
  from .update_status import UpdateStatus
39
45
 
46
+ from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
47
+
48
+
40
49
  if TYPE_CHECKING:
41
50
  import torch.utils.data
42
51
 
43
52
  import pixeltable.plan
44
53
  from pixeltable.globals import TableDataSource
45
54
 
55
+
46
56
  _logger = logging.getLogger('pixeltable')
47
57
 
48
58
 
@@ -95,7 +105,7 @@ class Table(SchemaObject):
95
105
 
96
106
  return op()
97
107
 
98
- def _get_metadata(self) -> 'TableMetadata':
108
+ def _get_metadata(self) -> TableMetadata:
99
109
  columns = self._tbl_version_path.columns()
100
110
  column_info: dict[str, ColumnMetadata] = {}
101
111
  for col in columns:
@@ -481,8 +491,7 @@ class Table(SchemaObject):
481
491
  Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed
482
492
  columns, use [`add_computed_column()`][pixeltable.catalog.Table.add_computed_column] instead.
483
493
 
484
- The format of the `schema` argument is identical to the format of the schema in a call to
485
- [`create_table()`][pixeltable.globals.create_table].
494
+ The format of the `schema` argument is a dict mapping column names to their types.
486
495
 
487
496
  Args:
488
497
  schema: A dictionary mapping column names to types.
@@ -595,6 +604,7 @@ class Table(SchemaObject):
595
604
  self,
596
605
  *,
597
606
  stored: Optional[bool] = None,
607
+ destination: Optional[str | Path] = None,
598
608
  print_stats: bool = False,
599
609
  on_error: Literal['abort', 'ignore'] = 'abort',
600
610
  if_exists: Literal['error', 'ignore', 'replace'] = 'error',
@@ -606,6 +616,7 @@ class Table(SchemaObject):
606
616
  Args:
607
617
  kwargs: Exactly one keyword argument of the form `col_name=expression`.
608
618
  stored: Whether the column is materialized and stored or computed on demand.
619
+ destination: An object store reference for persisting computed files.
609
620
  print_stats: If `True`, print execution metrics during evaluation.
610
621
  on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
611
622
  row.
@@ -656,6 +667,9 @@ class Table(SchemaObject):
656
667
  if stored is not None:
657
668
  col_schema['stored'] = stored
658
669
 
670
+ if destination is not None:
671
+ col_schema['destination'] = destination
672
+
659
673
  # Raise an error if the column expression refers to a column error property
660
674
  if isinstance(spec, exprs.Expr):
661
675
  for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
@@ -670,7 +684,7 @@ class Table(SchemaObject):
670
684
  [col_name], IfExistsParam.validated(if_exists, 'if_exists')
671
685
  )
672
686
  # if the column to add already exists and user asked to ignore
673
- # exiting column, there's nothing to do.
687
+ # existing column, there's nothing to do.
674
688
  result = UpdateStatus()
675
689
  if len(cols_to_ignore) != 0:
676
690
  assert cols_to_ignore[0] == col_name
@@ -691,7 +705,7 @@ class Table(SchemaObject):
691
705
  (on account of containing Python Callables or Exprs).
692
706
  """
693
707
  assert isinstance(spec, dict)
694
- valid_keys = {'type', 'value', 'stored', 'media_validation'}
708
+ valid_keys = {'type', 'value', 'stored', 'media_validation', 'destination'}
695
709
  for k in spec:
696
710
  if k not in valid_keys:
697
711
  raise excs.Error(f'Column {name}: invalid key {k!r}')
@@ -715,6 +729,10 @@ class Table(SchemaObject):
715
729
  if 'stored' in spec and not isinstance(spec['stored'], bool):
716
730
  raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
717
731
 
732
+ d = spec.get('destination')
733
+ if d is not None and not isinstance(d, (str, Path)):
734
+ raise excs.Error(f'Column {name}: `destination` must be a string or path, got {d}')
735
+
718
736
  @classmethod
719
737
  def _create_columns(cls, schema: dict[str, Any]) -> list[Column]:
720
738
  """Construct list of Columns, given schema"""
@@ -725,6 +743,7 @@ class Table(SchemaObject):
725
743
  primary_key: bool = False
726
744
  media_validation: Optional[catalog.MediaValidation] = None
727
745
  stored = True
746
+ destination: Optional[str] = None
728
747
 
729
748
  if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
730
749
  col_type = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
@@ -749,6 +768,8 @@ class Table(SchemaObject):
749
768
  media_validation = (
750
769
  catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None else None
751
770
  )
771
+ if 'destination' in spec:
772
+ destination = ObjectOps.validate_destination(spec['destination'], name)
752
773
  else:
753
774
  raise excs.Error(f'Invalid value for column {name!r}')
754
775
 
@@ -759,6 +780,7 @@ class Table(SchemaObject):
759
780
  stored=stored,
760
781
  is_pk=primary_key,
761
782
  media_validation=media_validation,
783
+ destination=destination,
762
784
  )
763
785
  columns.append(column)
764
786
  return columns
@@ -784,14 +806,16 @@ class Table(SchemaObject):
784
806
  f'streaming function'
785
807
  )
786
808
  )
809
+ if col.destination is not None and not (col.stored and col.is_computed):
810
+ raise excs.Error(
811
+ f'Column {col.name!r}: destination={col.destination} only applies to stored computed columns'
812
+ )
787
813
 
788
814
  @classmethod
789
815
  def _verify_schema(cls, schema: list[Column]) -> None:
790
816
  """Check integrity of user-supplied schema and set defaults"""
791
- column_names: set[str] = set()
792
817
  for col in schema:
793
818
  cls._verify_column(col)
794
- column_names.add(col.name)
795
819
 
796
820
  def drop_column(self, column: str | ColumnRef, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
797
821
  """Drop a column from the table.
@@ -1690,43 +1714,35 @@ class Table(SchemaObject):
1690
1714
  def _ipython_key_completions_(self) -> list[str]:
1691
1715
  return list(self._get_schema().keys())
1692
1716
 
1693
- _REPORT_SCHEMA: ClassVar[dict[str, ts.ColumnType]] = {
1694
- 'version': ts.IntType(),
1695
- 'created_at': ts.TimestampType(),
1696
- 'user': ts.StringType(nullable=True),
1697
- 'note': ts.StringType(),
1698
- 'inserts': ts.IntType(nullable=True),
1699
- 'updates': ts.IntType(nullable=True),
1700
- 'deletes': ts.IntType(nullable=True),
1701
- 'errors': ts.IntType(nullable=True),
1702
- 'computed': ts.IntType(),
1703
- 'schema_change': ts.StringType(),
1704
- }
1705
-
1706
- def history(self, n: Optional[int] = None) -> pixeltable.dataframe.DataFrameResultSet:
1707
- """Returns rows of information about the versions of this table, most recent first.
1717
+ def get_versions(self, n: Optional[int] = None) -> list[VersionMetadata]:
1718
+ """
1719
+ Returns information about versions of this table, most recent first.
1720
+
1721
+ `get_versions()` is intended for programmatic access to version metadata; for human-readable
1722
+ output, use [`history()`][pixeltable.Table.history] instead.
1708
1723
 
1709
1724
  Args:
1710
- n: a limit to the number of versions listed
1725
+ n: if specified, will return at most `n` versions
1711
1726
 
1712
- Examples:
1713
- Report history:
1727
+ Returns:
1728
+ A list of [VersionMetadata][pixeltable.VersionMetadata] dictionaries, one per version retrieved, most
1729
+ recent first.
1714
1730
 
1715
- >>> tbl.history()
1731
+ Examples:
1732
+ Retrieve metadata about all versions of the table `tbl`:
1716
1733
 
1717
- Report only the most recent 5 changes to the table:
1734
+ >>> tbl.get_versions()
1718
1735
 
1719
- >>> tbl.history(n=5)
1736
+ Retrieve metadata about the most recent 5 versions of the table `tbl`:
1720
1737
 
1721
- Returns:
1722
- A list of information about each version, ordered from most recent to oldest version.
1738
+ >>> tbl.get_versions(n=5)
1723
1739
  """
1724
1740
  from pixeltable.catalog import Catalog
1725
1741
 
1726
1742
  if n is None:
1727
1743
  n = 1_000_000_000
1728
1744
  if not isinstance(n, int) or n < 1:
1729
- raise excs.Error(f'Invalid value for n: {n}')
1745
+ raise excs.Error(f'Invalid value for `n`: {n}')
1730
1746
 
1731
1747
  # Retrieve the table history components from the catalog
1732
1748
  tbl_id = self._id
@@ -1744,104 +1760,60 @@ class Table(SchemaObject):
1744
1760
  else:
1745
1761
  over_count = 0
1746
1762
 
1747
- report_lines: list[list[Any]] = []
1763
+ metadata_dicts: list[VersionMetadata] = []
1748
1764
  for vers_md in vers_list[0 : len(vers_list) - over_count]:
1749
1765
  version = vers_md.version_md.version
1750
- schema_change = md_dict.get(version, '')
1766
+ schema_change = md_dict.get(version, None)
1751
1767
  update_status = vers_md.version_md.update_status
1752
1768
  if update_status is None:
1753
1769
  update_status = UpdateStatus()
1754
- change_type = 'schema' if schema_change != '' else ''
1755
- if change_type == '':
1756
- change_type = 'data'
1770
+ change_type: Literal['schema', 'data'] = 'schema' if schema_change is not None else 'data'
1757
1771
  rcs = update_status.row_count_stats + update_status.cascade_row_count_stats
1758
- report_line = [
1759
- version,
1760
- datetime.datetime.fromtimestamp(vers_md.version_md.created_at),
1761
- vers_md.version_md.user,
1762
- change_type,
1763
- rcs.ins_rows,
1764
- rcs.upd_rows,
1765
- rcs.del_rows,
1766
- rcs.num_excs,
1767
- rcs.computed_values,
1768
- schema_change,
1769
- ]
1770
- report_lines.append(report_line)
1772
+ metadata_dicts.append(
1773
+ VersionMetadata(
1774
+ version=version,
1775
+ created_at=datetime.datetime.fromtimestamp(vers_md.version_md.created_at, tz=datetime.timezone.utc),
1776
+ user=vers_md.version_md.user,
1777
+ change_type=change_type,
1778
+ inserts=rcs.ins_rows,
1779
+ updates=rcs.upd_rows,
1780
+ deletes=rcs.del_rows,
1781
+ errors=rcs.num_excs,
1782
+ computed=rcs.computed_values,
1783
+ schema_change=schema_change,
1784
+ )
1785
+ )
1786
+
1787
+ return metadata_dicts
1788
+
1789
+ def history(self, n: Optional[int] = None) -> pd.DataFrame:
1790
+ """
1791
+ Returns a human-readable report about versions of this table.
1792
+
1793
+ `history()` is intended for human-readable output of version metadata; for programmatic access,
1794
+ use [`get_versions()`][pixeltable.Table.get_versions] instead.
1795
+
1796
+ Args:
1797
+ n: if specified, will return at most `n` versions
1798
+
1799
+ Returns:
1800
+ A report with information about each version, one per row, most recent first.
1771
1801
 
1772
- return pxt.dataframe.DataFrameResultSet(report_lines, self._REPORT_SCHEMA)
1802
+ Examples:
1803
+ Report all versions of the table:
1804
+
1805
+ >>> tbl.history()
1806
+
1807
+ Report only the most recent 5 changes to the table:
1808
+
1809
+ >>> tbl.history(n=5)
1810
+ """
1811
+ versions = self.get_versions(n)
1812
+ assert len(versions) > 0
1813
+ return pd.DataFrame([list(v.values()) for v in versions], columns=list(versions[0].keys()))
1773
1814
 
1774
1815
  def __check_mutable(self, op_descr: str) -> None:
1816
+ if self._tbl_version_path.is_replica():
1817
+ raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a replica.')
1775
1818
  if self._tbl_version_path.is_snapshot():
1776
1819
  raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a snapshot.')
1777
- if self._tbl_version_path.is_replica():
1778
- raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a {self._display_name()}.')
1779
-
1780
-
1781
- class ColumnMetadata(TypedDict):
1782
- """Metadata for a column of a Pixeltable table."""
1783
-
1784
- name: str
1785
- """The name of the column."""
1786
- type_: str
1787
- """The type specifier of the column."""
1788
- version_added: int
1789
- """The table version when this column was added."""
1790
- is_stored: bool
1791
- """`True` if this is a stored column; `False` if it is dynamically computed."""
1792
- is_primary_key: bool
1793
- """`True` if this column is part of the table's primary key."""
1794
- media_validation: Optional[Literal['on_read', 'on_write']]
1795
- """The media validation policy for this column."""
1796
- computed_with: Optional[str]
1797
- """Expression used to compute this column; `None` if this is not a computed column."""
1798
-
1799
-
1800
- class IndexMetadata(TypedDict):
1801
- """Metadata for a column of a Pixeltable table."""
1802
-
1803
- name: str
1804
- """The name of the index."""
1805
- columns: list[str]
1806
- """The table columns that are indexed."""
1807
- index_type: Literal['embedding']
1808
- """The type of index (currently only `'embedding'` is supported, but others will be added in the future)."""
1809
- parameters: EmbeddingIndexParams
1810
-
1811
-
1812
- class EmbeddingIndexParams(TypedDict):
1813
- metric: Literal['cosine', 'ip', 'l2']
1814
- """Index metric."""
1815
- embeddings: list[str]
1816
- """List of embeddings defined for this index."""
1817
-
1818
-
1819
- class TableMetadata(TypedDict):
1820
- """Metadata for a Pixeltable table."""
1821
-
1822
- name: str
1823
- """The name of the table (ex: `'my_table'`)."""
1824
- path: str
1825
- """The full path of the table (ex: `'my_dir.my_subdir.my_table'`)."""
1826
- columns: dict[str, ColumnMetadata]
1827
- """Column metadata for all of the visible columns of the table."""
1828
- indices: dict[str, IndexMetadata]
1829
- """Index metadata for all of the indices of the table."""
1830
- is_replica: bool
1831
- """`True` if this table is a replica of another (shared) table."""
1832
- is_view: bool
1833
- """`True` if this table is a view."""
1834
- is_snapshot: bool
1835
- """`True` if this table is a snapshot."""
1836
- version: int
1837
- """The current version of the table."""
1838
- version_created: datetime.datetime
1839
- """The timestamp when this table version was created."""
1840
- schema_version: int
1841
- """The current schema version of the table."""
1842
- comment: Optional[str]
1843
- """User-provided table comment, if one exists."""
1844
- media_validation: Literal['on_read', 'on_write']
1845
- """The media validation policy for this table."""
1846
- base: Optional[str]
1847
- """If this table is a view or snapshot, the full path of its base table; otherwise `None`."""
@@ -0,0 +1,96 @@
1
+ import datetime
2
+ from typing import Literal, Optional, TypedDict
3
+
4
+
5
+ class ColumnMetadata(TypedDict):
6
+ """Metadata for a column of a Pixeltable table."""
7
+
8
+ name: str
9
+ """The name of the column."""
10
+ type_: str
11
+ """The type specifier of the column."""
12
+ version_added: int
13
+ """The table version when this column was added."""
14
+ is_stored: bool
15
+ """`True` if this is a stored column; `False` if it is dynamically computed."""
16
+ is_primary_key: bool
17
+ """`True` if this column is part of the table's primary key."""
18
+ media_validation: Optional[Literal['on_read', 'on_write']]
19
+ """The media validation policy for this column."""
20
+ computed_with: Optional[str]
21
+ """Expression used to compute this column; `None` if this is not a computed column."""
22
+
23
+
24
+ class EmbeddingIndexParams(TypedDict):
25
+ metric: Literal['cosine', 'ip', 'l2']
26
+ """Index metric."""
27
+ embeddings: list[str]
28
+ """List of embeddings defined for this index."""
29
+
30
+
31
+ class IndexMetadata(TypedDict):
32
+ """Metadata for a column of a Pixeltable table."""
33
+
34
+ name: str
35
+ """The name of the index."""
36
+ columns: list[str]
37
+ """The table columns that are indexed."""
38
+ index_type: Literal['embedding']
39
+ """The type of index (currently only `'embedding'` is supported, but others will be added in the future)."""
40
+ parameters: EmbeddingIndexParams
41
+
42
+
43
+ class TableMetadata(TypedDict):
44
+ """Metadata for a Pixeltable table."""
45
+
46
+ name: str
47
+ """The name of the table (ex: `'my_table'`)."""
48
+ path: str
49
+ """The full path of the table (ex: `'my_dir.my_subdir.my_table'`)."""
50
+ columns: dict[str, ColumnMetadata]
51
+ """Column metadata for all of the visible columns of the table."""
52
+ indices: dict[str, IndexMetadata]
53
+ """Index metadata for all of the indices of the table."""
54
+ is_replica: bool
55
+ """`True` if this table is a replica of another (shared) table."""
56
+ is_view: bool
57
+ """`True` if this table is a view."""
58
+ is_snapshot: bool
59
+ """`True` if this table is a snapshot."""
60
+ version: int
61
+ """The current version of the table."""
62
+ version_created: datetime.datetime
63
+ """The timestamp when this table version was created."""
64
+ schema_version: int
65
+ """The current schema version of the table."""
66
+ comment: Optional[str]
67
+ """User-provided table comment, if one exists."""
68
+ media_validation: Literal['on_read', 'on_write']
69
+ """The media validation policy for this table."""
70
+ base: Optional[str]
71
+ """If this table is a view or snapshot, the full path of its base table; otherwise `None`."""
72
+
73
+
74
+ class VersionMetadata(TypedDict):
75
+ """Metadata for a specific version of a Pixeltable table."""
76
+
77
+ """The version number."""
78
+ version: int
79
+ """The timestamp when this version was created."""
80
+ created_at: datetime.datetime
81
+ """The user who created this version, if defined."""
82
+ user: str | None
83
+ """The type of table transformation that this version represents (`'data'` or `'schema'`)."""
84
+ change_type: Literal['data', 'schema']
85
+ """The number of rows inserted in this version."""
86
+ inserts: int
87
+ """The number of rows updated in this version."""
88
+ updates: int
89
+ """The number of rows deleted in this version."""
90
+ deletes: int
91
+ """The number of errors encountered during this version."""
92
+ errors: int
93
+ """The number of computed values calculated in this version."""
94
+ computed: int
95
+ """A description of the schema change that occurred in this version, if any."""
96
+ schema_change: str | None
@@ -20,7 +20,7 @@ from pixeltable.iterators import ComponentIterator
20
20
  from pixeltable.metadata import schema
21
21
  from pixeltable.utils.exception_handler import run_cleanup_on_exception
22
22
  from pixeltable.utils.filecache import FileCache
23
- from pixeltable.utils.media_store import MediaStore
23
+ from pixeltable.utils.object_stores import ObjectOps
24
24
 
25
25
  from .tbl_ops import TableOp
26
26
 
@@ -327,7 +327,7 @@ class TableVersion:
327
327
  from .table_version_path import TableVersionPath
328
328
 
329
329
  # clear out any remaining media files from an aborted previous attempt
330
- MediaStore.get().delete(self.id)
330
+ self.delete_media()
331
331
  view_path = TableVersionPath.from_dict(op.load_view_op.view_path)
332
332
  plan, _ = Planner.create_view_load_plan(view_path)
333
333
  _, row_counts = self.store_tbl.insert_rows(plan, v_min=self.version)
@@ -356,14 +356,23 @@ class TableVersion:
356
356
  cat = pxt.catalog.Catalog.get()
357
357
  # We're creating a new TableVersion replica, so we should never have seen this particular
358
358
  # TableVersion instance before.
359
- assert tbl_version.effective_version is not None
360
- assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
359
+ # Actually this isn't true, because we might be re-creating a dropped replica.
360
+ # TODO: Understand why old TableVersions are kept around even for a dropped table.
361
+ # assert tbl_version.effective_version is not None
362
+ # assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
361
363
  cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
362
364
  tbl_version.init()
363
365
  tbl_version.store_tbl.create()
364
366
  tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
365
367
  return tbl_version
366
368
 
369
+ def delete_media(self, tbl_version: Optional[int] = None) -> None:
370
+ # Assemble a set of column destinations and delete objects from all of them
371
+ # None is a valid column destination which refers to the default object location
372
+ destinations = {col.destination for col in self.cols if col.is_stored}
373
+ for dest in destinations:
374
+ ObjectOps.delete(dest, self.id, tbl_version=tbl_version)
375
+
367
376
  def drop(self) -> None:
368
377
  # if self.is_view and self.is_mutable:
369
378
  # # update mutable_views
@@ -374,7 +383,7 @@ class TableVersion:
374
383
  # if self.base.get().is_mutable:
375
384
  # self.base.get().mutable_views.remove(TableVersionHandle.create(self))
376
385
 
377
- MediaStore.get().delete(self.id)
386
+ self.delete_media()
378
387
  FileCache.get().clear(tbl_id=self.id)
379
388
  self.store_tbl.drop()
380
389
 
@@ -1236,7 +1245,7 @@ class TableVersion:
1236
1245
  )
1237
1246
 
1238
1247
  # delete newly-added data
1239
- MediaStore.get().delete(self.id, tbl_version=self.version)
1248
+ self.delete_media(tbl_version=self.version)
1240
1249
  conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
1241
1250
 
1242
1251
  # revert new deletions
@@ -47,17 +47,13 @@ class View(Table):
47
47
  self._tbl_version = tbl_version_path.tbl_version
48
48
 
49
49
  def _display_name(self) -> str:
50
- name: str
51
- if self._tbl_version_path.is_snapshot():
52
- name = 'snapshot'
53
- elif self._tbl_version_path.is_view():
54
- name = 'view'
55
- else:
56
- assert self._tbl_version_path.is_replica()
57
- name = 'table'
58
50
  if self._tbl_version_path.is_replica():
59
- name = f'{name}-replica'
60
- return name
51
+ return 'replica'
52
+ if self._tbl_version_path.is_snapshot():
53
+ return 'snapshot'
54
+ if self._tbl_version_path.is_view():
55
+ return 'view'
56
+ return 'table'
61
57
 
62
58
  @classmethod
63
59
  def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
@@ -270,12 +266,12 @@ class View(Table):
270
266
  # Update name and path with version qualifiers.
271
267
  md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
272
268
  md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
273
- base_tbl = self._get_base_table()
274
- if base_tbl is None:
275
- md['base'] = None
276
- else:
269
+ base_tbl_id = self._base_tbl_id
270
+ if base_tbl_id is not None:
271
+ base_tbl = self._get_base_table()
272
+ base_path = '<anonymous base table>' if base_tbl is None else base_tbl._path()
277
273
  base_version = self._effective_base_versions[0]
278
- md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
274
+ md['base'] = base_path if base_version is None else f'{base_path}:{base_version}'
279
275
  return md
280
276
 
281
277
  def insert(
@@ -294,17 +290,21 @@ class View(Table):
294
290
  def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
295
291
  raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
296
292
 
297
- def _get_base_table(self) -> Optional['Table']:
293
+ @property
294
+ def _base_tbl_id(self) -> Optional[UUID]:
298
295
  if self._tbl_version_path.tbl_id != self._id:
299
296
  # _tbl_version_path represents a different schema object from this one. This can only happen if this is a
300
297
  # named pure snapshot.
301
- base_id = self._tbl_version_path.tbl_id
302
- elif self._tbl_version_path.base is None:
298
+ return self._tbl_version_path.tbl_id
299
+ if self._tbl_version_path.base is None:
303
300
  return None
304
- else:
305
- base_id = self._tbl_version_path.base.tbl_id
306
- with catalog.Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
307
- return catalog.Catalog.get().get_table_by_id(base_id)
301
+ return self._tbl_version_path.base.tbl_id
302
+
303
+ def _get_base_table(self) -> Optional['Table']:
304
+ """Returns None if there is no base table, or if the base table is hidden."""
305
+ base_tbl_id = self._base_tbl_id
306
+ with catalog.Catalog.get().begin_xact(tbl_id=base_tbl_id, for_write=False):
307
+ return catalog.Catalog.get().get_table_by_id(base_tbl_id)
308
308
 
309
309
  @property
310
310
  def _effective_base_versions(self) -> list[Optional[int]]:
pixeltable/config.py CHANGED
@@ -161,6 +161,8 @@ KNOWN_CONFIG_OPTIONS = {
161
161
  'hide_warnings': 'Hide warnings from the console',
162
162
  'verbosity': 'Verbosity level for console output',
163
163
  'api_key': 'API key for Pixeltable cloud',
164
+ 'r2_profile': 'AWS config profile name used to access R2 storage',
165
+ 's3_profile': 'AWS config profile name used to access S3 storage',
164
166
  },
165
167
  'anthropic': {'api_key': 'Anthropic API key'},
166
168
  'bedrock': {'api_key': 'AWS Bedrock API key'},
pixeltable/dataframe.py CHANGED
@@ -1276,10 +1276,11 @@ class DataFrame:
1276
1276
 
1277
1277
  # TODO: Reconcile these with Table.__check_mutable()
1278
1278
  assert len(self._from_clause.tbls) == 1
1279
- if self._first_tbl.is_snapshot():
1280
- raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1279
+ # First check if it's a replica, since every replica handle is also a snapshot
1281
1280
  if self._first_tbl.is_replica():
1282
1281
  raise excs.Error(f'Cannot use `{op_name}` on a replica.')
1282
+ if self._first_tbl.is_snapshot():
1283
+ raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1283
1284
 
1284
1285
  def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
1285
1286
  """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""