pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +11 -1
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +179 -63
- pixeltable/catalog/column.py +24 -20
- pixeltable/catalog/table.py +96 -124
- pixeltable/catalog/table_metadata.py +96 -0
- pixeltable/catalog/table_version.py +15 -6
- pixeltable/catalog/view.py +22 -22
- pixeltable/config.py +2 -0
- pixeltable/dataframe.py +3 -2
- pixeltable/env.py +43 -21
- pixeltable/exec/__init__.py +1 -0
- pixeltable/exec/aggregation_node.py +0 -1
- pixeltable/exec/cache_prefetch_node.py +74 -98
- pixeltable/exec/data_row_batch.py +2 -18
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/object_store_save_node.py +299 -0
- pixeltable/exec/sql_node.py +28 -33
- pixeltable/exprs/data_row.py +31 -25
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/row_builder.py +6 -12
- pixeltable/functions/gemini.py +1 -1
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/video.py +5 -6
- pixeltable/globals.py +6 -7
- pixeltable/index/embedding_index.py +5 -8
- pixeltable/io/__init__.py +2 -1
- pixeltable/io/fiftyone.py +1 -1
- pixeltable/io/label_studio.py +4 -5
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/parquet.py +9 -89
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/iterators/audio.py +1 -1
- pixeltable/iterators/document.py +10 -12
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/schema.py +7 -0
- pixeltable/plan.py +26 -1
- pixeltable/share/packager.py +8 -2
- pixeltable/share/publish.py +3 -9
- pixeltable/type_system.py +1 -3
- pixeltable/utils/arrow.py +97 -2
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/object_stores.py +497 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +354 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
- pixeltable/utils/media_store.py +0 -248
- pixeltable/utils/s3.py +0 -17
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
pixeltable/catalog/table.py
CHANGED
|
@@ -7,9 +7,7 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
from keyword import iskeyword as is_python_keyword
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import TYPE_CHECKING, Any,
|
|
11
|
-
|
|
12
|
-
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, overload
|
|
13
11
|
from uuid import UUID
|
|
14
12
|
|
|
15
13
|
import pandas as pd
|
|
@@ -17,8 +15,16 @@ import sqlalchemy as sql
|
|
|
17
15
|
|
|
18
16
|
import pixeltable as pxt
|
|
19
17
|
from pixeltable import catalog, env, exceptions as excs, exprs, index, type_system as ts
|
|
18
|
+
from pixeltable.catalog.table_metadata import (
|
|
19
|
+
ColumnMetadata,
|
|
20
|
+
EmbeddingIndexParams,
|
|
21
|
+
IndexMetadata,
|
|
22
|
+
TableMetadata,
|
|
23
|
+
VersionMetadata,
|
|
24
|
+
)
|
|
20
25
|
from pixeltable.metadata import schema
|
|
21
26
|
from pixeltable.metadata.utils import MetadataUtils
|
|
27
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
22
28
|
|
|
23
29
|
from ..exprs import ColumnRef
|
|
24
30
|
from ..utils.description_helper import DescriptionHelper
|
|
@@ -37,12 +43,16 @@ from .table_version_handle import TableVersionHandle
|
|
|
37
43
|
from .table_version_path import TableVersionPath
|
|
38
44
|
from .update_status import UpdateStatus
|
|
39
45
|
|
|
46
|
+
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
47
|
+
|
|
48
|
+
|
|
40
49
|
if TYPE_CHECKING:
|
|
41
50
|
import torch.utils.data
|
|
42
51
|
|
|
43
52
|
import pixeltable.plan
|
|
44
53
|
from pixeltable.globals import TableDataSource
|
|
45
54
|
|
|
55
|
+
|
|
46
56
|
_logger = logging.getLogger('pixeltable')
|
|
47
57
|
|
|
48
58
|
|
|
@@ -95,7 +105,7 @@ class Table(SchemaObject):
|
|
|
95
105
|
|
|
96
106
|
return op()
|
|
97
107
|
|
|
98
|
-
def _get_metadata(self) ->
|
|
108
|
+
def _get_metadata(self) -> TableMetadata:
|
|
99
109
|
columns = self._tbl_version_path.columns()
|
|
100
110
|
column_info: dict[str, ColumnMetadata] = {}
|
|
101
111
|
for col in columns:
|
|
@@ -481,8 +491,7 @@ class Table(SchemaObject):
|
|
|
481
491
|
Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed
|
|
482
492
|
columns, use [`add_computed_column()`][pixeltable.catalog.Table.add_computed_column] instead.
|
|
483
493
|
|
|
484
|
-
The format of the `schema` argument is
|
|
485
|
-
[`create_table()`][pixeltable.globals.create_table].
|
|
494
|
+
The format of the `schema` argument is a dict mapping column names to their types.
|
|
486
495
|
|
|
487
496
|
Args:
|
|
488
497
|
schema: A dictionary mapping column names to types.
|
|
@@ -595,6 +604,7 @@ class Table(SchemaObject):
|
|
|
595
604
|
self,
|
|
596
605
|
*,
|
|
597
606
|
stored: Optional[bool] = None,
|
|
607
|
+
destination: Optional[str | Path] = None,
|
|
598
608
|
print_stats: bool = False,
|
|
599
609
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
600
610
|
if_exists: Literal['error', 'ignore', 'replace'] = 'error',
|
|
@@ -606,6 +616,7 @@ class Table(SchemaObject):
|
|
|
606
616
|
Args:
|
|
607
617
|
kwargs: Exactly one keyword argument of the form `col_name=expression`.
|
|
608
618
|
stored: Whether the column is materialized and stored or computed on demand.
|
|
619
|
+
destination: An object store reference for persisting computed files.
|
|
609
620
|
print_stats: If `True`, print execution metrics during evaluation.
|
|
610
621
|
on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
|
|
611
622
|
row.
|
|
@@ -656,6 +667,9 @@ class Table(SchemaObject):
|
|
|
656
667
|
if stored is not None:
|
|
657
668
|
col_schema['stored'] = stored
|
|
658
669
|
|
|
670
|
+
if destination is not None:
|
|
671
|
+
col_schema['destination'] = destination
|
|
672
|
+
|
|
659
673
|
# Raise an error if the column expression refers to a column error property
|
|
660
674
|
if isinstance(spec, exprs.Expr):
|
|
661
675
|
for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
|
|
@@ -670,7 +684,7 @@ class Table(SchemaObject):
|
|
|
670
684
|
[col_name], IfExistsParam.validated(if_exists, 'if_exists')
|
|
671
685
|
)
|
|
672
686
|
# if the column to add already exists and user asked to ignore
|
|
673
|
-
#
|
|
687
|
+
# existing column, there's nothing to do.
|
|
674
688
|
result = UpdateStatus()
|
|
675
689
|
if len(cols_to_ignore) != 0:
|
|
676
690
|
assert cols_to_ignore[0] == col_name
|
|
@@ -691,7 +705,7 @@ class Table(SchemaObject):
|
|
|
691
705
|
(on account of containing Python Callables or Exprs).
|
|
692
706
|
"""
|
|
693
707
|
assert isinstance(spec, dict)
|
|
694
|
-
valid_keys = {'type', 'value', 'stored', 'media_validation'}
|
|
708
|
+
valid_keys = {'type', 'value', 'stored', 'media_validation', 'destination'}
|
|
695
709
|
for k in spec:
|
|
696
710
|
if k not in valid_keys:
|
|
697
711
|
raise excs.Error(f'Column {name}: invalid key {k!r}')
|
|
@@ -715,6 +729,10 @@ class Table(SchemaObject):
|
|
|
715
729
|
if 'stored' in spec and not isinstance(spec['stored'], bool):
|
|
716
730
|
raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
|
|
717
731
|
|
|
732
|
+
d = spec.get('destination')
|
|
733
|
+
if d is not None and not isinstance(d, (str, Path)):
|
|
734
|
+
raise excs.Error(f'Column {name}: `destination` must be a string or path, got {d}')
|
|
735
|
+
|
|
718
736
|
@classmethod
|
|
719
737
|
def _create_columns(cls, schema: dict[str, Any]) -> list[Column]:
|
|
720
738
|
"""Construct list of Columns, given schema"""
|
|
@@ -725,6 +743,7 @@ class Table(SchemaObject):
|
|
|
725
743
|
primary_key: bool = False
|
|
726
744
|
media_validation: Optional[catalog.MediaValidation] = None
|
|
727
745
|
stored = True
|
|
746
|
+
destination: Optional[str] = None
|
|
728
747
|
|
|
729
748
|
if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
|
|
730
749
|
col_type = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
|
|
@@ -749,6 +768,8 @@ class Table(SchemaObject):
|
|
|
749
768
|
media_validation = (
|
|
750
769
|
catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None else None
|
|
751
770
|
)
|
|
771
|
+
if 'destination' in spec:
|
|
772
|
+
destination = ObjectOps.validate_destination(spec['destination'], name)
|
|
752
773
|
else:
|
|
753
774
|
raise excs.Error(f'Invalid value for column {name!r}')
|
|
754
775
|
|
|
@@ -759,6 +780,7 @@ class Table(SchemaObject):
|
|
|
759
780
|
stored=stored,
|
|
760
781
|
is_pk=primary_key,
|
|
761
782
|
media_validation=media_validation,
|
|
783
|
+
destination=destination,
|
|
762
784
|
)
|
|
763
785
|
columns.append(column)
|
|
764
786
|
return columns
|
|
@@ -784,14 +806,16 @@ class Table(SchemaObject):
|
|
|
784
806
|
f'streaming function'
|
|
785
807
|
)
|
|
786
808
|
)
|
|
809
|
+
if col.destination is not None and not (col.stored and col.is_computed):
|
|
810
|
+
raise excs.Error(
|
|
811
|
+
f'Column {col.name!r}: destination={col.destination} only applies to stored computed columns'
|
|
812
|
+
)
|
|
787
813
|
|
|
788
814
|
@classmethod
|
|
789
815
|
def _verify_schema(cls, schema: list[Column]) -> None:
|
|
790
816
|
"""Check integrity of user-supplied schema and set defaults"""
|
|
791
|
-
column_names: set[str] = set()
|
|
792
817
|
for col in schema:
|
|
793
818
|
cls._verify_column(col)
|
|
794
|
-
column_names.add(col.name)
|
|
795
819
|
|
|
796
820
|
def drop_column(self, column: str | ColumnRef, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
|
|
797
821
|
"""Drop a column from the table.
|
|
@@ -1690,43 +1714,35 @@ class Table(SchemaObject):
|
|
|
1690
1714
|
def _ipython_key_completions_(self) -> list[str]:
|
|
1691
1715
|
return list(self._get_schema().keys())
|
|
1692
1716
|
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
'updates': ts.IntType(nullable=True),
|
|
1700
|
-
'deletes': ts.IntType(nullable=True),
|
|
1701
|
-
'errors': ts.IntType(nullable=True),
|
|
1702
|
-
'computed': ts.IntType(),
|
|
1703
|
-
'schema_change': ts.StringType(),
|
|
1704
|
-
}
|
|
1705
|
-
|
|
1706
|
-
def history(self, n: Optional[int] = None) -> pixeltable.dataframe.DataFrameResultSet:
|
|
1707
|
-
"""Returns rows of information about the versions of this table, most recent first.
|
|
1717
|
+
def get_versions(self, n: Optional[int] = None) -> list[VersionMetadata]:
|
|
1718
|
+
"""
|
|
1719
|
+
Returns information about versions of this table, most recent first.
|
|
1720
|
+
|
|
1721
|
+
`get_versions()` is intended for programmatic access to version metadata; for human-readable
|
|
1722
|
+
output, use [`history()`][pixeltable.Table.history] instead.
|
|
1708
1723
|
|
|
1709
1724
|
Args:
|
|
1710
|
-
n:
|
|
1725
|
+
n: if specified, will return at most `n` versions
|
|
1711
1726
|
|
|
1712
|
-
|
|
1713
|
-
|
|
1727
|
+
Returns:
|
|
1728
|
+
A list of [VersionMetadata][pixeltable.VersionMetadata] dictionaries, one per version retrieved, most
|
|
1729
|
+
recent first.
|
|
1714
1730
|
|
|
1715
|
-
|
|
1731
|
+
Examples:
|
|
1732
|
+
Retrieve metadata about all versions of the table `tbl`:
|
|
1716
1733
|
|
|
1717
|
-
|
|
1734
|
+
>>> tbl.get_versions()
|
|
1718
1735
|
|
|
1719
|
-
|
|
1736
|
+
Retrieve metadata about the most recent 5 versions of the table `tbl`:
|
|
1720
1737
|
|
|
1721
|
-
|
|
1722
|
-
A list of information about each version, ordered from most recent to oldest version.
|
|
1738
|
+
>>> tbl.get_versions(n=5)
|
|
1723
1739
|
"""
|
|
1724
1740
|
from pixeltable.catalog import Catalog
|
|
1725
1741
|
|
|
1726
1742
|
if n is None:
|
|
1727
1743
|
n = 1_000_000_000
|
|
1728
1744
|
if not isinstance(n, int) or n < 1:
|
|
1729
|
-
raise excs.Error(f'Invalid value for n
|
|
1745
|
+
raise excs.Error(f'Invalid value for `n`: {n}')
|
|
1730
1746
|
|
|
1731
1747
|
# Retrieve the table history components from the catalog
|
|
1732
1748
|
tbl_id = self._id
|
|
@@ -1744,104 +1760,60 @@ class Table(SchemaObject):
|
|
|
1744
1760
|
else:
|
|
1745
1761
|
over_count = 0
|
|
1746
1762
|
|
|
1747
|
-
|
|
1763
|
+
metadata_dicts: list[VersionMetadata] = []
|
|
1748
1764
|
for vers_md in vers_list[0 : len(vers_list) - over_count]:
|
|
1749
1765
|
version = vers_md.version_md.version
|
|
1750
|
-
schema_change = md_dict.get(version,
|
|
1766
|
+
schema_change = md_dict.get(version, None)
|
|
1751
1767
|
update_status = vers_md.version_md.update_status
|
|
1752
1768
|
if update_status is None:
|
|
1753
1769
|
update_status = UpdateStatus()
|
|
1754
|
-
change_type = 'schema' if schema_change
|
|
1755
|
-
if change_type == '':
|
|
1756
|
-
change_type = 'data'
|
|
1770
|
+
change_type: Literal['schema', 'data'] = 'schema' if schema_change is not None else 'data'
|
|
1757
1771
|
rcs = update_status.row_count_stats + update_status.cascade_row_count_stats
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1772
|
+
metadata_dicts.append(
|
|
1773
|
+
VersionMetadata(
|
|
1774
|
+
version=version,
|
|
1775
|
+
created_at=datetime.datetime.fromtimestamp(vers_md.version_md.created_at, tz=datetime.timezone.utc),
|
|
1776
|
+
user=vers_md.version_md.user,
|
|
1777
|
+
change_type=change_type,
|
|
1778
|
+
inserts=rcs.ins_rows,
|
|
1779
|
+
updates=rcs.upd_rows,
|
|
1780
|
+
deletes=rcs.del_rows,
|
|
1781
|
+
errors=rcs.num_excs,
|
|
1782
|
+
computed=rcs.computed_values,
|
|
1783
|
+
schema_change=schema_change,
|
|
1784
|
+
)
|
|
1785
|
+
)
|
|
1786
|
+
|
|
1787
|
+
return metadata_dicts
|
|
1788
|
+
|
|
1789
|
+
def history(self, n: Optional[int] = None) -> pd.DataFrame:
|
|
1790
|
+
"""
|
|
1791
|
+
Returns a human-readable report about versions of this table.
|
|
1792
|
+
|
|
1793
|
+
`history()` is intended for human-readable output of version metadata; for programmatic access,
|
|
1794
|
+
use [`get_versions()`][pixeltable.Table.get_versions] instead.
|
|
1795
|
+
|
|
1796
|
+
Args:
|
|
1797
|
+
n: if specified, will return at most `n` versions
|
|
1798
|
+
|
|
1799
|
+
Returns:
|
|
1800
|
+
A report with information about each version, one per row, most recent first.
|
|
1771
1801
|
|
|
1772
|
-
|
|
1802
|
+
Examples:
|
|
1803
|
+
Report all versions of the table:
|
|
1804
|
+
|
|
1805
|
+
>>> tbl.history()
|
|
1806
|
+
|
|
1807
|
+
Report only the most recent 5 changes to the table:
|
|
1808
|
+
|
|
1809
|
+
>>> tbl.history(n=5)
|
|
1810
|
+
"""
|
|
1811
|
+
versions = self.get_versions(n)
|
|
1812
|
+
assert len(versions) > 0
|
|
1813
|
+
return pd.DataFrame([list(v.values()) for v in versions], columns=list(versions[0].keys()))
|
|
1773
1814
|
|
|
1774
1815
|
def __check_mutable(self, op_descr: str) -> None:
|
|
1816
|
+
if self._tbl_version_path.is_replica():
|
|
1817
|
+
raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a replica.')
|
|
1775
1818
|
if self._tbl_version_path.is_snapshot():
|
|
1776
1819
|
raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a snapshot.')
|
|
1777
|
-
if self._tbl_version_path.is_replica():
|
|
1778
|
-
raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a {self._display_name()}.')
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
class ColumnMetadata(TypedDict):
|
|
1782
|
-
"""Metadata for a column of a Pixeltable table."""
|
|
1783
|
-
|
|
1784
|
-
name: str
|
|
1785
|
-
"""The name of the column."""
|
|
1786
|
-
type_: str
|
|
1787
|
-
"""The type specifier of the column."""
|
|
1788
|
-
version_added: int
|
|
1789
|
-
"""The table version when this column was added."""
|
|
1790
|
-
is_stored: bool
|
|
1791
|
-
"""`True` if this is a stored column; `False` if it is dynamically computed."""
|
|
1792
|
-
is_primary_key: bool
|
|
1793
|
-
"""`True` if this column is part of the table's primary key."""
|
|
1794
|
-
media_validation: Optional[Literal['on_read', 'on_write']]
|
|
1795
|
-
"""The media validation policy for this column."""
|
|
1796
|
-
computed_with: Optional[str]
|
|
1797
|
-
"""Expression used to compute this column; `None` if this is not a computed column."""
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
class IndexMetadata(TypedDict):
|
|
1801
|
-
"""Metadata for a column of a Pixeltable table."""
|
|
1802
|
-
|
|
1803
|
-
name: str
|
|
1804
|
-
"""The name of the index."""
|
|
1805
|
-
columns: list[str]
|
|
1806
|
-
"""The table columns that are indexed."""
|
|
1807
|
-
index_type: Literal['embedding']
|
|
1808
|
-
"""The type of index (currently only `'embedding'` is supported, but others will be added in the future)."""
|
|
1809
|
-
parameters: EmbeddingIndexParams
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
class EmbeddingIndexParams(TypedDict):
|
|
1813
|
-
metric: Literal['cosine', 'ip', 'l2']
|
|
1814
|
-
"""Index metric."""
|
|
1815
|
-
embeddings: list[str]
|
|
1816
|
-
"""List of embeddings defined for this index."""
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
class TableMetadata(TypedDict):
|
|
1820
|
-
"""Metadata for a Pixeltable table."""
|
|
1821
|
-
|
|
1822
|
-
name: str
|
|
1823
|
-
"""The name of the table (ex: `'my_table'`)."""
|
|
1824
|
-
path: str
|
|
1825
|
-
"""The full path of the table (ex: `'my_dir.my_subdir.my_table'`)."""
|
|
1826
|
-
columns: dict[str, ColumnMetadata]
|
|
1827
|
-
"""Column metadata for all of the visible columns of the table."""
|
|
1828
|
-
indices: dict[str, IndexMetadata]
|
|
1829
|
-
"""Index metadata for all of the indices of the table."""
|
|
1830
|
-
is_replica: bool
|
|
1831
|
-
"""`True` if this table is a replica of another (shared) table."""
|
|
1832
|
-
is_view: bool
|
|
1833
|
-
"""`True` if this table is a view."""
|
|
1834
|
-
is_snapshot: bool
|
|
1835
|
-
"""`True` if this table is a snapshot."""
|
|
1836
|
-
version: int
|
|
1837
|
-
"""The current version of the table."""
|
|
1838
|
-
version_created: datetime.datetime
|
|
1839
|
-
"""The timestamp when this table version was created."""
|
|
1840
|
-
schema_version: int
|
|
1841
|
-
"""The current schema version of the table."""
|
|
1842
|
-
comment: Optional[str]
|
|
1843
|
-
"""User-provided table comment, if one exists."""
|
|
1844
|
-
media_validation: Literal['on_read', 'on_write']
|
|
1845
|
-
"""The media validation policy for this table."""
|
|
1846
|
-
base: Optional[str]
|
|
1847
|
-
"""If this table is a view or snapshot, the full path of its base table; otherwise `None`."""
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Literal, Optional, TypedDict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ColumnMetadata(TypedDict):
|
|
6
|
+
"""Metadata for a column of a Pixeltable table."""
|
|
7
|
+
|
|
8
|
+
name: str
|
|
9
|
+
"""The name of the column."""
|
|
10
|
+
type_: str
|
|
11
|
+
"""The type specifier of the column."""
|
|
12
|
+
version_added: int
|
|
13
|
+
"""The table version when this column was added."""
|
|
14
|
+
is_stored: bool
|
|
15
|
+
"""`True` if this is a stored column; `False` if it is dynamically computed."""
|
|
16
|
+
is_primary_key: bool
|
|
17
|
+
"""`True` if this column is part of the table's primary key."""
|
|
18
|
+
media_validation: Optional[Literal['on_read', 'on_write']]
|
|
19
|
+
"""The media validation policy for this column."""
|
|
20
|
+
computed_with: Optional[str]
|
|
21
|
+
"""Expression used to compute this column; `None` if this is not a computed column."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EmbeddingIndexParams(TypedDict):
|
|
25
|
+
metric: Literal['cosine', 'ip', 'l2']
|
|
26
|
+
"""Index metric."""
|
|
27
|
+
embeddings: list[str]
|
|
28
|
+
"""List of embeddings defined for this index."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class IndexMetadata(TypedDict):
|
|
32
|
+
"""Metadata for a column of a Pixeltable table."""
|
|
33
|
+
|
|
34
|
+
name: str
|
|
35
|
+
"""The name of the index."""
|
|
36
|
+
columns: list[str]
|
|
37
|
+
"""The table columns that are indexed."""
|
|
38
|
+
index_type: Literal['embedding']
|
|
39
|
+
"""The type of index (currently only `'embedding'` is supported, but others will be added in the future)."""
|
|
40
|
+
parameters: EmbeddingIndexParams
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TableMetadata(TypedDict):
|
|
44
|
+
"""Metadata for a Pixeltable table."""
|
|
45
|
+
|
|
46
|
+
name: str
|
|
47
|
+
"""The name of the table (ex: `'my_table'`)."""
|
|
48
|
+
path: str
|
|
49
|
+
"""The full path of the table (ex: `'my_dir.my_subdir.my_table'`)."""
|
|
50
|
+
columns: dict[str, ColumnMetadata]
|
|
51
|
+
"""Column metadata for all of the visible columns of the table."""
|
|
52
|
+
indices: dict[str, IndexMetadata]
|
|
53
|
+
"""Index metadata for all of the indices of the table."""
|
|
54
|
+
is_replica: bool
|
|
55
|
+
"""`True` if this table is a replica of another (shared) table."""
|
|
56
|
+
is_view: bool
|
|
57
|
+
"""`True` if this table is a view."""
|
|
58
|
+
is_snapshot: bool
|
|
59
|
+
"""`True` if this table is a snapshot."""
|
|
60
|
+
version: int
|
|
61
|
+
"""The current version of the table."""
|
|
62
|
+
version_created: datetime.datetime
|
|
63
|
+
"""The timestamp when this table version was created."""
|
|
64
|
+
schema_version: int
|
|
65
|
+
"""The current schema version of the table."""
|
|
66
|
+
comment: Optional[str]
|
|
67
|
+
"""User-provided table comment, if one exists."""
|
|
68
|
+
media_validation: Literal['on_read', 'on_write']
|
|
69
|
+
"""The media validation policy for this table."""
|
|
70
|
+
base: Optional[str]
|
|
71
|
+
"""If this table is a view or snapshot, the full path of its base table; otherwise `None`."""
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class VersionMetadata(TypedDict):
|
|
75
|
+
"""Metadata for a specific version of a Pixeltable table."""
|
|
76
|
+
|
|
77
|
+
"""The version number."""
|
|
78
|
+
version: int
|
|
79
|
+
"""The timestamp when this version was created."""
|
|
80
|
+
created_at: datetime.datetime
|
|
81
|
+
"""The user who created this version, if defined."""
|
|
82
|
+
user: str | None
|
|
83
|
+
"""The type of table transformation that this version represents (`'data'` or `'schema'`)."""
|
|
84
|
+
change_type: Literal['data', 'schema']
|
|
85
|
+
"""The number of rows inserted in this version."""
|
|
86
|
+
inserts: int
|
|
87
|
+
"""The number of rows updated in this version."""
|
|
88
|
+
updates: int
|
|
89
|
+
"""The number of rows deleted in this version."""
|
|
90
|
+
deletes: int
|
|
91
|
+
"""The number of errors encountered during this version."""
|
|
92
|
+
errors: int
|
|
93
|
+
"""The number of computed values calculated in this version."""
|
|
94
|
+
computed: int
|
|
95
|
+
"""A description of the schema change that occurred in this version, if any."""
|
|
96
|
+
schema_change: str | None
|
|
@@ -20,7 +20,7 @@ from pixeltable.iterators import ComponentIterator
|
|
|
20
20
|
from pixeltable.metadata import schema
|
|
21
21
|
from pixeltable.utils.exception_handler import run_cleanup_on_exception
|
|
22
22
|
from pixeltable.utils.filecache import FileCache
|
|
23
|
-
from pixeltable.utils.
|
|
23
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
24
24
|
|
|
25
25
|
from .tbl_ops import TableOp
|
|
26
26
|
|
|
@@ -327,7 +327,7 @@ class TableVersion:
|
|
|
327
327
|
from .table_version_path import TableVersionPath
|
|
328
328
|
|
|
329
329
|
# clear out any remaining media files from an aborted previous attempt
|
|
330
|
-
|
|
330
|
+
self.delete_media()
|
|
331
331
|
view_path = TableVersionPath.from_dict(op.load_view_op.view_path)
|
|
332
332
|
plan, _ = Planner.create_view_load_plan(view_path)
|
|
333
333
|
_, row_counts = self.store_tbl.insert_rows(plan, v_min=self.version)
|
|
@@ -356,14 +356,23 @@ class TableVersion:
|
|
|
356
356
|
cat = pxt.catalog.Catalog.get()
|
|
357
357
|
# We're creating a new TableVersion replica, so we should never have seen this particular
|
|
358
358
|
# TableVersion instance before.
|
|
359
|
-
|
|
360
|
-
|
|
359
|
+
# Actually this isn't true, because we might be re-creating a dropped replica.
|
|
360
|
+
# TODO: Understand why old TableVersions are kept around even for a dropped table.
|
|
361
|
+
# assert tbl_version.effective_version is not None
|
|
362
|
+
# assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
|
|
361
363
|
cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
|
|
362
364
|
tbl_version.init()
|
|
363
365
|
tbl_version.store_tbl.create()
|
|
364
366
|
tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
|
|
365
367
|
return tbl_version
|
|
366
368
|
|
|
369
|
+
def delete_media(self, tbl_version: Optional[int] = None) -> None:
|
|
370
|
+
# Assemble a set of column destinations and delete objects from all of them
|
|
371
|
+
# None is a valid column destination which refers to the default object location
|
|
372
|
+
destinations = {col.destination for col in self.cols if col.is_stored}
|
|
373
|
+
for dest in destinations:
|
|
374
|
+
ObjectOps.delete(dest, self.id, tbl_version=tbl_version)
|
|
375
|
+
|
|
367
376
|
def drop(self) -> None:
|
|
368
377
|
# if self.is_view and self.is_mutable:
|
|
369
378
|
# # update mutable_views
|
|
@@ -374,7 +383,7 @@ class TableVersion:
|
|
|
374
383
|
# if self.base.get().is_mutable:
|
|
375
384
|
# self.base.get().mutable_views.remove(TableVersionHandle.create(self))
|
|
376
385
|
|
|
377
|
-
|
|
386
|
+
self.delete_media()
|
|
378
387
|
FileCache.get().clear(tbl_id=self.id)
|
|
379
388
|
self.store_tbl.drop()
|
|
380
389
|
|
|
@@ -1236,7 +1245,7 @@ class TableVersion:
|
|
|
1236
1245
|
)
|
|
1237
1246
|
|
|
1238
1247
|
# delete newly-added data
|
|
1239
|
-
|
|
1248
|
+
self.delete_media(tbl_version=self.version)
|
|
1240
1249
|
conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
|
|
1241
1250
|
|
|
1242
1251
|
# revert new deletions
|
pixeltable/catalog/view.py
CHANGED
|
@@ -47,17 +47,13 @@ class View(Table):
|
|
|
47
47
|
self._tbl_version = tbl_version_path.tbl_version
|
|
48
48
|
|
|
49
49
|
def _display_name(self) -> str:
|
|
50
|
-
name: str
|
|
51
|
-
if self._tbl_version_path.is_snapshot():
|
|
52
|
-
name = 'snapshot'
|
|
53
|
-
elif self._tbl_version_path.is_view():
|
|
54
|
-
name = 'view'
|
|
55
|
-
else:
|
|
56
|
-
assert self._tbl_version_path.is_replica()
|
|
57
|
-
name = 'table'
|
|
58
50
|
if self._tbl_version_path.is_replica():
|
|
59
|
-
|
|
60
|
-
|
|
51
|
+
return 'replica'
|
|
52
|
+
if self._tbl_version_path.is_snapshot():
|
|
53
|
+
return 'snapshot'
|
|
54
|
+
if self._tbl_version_path.is_view():
|
|
55
|
+
return 'view'
|
|
56
|
+
return 'table'
|
|
61
57
|
|
|
62
58
|
@classmethod
|
|
63
59
|
def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
|
|
@@ -270,12 +266,12 @@ class View(Table):
|
|
|
270
266
|
# Update name and path with version qualifiers.
|
|
271
267
|
md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
|
|
272
268
|
md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
|
|
273
|
-
|
|
274
|
-
if
|
|
275
|
-
|
|
276
|
-
|
|
269
|
+
base_tbl_id = self._base_tbl_id
|
|
270
|
+
if base_tbl_id is not None:
|
|
271
|
+
base_tbl = self._get_base_table()
|
|
272
|
+
base_path = '<anonymous base table>' if base_tbl is None else base_tbl._path()
|
|
277
273
|
base_version = self._effective_base_versions[0]
|
|
278
|
-
md['base'] =
|
|
274
|
+
md['base'] = base_path if base_version is None else f'{base_path}:{base_version}'
|
|
279
275
|
return md
|
|
280
276
|
|
|
281
277
|
def insert(
|
|
@@ -294,17 +290,21 @@ class View(Table):
|
|
|
294
290
|
def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
|
|
295
291
|
raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
|
|
296
292
|
|
|
297
|
-
|
|
293
|
+
@property
|
|
294
|
+
def _base_tbl_id(self) -> Optional[UUID]:
|
|
298
295
|
if self._tbl_version_path.tbl_id != self._id:
|
|
299
296
|
# _tbl_version_path represents a different schema object from this one. This can only happen if this is a
|
|
300
297
|
# named pure snapshot.
|
|
301
|
-
|
|
302
|
-
|
|
298
|
+
return self._tbl_version_path.tbl_id
|
|
299
|
+
if self._tbl_version_path.base is None:
|
|
303
300
|
return None
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
301
|
+
return self._tbl_version_path.base.tbl_id
|
|
302
|
+
|
|
303
|
+
def _get_base_table(self) -> Optional['Table']:
|
|
304
|
+
"""Returns None if there is no base table, or if the base table is hidden."""
|
|
305
|
+
base_tbl_id = self._base_tbl_id
|
|
306
|
+
with catalog.Catalog.get().begin_xact(tbl_id=base_tbl_id, for_write=False):
|
|
307
|
+
return catalog.Catalog.get().get_table_by_id(base_tbl_id)
|
|
308
308
|
|
|
309
309
|
@property
|
|
310
310
|
def _effective_base_versions(self) -> list[Optional[int]]:
|
pixeltable/config.py
CHANGED
|
@@ -161,6 +161,8 @@ KNOWN_CONFIG_OPTIONS = {
|
|
|
161
161
|
'hide_warnings': 'Hide warnings from the console',
|
|
162
162
|
'verbosity': 'Verbosity level for console output',
|
|
163
163
|
'api_key': 'API key for Pixeltable cloud',
|
|
164
|
+
'r2_profile': 'AWS config profile name used to access R2 storage',
|
|
165
|
+
's3_profile': 'AWS config profile name used to access S3 storage',
|
|
164
166
|
},
|
|
165
167
|
'anthropic': {'api_key': 'Anthropic API key'},
|
|
166
168
|
'bedrock': {'api_key': 'AWS Bedrock API key'},
|
pixeltable/dataframe.py
CHANGED
|
@@ -1276,10 +1276,11 @@ class DataFrame:
|
|
|
1276
1276
|
|
|
1277
1277
|
# TODO: Reconcile these with Table.__check_mutable()
|
|
1278
1278
|
assert len(self._from_clause.tbls) == 1
|
|
1279
|
-
if
|
|
1280
|
-
raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
|
|
1279
|
+
# First check if it's a replica, since every replica handle is also a snapshot
|
|
1281
1280
|
if self._first_tbl.is_replica():
|
|
1282
1281
|
raise excs.Error(f'Cannot use `{op_name}` on a replica.')
|
|
1282
|
+
if self._first_tbl.is_snapshot():
|
|
1283
|
+
raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
|
|
1283
1284
|
|
|
1284
1285
|
def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
|
|
1285
1286
|
"""Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
|