PyPI - pixeltable - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

pixeltable 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (52) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/catalog.py +619 -255
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/insertable_table.py +9 -9
pixeltable/catalog/path.py +59 -20
pixeltable/catalog/schema_object.py +10 -4
pixeltable/catalog/table.py +51 -53
pixeltable/catalog/table_version.py +216 -156
pixeltable/catalog/table_version_path.py +1 -1
pixeltable/catalog/tbl_ops.py +44 -0
pixeltable/catalog/view.py +63 -65
pixeltable/config.py +12 -4
pixeltable/dataframe.py +75 -6
pixeltable/env.py +46 -17
pixeltable/exec/aggregation_node.py +1 -1
pixeltable/exec/cache_prefetch_node.py +2 -6
pixeltable/exec/component_iteration_node.py +4 -3
pixeltable/exec/data_row_batch.py +10 -51
pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
pixeltable/exec/in_memory_data_node.py +17 -16
pixeltable/exec/sql_node.py +6 -7
pixeltable/exprs/column_ref.py +2 -1
pixeltable/exprs/data_row.py +13 -13
pixeltable/exprs/row_builder.py +16 -4
pixeltable/exprs/string_op.py +1 -1
pixeltable/func/expr_template_function.py +1 -4
pixeltable/functions/date.py +1 -1
pixeltable/functions/gemini.py +4 -4
pixeltable/functions/math.py +1 -1
pixeltable/functions/openai.py +9 -6
pixeltable/functions/timestamp.py +6 -6
pixeltable/functions/video.py +2 -6
pixeltable/globals.py +62 -33
pixeltable/io/datarows.py +2 -1
pixeltable/io/pandas.py +1 -0
pixeltable/io/table_data_conduit.py +12 -13
pixeltable/iterators/audio.py +17 -8
pixeltable/iterators/image.py +5 -2
pixeltable/metadata/schema.py +39 -2
pixeltable/plan.py +5 -14
pixeltable/share/packager.py +13 -13
pixeltable/store.py +31 -7
pixeltable/type_system.py +2 -1
pixeltable/utils/filecache.py +1 -1
pixeltable/utils/http_server.py +2 -3
pixeltable/utils/media_store.py +90 -34
{pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/METADATA +1 -1
{pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/RECORD +52 -51
{pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/LICENSE +0 -0
{pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/WHEEL +0 -0
{pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/entry_points.txt +0 -0

pixeltable/globals.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Un
 import pandas as pd
 from pandas.io.formats.style import Styler
-from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
+from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
 from pixeltable.catalog import Catalog, TableVersionPath
 from pixeltable.catalog.insertable_table import OnErrorParameter
 from pixeltable.config import Config
@@ -44,7 +44,7 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
 def create_table(
-    path_str: str,
+    path: str,
     schema: Optional[dict[str, Any]] = None,
     *,
     source: Optional[TableDataSource] = None,
@@ -58,14 +58,24 @@ def create_table(
     if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
     extra_args: Optional[dict[str, Any]] = None,  # Additional arguments to data source provider
 ) -> catalog.Table:
-    """Create a new base table.
+    """Create a new base table. Exactly one of `schema` or `source` must be provided.
+    If a `schema` is provided, then an empty table will be created with the specified schema.
+    If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
+    contents of the specified data, and the data will be imported from the specified source into the new table. The
+    source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
     Args:
-        path_str: Path to the table.
-        schema: A dictionary that maps column names to column types
-        source: A data source from which a table schema can be inferred and data imported
-        source_format: A hint to the format of the source data
-        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
+        path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
+        schema: Schema for the new table, mapping column names to Pixeltable types.
+        source: A data source (file, URL, DataFrame, or list of rows) to import from.
+        source_format: Must be used in conjunction with a `source`.
+            If specified, then the given format will be used to read the source data. (Otherwise,
+            Pixeltable will attempt to infer the format from the source data.)
+        schema_overrides: Must be used in conjunction with a `source`.
+            If specified, then columns in `schema_overrides` will be given the specified types.
+            (Pixeltable will attempt to infer the types of any columns not specified.)
         on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
             invalid media file (such as a corrupt image) for one of the inserted rows.
@@ -81,14 +91,15 @@ def create_table(
             - `'on_read'`: validate media files at query time
             - `'on_write'`: validate media files during insert/update operations
-        if_exists: Directive regarding how to handle if the path already exists.
-            Must be one of the following:
+        if_exists: Determines the behavior if a table already exists at the specified path location.
             - `'error'`: raise an error
             - `'ignore'`: do nothing and return the existing table handle
-            - `'replace'`: if the existing table has no views, drop and replace it with a new one
-            - `'replace_force'`: drop the existing table and all its views, and create a new one
-        extra_args: Additional arguments to pass to the source data provider
+            - `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
+                raise an error if the existing table has views or snapshots
+            - `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
+        extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
+            passed along to the source data provider.
     Returns:
         A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
@@ -114,7 +125,7 @@ def create_table(
         >>> tbl1 = pxt.get_table('orig_table')
         ... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
-        Create a table if does not already exist, otherwise get the existing table:
+        Create a table if it does not already exist, otherwise get the existing table:
         >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
@@ -130,12 +141,12 @@ def create_table(
     from pixeltable.io.utils import normalize_primary_key_parameter
     if (schema is None) == (source is None):
-        raise excs.Error('Must provide either a `schema` or a `source`')
+        raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
     if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
         raise excs.Error('`schema` must be a non-empty dictionary')
-    path_obj = catalog.Path(path_str)
+    path_obj = catalog.Path.parse(path)
     if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
     media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
     primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
@@ -146,7 +157,14 @@ def create_table(
         tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
         tds.check_source_format()
         data_source = tds.specialize()
-        data_source.src_schema_overrides = schema_overrides
+        src_schema_overrides: dict[str, ts.ColumnType] = {}
+        if schema_overrides is not None:
+            for col_name, py_type in schema_overrides.items():
+                col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
+                if col_type is None:
+                    raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
+                src_schema_overrides[col_name] = col_type
+        data_source.src_schema_overrides = src_schema_overrides
         data_source.src_pk = primary_key
         data_source.infer_schema()
         schema = data_source.pxt_schema
@@ -255,9 +273,7 @@ def create_view(
         tbl_version_path = base._tbl_version_path
         sample_clause = None
     elif isinstance(base, DataFrame):
-        base._validate_mutable('create_view', allow_select=True)
-        if len(base._from_clause.tbls) > 1:
-            raise excs.Error('Cannot create a view of a join')
+        base._validate_mutable_op_sequence('create_view', allow_select=True)
         tbl_version_path = base._from_clause.tbls[0]
         where = base.where_clause
         sample_clause = base.sample_clause
@@ -268,7 +284,7 @@ def create_view(
         raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
     assert isinstance(base, (catalog.Table, DataFrame))
-    path_obj = catalog.Path(path)
+    path_obj = catalog.Path.parse(path)
     if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
     media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
@@ -429,8 +445,12 @@ def get_table(path: str) -> catalog.Table:
         Handles to views and snapshots are retrieved in the same way:
         >>> tbl = pxt.get_table('my_snapshot')
+        Get a handle to a specific version of a table:
+        >>> tbl = pxt.get_table('my_table:722')
     """
-    path_obj = catalog.Path(path)
+    path_obj = catalog.Path.parse(path, allow_versioned_path=True)
     tbl = Catalog.get().get_table(path_obj)
     return tbl
@@ -456,7 +476,7 @@ def move(path: str, new_path: str) -> None:
     """
     if path == new_path:
         raise excs.Error('move(): source and destination cannot be identical')
-    path_obj, new_path_obj = catalog.Path(path), catalog.Path(new_path)
+    path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
     if path_obj.is_ancestor(new_path_obj):
         raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
     cat = Catalog.get()
@@ -509,7 +529,7 @@ def drop_table(
         assert isinstance(table, str)
         tbl_path = table
-    path_obj = catalog.Path(tbl_path)
+    path_obj = catalog.Path.parse(tbl_path)
     if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
     Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
@@ -537,9 +557,12 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
         >>> pxt.list_tables('dir1')
     """
-    path_obj = catalog.Path(dir_path, empty_is_valid=True)  # validate format
-    cat = Catalog.get()
-    contents = cat.get_dir_contents(path_obj, recursive=recursive)
+    return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
+def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
+    path_obj = catalog.Path.parse(dir_path, allow_empty_path=True, allow_system_path=allow_system_paths)
+    contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
     return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
@@ -590,7 +613,7 @@ def create_dir(
         >>> pxt.create_dir('parent1.parent2.sub_dir', parents=True)
     """
-    path_obj = catalog.Path(path)
+    path_obj = catalog.Path.parse(path)
     if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
     return Catalog.get().create_dir(path_obj, if_exists=if_exists_, parents=parents)
@@ -632,7 +655,7 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
         >>> pxt.drop_dir('my_dir', force=True)
     """
-    path_obj = catalog.Path(path)  # validate format
+    path_obj = catalog.Path.parse(path)  # validate format
     if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
     Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
@@ -647,13 +670,16 @@ def ls(path: str = '') -> pd.DataFrame:
     To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
     [list_dirs()][pixeltable.list_dirs] instead.
     """
+    from pixeltable.catalog import retry_loop
     from pixeltable.metadata import schema
     cat = Catalog.get()
-    path_obj = catalog.Path(path, empty_is_valid=True)
+    path_obj = catalog.Path.parse(path, allow_empty_path=True)
     dir_entries = cat.get_dir_contents(path_obj)
-    rows: list[list[str]] = []
-    with Catalog.get().begin_xact():
+    @retry_loop(for_write=False)
+    def op() -> list[list[str]]:
+        rows: list[list[str]] = []
         for name, entry in dir_entries.items():
             if name.startswith('_'):
                 continue
@@ -679,6 +705,9 @@ def ls(path: str = '') -> pd.DataFrame:
                 if md['is_replica']:
                     kind = f'{kind}-replica'
             rows.append([name, kind, version, base])
+        return rows
+    rows = op()
     rows = sorted(rows, key=lambda x: x[0])
     df = pd.DataFrame(
@@ -734,7 +763,7 @@ def list_dirs(path: str = '', recursive: bool = True) -> list[str]:
         >>> cl.list_dirs('my_dir', recursive=True)
         ['my_dir', 'my_dir.sub_dir1']
     """
-    path_obj = catalog.Path(path, empty_is_valid=True)  # validate format
+    path_obj = catalog.Path.parse(path, allow_empty_path=True)  # validate format
     cat = Catalog.get()
     contents = cat.get_dir_contents(path_obj, recursive=recursive)
     return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Dir)]

pixeltable/io/datarows.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
 def _infer_schema_from_rows(
-    rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
+    rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
 ) -> dict[str, ts.ColumnType]:
     schema: dict[str, ts.ColumnType] = {}
     cols_with_nones: set[str] = set()
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
                 # in which the column names are encountered in the input data, even if `schema_overrides`
                 # is specified.
                 if col_name not in schema:
+                    assert isinstance(schema_overrides[col_name], ts.ColumnType)
                     schema[col_name] = schema_overrides[col_name]
             elif value is not None:
                 # If `key` is not in `schema_overrides`, then we infer its type from the data.

pixeltable/io/pandas.py CHANGED Viewed

@@ -132,6 +132,7 @@ def df_infer_schema(
     pd_schema: dict[str, ts.ColumnType] = {}
     for pd_name, pd_dtype in zip(df.columns, df.dtypes):
         if pd_name in schema_overrides:
+            assert isinstance(schema_overrides[pd_name], ts.ColumnType)
             pxt_type = schema_overrides[pd_name]
         else:
             pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -47,13 +47,13 @@ class TableDataConduitFormat(str, enum.Enum):
 @dataclass
 class TableDataConduit:
-    source: TableDataSource
+    source: 'TableDataSource'
     source_format: Optional[str] = None
     source_column_map: Optional[dict[str, str]] = None
     if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
-    pxt_schema: Optional[dict[str, Any]] = None
-    src_schema_overrides: Optional[dict[str, Any]] = None
-    src_schema: Optional[dict[str, Any]] = None
+    pxt_schema: Optional[dict[str, ts.ColumnType]] = None
+    src_schema_overrides: Optional[dict[str, ts.ColumnType]] = None
+    src_schema: Optional[dict[str, ts.ColumnType]] = None
     pxt_pk: Optional[list[str]] = None
     src_pk: Optional[list[str]] = None
     valid_rows: Optional[RowData] = None
@@ -87,7 +87,7 @@ class TableDataConduit:
         for name, coltype in self.pxt_schema.items():
             self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         raise NotImplementedError
     def valid_row_batch(self) -> Iterator[RowData]:
@@ -137,7 +137,7 @@ class DFTableDataConduit(TableDataConduit):
         t.pxt_df = tds.source
         return t
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         self.pxt_schema = self.pxt_df.schema
         self.pxt_pk = self.src_pk
         return self.pxt_schema
@@ -168,7 +168,7 @@ class RowDataTableDataConduit(TableDataConduit):
         t.batch_count = 0
         return t
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         from .datarows import _infer_schema_from_rows
         if self.source_column_map is None:
@@ -239,7 +239,7 @@ class PandasTableDataConduit(TableDataConduit):
         t.batch_count = 0
         return t
-    def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
+    def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
         """Return inferred schema, inferred primary key, and source column map"""
         if self.source_column_map is None:
             if self.src_schema_overrides is None:
@@ -252,7 +252,7 @@ class PandasTableDataConduit(TableDataConduit):
         else:
             raise NotImplementedError()
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
         self.normalize_pxt_schema_types()
         _df_check_primary_key_values(self.pd_df, self.src_pk)
@@ -328,7 +328,6 @@ class HFTableDataConduit(TableDataConduit):
     hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
     column_name_for_split: Optional[str] = None
     categorical_features: dict[str, dict[int, str]]
-    hf_schema: dict[str, Any] = None
     dataset_dict: dict[str, datasets.Dataset] = None
     hf_schema_source: dict[str, Any] = None
@@ -356,7 +355,7 @@ class HFTableDataConduit(TableDataConduit):
         except ImportError:
             return False
-    def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
+    def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
         from pixeltable.io.hf_datasets import _get_hf_schema, huggingface_schema_to_pxt_schema
         if self.source_column_map is None:
@@ -469,7 +468,7 @@ class ParquetTableDataConduit(TableDataConduit):
         t.pq_ds = parquet.ParquetDataset(str(input_path))
         return t
-    def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
+    def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
         from pixeltable.utils.arrow import ar_infer_schema
         if self.source_column_map is None:
@@ -483,7 +482,7 @@ class ParquetTableDataConduit(TableDataConduit):
         else:
             raise NotImplementedError()
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
         self.normalize_pxt_schema_types()
         self.prepare_insert()

pixeltable/iterators/audio.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import uuid
 from fractions import Fraction
 from pathlib import Path
 from typing import Any, ClassVar, Optional
@@ -55,12 +54,9 @@ class AudioSplitter(ComponentIterator):
     def __init__(
         self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
     ):
-        if chunk_duration_sec <= 0.0:
-            raise excs.Error('chunk_duration_sec must be a positive number')
-        if chunk_duration_sec < min_chunk_duration_sec:
-            raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
-        if overlap_sec >= chunk_duration_sec:
-            raise excs.Error('overlap_sec must be less than chunk_duration_sec')
+        assert chunk_duration_sec > 0.0
+        assert chunk_duration_sec >= min_chunk_duration_sec
+        assert overlap_sec < chunk_duration_sec
         audio_path = Path(audio)
         assert audio_path.exists() and audio_path.is_file()
         self.audio_path = audio_path
@@ -128,6 +124,19 @@ class AudioSplitter(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
+        params = dict(zip(param_names, args))
+        params.update(kwargs)
+        chunk_duration_sec = params['chunk_duration_sec']
+        min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
+        overlap_sec = params.get('overlap_sec', 0.0)
+        if chunk_duration_sec <= 0.0:
+            raise excs.Error('chunk_duration_sec must be a positive number')
+        if chunk_duration_sec < min_chunk_duration_sec:
+            raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
+        if overlap_sec >= chunk_duration_sec:
+            raise excs.Error('overlap_sec must be less than chunk_duration_sec')
         return {
             'start_time_sec': ts.FloatType(),
             'end_time_sec': ts.FloatType(),
@@ -140,7 +149,7 @@ class AudioSplitter(ComponentIterator):
         target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
         chunk_start_pts = 0
         chunk_end_pts = 0
-        chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
+        chunk_file = str(env.Env.get().create_tmp_path(self.audio_path.suffix))
         output_container = av.open(chunk_file, mode='w')
         input_stream = self.container.streams.audio[0]
         codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)

pixeltable/iterators/image.py CHANGED Viewed

@@ -31,8 +31,7 @@ class TileIterator(ComponentIterator):
     __j: int
     def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
-        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
-            raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
+        assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
         self.__image = image
         self.__image.load()
@@ -79,4 +78,8 @@ class TileIterator(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        tile_size = kwargs.get('tile_size')
+        overlap = kwargs.get('overlap', (0, 0))
+        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
+            raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
         return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']

pixeltable/metadata/schema.py CHANGED Viewed

@@ -24,7 +24,7 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
     """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
     if dataclasses.is_dataclass(data_class_type):
         fieldtypes = get_type_hints(data_class_type)
-        return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})  # type: ignore[return-value]
+        return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
     origin = typing.get_origin(data_class_type)
     if origin is not None:
@@ -182,6 +182,7 @@ class TableMd:
     # sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
     # - incremented for each add/drop of a mutable view
     # - only maintained for mutable tables
+    # TODO: replace with mutable_views: list[UUID] to help with debugging
     view_sn: int
     # Metadata format for external stores:
@@ -193,6 +194,26 @@ class TableMd:
     view_md: Optional[ViewMd]
     additional_md: dict[str, Any]
+    has_pending_ops: bool = False
+    @property
+    def is_snapshot(self) -> bool:
+        return self.view_md is not None and self.view_md.is_snapshot
+    @property
+    def is_mutable(self) -> bool:
+        return not self.is_snapshot and not self.is_replica
+    @property
+    def is_pure_snapshot(self) -> bool:
+        return (
+            self.view_md is not None
+            and self.view_md.is_snapshot
+            and self.view_md.sample_clause is None
+            and self.view_md.predicate is None
+            and len(self.column_md) == 0
+        )
 class Table(Base):
     """
@@ -215,7 +236,7 @@ class Table(Base):
     lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
-@dataclasses.dataclass(frozen=True)
+@dataclasses.dataclass
 class TableVersionMd:
     tbl_id: str  # uuid.UUID
     created_at: float  # time.time()
@@ -279,6 +300,22 @@ class TableSchemaVersion(Base):
     md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False)  # TableSchemaVersionMd
+class PendingTableOp(Base):
+    """
+    Table operation that needs to be completed before the table can be used.
+    Operations need to be completed in order of increasing seq_num.
+    """
+    __tablename__ = 'pendingtableops'
+    tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
+        UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
+    )
+    op_sn: orm.Mapped[int] = orm.mapped_column(Integer, primary_key=True, nullable=False)  # catalog.TableOp.op_sn
+    op: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False)  # catalog.TableOp
 @dataclasses.dataclass
 class FunctionMd:
     name: str

pixeltable/plan.py CHANGED Viewed

@@ -385,14 +385,7 @@ class Planner:
             TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
         )
-        media_input_col_info = [
-            exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
-            for col_ref in row_builder.input_exprs
-            if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
-        ]
-        if len(media_input_col_info) > 0:
-            # prefetch external files for all input column refs
-            plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
+        plan = cls._insert_prefetch_node(tbl.id, row_builder.input_exprs, input_node=plan)
         computed_exprs = row_builder.output_exprs - row_builder.input_exprs
         if len(computed_exprs) > 0:
@@ -789,15 +782,13 @@ class Planner:
     @classmethod
     def _insert_prefetch_node(
-        cls, tbl_id: UUID, row_builder: exprs.RowBuilder, input_node: exec.ExecNode
+        cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
     ) -> exec.ExecNode:
-        """Returns a CachePrefetchNode into the plan if needed, otherwise returns input"""
+        """Return a CachePrefetchNode if needed, otherwise return input"""
         # we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
         # of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
         # aren't explicitly captured as dependencies
-        media_col_refs = [
-            e for e in list(row_builder.unique_exprs) if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()
-        ]
+        media_col_refs = [e for e in expressions if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()]
         if len(media_col_refs) == 0:
             return input_node
         # we need to prefetch external files for media column types
@@ -967,7 +958,7 @@ class Planner:
                 stratify_exprs=analyzer.stratify_exprs,
             )
-        plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
+        plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder.unique_exprs, plan)
         if analyzer.group_by_clause is not None:
             # we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the

pixeltable/share/packager.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import base64
 import datetime
 import io
-import itertools
 import json
 import logging
 import tarfile
@@ -237,8 +236,7 @@ class TablePackager:
         - Videos are replaced by their first frame and resized as above
         - Documents are replaced by a thumbnail as a base64-encoded webp
         """
-        # First 8 columns
-        preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
+        preview_cols = self.table._get_schema()
         select_list = [self.table[col_name] for col_name in preview_cols]
         # First 5 rows
         rows = list(self.table.select(*select_list).head(n=5))
@@ -369,7 +367,7 @@ class TableRestorer:
         with cat.begin_xact(for_write=True):
             # Create (or update) the replica table and its ancestors, along with TableVersion instances for any
             # versions that have not been seen before.
-            cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
+            cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
             # Now we need to load data for replica_tbl and its ancestors, except that we skip
             # replica_tbl itself if it's a pure snapshot.
@@ -572,16 +570,18 @@ class TableRestorer:
         for col_name in pydict:
             assert col_name in tv.store_tbl.sa_tbl.columns
             sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
-        media_col_ids: dict[str, int] = {}
+        media_cols: dict[str, catalog.Column] = {}
         for col in tv.cols:
             if col.is_stored and col.col_type.is_media_type():
-                media_col_ids[col.store_name()] = col.id
+                assert tv.id == col.tbl.id
+                assert tv.version == col.tbl.version
+                media_cols[col.store_name()] = col
         row_count = len(next(iter(pydict.values())))
         rows: list[dict[str, Any]] = []
         for i in range(row_count):
             row = {
-                col_name: self.__from_pa_value(tv, col_vals[i], sql_types[col_name], media_col_ids.get(col_name))
+                col_name: self.__from_pa_value(col_vals[i], sql_types[col_name], media_cols.get(col_name))
                 for col_name, col_vals in pydict.items()
             }
             rows.append(row)
@@ -589,19 +589,19 @@ class TableRestorer:
         return rows
     def __from_pa_value(
-        self, tv: catalog.TableVersion, val: Any, sql_type: sql.types.TypeEngine[Any], media_col_id: Optional[int]
+        self, val: Any, sql_type: sql.types.TypeEngine[Any], media_col: Optional[catalog.Column]
     ) -> Any:
         if val is None:
             return None
         if isinstance(sql_type, sql.JSON):
             return json.loads(val)
-        if media_col_id is not None:
-            assert isinstance(val, str)
-            return self.__relocate_media_file(tv, media_col_id, val)
+        if media_col is not None:
+            return self.__relocate_media_file(media_col, val)
         return val
-    def __relocate_media_file(self, tv: catalog.TableVersion, media_col_id: int, url: str) -> str:
+    def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
         # If this is a pxtmedia:// URL, relocate it
+        assert isinstance(url, str)
         parsed_url = urllib.parse.urlparse(url)
         assert parsed_url.scheme != 'file'  # These should all have been converted to pxtmedia:// URLs
         if parsed_url.scheme == 'pxtmedia':
@@ -610,7 +610,7 @@ class TableRestorer:
                 # in self.media_files.
                 src_path = self.tmp_dir / 'media' / parsed_url.netloc
                 # Move the file to the media store and update the URL.
-                self.media_files[url] = MediaStore.relocate_local_media_file(src_path, tv.id, media_col_id, tv.version)
+                self.media_files[url] = MediaStore.relocate_local_media_file(src_path, media_col)
             return self.media_files[url]
         # For any type of URL other than a local file, just return the URL as-is.
         return url

pixeltable 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl