PyPI - pixeltable - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

pixeltable 0.4.3py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (33) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/catalog.py +526 -197
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/insertable_table.py +9 -9
pixeltable/catalog/schema_object.py +9 -4
pixeltable/catalog/table.py +45 -53
pixeltable/catalog/table_version.py +214 -155
pixeltable/catalog/table_version_path.py +1 -1
pixeltable/catalog/tbl_ops.py +44 -0
pixeltable/catalog/view.py +47 -60
pixeltable/dataframe.py +18 -5
pixeltable/env.py +21 -4
pixeltable/exec/data_row_batch.py +3 -1
pixeltable/exec/in_memory_data_node.py +6 -7
pixeltable/exprs/column_ref.py +2 -1
pixeltable/functions/gemini.py +4 -4
pixeltable/functions/openai.py +1 -2
pixeltable/functions/video.py +2 -6
pixeltable/globals.py +50 -25
pixeltable/io/datarows.py +2 -1
pixeltable/io/pandas.py +1 -0
pixeltable/io/table_data_conduit.py +12 -13
pixeltable/iterators/audio.py +17 -8
pixeltable/iterators/image.py +5 -2
pixeltable/metadata/schema.py +38 -1
pixeltable/store.py +22 -1
pixeltable/utils/media_store.py +11 -0
{pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
{pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/RECORD +33 -32
{pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
{pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
{pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0

pixeltable/catalog/table_version_path.py CHANGED Viewed

@@ -76,7 +76,7 @@ class TableVersionPath:
         elif self._cached_tbl_version is not None:
             return
-        with Catalog.get().begin_xact(for_write=False):
+        with Catalog.get().begin_xact(tbl_id=self.tbl_version.id, for_write=False):
             self._cached_tbl_version = self.tbl_version.get()
     def clear_cached_md(self) -> None:

pixeltable/catalog/tbl_ops.py ADDED Viewed

@@ -0,0 +1,44 @@
+# This file contains all dataclasses related to schema.PendingTableOp:
+# - TableOp: the container for each log entry
+# - <>Op: the actual operation, which is performed by TableVersion.exec_op(); each <>Op class contains
+#   enough information for exec_op() to perform the operation without having to reference data outside of
+#   TableVersion
+import dataclasses
+from typing import Any, Optional
+@dataclasses.dataclass
+class CreateStoreTableOp:
+    pass
+@dataclasses.dataclass
+class LoadViewOp:
+    view_path: dict[str, Any]  # needed to create the view load plan
+@dataclasses.dataclass
+class DeleteTableMdOp:
+    pass
+@dataclasses.dataclass
+class DeleteTableMediaFilesOp:
+    pass
+@dataclasses.dataclass
+class DropStoreTableOp:
+    pass
+@dataclasses.dataclass
+class TableOp:
+    tbl_id: str  # uuid.UUID
+    op_sn: int  # sequence number within the update operation; [0, num_ops)
+    num_ops: int  # total number of ops forming the update operation
+    needs_xact: bool  # if True, op must be run as part of a transaction
+    create_store_table_op: Optional[CreateStoreTableOp] = None
+    load_view_op: Optional[LoadViewOp] = None

pixeltable/catalog/view.py CHANGED Viewed

@@ -9,7 +9,6 @@ import pixeltable.exceptions as excs
 import pixeltable.metadata.schema as md_schema
 import pixeltable.type_system as ts
 from pixeltable import catalog, exprs, func
-from pixeltable.env import Env
 from pixeltable.iterators import ComponentIterator
 if TYPE_CHECKING:
@@ -19,9 +18,10 @@ if TYPE_CHECKING:
 from .column import Column
 from .globals import _POS_COLUMN_NAME, MediaValidation
 from .table import Table
-from .table_version import TableVersion
+from .table_version import TableVersion, TableVersionMd
 from .table_version_handle import TableVersionHandle
 from .table_version_path import TableVersionPath
+from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
 from .update_status import UpdateStatus
 if TYPE_CHECKING:
@@ -45,9 +45,18 @@ class View(Table):
         if not snapshot_only:
             self._tbl_version = tbl_version_path.tbl_version
-    @classmethod
-    def _display_name(cls) -> str:
-        return 'view'
+    def _display_name(self) -> str:
+        name: str
+        if self._tbl_version_path.is_snapshot():
+            name = 'snapshot'
+        elif self._tbl_version_path.is_view():
+            name = 'view'
+        else:
+            assert self._tbl_version_path.is_replica()
+            name = 'table'
+        if self._tbl_version_path.is_replica():
+            name = f'{name}-replica'
+        return name
     @classmethod
     def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
@@ -80,7 +89,7 @@ class View(Table):
         media_validation: MediaValidation,
         iterator_cls: Optional[type[ComponentIterator]],
         iterator_args: Optional[dict],
-    ) -> View:
+    ) -> tuple[TableVersionMd, Optional[list[TableOp]]]:
         from pixeltable.plan import SampleClause
         # Convert select_list to more additional_columns if present
@@ -167,11 +176,10 @@ class View(Table):
             for col in columns:
                 if col.name in iterator_col_names:
                     raise excs.Error(
-                        f'Duplicate name: column {col.name} is already present in the iterator output schema'
+                        f'Duplicate name: column {col.name!r} is already present in the iterator output schema'
                     )
             columns = iterator_cols + columns
-        session = Env.get().session
         from pixeltable.exprs import InlineDict
         iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
@@ -200,54 +208,26 @@ class View(Table):
             iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
         )
-        id, tbl_version = TableVersion.create(
-            dir_id,
-            name,
-            columns,
-            num_retained_versions,
-            comment,
-            media_validation=media_validation,
-            # base_path=base_version_path,
-            view_md=view_md,
+        md = TableVersion.create_initial_md(
+            name, columns, num_retained_versions, comment, media_validation=media_validation, view_md=view_md
         )
-        if tbl_version is None:
-            # this is purely a snapshot: we use the base's tbl version path
-            view = cls(id, dir_id, name, base_version_path, snapshot_only=True)
-            _logger.info(f'created snapshot {name}')
+        if md.tbl_md.is_pure_snapshot:
+            # this is purely a snapshot: no store table to create or load
+            return md, None
         else:
-            view = cls(
-                id,
-                dir_id,
-                name,
-                TableVersionPath(
-                    TableVersionHandle(tbl_version.id, tbl_version.effective_version), base=base_version_path
-                ),
-                snapshot_only=False,
-            )
-            _logger.info(f'Created view `{name}`, id={tbl_version.id}')
-            from pixeltable.plan import Planner
-            try:
-                plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
-                _, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
-                status = UpdateStatus(row_count_stats=row_counts)
-                tbl_version._write_md_update_status(0, update_status=status)
-            except:
-                # we need to remove the orphaned TableVersion instance
-                del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
-                base_tbl_version = base.tbl_version.get()
-                if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
-                    # also remove tbl_version from the base
-                    base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
-                raise
-            Env.get().console_logger.info(
-                f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
+            tbl_id = md.tbl_md.tbl_id
+            view_path = TableVersionPath(
+                TableVersionHandle(UUID(tbl_id), effective_version=0 if is_snapshot else None), base=base_version_path
             )
-        session.commit()
-        return view
+            ops = [
+                TableOp(
+                    tbl_id=tbl_id, op_sn=0, num_ops=2, needs_xact=False, create_store_table_op=CreateStoreTableOp()
+                ),
+                TableOp(
+                    tbl_id=tbl_id, op_sn=1, num_ops=2, needs_xact=True, load_view_op=LoadViewOp(view_path.as_dict())
+                ),
+            ]
+            return md, ops
     @classmethod
     def _verify_column(cls, col: Column) -> None:
@@ -280,8 +260,11 @@ class View(Table):
         md['is_view'] = True
         md['is_snapshot'] = self._tbl_version_path.is_snapshot()
         base_tbl = self._get_base_table()
-        base_version = self._effective_base_versions[0]
-        md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
+        if base_tbl is None:
+            md['base'] = None
+        else:
+            base_version = self._effective_base_versions[0]
+            md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
         return md
     def insert(
@@ -295,16 +278,21 @@ class View(Table):
         print_stats: bool = False,
         **kwargs: Any,
     ) -> UpdateStatus:
-        raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
+        raise excs.Error(f'{self._display_str()}: Cannot insert into a {self._display_name()}.')
     def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
-        raise excs.Error(f'{self._display_name()} {self._name!r}: cannot delete from view')
+        raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
     def _get_base_table(self) -> Optional['Table']:
+        if self._tbl_version_path.base is None and not self._snapshot_only:
+            return None  # this can happen for a replica of a base table
         # if this is a pure snapshot, our tbl_version_path only reflects the base (there is no TableVersion instance
         # for the snapshot itself)
+        from pixeltable.catalog import Catalog
         base_id = self._tbl_version_path.tbl_id if self._snapshot_only else self._tbl_version_path.base.tbl_id
-        return catalog.Catalog.get().get_table_by_id(base_id)
+        with Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
+            return catalog.Catalog.get().get_table_by_id(base_id)
     @property
     def _effective_base_versions(self) -> list[Optional[int]]:
@@ -315,8 +303,7 @@ class View(Table):
             return effective_versions[1:]
     def _table_descriptor(self) -> str:
-        display_name = 'Snapshot' if self._snapshot_only else 'View'
-        result = [f'{display_name} {self._path()!r}']
+        result = [self._display_str()]
         bases_descrs: list[str] = []
         for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
             if effective_version is None:

pixeltable/dataframe.py CHANGED Viewed

@@ -1185,7 +1185,7 @@ class DataFrame:
         """
         self._validate_mutable('delete', False)
         if not self._first_tbl.is_insertable():
-            raise excs.Error('Cannot delete from view')
+            raise excs.Error('Cannot use `delete` on a view.')
         with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
             return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
@@ -1196,14 +1196,27 @@ class DataFrame:
             op_name: The name of the operation for which the test is being performed.
             allow_select: If True, allow a select() specification in the Dataframe.
         """
+        self._validate_mutable_op_sequence(op_name, allow_select)
+        # TODO: Reconcile these with Table.__check_mutable()
+        assert len(self._from_clause.tbls) == 1
+        if self._first_tbl.is_snapshot():
+            raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
+        if self._first_tbl.is_replica():
+            raise excs.Error(f'Cannot use `{op_name}` on a replica.')
+    def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
+        """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
         if self.group_by_clause is not None or self.grouping_tbl is not None:
-            raise excs.Error(f'Cannot use `{op_name}` after `group_by`')
+            raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
         if self.order_by_clause is not None:
-            raise excs.Error(f'Cannot use `{op_name}` after `order_by`')
+            raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
         if self.select_list is not None and not allow_select:
-            raise excs.Error(f'Cannot use `{op_name}` after `select`')
+            raise excs.Error(f'Cannot use `{op_name}` after `select`.')
         if self.limit_val is not None:
-            raise excs.Error(f'Cannot use `{op_name}` after `limit`')
+            raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
+        if self._has_joins():
+            raise excs.Error(f'Cannot use `{op_name}` after `join`.')
     def as_dict(self) -> dict[str, Any]:
         """

pixeltable/env.py CHANGED Viewed

@@ -20,7 +20,7 @@ from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
 from sys import stdout
-from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Optional, TypeVar
 from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
 import nest_asyncio  # type: ignore[import-untyped]
@@ -86,6 +86,7 @@ class Env:
     _resource_pool_info: dict[str, Any]
     _current_conn: Optional[sql.Connection]
     _current_session: Optional[sql.orm.Session]
+    _current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
     _dbms: Optional[Dbms]
     _event_loop: Optional[asyncio.AbstractEventLoop]  # event loop for ExecNode
@@ -99,6 +100,7 @@ class Env:
     def _init_env(cls, reinit_db: bool = False) -> None:
         assert not cls.__initializing, 'Circular env initialization detected.'
         cls.__initializing = True
+        cls._instance = None
         env = Env()
         env._set_up(reinit_db=reinit_db)
         env._upgrade_metadata()
@@ -142,6 +144,7 @@ class Env:
         self._resource_pool_info = {}
         self._current_conn = None
         self._current_session = None
+        self._current_isolation_level = None
         self._dbms = None
         self._event_loop = None
@@ -230,20 +233,34 @@ class Env:
         return self._db_server is not None
     @contextmanager
-    def begin_xact(self) -> Iterator[sql.Connection]:
-        """Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
+    def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
+        """
+        Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
+        for_write: if True, uses serializable isolation; if False, uses repeatable_read
+        TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
+        that avoids tripping over any pending ops
+        """
         if self._current_conn is None:
             assert self._current_session is None
             try:
-                with self.engine.begin() as conn, sql.orm.Session(conn) as session:
+                self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
+                with (
+                    self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
+                    sql.orm.Session(conn) as session,
+                    conn.begin(),
+                ):
                     self._current_conn = conn
                     self._current_session = session
                     yield conn
             finally:
                 self._current_session = None
                 self._current_conn = None
+                self._current_isolation_level = None
         else:
             assert self._current_session is not None
+            assert for_write == (self._current_isolation_level == 'serializable')
             yield self._current_conn
     def configure_logging(

pixeltable/exec/data_row_batch.py CHANGED Viewed

@@ -90,7 +90,9 @@ class DataRowBatch:
             idx_range = slice(0, len(self.rows))
         for row in self.rows[idx_range]:
             for info in stored_img_info:
-                filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.get().version))
+                col = info.col
+                assert col.tbl.id == self.tbl.id
+                filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
                 row.flush_img(info.slot_idx, filepath)
             for slot_idx in flushed_slot_idxs:
                 row.flush_img(slot_idx)

pixeltable/exec/in_memory_data_node.py CHANGED Viewed

@@ -63,13 +63,12 @@ class InMemoryDataNode(ExecNode):
             for col_name, val in input_row.items():
                 col_info = user_cols_by_name.get(col_name)
                 assert col_info is not None
-                if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
-                    # this is a literal image, ie, a sequence of bytes; we save this as a media file and store the path
-                    path = str(MediaStore.prepare_media_path(self.tbl.id, col_info.col.id, self.tbl.get().version))
-                    with open(path, 'wb') as fp:
-                        fp.write(val)
-                    self.output_rows[row_idx][col_info.slot_idx] = path
+                col = col_info.col
+                if col.col_type.is_image_type() and isinstance(val, bytes):
+                    # this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
+                    assert col.tbl.id == self.tbl.id
+                    path = MediaStore.save_media_file(val, col.tbl.id, col.id, col.tbl.version)
+                    self.output_rows[row_idx][col_info.slot_idx] = str(path)
                 else:
                     self.output_rows[row_idx][col_info.slot_idx] = val

pixeltable/exprs/column_ref.py CHANGED Viewed

@@ -325,7 +325,8 @@ class ColumnRef(Expr):
     @classmethod
     def get_column(cls, d: dict) -> catalog.Column:
         tbl_id, version, col_id = UUID(d['tbl_id']), d['tbl_version'], d['col_id']
-        tbl_version = catalog.Catalog.get().get_tbl_version(tbl_id, version)
+        # validate_initialized=False: this gets called as part of TableVersion.init()
+        tbl_version = catalog.Catalog.get().get_tbl_version(tbl_id, version, validate_initialized=False)
         # don't use tbl_version.cols_by_id here, this might be a snapshot reference to a column that was then dropped
         col = next(col for col in tbl_version.cols if col.id == col_id)
         return col

pixeltable/functions/gemini.py CHANGED Viewed

@@ -7,7 +7,6 @@ the [Working with Gemini](https://pixeltable.readme.io/docs/working-with-gemini)
 import asyncio
 import io
-import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
@@ -215,9 +214,10 @@ async def generate_videos(
     video_bytes = await _genai_client().aio.files.download(file=video.video)  # type: ignore[arg-type]
     assert video_bytes is not None
-    _, output_filename = tempfile.mkstemp(suffix='.mp4', dir=str(env.Env.get().tmp_dir))
-    Path(output_filename).write_bytes(video_bytes)
-    return output_filename
+    # Create a temporary file to store the video bytes
+    output_path = env.Env.get().create_tmp_path('.mp4')
+    Path(output_path).write_bytes(video_bytes)
+    return str(output_path)
 @generate_videos.resource_pool

pixeltable/functions/openai.py CHANGED Viewed

@@ -13,7 +13,6 @@ import logging
 import math
 import pathlib
 import re
-import uuid
 from typing import TYPE_CHECKING, Any, Callable, Optional, Type
 import httpx
@@ -207,7 +206,7 @@ async def speech(input: str, *, model: str, voice: str, model_kwargs: Optional[d
     content = await _openai_client().audio.speech.create(input=input, model=model, voice=voice, **model_kwargs)
     ext = model_kwargs.get('response_format', 'mp3')
-    output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
+    output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
     content.write_to_file(output_filename)
     return output_filename

pixeltable/functions/video.py CHANGED Viewed

@@ -2,9 +2,6 @@
 Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
 """
-import tempfile
-import uuid
-from pathlib import Path
 from typing import Any, Optional
 import av
@@ -59,8 +56,7 @@ class make_video(pxt.Aggregator):
         if frame is None:
             return
         if self.container is None:
-            (_, output_filename) = tempfile.mkstemp(suffix='.mp4', dir=str(env.Env.get().tmp_dir))
-            self.out_file = Path(output_filename)
+            self.out_file = env.Env.get().create_tmp_path('.mp4')
             self.container = av.open(str(self.out_file), mode='w')
             self.stream = self.container.add_stream('h264', rate=self.fps)
             self.stream.pix_fmt = 'yuv420p'
@@ -109,7 +105,7 @@ def extract_audio(
             return None
         audio_stream = container.streams.audio[stream_idx]
         # create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
-        output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
+        output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
         with av.open(output_filename, 'w', format=format) as output_container:
             output_stream = output_container.add_stream(codec or default_codec)

pixeltable/globals.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Un
 import pandas as pd
 from pandas.io.formats.style import Styler
-from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
+from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
 from pixeltable.catalog import Catalog, TableVersionPath
 from pixeltable.catalog.insertable_table import OnErrorParameter
 from pixeltable.config import Config
@@ -44,7 +44,7 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
 def create_table(
-    path_str: str,
+    path: str,
     schema: Optional[dict[str, Any]] = None,
     *,
     source: Optional[TableDataSource] = None,
@@ -58,14 +58,24 @@ def create_table(
     if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
     extra_args: Optional[dict[str, Any]] = None,  # Additional arguments to data source provider
 ) -> catalog.Table:
-    """Create a new base table.
+    """Create a new base table. Exactly one of `schema` or `source` must be provided.
+    If a `schema` is provided, then an empty table will be created with the specified schema.
+    If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
+    contents of the specified data, and the data will be imported from the specified source into the new table. The
+    source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
     Args:
-        path_str: Path to the table.
-        schema: A dictionary that maps column names to column types
-        source: A data source from which a table schema can be inferred and data imported
-        source_format: A hint to the format of the source data
-        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
+        path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
+        schema: Schema for the new table, mapping column names to Pixeltable types.
+        source: A data source (file, URL, DataFrame, or list of rows) to import from.
+        source_format: Must be used in conjunction with a `source`.
+            If specified, then the given format will be used to read the source data. (Otherwise,
+            Pixeltable will attempt to infer the format from the source data.)
+        schema_overrides: Must be used in conjunction with a `source`.
+            If specified, then columns in `schema_overrides` will be given the specified types.
+            (Pixeltable will attempt to infer the types of any columns not specified.)
         on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
             invalid media file (such as a corrupt image) for one of the inserted rows.
@@ -81,14 +91,15 @@ def create_table(
             - `'on_read'`: validate media files at query time
             - `'on_write'`: validate media files during insert/update operations
-        if_exists: Directive regarding how to handle if the path already exists.
-            Must be one of the following:
+        if_exists: Determines the behavior if a table already exists at the specified path location.
             - `'error'`: raise an error
             - `'ignore'`: do nothing and return the existing table handle
-            - `'replace'`: if the existing table has no views, drop and replace it with a new one
-            - `'replace_force'`: drop the existing table and all its views, and create a new one
-        extra_args: Additional arguments to pass to the source data provider
+            - `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
+                raise an error if the existing table has views or snapshots
+            - `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
+        extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
+            passed along to the source data provider.
     Returns:
         A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
@@ -114,7 +125,7 @@ def create_table(
         >>> tbl1 = pxt.get_table('orig_table')
         ... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
-        Create a table if does not already exist, otherwise get the existing table:
+        Create a table if it does not already exist, otherwise get the existing table:
         >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
@@ -130,12 +141,12 @@ def create_table(
     from pixeltable.io.utils import normalize_primary_key_parameter
     if (schema is None) == (source is None):
-        raise excs.Error('Must provide either a `schema` or a `source`')
+        raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
     if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
         raise excs.Error('`schema` must be a non-empty dictionary')
-    path_obj = catalog.Path(path_str)
+    path_obj = catalog.Path(path)
     if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
     media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
     primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
@@ -146,7 +157,14 @@ def create_table(
         tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
         tds.check_source_format()
         data_source = tds.specialize()
-        data_source.src_schema_overrides = schema_overrides
+        src_schema_overrides: dict[str, ts.ColumnType] = {}
+        if schema_overrides is not None:
+            for col_name, py_type in schema_overrides.items():
+                col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
+                if col_type is None:
+                    raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
+                src_schema_overrides[col_name] = col_type
+        data_source.src_schema_overrides = src_schema_overrides
         data_source.src_pk = primary_key
         data_source.infer_schema()
         schema = data_source.pxt_schema
@@ -255,9 +273,7 @@ def create_view(
         tbl_version_path = base._tbl_version_path
         sample_clause = None
     elif isinstance(base, DataFrame):
-        base._validate_mutable('create_view', allow_select=True)
-        if len(base._from_clause.tbls) > 1:
-            raise excs.Error('Cannot create a view of a join')
+        base._validate_mutable_op_sequence('create_view', allow_select=True)
         tbl_version_path = base._from_clause.tbls[0]
         where = base.where_clause
         sample_clause = base.sample_clause
@@ -537,9 +553,12 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
         >>> pxt.list_tables('dir1')
     """
-    path_obj = catalog.Path(dir_path, empty_is_valid=True)  # validate format
-    cat = Catalog.get()
-    contents = cat.get_dir_contents(path_obj, recursive=recursive)
+    return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
+def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
+    path_obj = catalog.Path(dir_path, empty_is_valid=True, allow_system_paths=allow_system_paths)
+    contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
     return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
@@ -647,13 +666,16 @@ def ls(path: str = '') -> pd.DataFrame:
     To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
     [list_dirs()][pixeltable.list_dirs] instead.
     """
+    from pixeltable.catalog import retry_loop
     from pixeltable.metadata import schema
     cat = Catalog.get()
     path_obj = catalog.Path(path, empty_is_valid=True)
     dir_entries = cat.get_dir_contents(path_obj)
-    rows: list[list[str]] = []
-    with Catalog.get().begin_xact():
+    @retry_loop(for_write=False)
+    def op() -> list[list[str]]:
+        rows: list[list[str]] = []
         for name, entry in dir_entries.items():
             if name.startswith('_'):
                 continue
@@ -679,6 +701,9 @@ def ls(path: str = '') -> pd.DataFrame:
                 if md['is_replica']:
                     kind = f'{kind}-replica'
             rows.append([name, kind, version, base])
+        return rows
+    rows = op()
     rows = sorted(rows, key=lambda x: x[0])
     df = pd.DataFrame(

pixeltable/io/datarows.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
 def _infer_schema_from_rows(
-    rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
+    rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
 ) -> dict[str, ts.ColumnType]:
     schema: dict[str, ts.ColumnType] = {}
     cols_with_nones: set[str] = set()
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
                 # in which the column names are encountered in the input data, even if `schema_overrides`
                 # is specified.
                 if col_name not in schema:
+                    assert isinstance(schema_overrides[col_name], ts.ColumnType)
                     schema[col_name] = schema_overrides[col_name]
             elif value is not None:
                 # If `key` is not in `schema_overrides`, then we infer its type from the data.

pixeltable/io/pandas.py CHANGED Viewed

@@ -132,6 +132,7 @@ def df_infer_schema(
     pd_schema: dict[str, ts.ColumnType] = {}
     for pd_name, pd_dtype in zip(df.columns, df.dtypes):
         if pd_name in schema_overrides:
+            assert isinstance(schema_overrides[pd_name], ts.ColumnType)
             pxt_type = schema_overrides[pd_name]
         else:
             pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)

pixeltable 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.3py3-none-any.whl → 0.4.4py3-none-any.whl