PyPI - pixeltable - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl - Mend

pixeltable 0.3.6py3-none-any.whl → 0.3.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (127) hide show

pixeltable/__init__.py +5 -3
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -0
pixeltable/catalog/catalog.py +335 -128
pixeltable/catalog/column.py +22 -5
pixeltable/catalog/dir.py +19 -6
pixeltable/catalog/insertable_table.py +34 -37
pixeltable/catalog/named_function.py +0 -4
pixeltable/catalog/schema_object.py +28 -42
pixeltable/catalog/table.py +193 -158
pixeltable/catalog/table_version.py +191 -232
pixeltable/catalog/table_version_handle.py +50 -0
pixeltable/catalog/table_version_path.py +49 -33
pixeltable/catalog/view.py +56 -96
pixeltable/config.py +103 -0
pixeltable/dataframe.py +89 -89
pixeltable/env.py +98 -168
pixeltable/exec/aggregation_node.py +5 -4
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/component_iteration_node.py +13 -9
pixeltable/exec/data_row_batch.py +3 -3
pixeltable/exec/exec_context.py +0 -4
pixeltable/exec/exec_node.py +3 -2
pixeltable/exec/expr_eval/schedulers.py +2 -1
pixeltable/exec/in_memory_data_node.py +9 -4
pixeltable/exec/row_update_node.py +1 -2
pixeltable/exec/sql_node.py +20 -16
pixeltable/exprs/__init__.py +2 -0
pixeltable/exprs/arithmetic_expr.py +7 -11
pixeltable/exprs/array_slice.py +1 -1
pixeltable/exprs/column_property_ref.py +3 -3
pixeltable/exprs/column_ref.py +12 -13
pixeltable/exprs/comparison.py +3 -6
pixeltable/exprs/compound_predicate.py +4 -4
pixeltable/exprs/expr.py +31 -22
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +1 -1
pixeltable/exprs/function_call.py +110 -80
pixeltable/exprs/globals.py +3 -3
pixeltable/exprs/in_predicate.py +1 -1
pixeltable/exprs/inline_expr.py +3 -3
pixeltable/exprs/is_null.py +1 -1
pixeltable/exprs/json_mapper.py +2 -2
pixeltable/exprs/json_path.py +17 -10
pixeltable/exprs/literal.py +1 -1
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/row_builder.py +8 -17
pixeltable/exprs/rowid_ref.py +21 -10
pixeltable/exprs/similarity_expr.py +5 -5
pixeltable/exprs/sql_element_cache.py +1 -1
pixeltable/exprs/type_cast.py +2 -3
pixeltable/exprs/variable.py +2 -2
pixeltable/ext/__init__.py +2 -0
pixeltable/ext/functions/__init__.py +2 -0
pixeltable/ext/functions/yolox.py +3 -3
pixeltable/func/__init__.py +3 -1
pixeltable/func/aggregate_function.py +9 -9
pixeltable/func/callable_function.py +3 -4
pixeltable/func/expr_template_function.py +6 -16
pixeltable/func/function.py +48 -14
pixeltable/func/function_registry.py +1 -3
pixeltable/func/query_template_function.py +5 -12
pixeltable/func/signature.py +23 -22
pixeltable/func/tools.py +3 -3
pixeltable/func/udf.py +6 -4
pixeltable/functions/__init__.py +2 -0
pixeltable/functions/fireworks.py +7 -4
pixeltable/functions/globals.py +4 -5
pixeltable/functions/huggingface.py +1 -5
pixeltable/functions/image.py +17 -7
pixeltable/functions/llama_cpp.py +1 -1
pixeltable/functions/mistralai.py +1 -1
pixeltable/functions/ollama.py +4 -4
pixeltable/functions/openai.py +19 -19
pixeltable/functions/string.py +23 -30
pixeltable/functions/timestamp.py +11 -6
pixeltable/functions/together.py +14 -12
pixeltable/functions/util.py +1 -1
pixeltable/functions/video.py +5 -4
pixeltable/functions/vision.py +6 -9
pixeltable/functions/whisper.py +3 -3
pixeltable/globals.py +246 -260
pixeltable/index/__init__.py +2 -0
pixeltable/index/base.py +1 -1
pixeltable/index/btree.py +3 -1
pixeltable/index/embedding_index.py +11 -5
pixeltable/io/external_store.py +11 -12
pixeltable/io/label_studio.py +4 -3
pixeltable/io/parquet.py +57 -56
pixeltable/iterators/__init__.py +4 -2
pixeltable/iterators/audio.py +11 -11
pixeltable/iterators/document.py +10 -10
pixeltable/iterators/string.py +1 -2
pixeltable/iterators/video.py +14 -15
pixeltable/metadata/__init__.py +9 -5
pixeltable/metadata/converters/convert_10.py +0 -1
pixeltable/metadata/converters/convert_15.py +0 -2
pixeltable/metadata/converters/convert_23.py +0 -2
pixeltable/metadata/converters/convert_24.py +3 -3
pixeltable/metadata/converters/convert_25.py +1 -1
pixeltable/metadata/converters/convert_27.py +0 -2
pixeltable/metadata/converters/convert_28.py +0 -2
pixeltable/metadata/converters/convert_29.py +7 -8
pixeltable/metadata/converters/util.py +7 -7
pixeltable/metadata/schema.py +27 -19
pixeltable/plan.py +68 -40
pixeltable/share/__init__.py +2 -0
pixeltable/share/packager.py +15 -12
pixeltable/share/publish.py +3 -5
pixeltable/store.py +37 -38
pixeltable/type_system.py +41 -28
pixeltable/utils/coco.py +4 -4
pixeltable/utils/console_output.py +1 -3
pixeltable/utils/description_helper.py +1 -1
pixeltable/utils/documents.py +3 -3
pixeltable/utils/filecache.py +20 -9
pixeltable/utils/formatter.py +2 -3
pixeltable/utils/media_store.py +1 -1
pixeltable/utils/pytorch.py +1 -1
pixeltable/utils/sql.py +4 -4
pixeltable/utils/transactional_directory.py +2 -1
{pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/METADATA +1 -1
pixeltable-0.3.8.dist-info/RECORD +174 -0
pixeltable-0.3.6.dist-info/RECORD +0 -172
{pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/LICENSE +0 -0
{pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/WHEEL +0 -0
{pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/entry_points.txt +0 -0

pixeltable/index/base.py CHANGED Viewed

@@ -37,7 +37,7 @@ class IndexBase(abc.ABC):
         pass
     @abc.abstractmethod
-    def create_index(self, index_name: str, index_value_col: catalog.Column, conn: sql.engine.Connection) -> None:
+    def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
         """Create the index on the index value column"""
         pass

pixeltable/index/btree.py CHANGED Viewed

@@ -6,6 +6,7 @@ import sqlalchemy as sql
 # import pixeltable.catalog as catalog
 import pixeltable.exceptions as excs
 from pixeltable import catalog, exprs
+from pixeltable.env import Env
 from pixeltable.func.udf import udf
 from .base import IndexBase
@@ -52,9 +53,10 @@ class BtreeIndex(IndexBase):
         """Return the sqlalchemy type of the index value column"""
         return self.value_expr.col_type.to_sa_type()
-    def create_index(self, index_name: str, index_value_col: 'catalog.Column', conn: sql.engine.Connection) -> None:
+    def create_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
         """Create the index on the index value column"""
         idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
+        conn = Env.get().conn
         idx.create(bind=conn)
     @classmethod

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import enum
-from typing import Any, Optional
+from typing import Any, ClassVar, Optional
 import numpy as np
 import pgvector.sqlalchemy  # type: ignore[import-untyped]
@@ -11,6 +11,7 @@ import sqlalchemy as sql
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
 from pixeltable import catalog, exprs, func
+from pixeltable.env import Env
 from .base import IndexBase
@@ -31,7 +32,11 @@ class EmbeddingIndex(IndexBase):
         IP = 2
         L2 = 3
-    PGVECTOR_OPS = {Metric.COSINE: 'vector_cosine_ops', Metric.IP: 'vector_ip_ops', Metric.L2: 'vector_l2_ops'}
+    PGVECTOR_OPS: ClassVar[dict[Metric, str]] = {
+        Metric.COSINE: 'vector_cosine_ops',
+        Metric.IP: 'vector_ip_ops',
+        Metric.L2: 'vector_l2_ops',
+    }
     metric: Metric
     value_expr: exprs.FunctionCall
@@ -55,7 +60,7 @@ class EmbeddingIndex(IndexBase):
         if metric.lower() not in metric_names:
             raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
         if not c.col_type.is_string_type() and not c.col_type.is_image_type():
-            raise excs.Error(f'Embedding index requires string or image column')
+            raise excs.Error('Embedding index requires string or image column')
         self.string_embed = None
         self.image_embed = None
@@ -131,7 +136,7 @@ class EmbeddingIndex(IndexBase):
         """Return the sqlalchemy type of the index value column"""
         return self.index_col_type
-    def create_index(self, index_name: str, index_value_col: catalog.Column, conn: sql.engine.Connection) -> None:
+    def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
         """Create the index on the index value column"""
         idx = sql.Index(
             index_name,
@@ -140,6 +145,7 @@ class EmbeddingIndex(IndexBase):
             postgresql_with={'m': 16, 'ef_construction': 64},
             postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
         )
+        conn = Env.get().conn
         idx.create(bind=conn)
     def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
@@ -219,7 +225,7 @@ class EmbeddingIndex(IndexBase):
             )
         shape = return_type.shape
-        if len(shape) != 1 or shape[0] == None:
+        if len(shape) != 1 or shape[0] is None:
             raise excs.Error(
                 f'The function `{embed_fn.name}` is not a valid embedding: '
                 f'it must return a 1-dimensional array of a specific length, but returns {return_type}'

pixeltable/io/external_store.py CHANGED Viewed

@@ -8,12 +8,10 @@ from dataclasses import dataclass
 from typing import Any, Optional
 from uuid import UUID
-import sqlalchemy as sql
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
 from pixeltable import Column, Table
-from pixeltable.catalog import TableVersion
+from pixeltable.catalog import TableVersion, TableVersionHandle
 _logger = logging.getLogger('pixeltable')
@@ -33,13 +31,13 @@ class ExternalStore(abc.ABC):
         return self.__name
     @abc.abstractmethod
-    def link(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
+    def link(self, tbl_version: TableVersion) -> None:
         """
         Called by `TableVersion.link()` to implement store-specific logic.
         """
     @abc.abstractmethod
-    def unlink(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
+    def unlink(self, tbl_version: TableVersion) -> None:
         """
         Called by `TableVersion.unlink()` to implement store-specific logic.
         """
@@ -94,7 +92,7 @@ class Project(ExternalStore, abc.ABC):
     def get_local_columns(self) -> list[Column]:
         return list(self.col_mapping.keys())
-    def link(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
+    def link(self, tbl_version: TableVersion) -> None:
         # All of the media columns being linked need to either be stored computed columns, or else have stored proxies.
         # This ensures that the media in those columns resides in the media store.
         # First determine which columns (if any) need stored proxies, but don't have one yet.
@@ -110,6 +108,7 @@ class Project(ExternalStore, abc.ABC):
                 if col not in self.stored_proxies:
                     # We didn't find it in an existing Project
                     stored_proxies_needed.append(col)
         if len(stored_proxies_needed) > 0:
             _logger.info(f'Creating stored proxies for columns: {[col.name for col in stored_proxies_needed]}')
             # Create stored proxies for columns that need one. Increment the schema version
@@ -119,12 +118,12 @@ class Project(ExternalStore, abc.ABC):
             tbl_version.schema_version = tbl_version.version
             proxy_cols = [self.create_stored_proxy(tbl_version, col) for col in stored_proxies_needed]
             # Add the columns; this will also update table metadata.
-            tbl_version._add_columns(proxy_cols, conn, print_stats=False, on_error='ignore')
+            tbl_version._add_columns(proxy_cols, print_stats=False, on_error='ignore')
             # We don't need to retain `UpdateStatus` since the stored proxies are intended to be
             # invisible to the user.
-            tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
+            tbl_version._update_md(time.time(), preceding_schema_version=preceding_schema_version)
-    def unlink(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
+    def unlink(self, tbl_version: TableVersion) -> None:
         # Determine which stored proxies can be deleted. (A stored proxy can be deleted if it is not referenced by
         # any *other* external store for this table.)
         deletions_needed: set[Column] = set(self.stored_proxies.values())
@@ -139,7 +138,7 @@ class Project(ExternalStore, abc.ABC):
             tbl_version.schema_version = tbl_version.version
             tbl_version._drop_columns(deletions_needed)
             self.stored_proxies.clear()
-            tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
+            tbl_version._update_md(time.time(), preceding_schema_version=preceding_schema_version)
     def create_stored_proxy(self, tbl_version: TableVersion, col: Column) -> Column:
         """
@@ -163,7 +162,7 @@ class Project(ExternalStore, abc.ABC):
             sa_col_type=col.col_type.to_sa_type(),
             schema_version_add=tbl_version.schema_version,
         )
-        proxy_col.tbl = tbl_version
+        proxy_col.tbl = TableVersionHandle(tbl_version.id, tbl_version.effective_version, tbl_version=tbl_version)
         tbl_version.next_col_id += 1
         self.stored_proxies[col] = proxy_col
         return proxy_col
@@ -279,7 +278,7 @@ class Project(ExternalStore, abc.ABC):
         tbl_id = UUID(d['tbl_id'])
         col_id = d['col_id']
-        return Catalog.get().tbl_versions[(tbl_id, None)].cols_by_id[col_id]
+        return Catalog.get().get_tbl_version(tbl_id, None).cols_by_id[col_id]
 @dataclass(frozen=True)

pixeltable/io/label_studio.py CHANGED Viewed

@@ -15,6 +15,7 @@ import pixeltable as pxt
 import pixeltable.env as env
 import pixeltable.exceptions as excs
 from pixeltable import Column, Table
+from pixeltable.config import Config
 from pixeltable.exprs import ColumnRef, DataRow, Expr
 from pixeltable.io.external_store import Project, SyncStatus
 from pixeltable.utils import coco
@@ -356,7 +357,7 @@ class LabelStudioProject(Project):
     @classmethod
     def __localpath_to_lspath(cls, localpath: str) -> str:
         # Transform the local path into Label Studio's bespoke path format.
-        relpath = Path(localpath).relative_to(env.Env.get().home)
+        relpath = Path(localpath).relative_to(Config.get().home)
         return f'/data/local-files/?d={str(relpath)}'
     def __delete_stale_tasks(
@@ -410,7 +411,7 @@ class LabelStudioProject(Project):
             # batch_update on the actual ancestor table that holds the annotations column.
             # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
             ancestor = t
-            while local_annotations_col not in ancestor._tbl_version.cols:
+            while local_annotations_col not in ancestor._tbl_version.get().cols:
                 assert ancestor._base is not None
                 ancestor = ancestor._base
             update_status = ancestor.batch_update(updates)
@@ -618,7 +619,7 @@ class LabelStudioProject(Project):
         if media_import_method == 'file':
             # We need to set up a local storage connection to receive media files
-            os.environ['LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT'] = str(env.Env.get().home)
+            os.environ['LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT'] = str(Config.get().home)
             try:
                 project.connect_local_import_storage(local_store_path=str(env.Env.get().media_dir))
             except HTTPError as exc:

pixeltable/io/parquet.py CHANGED Viewed

@@ -90,63 +90,64 @@ def export_parquet(
         current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
         current_byte_estimate = 0
-        for data_row in df._exec():
-            for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
-                val = data_row[e.slot_idx]
-                if val is None:
-                    current_value_batch[col_name].append(val)
-                    continue
-                assert val is not None
-                if col_type.is_image_type():
-                    # images get inlined into the parquet file
-                    if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
-                        # if there is a file, read directly to preserve information
-                        with open(data_row.file_paths[e.slot_idx], 'rb') as f:
-                            val = f.read()
-                    elif isinstance(val, PIL.Image.Image):
-                        # if no file available, eg. bc it is computed, convert to png
-                        buf = io.BytesIO()
-                        val.save(buf, format='PNG')
-                        val = buf.getvalue()
-                    else:
-                        assert False, f'unknown image type {type(val)}'
-                    length = len(val)
-                elif col_type.is_string_type():
-                    length = len(val)
-                elif col_type.is_video_type():
-                    if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
-                        val = data_row.file_paths[e.slot_idx]
+        with Env.get().begin_xact():
+            for data_row in df._exec():
+                for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
+                    val = data_row[e.slot_idx]
+                    if val is None:
+                        current_value_batch[col_name].append(val)
+                        continue
+                    assert val is not None
+                    if col_type.is_image_type():
+                        # images get inlined into the parquet file
+                        if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
+                            # if there is a file, read directly to preserve information
+                            with open(data_row.file_paths[e.slot_idx], 'rb') as f:
+                                val = f.read()
+                        elif isinstance(val, PIL.Image.Image):
+                            # if no file available, eg. bc it is computed, convert to png
+                            buf = io.BytesIO()
+                            val.save(buf, format='PNG')
+                            val = buf.getvalue()
+                        else:
+                            assert False, f'unknown image type {type(val)}'
+                        length = len(val)
+                    elif col_type.is_string_type():
+                        length = len(val)
+                    elif col_type.is_video_type():
+                        if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
+                            val = data_row.file_paths[e.slot_idx]
+                        else:
+                            assert False, f'unknown video type {type(val)}'
+                        length = len(val)
+                    elif col_type.is_json_type():
+                        val = json.dumps(val)
+                        length = len(val)
+                    elif col_type.is_array_type():
+                        length = val.nbytes
+                    elif col_type.is_int_type():
+                        length = 8
+                    elif col_type.is_float_type():
+                        length = 8
+                    elif col_type.is_bool_type():
+                        length = 1
+                    elif col_type.is_timestamp_type():
+                        val = val.astimezone(datetime.timezone.utc)
+                        length = 8
                     else:
-                        assert False, f'unknown video type {type(val)}'
-                    length = len(val)
-                elif col_type.is_json_type():
-                    val = json.dumps(val)
-                    length = len(val)
-                elif col_type.is_array_type():
-                    length = val.nbytes
-                elif col_type.is_int_type():
-                    length = 8
-                elif col_type.is_float_type():
-                    length = 8
-                elif col_type.is_bool_type():
-                    length = 1
-                elif col_type.is_timestamp_type():
-                    val = val.astimezone(datetime.timezone.utc)
-                    length = 8
-                else:
-                    assert False, f'unknown type {col_type} for {col_name}'
-                current_value_batch[col_name].append(val)
-                current_byte_estimate += length
-            if current_byte_estimate > partition_size_bytes:
-                assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
-                _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
-                batch_num += 1
-                current_value_batch = {k: deque() for k in df.schema.keys()}
-                current_byte_estimate = 0
-        _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
+                        assert False, f'unknown type {col_type} for {col_name}'
+                    current_value_batch[col_name].append(val)
+                    current_byte_estimate += length
+                if current_byte_estimate > partition_size_bytes:
+                    assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
+                    _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
+                    batch_num += 1
+                    current_value_batch = {k: deque() for k in df.schema.keys()}
+                    current_byte_estimate = 0
+            _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
 def import_parquet(

pixeltable/iterators/__init__.py CHANGED Viewed

@@ -1,3 +1,5 @@
+# ruff: noqa: F401
 from .audio import AudioSplitter
 from .base import ComponentIterator
 from .document import DocumentSplitter
@@ -5,9 +7,9 @@ from .image import TileIterator
 from .string import StringSplitter
 from .video import FrameIterator
-__default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
+__default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
 __removed_symbols = {'base', 'document', 'video'}
-__all__ = sorted(list(__default_dir - __removed_symbols))
+__all__ = sorted(__default_dir - __removed_symbols)
 def __dir__():

pixeltable/iterators/audio.py CHANGED Viewed

@@ -1,15 +1,12 @@
 import logging
-import math
 import uuid
 from fractions import Fraction
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, ClassVar, Optional
 import av
-import pixeltable.env as env
-import pixeltable.exceptions as excs
-import pixeltable.type_system as ts
+from pixeltable import env, exceptions as excs, type_system as ts
 from .base import ComponentIterator
@@ -18,7 +15,8 @@ _logger = logging.getLogger('pixeltable')
 class AudioSplitter(ComponentIterator):
     """
-    Iterator over chunks of an audio file. The audio file is split into smaller chunks, where the duration of each chunk is determined by chunk_duration_sec.
+    Iterator over chunks of an audio file. The audio file is split into smaller chunks,
+    where the duration of each chunk is determined by chunk_duration_sec.
     The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
     If the input contains no audio, no chunks are yielded.
@@ -39,11 +37,11 @@ class AudioSplitter(ComponentIterator):
     # List of chunks to extract
     # Each chunk is defined by start and end presentation timestamps in audio file (int)
-    chunks_to_extract_in_pts: Optional[list[tuple[int, int]]] = []
+    chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
     # next chunk to extract
     next_pos: int
-    __codec_map = {
+    __codec_map: ClassVar[dict[str, str]] = {
         'mp3': 'mp3',  # MP3 decoder -> mp3/libmp3lame encoder
         'mp3float': 'mp3',  # MP3float decoder -> mp3 encoder
         'aac': 'aac',  # AAC decoder -> AAC encoder
@@ -88,7 +86,8 @@ class AudioSplitter(ComponentIterator):
             )
         ]
         _logger.debug(
-            f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts} chunks_to_extract_in_pts={self.chunks_to_extract_in_pts}'
+            f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts} '
+            f'chunks_to_extract_in_pts={self.chunks_to_extract_in_pts}'
         )
     @classmethod
@@ -155,7 +154,7 @@ class AudioSplitter(ComponentIterator):
             try:
                 frame = next(self.container.decode(audio=0))
             except EOFError as e:
-                raise excs.Error(f'Failed to read audio file `{self.audio_path}`, error `{e}`')
+                raise excs.Error(f"Failed to read audio file '{self.audio_path}': {e}") from e
             except StopIteration:
                 # no more frames to scan
                 break
@@ -163,7 +162,8 @@ class AudioSplitter(ComponentIterator):
                 # Current frame is behind chunk's start time, always get frame next to chunk's start time
                 continue
             if frame.pts >= target_chunk_end:
-                # Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away the current frame
+                # Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away
+                # the current frame
                 break
             frame_end = frame.pts + frame.samples
             if frame_count == 0:

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import dataclasses
 import enum
 import logging
-from typing import Any, Iterable, Iterator, Optional, Union
+from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
 import ftfy
@@ -96,7 +96,7 @@ class DocumentSplitter(ComponentIterator):
     Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
     """
-    METADATA_COLUMN_TYPES = {
+    METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
         ChunkMetadata.TITLE: StringType(nullable=True),
         ChunkMetadata.HEADING: JsonType(nullable=True),
         ChunkMetadata.SOURCELINE: IntType(nullable=True),
@@ -164,7 +164,7 @@ class DocumentSplitter(ComponentIterator):
             assert self._doc_handle.txt_doc is not None
             self._sections = self._txt_sections()
         else:
-            assert False, f'Unsupported document format: {self._doc_handle.format}'
+            raise AssertionError(f'Unsupported document format: {self._doc_handle.format}')
         if Separator.SENTENCE in self._separators:
             self._sections = self._sentence_sections(self._sections)
@@ -215,7 +215,7 @@ class DocumentSplitter(ComponentIterator):
         # check dependencies at the end
         if Separator.SENTENCE in separators:
-            Env.get().require_package('spacy')
+            _ = Env.get().spacy_nlp
         if Separator.TOKEN_LIMIT in separators:
             Env.get().require_package('tiktoken')
@@ -259,9 +259,9 @@ class DocumentSplitter(ComponentIterator):
             sourceline = el.sourceline
             if el.name in _HTML_HEADINGS:
                 # remove the previously seen lower levels
-                lower_levels = [l for l in headings if l > el.name]
-                for l in lower_levels:
-                    del headings[l]
+                lower_levels = [lv for lv in headings if lv > el.name]
+                for lv in lower_levels:
+                    del headings[lv]
                 headings[el.name] = el.get_text().strip()
         def emit() -> Iterator[DocumentSection]:
@@ -320,9 +320,9 @@ class DocumentSplitter(ComponentIterator):
             level = f'h{lint}'
             text = heading['children'][0]['raw'].strip()
             # remove the previously seen lower levels
-            lower_levels = [l for l in headings.keys() if l > level]
-            for l in lower_levels:
-                del headings[l]
+            lower_levels = [lv for lv in headings if lv > level]
+            for lv in lower_levels:
+                del headings[lv]
             headings[level] = text
         def emit() -> Iterator[DocumentSection]:

pixeltable/iterators/string.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from typing import Any, Iterator
-import pixeltable.exceptions as excs
-import pixeltable.type_system as ts
+from pixeltable import exceptions as excs, type_system as ts
 from pixeltable.env import Env
 from pixeltable.iterators.base import ComponentIterator

pixeltable/iterators/video.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 from fractions import Fraction
 from pathlib import Path
-from typing import Any, Optional, Sequence
+from typing import Any, Optional
 import av
 import pandas as pd
@@ -91,21 +91,20 @@ class FrameIterator(ComponentIterator):
                 self.frames_to_extract = None
             else:
                 spacing = float(self.video_frame_count) / float(num_frames)
-                self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
+                self.frames_to_extract = [round(i * spacing) for i in range(num_frames)]
                 assert len(self.frames_to_extract) == num_frames
+        elif fps is None or fps == 0.0:
+            # Extract all frames
+            self.frames_to_extract = None
+        elif fps > float(self.video_framerate):
+            raise excs.Error(
+                f'Video {video}: requested fps ({fps}) exceeds that of the video ({float(self.video_framerate)})'
+            )
         else:
-            if fps is None or fps == 0.0:
-                # Extract all frames
-                self.frames_to_extract = None
-            elif fps > float(self.video_framerate):
-                raise excs.Error(
-                    f'Video {video}: requested fps ({fps}) exceeds that of the video ({float(self.video_framerate)})'
-                )
-            else:
-                # Extract frames at the implied frequency
-                freq = fps / float(self.video_framerate)
-                n = math.ceil(self.video_frame_count * freq)  # number of frames to extract
-                self.frames_to_extract = list(round(i / freq) for i in range(n))
+            # Extract frames at the implied frequency
+            freq = fps / float(self.video_framerate)
+            n = math.ceil(self.video_frame_count * freq)  # number of frames to extract
+            self.frames_to_extract = [round(i / freq) for i in range(n)]
         _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
         self.next_pos = 0
@@ -149,7 +148,7 @@ class FrameIterator(ComponentIterator):
             try:
                 frame = next(self.container.decode(video=0))
             except EOFError:
-                raise StopIteration
+                raise StopIteration from None
             # Compute the index of the current frame in the video based on the presentation timestamp (pts);
             # this ensures we have a canonical understanding of frame index, regardless of how we got here
             # (seek or iteration)

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import dataclasses
 import importlib
+import logging
 import os
 import pkgutil
 from typing import Callable
 import sqlalchemy as sql
-import sqlalchemy.orm as orm
+from sqlalchemy import orm
+from pixeltable.utils.console_output import ConsoleLogger
 from .schema import SystemInfo, SystemInfoMd
+_console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
 # current version of the metadata; this is incremented whenever the metadata schema changes
 VERSION = 30
@@ -30,7 +36,6 @@ converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
 def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
     def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
-        global converter_cbs
         assert version not in converter_cbs
         converter_cbs[version] = fn
@@ -53,9 +58,8 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
         while md_version < VERSION:
             if md_version not in converter_cbs:
                 raise RuntimeError(f'No metadata converter for version {md_version}')
-            from pixeltable.env import Env
-            Env.get().console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
+            # We can't use the console logger in Env, because Env might not have been initialized yet.
+            _console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
             converter_cbs[md_version](engine)
             md_version += 1
         # update system info

pixeltable/metadata/converters/convert_10.py CHANGED Viewed

@@ -13,4 +13,3 @@ def _(engine: sql.engine.Engine) -> None:
         conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
         # Add `table_attrs` to all instances of tableschemaversions.md.
         conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
-    return

pixeltable/metadata/converters/convert_15.py CHANGED Viewed

@@ -5,8 +5,6 @@ from typing import Any
 import cloudpickle  # type: ignore[import-untyped]
 import sqlalchemy as sql
-import pixeltable.func as func
-import pixeltable.type_system as ts
 from pixeltable.metadata import register_converter
 from pixeltable.metadata.schema import Function

pixeltable/metadata/converters/convert_23.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import logging
-from typing import Any, Optional
 from uuid import UUID
 import sqlalchemy as sql
 from pixeltable.metadata import register_converter
 from pixeltable.metadata.converters.util import convert_table_md
-from pixeltable.metadata.schema import Table
 _logger = logging.getLogger('pixeltable')

pixeltable/metadata/converters/convert_24.py CHANGED Viewed

@@ -19,11 +19,11 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
         isinstance(v, dict)
         and '_classpath' in v
         and v['_classpath']
-        in [
+        in {
             'pixeltable.func.callable_function.CallableFunction',
             'pixeltable.func.aggregate_function.AggregateFunction',
             'pixeltable.func.expr_template_function.ExprTemplateFunction',
-        ]
+        }
     ):
         if 'path' in v:
             assert 'signature' not in v
@@ -50,6 +50,6 @@ def __substitute_path(path: str) -> str:
     # versions, it's necessary to resolve the function symbol to get the signature. The following
     # adjustment is necessary for function names that are stored in db artifacts of version < 25, but
     # have changed in some version > 25.
-    if path in ['pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image']:
+    if path in {'pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image'}:
         return 'pixeltable.functions.huggingface.clip'
     return path

pixeltable 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.6py3-none-any.whl → 0.3.8py3-none-any.whl