PyPI - pixeltable - Versions diffs - 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl - Mend

pixeltable 0.4.0rc3py3-none-any.whl → 0.4.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show

pixeltable/__init__.py +23 -5
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +5 -3
pixeltable/catalog/catalog.py +1318 -404
pixeltable/catalog/column.py +186 -115
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/globals.py +11 -43
pixeltable/catalog/insertable_table.py +167 -79
pixeltable/catalog/path.py +61 -23
pixeltable/catalog/schema_object.py +9 -10
pixeltable/catalog/table.py +626 -308
pixeltable/catalog/table_metadata.py +101 -0
pixeltable/catalog/table_version.py +713 -569
pixeltable/catalog/table_version_handle.py +37 -6
pixeltable/catalog/table_version_path.py +42 -29
pixeltable/catalog/tbl_ops.py +50 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +108 -94
pixeltable/config.py +128 -22
pixeltable/dataframe.py +188 -100
pixeltable/env.py +407 -136
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +3 -0
pixeltable/exec/aggregation_node.py +7 -8
pixeltable/exec/cache_prefetch_node.py +83 -110
pixeltable/exec/cell_materialization_node.py +231 -0
pixeltable/exec/cell_reconstruction_node.py +135 -0
pixeltable/exec/component_iteration_node.py +4 -3
pixeltable/exec/data_row_batch.py +8 -65
pixeltable/exec/exec_context.py +16 -4
pixeltable/exec/exec_node.py +13 -36
pixeltable/exec/expr_eval/evaluators.py +7 -6
pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
pixeltable/exec/expr_eval/globals.py +8 -5
pixeltable/exec/expr_eval/row_buffer.py +1 -2
pixeltable/exec/expr_eval/schedulers.py +190 -30
pixeltable/exec/globals.py +32 -0
pixeltable/exec/in_memory_data_node.py +18 -18
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +206 -101
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +27 -22
pixeltable/exprs/array_slice.py +3 -3
pixeltable/exprs/column_property_ref.py +34 -30
pixeltable/exprs/column_ref.py +92 -96
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +5 -4
pixeltable/exprs/data_row.py +152 -55
pixeltable/exprs/expr.py +62 -43
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +17 -10
pixeltable/exprs/function_call.py +75 -37
pixeltable/exprs/globals.py +1 -2
pixeltable/exprs/in_predicate.py +4 -4
pixeltable/exprs/inline_expr.py +10 -27
pixeltable/exprs/is_null.py +1 -3
pixeltable/exprs/json_mapper.py +8 -8
pixeltable/exprs/json_path.py +56 -22
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +127 -53
pixeltable/exprs/rowid_ref.py +8 -12
pixeltable/exprs/similarity_expr.py +50 -25
pixeltable/exprs/sql_element_cache.py +4 -4
pixeltable/exprs/string_op.py +5 -5
pixeltable/exprs/type_cast.py +3 -5
pixeltable/func/__init__.py +1 -0
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/callable_function.py +9 -9
pixeltable/func/expr_template_function.py +10 -10
pixeltable/func/function.py +18 -20
pixeltable/func/function_registry.py +6 -7
pixeltable/func/globals.py +2 -3
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +20 -18
pixeltable/func/signature.py +43 -16
pixeltable/func/tools.py +23 -13
pixeltable/func/udf.py +18 -20
pixeltable/functions/__init__.py +6 -0
pixeltable/functions/anthropic.py +93 -33
pixeltable/functions/audio.py +114 -10
pixeltable/functions/bedrock.py +13 -6
pixeltable/functions/date.py +1 -1
pixeltable/functions/deepseek.py +20 -9
pixeltable/functions/fireworks.py +2 -2
pixeltable/functions/gemini.py +28 -11
pixeltable/functions/globals.py +13 -13
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1046 -23
pixeltable/functions/image.py +9 -18
pixeltable/functions/llama_cpp.py +23 -8
pixeltable/functions/math.py +3 -4
pixeltable/functions/mistralai.py +4 -15
pixeltable/functions/ollama.py +16 -9
pixeltable/functions/openai.py +104 -82
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +2 -2
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +21 -28
pixeltable/functions/timestamp.py +13 -14
pixeltable/functions/together.py +4 -6
pixeltable/functions/twelvelabs.py +92 -0
pixeltable/functions/util.py +6 -1
pixeltable/functions/video.py +1388 -106
pixeltable/functions/vision.py +7 -7
pixeltable/functions/whisper.py +15 -7
pixeltable/functions/whisperx.py +179 -0
pixeltable/{ext/functions → functions}/yolox.py +2 -4
pixeltable/globals.py +332 -105
pixeltable/index/base.py +13 -22
pixeltable/index/btree.py +23 -22
pixeltable/index/embedding_index.py +32 -44
pixeltable/io/__init__.py +4 -2
pixeltable/io/datarows.py +7 -6
pixeltable/io/external_store.py +49 -77
pixeltable/io/fiftyone.py +11 -11
pixeltable/io/globals.py +29 -28
pixeltable/io/hf_datasets.py +17 -9
pixeltable/io/label_studio.py +70 -66
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +12 -11
pixeltable/io/parquet.py +13 -93
pixeltable/io/table_data_conduit.py +71 -47
pixeltable/io/utils.py +3 -3
pixeltable/iterators/__init__.py +2 -1
pixeltable/iterators/audio.py +21 -11
pixeltable/iterators/document.py +116 -55
pixeltable/iterators/image.py +5 -2
pixeltable/iterators/video.py +293 -13
pixeltable/metadata/__init__.py +4 -2
pixeltable/metadata/converters/convert_18.py +2 -2
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_20.py +2 -2
pixeltable/metadata/converters/convert_21.py +2 -2
pixeltable/metadata/converters/convert_22.py +2 -2
pixeltable/metadata/converters/convert_24.py +2 -2
pixeltable/metadata/converters/convert_25.py +2 -2
pixeltable/metadata/converters/convert_26.py +2 -2
pixeltable/metadata/converters/convert_29.py +4 -4
pixeltable/metadata/converters/convert_34.py +2 -2
pixeltable/metadata/converters/convert_36.py +2 -2
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/util.py +13 -12
pixeltable/metadata/notes.py +4 -0
pixeltable/metadata/schema.py +79 -42
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +274 -223
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +259 -129
pixeltable/share/protocol/__init__.py +34 -0
pixeltable/share/protocol/common.py +170 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +109 -0
pixeltable/share/publish.py +213 -57
pixeltable/store.py +238 -175
pixeltable/type_system.py +104 -63
pixeltable/utils/__init__.py +2 -3
pixeltable/utils/arrow.py +108 -13
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +305 -0
pixeltable/utils/code.py +3 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/coroutine.py +6 -23
pixeltable/utils/dbms.py +31 -5
pixeltable/utils/description_helper.py +4 -5
pixeltable/utils/documents.py +5 -6
pixeltable/utils/exception_handler.py +7 -30
pixeltable/utils/filecache.py +6 -6
pixeltable/utils/formatter.py +4 -6
pixeltable/utils/gcs_store.py +283 -0
pixeltable/utils/http_server.py +2 -3
pixeltable/utils/iceberg.py +1 -2
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +88 -0
pixeltable/utils/local_store.py +316 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +528 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +392 -0
pixeltable-0.4.20.dist-info/METADATA +587 -0
pixeltable-0.4.20.dist-info/RECORD +218 -0
{pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/ext/__init__.py +0 -17
pixeltable/ext/functions/__init__.py +0 -11
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/utils/media_store.py +0 -77
pixeltable/utils/s3.py +0 -17
pixeltable/utils/sample.py +0 -25
pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
{pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0

pixeltable/utils/dbms.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import abc
-from sqlalchemy import URL
+import sqlalchemy as sql
 class Dbms(abc.ABC):
@@ -11,9 +11,9 @@ class Dbms(abc.ABC):
     name: str
     transaction_isolation_level: str
     version_index_type: str
-    db_url: URL
+    db_url: sql.URL
-    def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: URL) -> None:
+    def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: sql.URL) -> None:
         self.name = name
         self.transaction_isolation_level = transaction_isolation_level
         self.version_index_type = version_index_type
@@ -28,13 +28,18 @@ class Dbms(abc.ABC):
     @abc.abstractmethod
     def default_system_db_url(self) -> str: ...
+    @abc.abstractmethod
+    def create_vector_index_stmt(
+        self, store_index_name: str, sa_value_col: sql.Column, metric: str
+    ) -> sql.Compiled: ...
 class PostgresqlDbms(Dbms):
     """
     Implements utilities to interact with Postgres database.
     """
-    def __init__(self, db_url: URL):
+    def __init__(self, db_url: sql.URL):
         super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
     def drop_db_stmt(self, database: str) -> str:
@@ -47,13 +52,25 @@ class PostgresqlDbms(Dbms):
         a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
         return a
+    def create_vector_index_stmt(self, store_index_name: str, sa_value_col: sql.Column, metric: str) -> sql.Compiled:
+        from sqlalchemy.dialects import postgresql
+        sa_idx = sql.Index(
+            store_index_name,
+            sa_value_col,
+            postgresql_using='hnsw',
+            postgresql_with={'m': 16, 'ef_construction': 64},
+            postgresql_ops={sa_value_col.name: metric},
+        )
+        return sql.schema.CreateIndex(sa_idx, if_not_exists=True).compile(dialect=postgresql.dialect())
 class CockroachDbms(Dbms):
     """
     Implements utilities to interact with CockroachDb database.
     """
-    def __init__(self, db_url: URL):
+    def __init__(self, db_url: sql.URL):
         super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
     def drop_db_stmt(self, database: str) -> str:
@@ -64,3 +81,12 @@ class CockroachDbms(Dbms):
     def default_system_db_url(self) -> str:
         return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
+    def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index | None:
+        return None
+    def create_vector_index_stmt(self, store_index_name: str, sa_value_col: sql.Column, metric: str) -> sql.Compiled:
+        return sql.text(
+            f'CREATE VECTOR INDEX IF NOT EXISTS {store_index_name} ON {sa_value_col.table.name}'
+            f'({sa_value_col.name} {metric})'
+        ).compile()

pixeltable/utils/description_helper.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import dataclasses
-from typing import Optional, Union
 import pandas as pd
 from pandas.io.formats.style import Styler
@@ -7,11 +6,11 @@ from pandas.io.formats.style import Styler
 @dataclasses.dataclass
 class _Descriptor:
-    body: Union[str, pd.DataFrame]
+    body: str | pd.DataFrame
     # The remaining fields only affect the behavior if `body` is a pd.DataFrame.
     show_index: bool
     show_header: bool
-    styler: Optional[Styler] = None
+    styler: Styler | None = None
 class DescriptionHelper:
@@ -33,10 +32,10 @@ class DescriptionHelper:
     def append(
         self,
-        descriptor: Union[str, pd.DataFrame],
+        descriptor: str | pd.DataFrame,
         show_index: bool = False,
         show_header: bool = True,
-        styler: Optional[Styler] = None,
+        styler: Styler | None = None,
     ) -> None:
         self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))

pixeltable/utils/documents.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import dataclasses
 import os
-from typing import Optional
 import bs4
 import fitz  # type: ignore[import-untyped]
@@ -13,10 +12,10 @@ from pixeltable.env import Env
 @dataclasses.dataclass
 class DocumentHandle:
     format: ts.DocumentType.DocumentFormat
-    bs_doc: Optional[bs4.BeautifulSoup] = None
-    md_ast: Optional[dict] = None
-    pdf_doc: Optional[fitz.Document] = None
-    txt_doc: Optional[str] = None
+    bs_doc: bs4.BeautifulSoup | None = None
+    md_ast: dict | None = None
+    pdf_doc: fitz.Document | None = None
+    txt_doc: str | None = None
 def get_document_handle(path: str) -> DocumentHandle:
@@ -34,7 +33,7 @@ def get_document_handle(path: str) -> DocumentHandle:
     raise excs.Error(f'Unrecognized document format: {path}')
-def get_handle_by_extension(path: str, extension: str) -> Optional[DocumentHandle]:
+def get_handle_by_extension(path: str, extension: str) -> DocumentHandle | None:
     doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
     try:

pixeltable/utils/exception_handler.py CHANGED Viewed

@@ -1,35 +1,12 @@
 import logging
-import sys
-from typing import Any, Callable, Optional, TypeVar
+from typing import Any, Callable, TypeVar
 R = TypeVar('R')
-def _is_in_exception() -> bool:
-    """
-    Check if code is currently executing within an exception context.
-    """
-    current_exception = sys.exc_info()[1]
-    return current_exception is not None
-def run_cleanup_on_exception(cleanup_func: Callable[..., R], *args: Any, **kwargs: Any) -> Optional[R]:
-    """
-    Runs cleanup only when running in exception context.
-    The function `run_cleanup_on_exception()` should be used to clean up resources when an operation fails.
-    This is typically done using a try, except, and finally block, with the resource cleanup logic placed within
-    the except block. However, this pattern may not handle KeyboardInterrupt exceptions.
-    To ensure that resources are always cleaned up at least once when an exception or KeyboardInterrupt occurs,
-    create an idempotent function for cleaning up resources and pass it to the `run_cleanup_on_exception()` function
-    from the finally block.
-    """
-    if _is_in_exception():
-        return run_cleanup(cleanup_func, *args, raise_error=False, **kwargs)
-    return None
+logger = logging.getLogger('pixeltable')
-def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> Optional[R]:
+def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> R | None:
     """
     Runs a cleanup function. If interrupted, retry cleanup.
     The `run_cleanup()` function ensures that the `cleanup_func()` function executes at least once.
@@ -40,20 +17,20 @@ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool =
         raise_error: raise an exception if an error occurs during cleanup.
     """
     try:
-        logging.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
+        logger.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
         return cleanup_func(*args, **kwargs)
     except KeyboardInterrupt as interrupt:
         # Save original exception and re-attempt cleanup
         original_exception = interrupt
-        logging.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
+        logger.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
         try:
             return cleanup_func(*args, **kwargs)
         except Exception as e:
             # Suppress this exception
-            logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
+            logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
         raise KeyboardInterrupt from original_exception
     except Exception as e:
-        logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
+        logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
         if raise_error:
             raise e
     return None

pixeltable/utils/filecache.py CHANGED Viewed

@@ -9,7 +9,7 @@ from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 from uuid import UUID
 import pixeltable.exceptions as excs
@@ -58,7 +58,7 @@ class FileCache:
     - implement MRU eviction for queries that exceed the capacity
     """
-    __instance: Optional[FileCache] = None
+    __instance: FileCache | None = None
     cache: OrderedDict[str, CacheEntry]
     total_size: int
@@ -126,12 +126,12 @@ class FileCache:
             return 0
         return int(self.total_size / len(self.cache))
-    def num_files(self, tbl_id: Optional[UUID] = None) -> int:
+    def num_files(self, tbl_id: UUID | None = None) -> int:
         if tbl_id is None:
             return len(self.cache)
         return sum(e.tbl_id == tbl_id for e in self.cache.values())
-    def clear(self, tbl_id: Optional[UUID] = None) -> None:
+    def clear(self, tbl_id: UUID | None = None) -> None:
         """
         For testing purposes: allow resetting capacity and stats.
         """
@@ -174,7 +174,7 @@ class FileCache:
         h.update(url.encode())
         return h.hexdigest()
-    def lookup(self, url: str) -> Optional[Path]:
+    def lookup(self, url: str) -> Path | None:
         self.num_requests += 1
         key = self._url_hash(url)
         entry = self.cache.get(key, None)
@@ -214,7 +214,7 @@ class FileCache:
         new_path = entry.path
         os.rename(str(path), str(new_path))
         new_path.touch(exist_ok=True)
-        _logger.debug(f'added entry for cell {url} to file cache')
+        _logger.debug(f'FileCache: cached url {url} with file name {new_path}')
         return new_path
     def ensure_capacity(self, size: int) -> None:

pixeltable/utils/formatter.py CHANGED Viewed

@@ -4,7 +4,7 @@ import io
 import json
 import logging
 import mimetypes
-from typing import Any, Callable, Optional
+from typing import Any, Callable
 import av
 import numpy as np
@@ -39,7 +39,7 @@ class Formatter:
         self.__num_cols = num_cols
         self.__http_address = http_address
-    def get_pandas_formatter(self, col_type: ts.ColumnType) -> Optional[Callable]:
+    def get_pandas_formatter(self, col_type: ts.ColumnType) -> Callable | None:
         if col_type.is_string_type():
             return self.format_string
         if col_type.is_float_type():
@@ -184,7 +184,7 @@ class Formatter:
         """
     @classmethod
-    def extract_first_video_frame(cls, file_path: str) -> Optional[Image.Image]:
+    def extract_first_video_frame(cls, file_path: str) -> Image.Image | None:
         with av.open(file_path) as container:
             try:
                 img = next(container.decode(video=0)).to_image()
@@ -224,9 +224,7 @@ class Formatter:
         """
     @classmethod
-    def make_document_thumbnail(
-        cls, file_path: str, max_width: int = 320, max_height: int = 320
-    ) -> Optional[Image.Image]:
+    def make_document_thumbnail(cls, file_path: str, max_width: int = 320, max_height: int = 320) -> Image.Image | None:
         """
         Returns a thumbnail image of a document.
         """

pixeltable/utils/gcs_store.py ADDED Viewed

@@ -0,0 +1,283 @@
+from __future__ import annotations
+import logging
+import re
+import urllib.parse
+import uuid
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Iterator
+from google.api_core.exceptions import GoogleAPIError
+from google.cloud import storage  # type: ignore[attr-defined]
+from google.cloud.exceptions import Forbidden, NotFound
+from google.cloud.storage.client import Client  # type: ignore[import-untyped]
+from pixeltable import env, exceptions as excs
+from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress, StorageTarget
+if TYPE_CHECKING:
+    from pixeltable.catalog import Column
+_logger = logging.getLogger('pixeltable')
+@env.register_client('gcs_store')
+def _() -> 'Client':
+    """Create and return a GCS client, using default credentials if available,
+    otherwise creating an anonymous client for public buckets.
+    """
+    try:
+        # Create a client with default credentials
+        # Note that if the default credentials have expired, gcloud will still create a client,
+        # which will report the expiry error when it is used.
+        # To create and use an anonymous client, expired credentials must be removed.
+        # For application default credentials, delete the file in ~/.config/gcloud/, or
+        #   gcloud auth application-default revoke
+        # OR
+        # For service account keys, you must delete the downloaded key file.
+        client = storage.Client()
+        return client
+    except Exception:
+        # If no credentials are found, create an anonymous client which can be used for public buckets.
+        client = storage.Client.create_anonymous_client()
+        return client
+class GCSStore(ObjectStoreBase):
+    """Class to handle Google Cloud Storage operations."""
+    # URI of the GCS bucket in the format gs://bucket_name/prefix/
+    # Always ends with a slash
+    __base_uri: str
+    # bucket name extracted from the URI
+    __bucket_name: str
+    # prefix path within the bucket, either empty or ending with a slash
+    __prefix_name: str
+    # The parsed form of the given destination address
+    soa: StorageObjectAddress
+    def __init__(self, soa: StorageObjectAddress):
+        assert soa.storage_target == StorageTarget.GCS_STORE, f'Expected storage_target "gs", got {soa.storage_target}'
+        self.soa = soa
+        self.__base_uri = soa.prefix_free_uri + soa.prefix
+        self.__bucket_name = soa.container
+        self.__prefix_name = soa.prefix
+    @classmethod
+    def client(cls) -> 'Client':
+        """Return the GCS client."""
+        return env.Env.get().get_client('gcs_store')
+    @property
+    def bucket_name(self) -> str:
+        """Return the bucket name from the base URI."""
+        return self.__bucket_name
+    @property
+    def prefix(self) -> str:
+        """Return the prefix from the base URI."""
+        return self.__prefix_name
+    def validate(self, error_col_name: str) -> str | None:
+        """
+        Checks if the URI exists.
+        Returns:
+            str: The base URI if the GCS bucket exists and is accessible, None otherwise.
+        """
+        try:
+            client = self.client()
+            bucket = client.bucket(self.bucket_name)
+            blobs = bucket.list_blobs(max_results=1)
+            # This will raise an exception if the destination doesn't exist or cannot be listed
+            _ = list(blobs)  # Force evaluation to check access
+            return self.__base_uri
+        except (NotFound, Forbidden, GoogleAPIError) as e:
+            self.handle_gcs_error(e, self.bucket_name, f'validate bucket {error_col_name}')
+        return None
+    def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: str | None = None) -> str:
+        """
+        Construct a new, unique URI for a persisted media file.
+        """
+        prefix, filename = ObjectPath.create_prefix_raw(tbl_id, col_id, tbl_version, ext)
+        parent = f'{self.__base_uri}{prefix}'
+        return f'{parent}/{filename}'
+    def _prepare_uri(self, col: Column, ext: str | None = None) -> str:
+        """
+        Construct a new, unique URI for a persisted media file.
+        """
+        assert col.get_tbl() is not None, 'Column must be associated with a table'
+        return self._prepare_uri_raw(col.get_tbl().id, col.id, col.get_tbl().version, ext=ext)
+    def copy_local_file(self, col: Column, src_path: Path) -> str:
+        """Copy a local file, and return its new URL"""
+        new_file_uri = self._prepare_uri(col, ext=src_path.suffix)
+        parsed = urllib.parse.urlparse(new_file_uri)
+        blob_name = parsed.path.lstrip('/')
+        try:
+            client = self.client()
+            bucket = client.bucket(self.bucket_name)
+            blob = bucket.blob(blob_name)
+            blob.upload_from_filename(str(src_path))
+            _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
+            return new_file_uri
+        except GoogleAPIError as e:
+            self.handle_gcs_error(e, self.bucket_name, f'upload file {src_path}')
+            raise
+    def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
+        """Copies an object to a local file. Thread safe"""
+        try:
+            client = self.client()
+            bucket = client.bucket(self.bucket_name)
+            blob = bucket.blob(self.prefix + src_path)
+            blob.download_to_filename(str(dest_path))
+        except GoogleAPIError as e:
+            self.handle_gcs_error(e, self.bucket_name, f'download file {src_path}')
+            raise
+    def _get_filtered_objects(self, bucket: Any, tbl_id: uuid.UUID, tbl_version: int | None = None) -> Iterator:
+        """Private method to get filtered objects for a table, optionally filtered by version.
+        Args:
+            tbl_id: Table UUID to filter by
+            tbl_version: Optional table version to filter by
+        Returns:
+            Tuple of (iterator over GCS objects matching the criteria, bucket object)
+        """
+        table_prefix = ObjectPath.table_prefix(tbl_id)
+        prefix = f'{self.prefix}{table_prefix}/'
+        if tbl_version is None:
+            # Return all blobs with the table prefix
+            blob_iterator = bucket.list_blobs(prefix=prefix)
+        else:
+            # Filter by both table_id and table_version using the ObjectPath pattern
+            # Pattern: tbl_id_col_id_version_uuid
+            version_pattern = re.compile(rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*')
+            # Return filtered collection - this still uses lazy loading
+            all_blobs = bucket.list_blobs(prefix=prefix)
+            blob_iterator = (blob for blob in all_blobs if version_pattern.match(blob.name.split('/')[-1]))
+        return blob_iterator
+    def count(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
+        """Count the number of files belonging to tbl_id. If tbl_version is not None,
+        count only those files belonging to the specified tbl_version.
+        Args:
+            tbl_id: Table UUID to count objects for
+            tbl_version: Optional table version to filter by
+        Returns:
+            Number of objects matching the criteria
+        """
+        assert tbl_id is not None
+        try:
+            client = self.client()
+            bucket = client.bucket(self.bucket_name)
+            blob_iterator = self._get_filtered_objects(bucket, tbl_id, tbl_version)
+            return sum(1 for _ in blob_iterator)
+        except GoogleAPIError as e:
+            self.handle_gcs_error(e, self.bucket_name, f'setup iterator {self.prefix}')
+            raise
+    def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
+        """Delete all files belonging to tbl_id. If tbl_version is not None, delete
+        only those files belonging to the specified tbl_version.
+        Args:
+            tbl_id: Table UUID to delete objects for
+            tbl_version: Optional table version to filter by
+        Returns:
+            Number of objects deleted
+        """
+        assert tbl_id is not None
+        total_deleted = 0
+        try:
+            client = self.client()
+            bucket = client.bucket(self.bucket_name)
+            blob_iterator = self._get_filtered_objects(bucket, tbl_id, tbl_version)
+            # Collect blob names for batch deletion
+            blobs_to_delete = []
+            for blob in blob_iterator:
+                blobs_to_delete.append(blob)
+                # Process in batches for efficiency
+                if len(blobs_to_delete) >= 100:
+                    with client.batch():
+                        for b in blobs_to_delete:
+                            b.delete()
+                    total_deleted += len(blobs_to_delete)
+                    blobs_to_delete = []
+            # Delete any remaining blobs in the final batch
+            if len(blobs_to_delete) > 0:
+                with client.batch():
+                    for b in blobs_to_delete:
+                        b.delete()
+                total_deleted += len(blobs_to_delete)
+            return total_deleted
+        except GoogleAPIError as e:
+            self.handle_gcs_error(e, self.bucket_name, f'deleting with {self.prefix}')
+            raise
+    def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
+        """Return a list of objects found in the specified destination bucket.
+        Each returned object includes the full set of prefixes.
+        if return_uri is True, full URI's are returned; otherwise, just the object keys.
+        """
+        p = self.soa.prefix_free_uri if return_uri else ''
+        gcs_client = self.client()
+        r: list[str] = []
+        try:
+            bucket = gcs_client.bucket(self.bucket_name)
+            # List blobs with the given prefix, limiting to n_max
+            blobs = bucket.list_blobs(prefix=self.prefix, max_results=n_max)
+            for blob in blobs:
+                r.append(f'{p}{blob.name}')
+                if len(r) >= n_max:
+                    break
+        except GoogleAPIError as e:
+            self.handle_gcs_error(e, self.bucket_name, f'list objects from {self.prefix}')
+        return r
+    @classmethod
+    def handle_gcs_error(cls, e: Exception, bucket_name: str, operation: str = '', *, ignore_404: bool = False) -> None:
+        """Handle GCS-specific errors and convert them to appropriate exceptions"""
+        if isinstance(e, NotFound):
+            if ignore_404:
+                return
+            raise excs.Error(f'Bucket or object {bucket_name} not found during {operation}: {str(e)!r}')
+        elif isinstance(e, Forbidden):
+            raise excs.Error(f'Access denied to bucket {bucket_name} during {operation}: {str(e)!r}')
+        elif isinstance(e, GoogleAPIError):
+            # Handle other Google API errors
+            error_message = str(e)
+            if 'Precondition' in error_message:
+                raise excs.Error(f'Precondition failed for bucket {bucket_name} during {operation}: {error_message}')
+            else:
+                raise excs.Error(f'Error during {operation} in bucket {bucket_name}: {error_message}')
+        else:
+            # Generic error handling
+            raise excs.Error(f'Unexpected error during {operation} in bucket {bucket_name}: {str(e)!r}')

pixeltable/utils/http_server.py CHANGED Viewed

@@ -2,7 +2,7 @@ import http
 import http.server
 import logging
 import pathlib
-import urllib
+import urllib.request
 from typing import Any
 _logger = logging.getLogger('pixeltable.http.server')
@@ -36,8 +36,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
         path = path.split('?', 1)[0]
         path = path.split('#', 1)[0]
-        path = pathlib.Path(urllib.request.url2pathname(path))
-        return str(path)
+        return str(pathlib.Path(urllib.request.url2pathname(path)))
     def log_message(self, format: str, *args: Any) -> None:
         """override logging to stderr in http.server.BaseHTTPRequestHandler"""

pixeltable/utils/iceberg.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from pathlib import Path
-from typing import Union
 from pyiceberg.catalog.sql import SqlCatalog
-def sqlite_catalog(warehouse_path: Union[str, Path], name: str = 'pixeltable') -> SqlCatalog:
+def sqlite_catalog(warehouse_path: str | Path, name: str = 'pixeltable') -> SqlCatalog:
     """
     Instantiate a sqlite Iceberg catalog at the specified path. If no catalog exists, one will be created.
     """

pixeltable/utils/image.py ADDED Viewed

@@ -0,0 +1,17 @@
+import base64
+from io import BytesIO
+import PIL.Image
+def default_format(img: PIL.Image.Image) -> str:
+    # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
+    # In that case, use WebP instead.
+    return 'webp' if img.has_transparency_data else 'jpeg'
+def to_base64(image: PIL.Image.Image, format: str | None = None) -> str:
+    buffer = BytesIO()
+    image.save(buffer, format=format or image.format)
+    image_bytes = buffer.getvalue()
+    return base64.b64encode(image_bytes).decode('utf-8')

pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.0rc3py3-none-any.whl → 0.4.20py3-none-any.whl