PyPI - pixeltable - Versions diffs - 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl - Mend

pixeltable 0.4.13py3-none-any.whl → 0.4.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show

pixeltable/__init__.py +2 -1
pixeltable/catalog/catalog.py +187 -63
pixeltable/catalog/column.py +24 -20
pixeltable/catalog/table.py +24 -8
pixeltable/catalog/table_metadata.py +1 -0
pixeltable/catalog/table_version.py +16 -34
pixeltable/catalog/update_status.py +12 -0
pixeltable/catalog/view.py +22 -22
pixeltable/config.py +2 -0
pixeltable/dataframe.py +4 -2
pixeltable/env.py +46 -21
pixeltable/exec/__init__.py +1 -0
pixeltable/exec/aggregation_node.py +0 -1
pixeltable/exec/cache_prefetch_node.py +74 -98
pixeltable/exec/data_row_batch.py +2 -18
pixeltable/exec/expr_eval/expr_eval_node.py +11 -0
pixeltable/exec/in_memory_data_node.py +1 -1
pixeltable/exec/object_store_save_node.py +299 -0
pixeltable/exec/sql_node.py +28 -33
pixeltable/exprs/data_row.py +31 -25
pixeltable/exprs/json_path.py +6 -5
pixeltable/exprs/row_builder.py +6 -12
pixeltable/functions/gemini.py +1 -1
pixeltable/functions/openai.py +1 -1
pixeltable/functions/video.py +128 -15
pixeltable/functions/whisperx.py +2 -0
pixeltable/functions/yolox.py +2 -0
pixeltable/globals.py +49 -30
pixeltable/index/embedding_index.py +5 -8
pixeltable/io/__init__.py +1 -0
pixeltable/io/fiftyone.py +1 -1
pixeltable/io/label_studio.py +4 -5
pixeltable/iterators/__init__.py +1 -0
pixeltable/iterators/audio.py +1 -1
pixeltable/iterators/document.py +10 -12
pixeltable/iterators/video.py +1 -1
pixeltable/metadata/schema.py +7 -0
pixeltable/plan.py +26 -1
pixeltable/share/packager.py +8 -2
pixeltable/share/publish.py +3 -10
pixeltable/store.py +1 -1
pixeltable/type_system.py +1 -3
pixeltable/utils/dbms.py +31 -5
pixeltable/utils/gcs_store.py +283 -0
pixeltable/utils/local_store.py +316 -0
pixeltable/utils/object_stores.py +497 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +354 -0
{pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/METADATA +1 -1
{pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/RECORD +53 -50
pixeltable/utils/media_store.py +0 -248
pixeltable/utils/s3.py +0 -17
{pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/WHEEL +0 -0
{pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/licenses/LICENSE +0 -0

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -138,15 +138,12 @@ class EmbeddingIndex(IndexBase):
     def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
         """Create the index on the index value column"""
-        idx = sql.Index(
-            index_name,
-            index_value_col.sa_col,
-            postgresql_using='hnsw',
-            postgresql_with={'m': 16, 'ef_construction': 64},
-            postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
+        Env.get().dbms.create_vector_index(
+            index_name=index_name,
+            index_value_sa_col=index_value_col.sa_col,
+            conn=Env.get().conn,
+            metric=self.PGVECTOR_OPS[self.metric],
         )
-        conn = Env.get().conn
-        idx.create(bind=conn)
     def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
         """Drop the index on the index value column"""

pixeltable/io/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+"""Functions for importing and exporting Pixeltable data."""
 # ruff: noqa: F401
 from .datarows import import_json, import_rows

pixeltable/io/fiftyone.py CHANGED Viewed

@@ -9,7 +9,7 @@ import puremagic
 import pixeltable as pxt
 import pixeltable.exceptions as excs
 from pixeltable import exprs
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):

pixeltable/io/label_studio.py CHANGED Viewed

@@ -19,7 +19,7 @@ from pixeltable.config import Config
 from pixeltable.exprs import ColumnRef, DataRow, Expr
 from pixeltable.io.external_store import Project
 from pixeltable.utils import coco
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
 # the import two different ways to insure intercompatibility
@@ -46,6 +46,9 @@ class LabelStudioProject(Project):
     """
     An [`ExternalStore`][pixeltable.io.ExternalStore] that represents a Label Studio project, providing functionality
     for synchronizing between a Pixeltable table and a Label Studio project.
+    The constructor will NOT create a new Label Studio project; it is also used when loading
+    metadata for existing projects.
     """
     project_id: int  # Label Studio project ID
@@ -60,10 +63,6 @@ class LabelStudioProject(Project):
         col_mapping: dict[ColumnHandle, str],
         stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
     ):
-        """
-        The constructor will NOT create a new Label Studio project; it is also used when loading
-        metadata for existing projects.
-        """
         self.project_id = project_id
         self.media_import_method = media_import_method
         self._project = None

pixeltable/iterators/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+"""Iterators for splitting media and documents into components."""
 # ruff: noqa: F401
 from .audio import AudioSplitter

pixeltable/iterators/audio.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, ClassVar, Optional
 import av
 from pixeltable import exceptions as excs, type_system as ts
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 from .base import ComponentIterator

pixeltable/iterators/document.py CHANGED Viewed

@@ -94,6 +94,16 @@ class DocumentSplitter(ComponentIterator):
     include additional metadata fields if specified in the `metadata` parameter, as explained below.
     Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
+    Args:
+        separators: separators to use to chunk the document. Options are:
+             `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
+             This may be a comma-separated string, e.g., `'heading,token_limit'`.
+        limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
+             or `'char_limit'` is specified.
+        metadata: additional metadata fields to include in the output. Options are:
+             `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
+             (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
     """
     METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -116,18 +126,6 @@ class DocumentSplitter(ComponentIterator):
         tiktoken_encoding: Optional[str] = 'cl100k_base',
         tiktoken_target_model: Optional[str] = None,
     ):
-        """Init method for `DocumentSplitter` class.
-        Args:
-            separators: separators to use to chunk the document. Options are:
-                 `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
-                 This may be a comma-separated string, e.g., `'heading,token_limit'`.
-            limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
-                 or `'char_limit'` is specified.
-            metadata: additional metadata fields to include in the output. Options are:
-                 `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
-                 (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
-        """
         if html_skip_tags is None:
             html_skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)

pixeltable/iterators/video.py CHANGED Viewed

@@ -14,7 +14,7 @@ import pixeltable as pxt
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
 import pixeltable.utils.av as av_utils
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 from .base import ComponentIterator

pixeltable/metadata/schema.py CHANGED Viewed

@@ -115,6 +115,9 @@ class ColumnMd:
     # if True, the column is present in the stored table
     stored: Optional[bool]
+    # If present, the URI for the destination for column values
+    destination: Optional[str] = None
 @dataclasses.dataclass
 class IndexMd:
@@ -244,6 +247,9 @@ class TableVersionMd:
     schema_version: int
     user: Optional[str] = None  # User that created this version
     update_status: Optional[UpdateStatus] = None  # UpdateStatus of the change that created this version
+    # A version fragment cannot be queried or instantiated via get_table(). A fragment represents a version of a
+    # replica table that has incomplete data, and exists only to provide base table support for a dependent view.
+    is_fragment: bool = False
     additional_md: dict[str, Any] = dataclasses.field(default_factory=dict)
@@ -353,6 +359,7 @@ class FullTableMd(NamedTuple):
     def is_pure_snapshot(self) -> bool:
         return (
             self.tbl_md.view_md is not None
+            and self.tbl_md.view_md.is_snapshot
             and self.tbl_md.view_md.predicate is None
             and len(self.schema_version_md.columns) == 0
         )

pixeltable/plan.py CHANGED Viewed

@@ -403,6 +403,8 @@ class Planner:
                 ignore_errors=ignore_errors,
             )
         )
+        plan = cls._insert_save_node(tbl.id, row_builder.stored_media_cols, input_node=plan)
         return plan
     @classmethod
@@ -499,6 +501,9 @@ class Planner:
         for i, col in enumerate(all_base_cols):
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
         plan.ctx.num_computed_exprs = len(recomputed_exprs)
+        plan = cls._insert_save_node(tbl.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
         return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
@@ -597,6 +602,7 @@ class Planner:
         # we're returning everything to the user, so we might as well do it in a single batch
         ctx.batch_size = 0
         plan.set_ctx(ctx)
+        plan = cls._insert_save_node(tbl.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
         return (
             plan,
@@ -650,6 +656,8 @@ class Planner:
         for i, col in enumerate(copied_cols + list(recomputed_cols)):  # same order as select_list
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
         # TODO: avoid duplication with view_load_plan() logic (where does this belong?)
+        plan = cls._insert_save_node(view.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
         return plan
     @classmethod
@@ -718,6 +726,8 @@ class Planner:
         exec_ctx.ignore_errors = True
         plan.set_ctx(exec_ctx)
+        plan = cls._insert_save_node(view.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
         return plan, len(row_builder.default_eval_ctx.target_exprs)
     @classmethod
@@ -762,6 +772,17 @@ class Planner:
             combined_ordering = combined
         return combined_ordering
+    @classmethod
+    def _insert_save_node(
+        cls, tbl_id: UUID, stored_media_cols: list[exprs.ColumnSlotIdx], input_node: exec.ExecNode
+    ) -> exec.ExecNode:
+        """Return an ObjectStoreSaveNode if stored media columns are present, otherwise return input"""
+        if len(stored_media_cols) == 0:
+            return input_node
+        save_node = exec.ObjectStoreSaveNode(tbl_id, stored_media_cols, input_node)
+        save_node.set_ctx(input_node.ctx)
+        return save_node
     @classmethod
     def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
         """Returns True if l1 is contained in l2"""
@@ -771,7 +792,7 @@ class Planner:
     def _insert_prefetch_node(
         cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
     ) -> exec.ExecNode:
-        """Return a CachePrefetchNode if needed, otherwise return input"""
+        """Return a node to prefetch data if needed, otherwise return input"""
         # we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
         # of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
         # aren't explicitly captured as dependencies
@@ -989,6 +1010,7 @@ class Planner:
                 if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
                     # we need an ExprEvalNode to evaluate the remaining output exprs
                     plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
+                plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
         else:
             if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
                 # we need an ExprEvalNode to evaluate the remaining output exprs
@@ -1034,10 +1056,13 @@ class Planner:
         plan = cls._create_query_plan(
             row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True
         )
         plan.ctx.batch_size = 16
         plan.ctx.show_pbar = True
         plan.ctx.ignore_errors = True
         computed_exprs = row_builder.output_exprs - row_builder.input_exprs
         plan.ctx.num_computed_exprs = len(computed_exprs)  # we are adding a computed column, so we need to evaluate it
+        plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
         return plan

pixeltable/share/packager.py CHANGED Viewed

@@ -24,7 +24,8 @@ from pixeltable.env import Env
 from pixeltable.metadata import schema
 from pixeltable.utils import sha256sum
 from pixeltable.utils.formatter import Formatter
-from pixeltable.utils.media_store import MediaStore, TempStore
+from pixeltable.utils.local_store import TempStore
+from pixeltable.utils.object_stores import ObjectOps
 _logger = logging.getLogger('pixeltable')
@@ -362,6 +363,8 @@ class TableRestorer:
         for md in tbl_md:
             md.tbl_md.is_replica = True
+        assert not tbl_md[0].version_md.is_fragment  # Top-level table cannot be a version fragment
         cat = catalog.Catalog.get()
         with cat.begin_xact(for_write=True):
@@ -369,6 +372,9 @@ class TableRestorer:
             # versions that have not been seen before.
             cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
+            _logger.debug(f'Now will import data for {len(tbl_md)} table(s):')
+            _logger.debug(repr([md.tbl_md.tbl_id for md in tbl_md[::-1]]))
             # Now we need to load data for replica_tbl and its ancestors, except that we skip
             # replica_tbl itself if it's a pure snapshot.
             for md in tbl_md[::-1]:  # Base table first
@@ -619,7 +625,7 @@ class TableRestorer:
                 # in self.media_files.
                 src_path = self.tmp_dir / 'media' / parsed_url.netloc
                 # Move the file to the media store and update the URL.
-                self.media_files[url] = MediaStore.get().relocate_local_media_file(src_path, media_col)
+                self.media_files[url] = ObjectOps.put_file(media_col, src_path, relocate_or_delete=True)
             return self.media_files[url]
         # For any type of URL other than a local file, just return the URL as-is.
         return url

pixeltable/share/publish.py CHANGED Viewed

@@ -14,7 +14,7 @@ import pixeltable as pxt
 from pixeltable import exceptions as excs
 from pixeltable.env import Env
 from pixeltable.utils import sha256sum
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 from .packager import TablePackager, TableRestorer
@@ -79,16 +79,13 @@ def push_replica(
 def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
-    from pixeltable.utils.s3 import get_client
     bucket = parsed_location.netloc
     remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
     remote_path = str(remote_dir / bundle.name)[1:]  # Remove initial /
     Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
-    boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
-    s3_client = get_client(**boto_config)
+    s3_client = Env.get().get_client('s3')
     upload_args = {'ChecksumAlgorithm': 'SHA256'}
@@ -135,16 +132,13 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
 def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_filename: str) -> Path:
-    from pixeltable.utils.s3 import get_client
     bucket = parsed_location.netloc
     remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
     remote_path = str(remote_dir / bundle_filename)[1:]  # Remove initial /
     Env.get().console_logger.info(f'Downloading snapshot from: {bucket}:{remote_path}')
-    boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
-    s3_client = get_client(**boto_config)
+    s3_client = Env.get().get_client('s3')
     obj = s3_client.head_object(Bucket=bucket, Key=remote_path)  # Check if the object exists
     bundle_size = obj['ContentLength']
@@ -260,7 +254,6 @@ def _download_from_presigned_url(
         session.close()
-# TODO: This will be replaced by drop_table with cloud table uri
 def delete_replica(dest_path: str) -> None:
     """Delete cloud replica"""
     delete_request_json = {'operation_type': 'delete_snapshot', 'table_uri': dest_path}

pixeltable/store.py CHANGED Viewed

@@ -274,7 +274,7 @@ class StoreBase:
                 self.sa_md.remove(tmp_tbl)
                 tmp_tbl.drop(bind=conn)
-            run_cleanup(remove_tmp_tbl, raise_error=True)
+            run_cleanup(remove_tmp_tbl, raise_error=False)
         return num_excs

pixeltable/type_system.py CHANGED Viewed

@@ -1081,9 +1081,7 @@ class ImageType(ColumnType):
         mode: Optional[str] = None,
         nullable: bool = False,
     ):
-        """
-        TODO: does it make sense to specify only width or height?
-        """
+        # TODO: does it make sense to specify only width or height?
         super().__init__(self.Type.IMAGE, nullable=nullable)
         assert not (width is not None and size is not None)
         assert not (height is not None and size is not None)

pixeltable/utils/dbms.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import abc
-from sqlalchemy import URL
+import sqlalchemy as sql
 class Dbms(abc.ABC):
@@ -11,9 +11,9 @@ class Dbms(abc.ABC):
     name: str
     transaction_isolation_level: str
     version_index_type: str
-    db_url: URL
+    db_url: sql.URL
-    def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: URL) -> None:
+    def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: sql.URL) -> None:
         self.name = name
         self.transaction_isolation_level = transaction_isolation_level
         self.version_index_type = version_index_type
@@ -28,13 +28,18 @@ class Dbms(abc.ABC):
     @abc.abstractmethod
     def default_system_db_url(self) -> str: ...
+    @abc.abstractmethod
+    def create_vector_index(
+        self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
+    ) -> None: ...
 class PostgresqlDbms(Dbms):
     """
     Implements utilities to interact with Postgres database.
     """
-    def __init__(self, db_url: URL):
+    def __init__(self, db_url: sql.URL):
         super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
     def drop_db_stmt(self, database: str) -> str:
@@ -47,13 +52,25 @@ class PostgresqlDbms(Dbms):
         a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
         return a
+    def create_vector_index(
+        self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
+    ) -> None:
+        idx = sql.Index(
+            index_name,
+            index_value_sa_col,
+            postgresql_using='hnsw',
+            postgresql_with={'m': 16, 'ef_construction': 64},
+            postgresql_ops={index_value_sa_col.name: metric},
+        )
+        idx.create(bind=conn)
 class CockroachDbms(Dbms):
     """
     Implements utilities to interact with CockroachDb database.
     """
-    def __init__(self, db_url: URL):
+    def __init__(self, db_url: sql.URL):
         super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
     def drop_db_stmt(self, database: str) -> str:
@@ -64,3 +81,12 @@ class CockroachDbms(Dbms):
     def default_system_db_url(self) -> str:
         return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
+    def create_vector_index(
+        self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
+    ) -> None:
+        create_index_sql = sql.text(
+            f"""CREATE VECTOR INDEX {index_name} ON {index_value_sa_col.table.name}
+             ({index_value_sa_col.name} {metric})"""
+        )
+        conn.execute(create_index_sql)

pixeltable 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.13py3-none-any.whl → 0.4.15py3-none-any.whl