PyPI - pixeltable - Versions diffs - 0.2.24__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

pixeltable 0.2.24py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (101) hide show

pixeltable/__init__.py +2 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/dir.py +6 -0
pixeltable/catalog/globals.py +25 -0
pixeltable/catalog/named_function.py +4 -0
pixeltable/catalog/path_dict.py +37 -11
pixeltable/catalog/schema_object.py +6 -0
pixeltable/catalog/table.py +531 -251
pixeltable/catalog/table_version.py +22 -8
pixeltable/catalog/view.py +8 -7
pixeltable/dataframe.py +439 -105
pixeltable/env.py +19 -5
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/exec_node.py +6 -7
pixeltable/exec/expr_eval_node.py +1 -1
pixeltable/exec/sql_node.py +92 -45
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/arithmetic_expr.py +1 -1
pixeltable/exprs/array_slice.py +1 -1
pixeltable/exprs/column_property_ref.py +1 -1
pixeltable/exprs/column_ref.py +29 -2
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/compound_predicate.py +1 -1
pixeltable/exprs/expr.py +12 -5
pixeltable/exprs/expr_set.py +8 -0
pixeltable/exprs/function_call.py +147 -39
pixeltable/exprs/in_predicate.py +1 -1
pixeltable/exprs/inline_expr.py +25 -5
pixeltable/exprs/is_null.py +1 -1
pixeltable/exprs/json_mapper.py +1 -1
pixeltable/exprs/json_path.py +1 -1
pixeltable/exprs/method_ref.py +1 -1
pixeltable/exprs/row_builder.py +1 -1
pixeltable/exprs/rowid_ref.py +1 -1
pixeltable/exprs/similarity_expr.py +17 -7
pixeltable/exprs/sql_element_cache.py +4 -0
pixeltable/exprs/type_cast.py +2 -2
pixeltable/exprs/variable.py +3 -0
pixeltable/func/__init__.py +5 -4
pixeltable/func/aggregate_function.py +151 -68
pixeltable/func/callable_function.py +48 -16
pixeltable/func/expr_template_function.py +64 -23
pixeltable/func/function.py +227 -23
pixeltable/func/function_registry.py +2 -1
pixeltable/func/query_template_function.py +51 -9
pixeltable/func/signature.py +65 -7
pixeltable/func/tools.py +153 -0
pixeltable/func/udf.py +57 -35
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/anthropic.py +51 -4
pixeltable/functions/gemini.py +85 -0
pixeltable/functions/globals.py +54 -34
pixeltable/functions/huggingface.py +10 -28
pixeltable/functions/json.py +3 -8
pixeltable/functions/math.py +67 -0
pixeltable/functions/mistralai.py +0 -2
pixeltable/functions/ollama.py +8 -8
pixeltable/functions/openai.py +51 -4
pixeltable/functions/timestamp.py +1 -1
pixeltable/functions/video.py +3 -9
pixeltable/functions/vision.py +1 -1
pixeltable/globals.py +374 -89
pixeltable/index/embedding_index.py +106 -29
pixeltable/io/__init__.py +1 -1
pixeltable/io/label_studio.py +1 -1
pixeltable/io/parquet.py +39 -19
pixeltable/iterators/__init__.py +1 -0
pixeltable/iterators/document.py +12 -0
pixeltable/iterators/image.py +100 -0
pixeltable/iterators/video.py +7 -8
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_16.py +2 -1
pixeltable/metadata/converters/convert_17.py +2 -1
pixeltable/metadata/converters/convert_22.py +17 -0
pixeltable/metadata/converters/convert_23.py +35 -0
pixeltable/metadata/converters/convert_24.py +56 -0
pixeltable/metadata/converters/convert_25.py +19 -0
pixeltable/metadata/converters/util.py +4 -2
pixeltable/metadata/notes.py +4 -0
pixeltable/metadata/schema.py +1 -0
pixeltable/plan.py +129 -51
pixeltable/store.py +1 -1
pixeltable/type_system.py +196 -54
pixeltable/utils/arrow.py +8 -3
pixeltable/utils/description_helper.py +89 -0
pixeltable/utils/documents.py +14 -0
{pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/METADATA +32 -22
pixeltable-0.3.0.dist-info/RECORD +155 -0
{pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/WHEEL +1 -1
pixeltable-0.3.0.dist-info/entry_points.txt +3 -0
pixeltable/tool/create_test_db_dump.py +0 -308
pixeltable/tool/create_test_video.py +0 -81
pixeltable/tool/doc_plugins/griffe.py +0 -50
pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
pixeltable/tool/embed_udf.py +0 -9
pixeltable/tool/mypy_plugin.py +0 -55
pixeltable-0.2.24.dist-info/RECORD +0 -153
pixeltable-0.2.24.dist-info/entry_points.txt +0 -3
{pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/LICENSE +0 -0

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -37,30 +37,89 @@ class EmbeddingIndex(IndexBase):
         Metric.L2: 'vector_l2_ops'
     }
+    metric: Metric
+    value_expr: exprs.FunctionCall
+    string_embed: Optional[func.Function]
+    image_embed: Optional[func.Function]
+    string_embed_signature_idx: int
+    image_embed_signature_idx: int
+    index_col_type: pgvector.sqlalchemy.Vector
     def __init__(
-            self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
-            image_embed: Optional[func.Function] = None):
+        self,
+        c: catalog.Column,
+        metric: str,
+        embed: Optional[func.Function] = None,
+        string_embed: Optional[func.Function] = None,
+        image_embed: Optional[func.Function] = None,
+    ):
+        if embed is None and string_embed is None and image_embed is None:
+            raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
         metric_names = [m.name.lower() for m in self.Metric]
         if metric.lower() not in metric_names:
             raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
         if not c.col_type.is_string_type() and not c.col_type.is_image_type():
             raise excs.Error(f'Embedding index requires string or image column')
-        if c.col_type.is_string_type() and string_embed is None:
-                raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
-        if c.col_type.is_image_type() and image_embed is None:
-            raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
+        self.string_embed = None
+        self.image_embed = None
+        # Resolve the specific embedding functions corresponding to the user-provided `string_embed`, `image_embed`,
+        # and/or `embed`. For string embeddings, `string_embed` will be used if specified; otherwise, `embed` will
+        # be used as a fallback, if it has a matching signature. Likewise for image embeddings.
         if string_embed is not None:
-            # verify signature
-            self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
+            # `string_embed` is specified; it MUST be valid.
+            self.string_embed = self._resolve_embedding_fn(string_embed, ts.ColumnType.Type.STRING)
+            if self.string_embed is None:
+                raise excs.Error(
+                    f'The function `{string_embed.name}` is not a valid string embedding: '
+                    'it must take a single string parameter'
+                )
+        elif embed is not None:
+            # `embed` is specified; see if it has a string signature.
+            self.string_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.STRING)
         if image_embed is not None:
-            # verify signature
-            self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
+            # `image_embed` is specified; it MUST be valid.
+            self.image_embed = self._resolve_embedding_fn(image_embed, ts.ColumnType.Type.IMAGE)
+            if self.image_embed is None:
+                raise excs.Error(
+                    f'The function `{image_embed.name}` is not a valid image embedding: '
+                    'it must take a single image parameter'
+                )
+        elif embed is not None:
+            # `embed` is specified; see if it has an image signature.
+            self.image_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.IMAGE)
+        if self.string_embed is None and self.image_embed is None:
+            # No string OR image signature was found. This can only happen if `embed` was specified and
+            # contains no matching signatures.
+            assert embed is not None
+            raise excs.Error(
+                f'The function `{embed.name}` is not a valid embedding: '
+                'it must take a single string or image parameter'
+            )
+        # Now validate the return types of the embedding functions.
+        if self.string_embed is not None:
+            self._validate_embedding_fn(self.string_embed, ts.ColumnType.Type.STRING)
+        if self.image_embed is not None:
+            self._validate_embedding_fn(self.image_embed, ts.ColumnType.Type.IMAGE)
+        if c.col_type.is_string_type() and self.string_embed is None:
+            raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
+        if c.col_type.is_image_type() and self.image_embed is None:
+            raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
         self.metric = self.Metric[metric.upper()]
-        self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
+        self.value_expr = (
+            self.string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type()
+            else self.image_embed(exprs.ColumnRef(c))
+        )
         assert isinstance(self.value_expr.col_type, ts.ArrayType)
-        self.string_embed = string_embed
-        self.image_embed = image_embed
         vector_size = self.value_expr.col_type.shape[0]
         assert vector_size is not None
         self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
@@ -91,10 +150,10 @@ class EmbeddingIndex(IndexBase):
         assert isinstance(item, (str, PIL.Image.Image))
         if isinstance(item, str):
             assert self.string_embed is not None
-            embedding = self.string_embed.exec(item)
+            embedding = self.string_embed.exec([item], {})
         if isinstance(item, PIL.Image.Image):
             assert self.image_embed is not None
-            embedding = self.image_embed.exec(item)
+            embedding = self.image_embed.exec([item], {})
         if self.metric == self.Metric.COSINE:
             return val_column.sa_col.cosine_distance(embedding) * -1 + 1
@@ -110,10 +169,10 @@ class EmbeddingIndex(IndexBase):
         embedding: Optional[np.ndarray] = None
         if isinstance(item, str):
             assert self.string_embed is not None
-            embedding = self.string_embed.exec(item)
+            embedding = self.string_embed.exec([item], {})
         if isinstance(item, PIL.Image.Image):
             assert self.image_embed is not None
-            embedding = self.image_embed.exec(item)
+            embedding = self.image_embed.exec([item], {})
         assert embedding is not None
         if self.metric == self.Metric.COSINE:
@@ -132,29 +191,47 @@ class EmbeddingIndex(IndexBase):
         return 'embedding'
     @classmethod
-    def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> None:
-        """Validate the signature"""
+    def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> Optional[func.Function]:
+        """Find an overload resolution for `embed_fn` that matches the given type."""
         assert isinstance(embed_fn, func.Function)
+        for resolved_fn in embed_fn._resolved_fns:
+            # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
+            # has more than one parameter, as long as it has at most one *required* parameter.
+            sig = resolved_fn.signature
+            if (len(sig.parameters) >= 1
+                and len(sig.required_parameters) <= 1
+                and sig.parameters_by_pos[0].col_type.type_enum == expected_type):
+                return resolved_fn
+        return None
+    @classmethod
+    def _validate_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> None:
+        """Validate the given embedding function."""
+        assert not embed_fn.is_polymorphic
         sig = embed_fn.signature
-        if len(sig.parameters) != 1 or sig.parameters_by_pos[0].col_type.type_enum != expected_type:
-            raise excs.Error(
-                f'{name} must take a single {expected_type.name.lower()} parameter, but has signature {sig}')
         # validate return type
         param_name = sig.parameters_by_pos[0].name
         if expected_type == ts.ColumnType.Type.STRING:
-            return_type = embed_fn.call_return_type({param_name: 'dummy'})
+            return_type = embed_fn.call_return_type([], {param_name: 'dummy'})
         else:
             assert expected_type == ts.ColumnType.Type.IMAGE
             img = PIL.Image.new('RGB', (512, 512))
-            return_type = embed_fn.call_return_type({param_name: img})
+            return_type = embed_fn.call_return_type([], {param_name: img})
         assert return_type is not None
         if not isinstance(return_type, ts.ArrayType):
-            raise excs.Error(f'{name} must return an array, but returns {return_type}')
-        else:
-            shape = return_type.shape
-            if len(shape) != 1 or shape[0] == None:
-                raise excs.Error(f'{name} must return a 1D array of a specific length, but returns {return_type}')
+            raise excs.Error(
+                f'The function `{embed_fn.name}` is not a valid embedding: '
+                f'it must return an array, but returns {return_type}'
+            )
+        shape = return_type.shape
+        if len(shape) != 1 or shape[0] == None:
+            raise excs.Error(
+                f'The function `{embed_fn.name}` is not a valid embedding: '
+                f'it must return a 1-dimensional array of a specific length, but returns {return_type}'
+            )
     def as_dict(self) -> dict:
         return {

pixeltable/io/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from .external_store import ExternalStore, SyncStatus
 from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
 from .hf_datasets import import_huggingface_dataset
 from .pandas import import_csv, import_excel, import_pandas
-from .parquet import import_parquet
+from .parquet import import_parquet, export_parquet
 __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
 __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}

pixeltable/io/label_studio.py CHANGED Viewed

@@ -574,7 +574,7 @@ class LabelStudioProject(Project):
             else:
                 local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
             if local_annotations_column not in t._schema.keys():
-                t[local_annotations_column] = pxt.JsonType(nullable=True)
+                t.add_columns({local_annotations_column: pxt.JsonType(nullable=True)})
         resolved_col_mapping = cls.validate_columns(
             t, config.export_columns, {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}, col_mapping)

pixeltable/io/parquet.py CHANGED Viewed

@@ -7,11 +7,14 @@ import random
 import typing
 from collections import deque
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Union
 import numpy as np
 import PIL.Image
+import datetime
+import pixeltable as pxt
+from pixeltable.env import Env
 import pixeltable.exceptions as exc
 import pixeltable.type_system as ts
 from pixeltable.utils.transactional_directory import transactional_directory
@@ -39,28 +42,44 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
     parquet.write_table(tab, str(output_path))
-def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
+def export_parquet(
+            table_or_df: Union[pxt.Table, pxt.DataFrame],
+            parquet_path: Path,
+            partition_size_bytes: int = 100_000_000,
+            inline_images: bool = False
+            ) -> None:
     """
-    Internal method to stream dataframe data to parquet format.
-    Does not materialize the dataset to memory.
+    Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
-    It preserves pixeltable type metadata in a json file, which would otherwise
+    It additionally writes the pixeltable metadata in a json file, which would otherwise
     not be available in the parquet format.
-    Images are stored inline in a compressed format in their parquet file.
     Args:
-        df : dataframe to save.
-        dest_path : path to directory to save the parquet files to.
-        partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
+        table_or_df : Table or Dataframe to export.
+        parquet_path : Path to directory to write the parquet files to.
+        partition_size_bytes : The maximum target size for each chunk. Default 100_000_000 bytes.
+        inline_images : If True, images are stored inline in the parquet file. This is useful
+                        for small images, to be imported as pytorch dataset. But can be inefficient
+                        for large images, and cannot be imported into pixeltable.
+                        If False, will raise an error if the Dataframe has any image column.
+                        Default False.
     """
     from pixeltable.utils.arrow import to_arrow_schema
+    df: pxt.DataFrame
+    if isinstance(table_or_df, pxt.catalog.Table):
+        df = table_or_df._df()
+    else:
+        df = table_or_df
     type_dict = {k: v.as_dict() for k, v in df.schema.items()}
     arrow_schema = to_arrow_schema(df.schema)
+    if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
+        raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
     # store the changes atomically
-    with transactional_directory(dest_path) as temp_path:
+    with transactional_directory(parquet_path) as temp_path:
         # dump metadata json file so we can inspect what was the source of the parquet file later on.
         json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
@@ -111,6 +130,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
                 elif col_type.is_bool_type():
                     length = 1
                 elif col_type.is_timestamp_type():
+                    val = val.astimezone(datetime.timezone.utc)
                     length = 8
                 else:
                     assert False, f'unknown type {col_type} for {col_name}'
@@ -139,7 +159,7 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional
 def import_parquet(
-    table_path: str,
+    table: str,
     *,
     parquet_path: str,
     schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
@@ -148,7 +168,7 @@ def import_parquet(
     """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
     Args:
-        table_path: Path to the table.
+        table: Fully qualified name of the table to import the data into.
         parquet_path: Path to an individual Parquet file or directory of Parquet files.
         schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
             name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
@@ -157,7 +177,7 @@ def import_parquet(
         kwargs: Additional arguments to pass to `create_table`.
     Returns:
-        A handle to the newly created [`Table`][pixeltable.Table].
+        A handle to the newly created table.
     """
     from pyarrow import parquet
@@ -176,11 +196,11 @@ def import_parquet(
         if v is None:
             raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
-    if table_path in pxt.list_tables():
-        raise exc.Error(f'Table {table_path} already exists')
+    if table in pxt.list_tables():
+        raise exc.Error(f'Table {table} already exists')
     try:
-        tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
+        tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
         tab = pxt.create_table(tmp_name, schema, **kwargs)
         for fragment in parquet_dataset.fragments:  # type: ignore[attr-defined]
             for batch in fragment.to_batches():
@@ -190,5 +210,5 @@ def import_parquet(
         _logger.error(f'Error while inserting Parquet file into table: {e}')
         raise e
-    pxt.move(tmp_name, table_path)
-    return pxt.get_table(table_path)
+    pxt.move(tmp_name, table)
+    return pxt.get_table(table)

pixeltable/iterators/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .base import ComponentIterator
 from .document import DocumentSplitter
+from .image import TileIterator
 from .string import StringSplitter
 from .video import FrameIterator

pixeltable/iterators/document.py CHANGED Viewed

@@ -151,6 +151,9 @@ class DocumentSplitter(ComponentIterator):
         elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
             assert self._doc_handle.pdf_doc is not None
             self._sections = self._pdf_sections()
+        elif self._doc_handle.format == DocumentType.DocumentFormat.TXT:
+            assert self._doc_handle.txt_doc is not None
+            self._sections = self._txt_sections()
         else:
             assert False, f'Unsupported document format: {self._doc_handle.format}'
@@ -389,6 +392,15 @@ class DocumentSplitter(ComponentIterator):
         if accumulated_text and not emit_on_page:
             yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
+    def _txt_sections(self) -> Iterator[DocumentSection]:
+        """Create DocumentSections for text files.
+        Currently, it returns the entire text as a single section.
+        TODO: Add support for paragraphs.
+        """
+        assert self._doc_handle.txt_doc is not None
+        yield DocumentSection(text=ftfy.fix_text(self._doc_handle.txt_doc), metadata=DocumentSectionMetadata())
     def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
         """Split the input sections into sentences"""
         for section in input_sections:

pixeltable/iterators/image.py ADDED Viewed

@@ -0,0 +1,100 @@
+from typing import Any, Sequence
+import PIL.Image
+import pixeltable.exceptions as excs
+import pixeltable.type_system as ts
+from pixeltable.iterators.base import ComponentIterator
+class TileIterator(ComponentIterator):
+    """
+    Iterator over tiles of an image. Each image will be divided into tiles of size `tile_size`, and the tiles will be
+    iterated over in row-major order (left-to-right, then top-to-bottom). An optional `overlap` parameter may be
+    specified. If the tiles do not exactly cover the image, then the rightmost and bottommost tiles will be padded with
+    blackspace, so that the output images all have the exact size `tile_size`.
+    Args:
+        image: Image to split into tiles.
+        tile_size: Size of each tile, as a pair of integers `[width, height]`.
+        overlap: Amount of overlap between adjacent tiles, as a pair of integers `[width, height]`.
+    """
+    __image: PIL.Image.Image
+    __tile_size: Sequence[int]
+    __overlap: Sequence[int]
+    __width: int
+    __height: int
+    __xlen: int
+    __ylen: int
+    __i: int
+    __j: int
+    def __init__(
+        self,
+        image: PIL.Image.Image,
+        *,
+        tile_size: tuple[int, int],
+        overlap: tuple[int, int] = (0, 0),
+    ):
+        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
+            raise excs.Error(f"overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}")
+        self.__image = image
+        self.__image.load()
+        self.__tile_size = tile_size
+        self.__overlap = overlap
+        self.__width, self.__height = image.size
+        # Justification for this formula: let t = tile_size[0], o = overlap[0]. Then the values of w (= width) that
+        # exactly accommodate an integer number of tiles are t, 2t - o, 3t - 2o, 4t - 3o, ...
+        # This formula ensures that t, 2t - o, 3t - 2o, ... result in an xlen of 1, 2, 3, ...
+        # but t + 1, 2t - o + 1, 3t - 2o + 1, ... result in an xlen of 2, 3, 4, ...
+        self.__xlen = (self.__width - overlap[0] - 1) // (tile_size[0] - overlap[0]) + 1
+        self.__ylen = (self.__height - overlap[1] - 1) // (tile_size[1] - overlap[1]) + 1
+        self.__i = 0
+        self.__j = 0
+    def __next__(self) -> dict[str, Any]:
+        if self.__j >= self.__ylen:
+            raise StopIteration
+        x1 = self.__i * (self.__tile_size[0] - self.__overlap[0])
+        y1 = self.__j * (self.__tile_size[1] - self.__overlap[1])
+        # If x2 > self.__width, PIL does the right thing and pads the image with blackspace
+        x2 = x1 + self.__tile_size[0]
+        y2 = y1 + self.__tile_size[1]
+        tile = self.__image.crop((x1, y1, x2, y2))
+        result = {
+            'tile': tile,
+            'tile_coord': [self.__i, self.__j],
+            'tile_box': [x1, y1, x2, y2]
+        }
+        self.__i += 1
+        if self.__i >= self.__xlen:
+            self.__i = 0
+            self.__j += 1
+        return result
+    def close(self) -> None:
+        pass
+    def set_pos(self, pos: int) -> None:
+        self.__j = pos // self.__xlen
+        self.__i = pos % self.__xlen
+    @classmethod
+    def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
+        return {
+            'image': ts.ImageType(),
+            'tile_size': ts.JsonType(),
+            'overlap': ts.JsonType(),
+        }
+    @classmethod
+    def output_schema(cls,  *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        return {
+            'tile': ts.ImageType(),
+            'tile_coord': ts.JsonType(),
+            'tile_box': ts.JsonType(),
+        }, ['tile']

pixeltable/iterators/video.py CHANGED Viewed

@@ -23,13 +23,13 @@ class FrameIterator(ComponentIterator):
     exact number of frames will be extracted. If neither is specified, then all frames will be extracted. The first
     frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
-        Args:
-            video: URL or path of the video to use for frame extraction.
-            fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
-                If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
-                extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
-            num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
-                `num_frames` is greater than the number of frames in the video, all frames will be extracted.
+    Args:
+        video: URL or path of the video to use for frame extraction.
+        fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
+            If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
+            extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
+        num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
+            `num_frames` is greater than the number of frames in the video, all frames will be extracted.
     """
     # Input parameters
@@ -180,7 +180,6 @@ class FrameIterator(ComponentIterator):
         self.container.close()
     def set_pos(self, pos: int) -> None:
-        """Seek to frame idx"""
         if pos == self.next_pos:
             return  # already there

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 22
+VERSION = 26
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_16.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from uuid import UUID
 import sqlalchemy as sql
 from pixeltable.metadata import register_converter
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
     )
-def __update_table_md(table_md: dict) -> None:
+def __update_table_md(table_md: dict, table_id: UUID) -> None:
     # External stores are not migratable; just drop them
     del table_md['remotes']
     table_md['external_stores'] = {}

pixeltable/metadata/converters/convert_17.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from uuid import UUID
 import sqlalchemy as sql
 from pixeltable.metadata import register_converter
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
     )
-def __update_table_md(table_md: dict) -> None:
+def __update_table_md(table_md: dict, table_id: UUID) -> None:
     # key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
     if len(table_md['index_md']) == 0:
         return

pixeltable/metadata/converters/convert_22.py ADDED Viewed

@@ -0,0 +1,17 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=22)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, substitution_fn=__substitute_md)
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'DataFrame':
+        v['from_clause'] = {'tbls': [v['tbl']], 'join_clauses': []}
+        return k, v
+    return None

pixeltable/metadata/converters/convert_23.py ADDED Viewed

@@ -0,0 +1,35 @@
+import logging
+from typing import Any, Optional
+from uuid import UUID
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+from pixeltable.metadata.schema import Table
+_logger = logging.getLogger('pixeltable')
+@register_converter(version=23)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(
+        engine,
+        table_md_updater=__update_table_md
+    )
+def __update_table_md(table_md: dict, table_id: UUID) -> None:
+    """update the index metadata to add indexed_col_tbl_id column if it is missing
+    Args:
+        table_md (dict): copy of the original table metadata. this gets updated in place.
+        table_id (UUID): the table id
+    """
+    if len(table_md['index_md']) == 0:
+        return
+    for idx_md in table_md['index_md'].values():
+        if 'indexed_col_tbl_id' not in idx_md:
+            # index metadata is missing indexed_col_tbl_id
+            # assume that the indexed column is in the same table
+            # and update the index metadata.
+            _logger.info(f'Updating index metadata for table: {table_id} index: {idx_md["id"]}')
+            idx_md['indexed_col_tbl_id'] = str(table_id)

pixeltable/metadata/converters/convert_24.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=24)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, substitution_fn=__substitute_md)
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    from pixeltable import func
+    from pixeltable.func.globals import resolve_symbol
+    if (isinstance(v, dict) and
+        '_classpath' in v and
+        v['_classpath'] in ['pixeltable.func.callable_function.CallableFunction',
+                            'pixeltable.func.aggregate_function.AggregateFunction',
+                            'pixeltable.func.expr_template_function.ExprTemplateFunction']):
+        if 'path' in v:
+            assert 'signature' not in v
+            f = resolve_symbol(__substitute_path(v['path']))
+            assert isinstance(f, func.Function)
+            v['signature'] = f.signatures[0].as_dict()
+        return k, v
+    if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'FunctionCall':
+        # Correct an older serialization mechanism where Expr elements of FunctionCall args and
+        # kwargs were indicated with idx == -1 rather than None. This was fixed for InlineList
+        # and InlineDict back in convert_20, but not for FunctionCall.
+        assert 'args' in v and isinstance(v['args'], list)
+        assert 'kwargs' in v and isinstance(v['kwargs'], dict)
+        v['args'] = [
+            (None, arg) if idx == -1 else (idx, arg)
+            for idx, arg in v['args']
+        ]
+        v['kwargs'] = {
+            k: (None, arg) if idx == -1 else (idx, arg)
+            for k, (idx, arg) in v['kwargs'].items()
+        }
+        return k, v
+    return None
+def __substitute_path(path: str) -> str:
+    # Starting with version 25, function signatures are preserved in metadata. To migrate from older
+    # versions, it's necessary to resolve the function symbol to get the signature. The following
+    # adjustment is necessary for function names that are stored in db artifacts of version < 25, but
+    # have changed in some version > 25.
+    if path in ['pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image']:
+        return 'pixeltable.functions.huggingface.clip'
+    return path

pixeltable 0.2.24__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.24py3-none-any.whl → 0.3.0py3-none-any.whl