PyPI - pixeltable - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

pixeltable 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (150) hide show

pixeltable/__init__.py +64 -11
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/catalog.py +50 -27
pixeltable/catalog/column.py +27 -11
pixeltable/catalog/dir.py +6 -4
pixeltable/catalog/globals.py +8 -1
pixeltable/catalog/insertable_table.py +22 -12
pixeltable/catalog/named_function.py +10 -6
pixeltable/catalog/path.py +3 -2
pixeltable/catalog/path_dict.py +8 -6
pixeltable/catalog/schema_object.py +2 -1
pixeltable/catalog/table.py +121 -101
pixeltable/catalog/table_version.py +291 -142
pixeltable/catalog/table_version_path.py +8 -5
pixeltable/catalog/view.py +67 -26
pixeltable/dataframe.py +106 -81
pixeltable/env.py +28 -24
pixeltable/exec/__init__.py +2 -2
pixeltable/exec/aggregation_node.py +10 -4
pixeltable/exec/cache_prefetch_node.py +5 -3
pixeltable/exec/component_iteration_node.py +9 -9
pixeltable/exec/data_row_batch.py +21 -10
pixeltable/exec/exec_context.py +10 -3
pixeltable/exec/exec_node.py +23 -12
pixeltable/exec/expr_eval/evaluators.py +13 -7
pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
pixeltable/exec/expr_eval/globals.py +30 -7
pixeltable/exec/expr_eval/row_buffer.py +5 -6
pixeltable/exec/expr_eval/schedulers.py +151 -31
pixeltable/exec/in_memory_data_node.py +8 -7
pixeltable/exec/row_update_node.py +15 -5
pixeltable/exec/sql_node.py +56 -27
pixeltable/exprs/__init__.py +2 -2
pixeltable/exprs/arithmetic_expr.py +57 -26
pixeltable/exprs/array_slice.py +1 -1
pixeltable/exprs/column_property_ref.py +2 -1
pixeltable/exprs/column_ref.py +20 -15
pixeltable/exprs/comparison.py +6 -2
pixeltable/exprs/compound_predicate.py +1 -3
pixeltable/exprs/data_row.py +2 -2
pixeltable/exprs/expr.py +108 -72
pixeltable/exprs/expr_dict.py +2 -1
pixeltable/exprs/expr_set.py +3 -1
pixeltable/exprs/function_call.py +39 -41
pixeltable/exprs/globals.py +1 -0
pixeltable/exprs/in_predicate.py +2 -2
pixeltable/exprs/inline_expr.py +20 -17
pixeltable/exprs/json_mapper.py +4 -2
pixeltable/exprs/json_path.py +12 -18
pixeltable/exprs/literal.py +5 -9
pixeltable/exprs/method_ref.py +1 -0
pixeltable/exprs/object_ref.py +1 -1
pixeltable/exprs/row_builder.py +32 -17
pixeltable/exprs/rowid_ref.py +14 -5
pixeltable/exprs/similarity_expr.py +11 -6
pixeltable/exprs/sql_element_cache.py +1 -1
pixeltable/exprs/type_cast.py +24 -9
pixeltable/ext/__init__.py +1 -0
pixeltable/ext/functions/__init__.py +1 -0
pixeltable/ext/functions/whisperx.py +2 -2
pixeltable/ext/functions/yolox.py +11 -11
pixeltable/func/aggregate_function.py +17 -13
pixeltable/func/callable_function.py +6 -6
pixeltable/func/expr_template_function.py +15 -14
pixeltable/func/function.py +16 -16
pixeltable/func/function_registry.py +11 -8
pixeltable/func/globals.py +4 -2
pixeltable/func/query_template_function.py +12 -13
pixeltable/func/signature.py +18 -9
pixeltable/func/tools.py +10 -17
pixeltable/func/udf.py +106 -11
pixeltable/functions/__init__.py +21 -2
pixeltable/functions/anthropic.py +16 -12
pixeltable/functions/fireworks.py +63 -5
pixeltable/functions/gemini.py +13 -3
pixeltable/functions/globals.py +18 -6
pixeltable/functions/huggingface.py +20 -38
pixeltable/functions/image.py +7 -3
pixeltable/functions/json.py +1 -0
pixeltable/functions/llama_cpp.py +1 -4
pixeltable/functions/mistralai.py +31 -20
pixeltable/functions/ollama.py +4 -18
pixeltable/functions/openai.py +231 -113
pixeltable/functions/replicate.py +11 -10
pixeltable/functions/string.py +70 -7
pixeltable/functions/timestamp.py +21 -8
pixeltable/functions/together.py +66 -52
pixeltable/functions/video.py +1 -0
pixeltable/functions/vision.py +14 -11
pixeltable/functions/whisper.py +2 -1
pixeltable/globals.py +60 -26
pixeltable/index/__init__.py +1 -1
pixeltable/index/btree.py +5 -3
pixeltable/index/embedding_index.py +15 -14
pixeltable/io/__init__.py +1 -1
pixeltable/io/external_store.py +30 -25
pixeltable/io/fiftyone.py +6 -14
pixeltable/io/globals.py +33 -27
pixeltable/io/hf_datasets.py +2 -1
pixeltable/io/label_studio.py +77 -68
pixeltable/io/pandas.py +36 -23
pixeltable/io/parquet.py +9 -12
pixeltable/iterators/__init__.py +1 -0
pixeltable/iterators/audio.py +205 -0
pixeltable/iterators/document.py +19 -8
pixeltable/iterators/image.py +6 -24
pixeltable/iterators/string.py +3 -6
pixeltable/iterators/video.py +1 -7
pixeltable/metadata/__init__.py +7 -1
pixeltable/metadata/converters/convert_10.py +2 -2
pixeltable/metadata/converters/convert_15.py +1 -5
pixeltable/metadata/converters/convert_16.py +2 -4
pixeltable/metadata/converters/convert_17.py +2 -4
pixeltable/metadata/converters/convert_18.py +2 -4
pixeltable/metadata/converters/convert_19.py +2 -5
pixeltable/metadata/converters/convert_20.py +1 -4
pixeltable/metadata/converters/convert_21.py +4 -6
pixeltable/metadata/converters/convert_22.py +1 -0
pixeltable/metadata/converters/convert_23.py +5 -5
pixeltable/metadata/converters/convert_24.py +12 -13
pixeltable/metadata/converters/convert_26.py +23 -0
pixeltable/metadata/converters/util.py +3 -4
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +13 -2
pixeltable/plan.py +173 -98
pixeltable/share/__init__.py +0 -0
pixeltable/share/packager.py +218 -0
pixeltable/store.py +42 -26
pixeltable/type_system.py +102 -75
pixeltable/utils/arrow.py +7 -8
pixeltable/utils/coco.py +16 -17
pixeltable/utils/code.py +1 -1
pixeltable/utils/console_output.py +6 -3
pixeltable/utils/description_helper.py +7 -7
pixeltable/utils/documents.py +3 -1
pixeltable/utils/filecache.py +12 -7
pixeltable/utils/http_server.py +9 -8
pixeltable/utils/iceberg.py +14 -0
pixeltable/utils/media_store.py +3 -2
pixeltable/utils/pytorch.py +11 -14
pixeltable/utils/s3.py +1 -0
pixeltable/utils/sql.py +1 -0
pixeltable/utils/transactional_directory.py +2 -2
{pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/METADATA +9 -9
pixeltable-0.3.4.dist-info/RECORD +166 -0
pixeltable-0.3.2.dist-info/RECORD +0 -161
{pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/LICENSE +0 -0
{pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/WHEEL +0 -0
{pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt +0 -0

pixeltable/io/pandas.py CHANGED Viewed

@@ -9,10 +9,13 @@ import pixeltable.type_system as ts
 def import_pandas(
-    tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    tbl_name: str,
+    df: pd.DataFrame,
+    *,
+    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
-    comment: str = ''
+    comment: str = '',
 ) -> pxt.Table:
     """Creates a new base table from a Pandas
     [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
@@ -45,17 +48,21 @@ def import_pandas(
     schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
     tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
-    table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
+    table = pxt.create_table(
+        tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
+    )
     table.insert(tbl_rows)
     return table
 def import_csv(
-    tbl_name: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
+    tbl_name: str,
+    filepath_or_buffer,
+    schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
-    **kwargs
+    **kwargs,
 ) -> pxt.Table:
     """
     Creates a new base table from a csv file. This is a convenience method and is equivalent
@@ -67,15 +74,25 @@ def import_csv(
         A handle to the newly created [`Table`][pixeltable.Table].
     """
     df = pd.read_csv(filepath_or_buffer, **kwargs)
-    return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
+    return import_pandas(
+        tbl_name,
+        df,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        num_retained_versions=num_retained_versions,
+        comment=comment,
+    )
 def import_excel(
-    tbl_name: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
+    tbl_name: str,
+    io,
+    *args,
+    schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
-    **kwargs
+    **kwargs,
 ) -> pxt.Table:
     """
     Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
@@ -87,7 +104,14 @@ def import_excel(
         A handle to the newly created [`Table`][pixeltable.Table].
     """
     df = pd.read_excel(io, *args, **kwargs)
-    return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
+    return import_pandas(
+        tbl_name,
+        df,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        num_retained_versions=num_retained_versions,
+        comment=comment,
+    )
 def __df_to_pxt_schema(
@@ -161,20 +185,9 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
     """
     Infers a Pixeltable type based on a Numpy dtype.
     """
-    if np.issubdtype(np_dtype, np.integer):
-        return pxt.IntType(nullable=nullable)
-    if np.issubdtype(np_dtype, np.floating):
-        return pxt.FloatType(nullable=nullable)
-    if np.issubdtype(np_dtype, np.bool_):
-        return pxt.BoolType(nullable=nullable)
-    if np.issubdtype(np_dtype, np.character):
-        return pxt.StringType(nullable=nullable)
-    if np.issubdtype(np_dtype, np.datetime64):
-        return pxt.TimestampType(nullable=nullable)
+    pxttype = ts.ArrayType.from_np_dtype(np_dtype, nullable)
+    if pxttype is not None:
+        return pxttype
     if np_dtype == np.object_:
         # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type

pixeltable/io/parquet.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import datetime
 import io
 import json
 import logging
@@ -11,16 +12,16 @@ from typing import Any, Optional, Union
 import numpy as np
 import PIL.Image
-import datetime
 import pixeltable as pxt
-from pixeltable.env import Env
 import pixeltable.exceptions as exc
 import pixeltable.type_system as ts
+from pixeltable.env import Env
 from pixeltable.utils.transactional_directory import transactional_directory
 if typing.TYPE_CHECKING:
     import pyarrow as pa
     import pixeltable as pxt
 _logger = logging.getLogger('pixeltable')
@@ -43,11 +44,11 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
 def export_parquet(
-            table_or_df: Union[pxt.Table, pxt.DataFrame],
-            parquet_path: Path,
-            partition_size_bytes: int = 100_000_000,
-            inline_images: bool = False
-            ) -> None:
+    table_or_df: Union[pxt.Table, pxt.DataFrame],
+    parquet_path: Path,
+    partition_size_bytes: int = 100_000_000,
+    inline_images: bool = False,
+) -> None:
     """
     Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
@@ -159,11 +160,7 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional
 def import_parquet(
-    table: str,
-    *,
-    parquet_path: str,
-    schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
-    **kwargs: Any,
+    table: str, *, parquet_path: str, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs: Any
 ) -> pxt.Table:
     """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.

pixeltable/iterators/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from .audio import AudioSplitter
 from .base import ComponentIterator
 from .document import DocumentSplitter
 from .image import TileIterator

pixeltable/iterators/audio.py ADDED Viewed

@@ -0,0 +1,205 @@
+import logging
+import math
+import uuid
+from fractions import Fraction
+from pathlib import Path
+from typing import Any, Optional
+import av  # type: ignore[import-untyped]
+import pixeltable.env as env
+import pixeltable.exceptions as excs
+import pixeltable.type_system as ts
+from .base import ComponentIterator
+_logger = logging.getLogger('pixeltable')
+class AudioSplitter(ComponentIterator):
+    """
+    Iterator over chunks of an audio file. The audio file is split into smaller chunks, where the duration of each chunk is determined by chunk_duration_sec.
+    The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
+    If the input contains no audio, no chunks are yielded.
+    Args:
+        chunk_duration_sec: Audio chunk duration in seconds
+        overlap_sec: Overlap between consecutive chunks in seconds.
+        min_chunk_duration_sec: Drop the last chunk if it is smaller than min_chunk_duration_sec
+    """
+    # Input parameters
+    audio_path: Path
+    chunk_duration_sec: float
+    overlap_sec: float
+    # audio stream details
+    container: av.container.input.InputContainer
+    audio_time_base: Fraction  # seconds per presentation time
+    # List of chunks to extract
+    # Each chunk is defined by start and end presentation timestamps in audio file (int)
+    chunks_to_extract_in_pts: Optional[list[tuple[int, int]]] = []
+    # next chunk to extract
+    next_pos: int
+    __codec_map = {
+        'mp3': 'mp3',  # MP3 decoder -> mp3/libmp3lame encoder
+        'mp3float': 'mp3',  # MP3float decoder -> mp3 encoder
+        'aac': 'aac',  # AAC decoder -> AAC encoder
+        'vorbis': 'libvorbis',  # Vorbis decoder -> libvorbis encoder
+        'opus': 'libopus',  # Opus decoder -> libopus encoder
+        'flac': 'flac',  # FLAC decoder -> FLAC encoder
+        'wavpack': 'wavpack',  # WavPack decoder -> WavPack encoder
+        'alac': 'alac',  # ALAC decoder -> ALAC encoder
+    }
+    def __init__(
+        self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
+    ):
+        if chunk_duration_sec <= 0.0:
+            raise excs.Error('chunk_duration_sec must be a positive number')
+        if chunk_duration_sec < min_chunk_duration_sec:
+            raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
+        if overlap_sec >= chunk_duration_sec:
+            raise excs.Error('overlap_sec must be less than chunk_duration_sec')
+        audio_path = Path(audio)
+        assert audio_path.exists() and audio_path.is_file()
+        self.audio_path = audio_path
+        self.next_pos = 0
+        self.container = av.open(str(audio_path))
+        if len(self.container.streams.audio) == 0:
+            # No audio stream
+            return
+        self.chunk_duration_sec = chunk_duration_sec
+        self.overlap_sec = overlap_sec
+        self.min_chunk_duration_sec = min_chunk_duration_sec
+        self.audio_time_base = self.container.streams.audio[0].time_base
+        audio_start_time_pts = self.container.streams.audio[0].start_time or 0
+        audio_start_time_sec = float(audio_start_time_pts * self.audio_time_base)
+        total_audio_duration_pts = self.container.streams.audio[0].duration or 0
+        total_audio_duration_sec = float(total_audio_duration_pts * self.audio_time_base)
+        self.chunks_to_extract_in_pts = [
+            (round(start / self.audio_time_base), round(end / self.audio_time_base))
+            for (start, end) in self.build_chunks(
+                audio_start_time_sec, total_audio_duration_sec, chunk_duration_sec, overlap_sec, min_chunk_duration_sec
+            )
+        ]
+        _logger.debug(
+            f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts} chunks_to_extract_in_pts={self.chunks_to_extract_in_pts}'
+        )
+    @classmethod
+    def build_chunks(
+        cls,
+        start_time_sec: float,
+        total_duration_sec: float,
+        chunk_duration_sec: float,
+        overlap_sec: float,
+        min_chunk_duration_sec: float,
+    ) -> list[tuple[float, float]]:
+        chunks_to_extract_in_sec: list[tuple[float, float]] = []
+        current_pos = start_time_sec
+        end_time = start_time_sec + total_duration_sec
+        while current_pos < end_time:
+            chunk_start = current_pos
+            chunk_end = min(chunk_start + chunk_duration_sec, end_time)
+            chunks_to_extract_in_sec.append((chunk_start, chunk_end))
+            if chunk_end >= end_time:
+                break
+            current_pos = chunk_end - overlap_sec
+        # If the last chunk is smaller than min_chunk_duration_sec then drop the last chunk from the list
+        if (
+            len(chunks_to_extract_in_sec) > 0
+            and (chunks_to_extract_in_sec[-1][1] - chunks_to_extract_in_sec[-1][0]) < min_chunk_duration_sec
+        ):
+            return chunks_to_extract_in_sec[:-1]  # return all but the last chunk
+        return chunks_to_extract_in_sec
+    @classmethod
+    def input_schema(cls) -> dict[str, ts.ColumnType]:
+        return {
+            'audio': ts.AudioType(nullable=False),
+            'chunk_duration_sec': ts.FloatType(nullable=True),
+            'overlap_sec': ts.FloatType(nullable=True),
+            'min_chunk_duration_sec': ts.FloatType(nullable=True),
+        }
+    @classmethod
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        return {
+            'start_time_sec': ts.FloatType(),
+            'end_time_sec': ts.FloatType(),
+            'audio_chunk': ts.AudioType(nullable=True),
+        }, []
+    def __next__(self) -> dict[str, Any]:
+        if self.next_pos >= len(self.chunks_to_extract_in_pts):
+            raise StopIteration
+        target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
+        chunk_start_pts = 0
+        chunk_end_pts = 0
+        chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
+        output_container = av.open(chunk_file, mode='w')
+        input_stream = self.container.streams.audio[0]
+        codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
+        output_stream = output_container.add_stream(codec_name, rate=input_stream.codec_context.sample_rate)
+        frame_count = 0
+        # Since frames don't align with chunk boundaries, we may have read an extra frame in previous iteration
+        # Seek to the nearest frame in stream at current chunk start time
+        self.container.seek(target_chunk_start, backward=True, stream=self.container.streams.audio[0])
+        while True:
+            try:
+                frame = next(self.container.decode(audio=0))
+            except EOFError as e:
+                raise excs.Error(f'Failed to read audio file `{self.audio_path}`, error `{e}`')
+            except StopIteration:
+                # no more frames to scan
+                break
+            if frame.pts < target_chunk_start:
+                # Current frame is behind chunk's start time, always get frame next to chunk's start time
+                continue
+            if frame.pts >= target_chunk_end:
+                # Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away the current frame
+                break
+            frame_end = frame.pts + frame.samples
+            if frame_count == 0:
+                # Record start of the first frame
+                chunk_start_pts = frame.pts
+            # Write frame to output container
+            frame_count += 1
+            # If encode returns packets, write them to output container. Some encoders will buffer the frames.
+            output_container.mux(output_stream.encode(frame))
+            # record this frame's end as chunks end
+            chunk_end_pts = frame_end
+            # Check if frame's end has crossed the chunk boundary
+            if frame_end >= target_chunk_end:
+                break
+        # record result
+        if frame_count > 0:
+            # flush encoder
+            output_container.mux(output_stream.encode(None))
+            output_container.close()
+            result = {
+                'start_time_sec': round(float(chunk_start_pts * self.audio_time_base), 4),
+                'end_time_sec': round(float(chunk_end_pts * self.audio_time_base), 4),
+                'audio_chunk': chunk_file if frame_count > 0 else None,
+            }
+            _logger.debug('audio chunk result: %s', result)
+            self.next_pos += 1
+            return result
+        else:
+            # It's possible that there are no frames in the range of the last chunk, stop the iterator in this case.
+            # Note that start_time points at the first frame so case applies only for the last chunk
+            assert self.next_pos == len(self.chunks_to_extract_in_pts) - 1
+            self.next_pos += 1
+            raise StopIteration
+    def close(self) -> None:
+        self.container.close()
+    def set_pos(self, pos: int) -> None:
+        pass

pixeltable/iterators/document.py CHANGED Viewed

@@ -35,6 +35,7 @@ class Separator(enum.Enum):
 @dataclasses.dataclass
 class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
     # html and markdown metadata
     sourceline: Optional[int] = None
     # the stack of headings up to the most recently observed one;
@@ -50,6 +51,7 @@ class DocumentSectionMetadata:
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
     text: Optional[str]
     metadata: Optional[DocumentSectionMetadata]
@@ -93,6 +95,7 @@ class DocumentSplitter(ComponentIterator):
     Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
     """
     METADATA_COLUMN_TYPES = {
         ChunkMetadata.TITLE: StringType(nullable=True),
         ChunkMetadata.HEADING: JsonType(nullable=True),
@@ -102,10 +105,16 @@ class DocumentSplitter(ComponentIterator):
     }
     def __init__(
-            self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
-            metadata: str = '',
-            html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
-            tiktoken_target_model: Optional[str] = None
+        self,
+        document: str,
+        *,
+        separators: str,
+        limit: Optional[int] = None,
+        overlap: Optional[int] = None,
+        metadata: str = '',
+        html_skip_tags: Optional[list[str]] = None,
+        tiktoken_encoding: Optional[str] = 'cl100k_base',
+        tiktoken_target_model: Optional[str] = None,
     ):
         """Init method for `DocumentSplitter` class.
@@ -234,13 +243,14 @@ class DocumentSplitter(ComponentIterator):
     def _html_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections reflecting the html-specific separators"""
         import bs4
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
         # current state
         accumulated_text: list[str] = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: dict[str, str] = {}   # current state of observed headings (level -> text)
+        headings: dict[str, str] = {}  # current state of observed headings (level -> text)
         sourceline = 0  # most recently seen sourceline
         def update_metadata(el: bs4.Tag) -> None:
@@ -300,7 +310,7 @@ class DocumentSplitter(ComponentIterator):
         # current state
         accumulated_text: list[str] = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: dict[str, str] = {}   # current state of observed headings (level -> text)
+        headings: dict[str, str] = {}  # current state of observed headings (level -> text)
         def update_headings(heading: dict) -> None:
             # update current state
@@ -353,6 +363,7 @@ class DocumentSplitter(ComponentIterator):
     def _pdf_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections reflecting the pdf-specific separators"""
         import fitz  # type: ignore[import-untyped]
         doc: fitz.Document = self._doc_handle.pdf_doc
         assert doc is not None
@@ -385,8 +396,7 @@ class DocumentSplitter(ComponentIterator):
                     yield DocumentSection(text=_emit_text(), metadata=metadata)
             if accumulated_text and emit_on_page and not emit_on_paragraph:
-                yield DocumentSection(text=_emit_text(),
-                                      metadata=DocumentSectionMetadata(page=page_number))
+                yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
                 accumulated_text = []
         if accumulated_text and not emit_on_page:
@@ -411,6 +421,7 @@ class DocumentSplitter(ComponentIterator):
     def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
         import tiktoken
         if self._tiktoken_target_model is not None:
             encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
         else:

pixeltable/iterators/image.py CHANGED Viewed

@@ -30,15 +30,9 @@ class TileIterator(ComponentIterator):
     __i: int
     __j: int
-    def __init__(
-        self,
-        image: PIL.Image.Image,
-        *,
-        tile_size: tuple[int, int],
-        overlap: tuple[int, int] = (0, 0),
-    ):
+    def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
         if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
-            raise excs.Error(f"overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}")
+            raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
         self.__image = image
         self.__image.load()
@@ -64,11 +58,7 @@ class TileIterator(ComponentIterator):
         x2 = x1 + self.__tile_size[0]
         y2 = y1 + self.__tile_size[1]
         tile = self.__image.crop((x1, y1, x2, y2))
-        result = {
-            'tile': tile,
-            'tile_coord': [self.__i, self.__j],
-            'tile_box': [x1, y1, x2, y2]
-        }
+        result = {'tile': tile, 'tile_coord': [self.__i, self.__j], 'tile_box': [x1, y1, x2, y2]}
         self.__i += 1
         if self.__i >= self.__xlen:
@@ -85,16 +75,8 @@ class TileIterator(ComponentIterator):
     @classmethod
     def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
-        return {
-            'image': ts.ImageType(),
-            'tile_size': ts.JsonType(),
-            'overlap': ts.JsonType(),
-        }
+        return {'image': ts.ImageType(), 'tile_size': ts.JsonType(), 'overlap': ts.JsonType()}
     @classmethod
-    def output_schema(cls,  *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
-        return {
-            'tile': ts.ImageType(),
-            'tile_coord': ts.JsonType(),
-            'tile_box': ts.JsonType(),
-        }, ['tile']
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']

pixeltable/iterators/string.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Iterator, Any
+from typing import Any, Iterator
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
@@ -30,11 +30,8 @@ class StringSplitter(ComponentIterator):
     @classmethod
     def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
-        return {
-            'text': ts.StringType(),
-            'separators': ts.StringType(),
-        }
+        return {'text': ts.StringType(), 'separators': ts.StringType()}
     @classmethod
-    def output_schema(cls,  *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
         return {'text': ts.StringType()}, []

pixeltable/iterators/video.py CHANGED Viewed

@@ -24,7 +24,6 @@ class FrameIterator(ComponentIterator):
     frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
     Args:
-        video: URL or path of the video to use for frame extraction.
         fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
             If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
             extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
@@ -167,12 +166,7 @@ class FrameIterator(ComponentIterator):
             img = frame.to_image()
             assert isinstance(img, PIL.Image.Image)
             pos_msec = float(pts * self.video_time_base * 1000)
-            result = {
-                'frame_idx': self.next_pos,
-                'pos_msec': pos_msec,
-                'pos_frame': video_idx,
-                'frame': img,
-            }
+            result = {'frame_idx': self.next_pos, 'pos_msec': pos_msec, 'pos_frame': video_idx, 'frame': img}
             self.next_pos += 1
             return result

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 26
+VERSION = 27
 def create_system_info(engine: sql.engine.Engine) -> None:
@@ -22,20 +22,25 @@ def create_system_info(engine: sql.engine.Engine) -> None:
         session.flush()
         session.commit()
 # conversion functions for upgrading the metadata schema from one version to the following
 # key: old schema version
 converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
 def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
     def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
         global converter_cbs
         converter_cbs[version] = fn
     return decorator
 # load all converter modules
 for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
     importlib.import_module('pixeltable.metadata.converters.' + modname)
 def upgrade_md(engine: sql.engine.Engine) -> None:
     """Upgrade the metadata schema to the current version"""
     with orm.Session(engine) as session:
@@ -48,6 +53,7 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
             if md_version not in converter_cbs:
                 raise RuntimeError(f'No metadata converter for version {md_version}')
             from pixeltable.env import Env
             Env.get().console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
             converter_cbs[md_version](engine)
             md_version += 1

pixeltable/metadata/converters/convert_10.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import sqlalchemy as sql
-from pixeltable.metadata.schema import Table, TableSchemaVersion
 from pixeltable.metadata import register_converter
+from pixeltable.metadata.schema import Table, TableSchemaVersion
 @register_converter(version=10)
 def _(engine: sql.engine.Engine) -> None:
-    default_table_attrs = {"comment": None, "num_retained_versions": 10}
+    default_table_attrs = {'comment': None, 'num_retained_versions': 10}
     with engine.begin() as conn:
         # Because `parameters` wasn't actually used for anything,
         # we can simply delete it without any data loss.

pixeltable/metadata/converters/convert_15.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import inspect
 import logging
 from typing import Any
@@ -37,8 +36,5 @@ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
         params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
     is_batched = 'batch_size' in orig_d
     sig = func.Signature(return_type, params, is_batched=is_batched)
-    d = {
-        'signature': sig.as_dict(),
-        'batch_size': orig_d['batch_size'] if is_batched else None,
-    }
+    d = {'signature': sig.as_dict(), 'batch_size': orig_d['batch_size'] if is_batched else None}
     return d

pixeltable/metadata/converters/convert_16.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from uuid import UUID
 import sqlalchemy as sql
 from pixeltable.metadata import register_converter
@@ -7,10 +8,7 @@ from pixeltable.metadata.converters.util import convert_table_md
 @register_converter(version=16)
 def _(engine: sql.engine.Engine) -> None:
-    convert_table_md(
-        engine,
-        table_md_updater=__update_table_md
-    )
+    convert_table_md(engine, table_md_updater=__update_table_md)
 def __update_table_md(table_md: dict, table_id: UUID) -> None:

pixeltable/metadata/converters/convert_17.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from uuid import UUID
 import sqlalchemy as sql
 from pixeltable.metadata import register_converter
@@ -7,10 +8,7 @@ from pixeltable.metadata.converters.util import convert_table_md
 @register_converter(version=17)
 def _(engine: sql.engine.Engine) -> None:
-    convert_table_md(
-        engine,
-        table_md_updater=__update_table_md
-    )
+    convert_table_md(engine, table_md_updater=__update_table_md)
 def __update_table_md(table_md: dict, table_id: UUID) -> None:

pixeltable 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl