PyPI - pixeltable - Versions diffs - 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl - Mend

pixeltable 0.4.17py3-none-any.whl → 0.4.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (153) hide show

pixeltable/__init__.py +1 -1
pixeltable/_version.py +1 -0
pixeltable/catalog/catalog.py +144 -118
pixeltable/catalog/column.py +104 -115
pixeltable/catalog/globals.py +1 -2
pixeltable/catalog/insertable_table.py +44 -49
pixeltable/catalog/path.py +3 -4
pixeltable/catalog/schema_object.py +4 -4
pixeltable/catalog/table.py +139 -124
pixeltable/catalog/table_metadata.py +6 -6
pixeltable/catalog/table_version.py +315 -246
pixeltable/catalog/table_version_handle.py +4 -4
pixeltable/catalog/table_version_path.py +9 -10
pixeltable/catalog/tbl_ops.py +9 -3
pixeltable/catalog/view.py +34 -28
pixeltable/config.py +14 -10
pixeltable/dataframe.py +69 -78
pixeltable/env.py +78 -64
pixeltable/exec/aggregation_node.py +6 -6
pixeltable/exec/cache_prefetch_node.py +10 -10
pixeltable/exec/data_row_batch.py +3 -3
pixeltable/exec/exec_context.py +16 -4
pixeltable/exec/exec_node.py +5 -5
pixeltable/exec/expr_eval/evaluators.py +6 -6
pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
pixeltable/exec/expr_eval/globals.py +6 -6
pixeltable/exec/expr_eval/row_buffer.py +1 -2
pixeltable/exec/expr_eval/schedulers.py +11 -11
pixeltable/exec/in_memory_data_node.py +2 -2
pixeltable/exec/object_store_save_node.py +14 -17
pixeltable/exec/sql_node.py +28 -27
pixeltable/exprs/arithmetic_expr.py +4 -4
pixeltable/exprs/array_slice.py +2 -2
pixeltable/exprs/column_property_ref.py +3 -3
pixeltable/exprs/column_ref.py +61 -74
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +3 -3
pixeltable/exprs/data_row.py +12 -12
pixeltable/exprs/expr.py +41 -31
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +3 -3
pixeltable/exprs/function_call.py +14 -14
pixeltable/exprs/in_predicate.py +4 -4
pixeltable/exprs/inline_expr.py +8 -8
pixeltable/exprs/is_null.py +1 -3
pixeltable/exprs/json_mapper.py +8 -8
pixeltable/exprs/json_path.py +6 -6
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +14 -14
pixeltable/exprs/rowid_ref.py +8 -8
pixeltable/exprs/similarity_expr.py +50 -25
pixeltable/exprs/sql_element_cache.py +4 -4
pixeltable/exprs/string_op.py +2 -2
pixeltable/exprs/type_cast.py +3 -5
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/callable_function.py +9 -9
pixeltable/func/expr_template_function.py +3 -3
pixeltable/func/function.py +15 -17
pixeltable/func/function_registry.py +6 -7
pixeltable/func/globals.py +2 -3
pixeltable/func/mcp.py +2 -2
pixeltable/func/query_template_function.py +16 -16
pixeltable/func/signature.py +14 -14
pixeltable/func/tools.py +11 -11
pixeltable/func/udf.py +16 -18
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/anthropic.py +7 -7
pixeltable/functions/audio.py +76 -0
pixeltable/functions/bedrock.py +6 -6
pixeltable/functions/deepseek.py +4 -4
pixeltable/functions/fireworks.py +2 -2
pixeltable/functions/gemini.py +6 -6
pixeltable/functions/globals.py +12 -12
pixeltable/functions/groq.py +4 -4
pixeltable/functions/huggingface.py +1033 -6
pixeltable/functions/image.py +7 -10
pixeltable/functions/llama_cpp.py +7 -7
pixeltable/functions/math.py +2 -3
pixeltable/functions/mistralai.py +3 -3
pixeltable/functions/ollama.py +9 -9
pixeltable/functions/openai.py +21 -21
pixeltable/functions/openrouter.py +7 -7
pixeltable/functions/string.py +21 -28
pixeltable/functions/timestamp.py +7 -8
pixeltable/functions/together.py +4 -6
pixeltable/functions/twelvelabs.py +92 -0
pixeltable/functions/video.py +36 -31
pixeltable/functions/vision.py +6 -6
pixeltable/functions/whisper.py +7 -7
pixeltable/functions/whisperx.py +16 -16
pixeltable/globals.py +75 -40
pixeltable/index/base.py +12 -8
pixeltable/index/btree.py +19 -22
pixeltable/index/embedding_index.py +30 -39
pixeltable/io/datarows.py +3 -3
pixeltable/io/external_store.py +13 -16
pixeltable/io/fiftyone.py +5 -5
pixeltable/io/globals.py +5 -5
pixeltable/io/hf_datasets.py +4 -4
pixeltable/io/label_studio.py +12 -12
pixeltable/io/pandas.py +6 -6
pixeltable/io/parquet.py +2 -2
pixeltable/io/table_data_conduit.py +12 -12
pixeltable/io/utils.py +2 -2
pixeltable/iterators/audio.py +2 -2
pixeltable/iterators/document.py +88 -57
pixeltable/iterators/video.py +66 -37
pixeltable/metadata/converters/convert_18.py +2 -2
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_20.py +2 -2
pixeltable/metadata/converters/convert_21.py +2 -2
pixeltable/metadata/converters/convert_22.py +2 -2
pixeltable/metadata/converters/convert_24.py +2 -2
pixeltable/metadata/converters/convert_25.py +2 -2
pixeltable/metadata/converters/convert_26.py +2 -2
pixeltable/metadata/converters/convert_29.py +4 -4
pixeltable/metadata/converters/convert_34.py +2 -2
pixeltable/metadata/converters/convert_36.py +2 -2
pixeltable/metadata/converters/convert_38.py +2 -2
pixeltable/metadata/converters/convert_39.py +1 -2
pixeltable/metadata/converters/util.py +11 -13
pixeltable/metadata/schema.py +22 -21
pixeltable/metadata/utils.py +2 -6
pixeltable/mypy/mypy_plugin.py +5 -5
pixeltable/plan.py +32 -34
pixeltable/share/packager.py +7 -7
pixeltable/share/publish.py +3 -3
pixeltable/store.py +126 -41
pixeltable/type_system.py +43 -46
pixeltable/utils/__init__.py +1 -2
pixeltable/utils/arrow.py +4 -4
pixeltable/utils/av.py +74 -38
pixeltable/utils/azure_store.py +305 -0
pixeltable/utils/code.py +1 -2
pixeltable/utils/dbms.py +15 -19
pixeltable/utils/description_helper.py +2 -3
pixeltable/utils/documents.py +5 -6
pixeltable/utils/exception_handler.py +2 -2
pixeltable/utils/filecache.py +5 -5
pixeltable/utils/formatter.py +4 -6
pixeltable/utils/gcs_store.py +9 -9
pixeltable/utils/local_store.py +17 -17
pixeltable/utils/object_stores.py +59 -43
pixeltable/utils/s3_store.py +35 -30
{pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
pixeltable-0.4.19.dist-info/RECORD +213 -0
pixeltable/__version__.py +0 -3
pixeltable-0.4.17.dist-info/RECORD +0 -211
{pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
{pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -8,7 +8,7 @@ import urllib.parse
 import urllib.request
 from dataclasses import dataclass, field, fields
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
+from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, cast
 import numpy as np
 import pandas as pd
@@ -50,15 +50,15 @@ class TableDataConduitFormat(str, enum.Enum):
 @dataclass
 class TableDataConduit:
     source: 'TableDataSource'
-    source_format: Optional[str] = None
-    source_column_map: Optional[dict[str, str]] = None
+    source_format: str | None = None
+    source_column_map: dict[str, str] | None = None
     if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
-    pxt_schema: Optional[dict[str, ts.ColumnType]] = None
-    src_schema_overrides: Optional[dict[str, ts.ColumnType]] = None
-    src_schema: Optional[dict[str, ts.ColumnType]] = None
-    pxt_pk: Optional[list[str]] = None
-    src_pk: Optional[list[str]] = None
-    valid_rows: Optional[RowData] = None
+    pxt_schema: dict[str, ts.ColumnType] | None = None
+    src_schema_overrides: dict[str, ts.ColumnType] | None = None
+    src_schema: dict[str, ts.ColumnType] | None = None
+    pxt_pk: list[str] | None = None
+    src_pk: list[str] | None = None
+    valid_rows: RowData | None = None
     extra_fields: dict[str, Any] = field(default_factory=dict)
     reqd_col_names: set[str] = field(default_factory=set)
@@ -151,7 +151,7 @@ class DFTableDataConduit(TableDataConduit):
 class RowDataTableDataConduit(TableDataConduit):
-    raw_rows: Optional[RowData] = None
+    raw_rows: RowData | None = None
     disable_mapping: bool = True
     batch_count: int = 0
@@ -332,7 +332,7 @@ class HFTableDataConduit(TableDataConduit):
     - use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
     """
-    column_name_for_split: Optional[str] = None
+    column_name_for_split: str | None = None
     categorical_features: dict[str, dict[int, str]]
     dataset_dict: dict[str, datasets.Dataset] = None
     hf_schema_source: dict[str, Any] = None
@@ -478,7 +478,7 @@ class HFTableDataConduit(TableDataConduit):
 class ParquetTableDataConduit(TableDataConduit):
-    pq_ds: Optional[ParquetDataset] = None
+    pq_ds: ParquetDataset | None = None
     @classmethod
     def from_tds(cls, tds: TableDataConduit) -> 'ParquetTableDataConduit':

pixeltable/io/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from keyword import iskeyword as is_python_keyword
-from typing import Any, Optional
+from typing import Any
 import pixeltable as pxt
 import pixeltable.exceptions as excs
@@ -40,7 +40,7 @@ def normalize_schema_names(
     primary_key: list[str],
     schema_overrides: dict[str, Any],
     require_valid_pxt_column_names: bool = False,
-) -> tuple[dict[str, Any], list[str], Optional[dict[str, str]]]:
+) -> tuple[dict[str, Any], list[str], dict[str, str] | None]:
     """
     Convert all names in the input schema from source names to valid Pixeltable identifiers
     - Ensure that all names are unique.

pixeltable/iterators/audio.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from fractions import Fraction
 from pathlib import Path
-from typing import Any, ClassVar, Optional
+from typing import Any, ClassVar
 import av
@@ -37,7 +37,7 @@ class AudioSplitter(ComponentIterator):
     # List of chunks to extract
     # Each chunk is defined by start and end presentation timestamps in audio file (int)
-    chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
+    chunks_to_extract_in_pts: list[tuple[int, int]] | None
     # next chunk to extract
     next_pos: int

pixeltable/iterators/document.py CHANGED Viewed

@@ -2,7 +2,7 @@ import dataclasses
 import enum
 import io
 import logging
-from typing import Any, ClassVar, Iterable, Iterator, Optional
+from typing import Any, ClassVar, Iterable, Iterator, Literal
 import fitz  # type: ignore[import-untyped]
 import ftfy
@@ -11,7 +11,7 @@ from bs4.element import NavigableString, Tag
 from pixeltable.env import Env
 from pixeltable.exceptions import Error
-from pixeltable.type_system import BoolType, ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
+from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
 from pixeltable.utils.documents import get_document_handle
 from .base import ComponentIterator
@@ -19,6 +19,11 @@ from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
+class Element(enum.Enum):
+    TEXT = 1
+    IMAGE = 2
 class ChunkMetadata(enum.Enum):
     TITLE = 1
     HEADING = 2
@@ -41,28 +46,28 @@ class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
     # html and markdown metadata
-    sourceline: Optional[int] = None
+    sourceline: int | None = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[dict[str, str]] = None
+    heading: dict[str, str] | None = None
     # pdf-specific metadata
-    page: Optional[int] = None
+    page: int | None = None
     # bounding box as an {x1, y1, x2, y2} dictionary
-    bounding_box: Optional[dict[str, float]] = None
+    bounding_box: dict[str, float] | None = None
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
-    text: Optional[str]
-    metadata: Optional[DocumentSectionMetadata]
-    image: Optional[PIL.Image.Image] = None
+    text: str | None = None
+    image: PIL.Image.Image | None = None
+    metadata: DocumentSectionMetadata | None = None
 def _parse_separators(separators: str) -> list[Separator]:
-    ret = []
+    ret: list[Separator] = []
     for s in separators.split(','):
         clean_s = s.strip().upper()
         if not clean_s:
@@ -76,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
 def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
-    ret = []
+    ret: list[ChunkMetadata] = []
     for m in metadata.split(','):
         clean_m = m.strip().upper()
         if not clean_m:
@@ -89,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
     return ret
+def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
+    result: list[Element] = []
+    for e in elements:
+        clean_e = e.strip().upper()
+        if clean_e not in Element.__members__:
+            raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
+        result.append(Element[clean_e])
+    if len(result) == 0:
+        raise Error('elements cannot be empty')
+    return result
 _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
@@ -106,11 +123,16 @@ class DocumentSplitter(ComponentIterator):
         separators: separators to use to chunk the document. Options are:
              `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
              This may be a comma-separated string, e.g., `'heading,token_limit'`.
+        elements: list of elements to extract from the document. Options are:
+            `'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
+            for the `'page'` separator on PDF documents.
         limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
              or `'char_limit'` is specified.
         metadata: additional metadata fields to include in the output. Options are:
              `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
              (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
+        image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
+        image_format: format to use when extracting images from PDFs. Defaults to 'png'.
     """
     METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -121,34 +143,41 @@ class DocumentSplitter(ComponentIterator):
         ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
     }
+    _doc_handle: Any
+    _separators: list[Separator]
+    _elements: list[Element]
+    _metadata_fields: list[ChunkMetadata]
+    _doc_title: str
+    _limit: int
+    _skip_tags: list[str]
+    _overlap: int
+    _tiktoken_encoding: str | None
+    _tiktoken_target_model: str | None
+    _image_dpi: int
+    _image_format: str
+    _sections: Iterator[DocumentSection]
     def __init__(
         self,
         document: str,
         *,
         separators: str,
-        limit: Optional[int] = None,
-        overlap: Optional[int] = None,
+        elements: list[Literal['text', 'image']] | None = None,
+        limit: int | None = None,
+        overlap: int | None = None,
         metadata: str = '',
-        html_skip_tags: Optional[list[str]] = None,
-        tiktoken_encoding: Optional[str] = 'cl100k_base',
-        tiktoken_target_model: Optional[str] = None,
-        # (PDF-processing-only)
-        include_page_image: bool = False,
-        page_image_dpi: int = 300,
-        page_image_format: str = 'png',
+        html_skip_tags: list[str] | None = None,
+        tiktoken_encoding: str | None = 'cl100k_base',
+        tiktoken_target_model: str | None = None,
+        image_dpi: int = 300,
+        image_format: str = 'png',
     ):
         if html_skip_tags is None:
             html_skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)
+        self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
         assert self._doc_handle is not None
-        # calling the output_schema method to validate the input arguments
-        self.output_schema(
-            separators=separators,
-            metadata=metadata,
-            limit=limit,
-            overlap=overlap,
-            include_page_image=include_page_image,
-        )
         self._separators = _parse_separators(separators)
         self._metadata_fields = _parse_metadata(metadata)
         if self._doc_handle.bs_doc is not None:
@@ -164,10 +193,8 @@ class DocumentSplitter(ComponentIterator):
         self._overlap = 0 if overlap is None else overlap
         self._tiktoken_encoding = tiktoken_encoding
         self._tiktoken_target_model = tiktoken_target_model
-        self._include_page_image = include_page_image
-        self._page_image_dpi = page_image_dpi
-        self._page_image_format = page_image_format
+        self._image_dpi = image_dpi
+        self._image_format = image_format
         # set up processing pipeline
         if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -197,23 +224,28 @@ class DocumentSplitter(ComponentIterator):
         return {
             'document': DocumentType(nullable=False),
             'separators': StringType(nullable=False),
+            'elements': JsonType(nullable=False),
             'metadata': StringType(nullable=False),
             'limit': IntType(nullable=True),
             'overlap': IntType(nullable=True),
             'skip_tags': StringType(nullable=True),
             'tiktoken_encoding': StringType(nullable=True),
             'tiktoken_target_model': StringType(nullable=True),
-            # PDF options must be declared so validation accepts them:
-            'include_page_image': BoolType(nullable=True),
-            'page_image_dpi': IntType(nullable=True),
-            'page_image_format': StringType(nullable=True),
+            'image_dpi': IntType(nullable=True),
+            'image_format': StringType(nullable=True),
         }
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
-        schema: dict[str, ColumnType] = {'text': StringType()}
-        md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
+        schema: dict[str, ColumnType] = {}
+        elements = _parse_elements(kwargs.get('elements', ['text']))
+        for element in elements:
+            if element == Element.TEXT:
+                schema['text'] = StringType(nullable=False)
+            elif element == Element.IMAGE:
+                schema['image'] = ImageType(nullable=False)
+        md_fields = _parse_metadata(kwargs.get('metadata', ''))
         for md_field in md_fields:
             schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
@@ -223,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
         limit = kwargs.get('limit')
         overlap = kwargs.get('overlap')
+        if Element.IMAGE in elements and separators != [Separator.PAGE]:
+            raise Error('Image elements are only supported for the "page" separator on PDF documents')
         if limit is not None or overlap is not None:
             if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
                 raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -236,23 +270,25 @@ class DocumentSplitter(ComponentIterator):
             if kwargs.get('limit') is None:
                 raise Error('limit is required with "token_limit"/"char_limit" separators')
-        # check dependencies at the end
         if Separator.SENTENCE in separators:
             _ = Env.get().spacy_nlp
         if Separator.TOKEN_LIMIT in separators:
             Env.get().require_package('tiktoken')
-        if kwargs.get('include_page_image'):
-            schema['image'] = ImageType(nullable=True)
         return schema, []
     def __next__(self) -> dict[str, Any]:
         while True:
             section = next(self._sections)
-            if section.text is None:
+            if section.text is None and section.image is None:
                 continue
-            result: dict[str, Any] = {'text': section.text}
+            result: dict[str, Any] = {}
+            for element in self._elements:
+                if element == Element.TEXT:
+                    result['text'] = section.text
+                elif element == Element.IMAGE:
+                    result['image'] = section.image
             for md_field in self._metadata_fields:
                 if md_field == ChunkMetadata.TITLE:
                     result[md_field.name.lower()] = self._doc_title
@@ -265,10 +301,6 @@ class DocumentSplitter(ComponentIterator):
                 elif md_field == ChunkMetadata.BOUNDING_BOX:
                     result[md_field.name.lower()] = section.metadata.bounding_box
-            # FIX: only include image if schema supports it
-            if self._include_page_image:
-                result['image'] = section.image
             return result
     def _html_sections(self) -> Iterator[DocumentSection]:
@@ -411,11 +443,10 @@ class DocumentSplitter(ComponentIterator):
             return txt
         for page_idx, page in enumerate(doc.pages()):
-            # render once per page if requested
-            page_image = None
-            if self._include_page_image:
-                pix = page.get_pixmap(dpi=self._page_image_dpi)  # ← single render
-                page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
+            img: PIL.Image.Image | None = None
+            if Element.IMAGE in self._elements:
+                pix = page.get_pixmap(dpi=self._image_dpi)
+                img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
             for block in page.get_text('blocks'):
                 x1, y1, x2, y2, text, *_ = block
@@ -423,14 +454,14 @@ class DocumentSplitter(ComponentIterator):
                 if accumulated_text and emit_on_paragraph:
                     bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
                     md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
-                    yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
+                    yield DocumentSection(text=_emit_text(), metadata=md)
             if accumulated_text and emit_on_page and not emit_on_paragraph:
                 md = DocumentSectionMetadata(page=page_idx)
-                yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
+                yield DocumentSection(text=_emit_text(), image=img, metadata=md)
         if accumulated_text and not emit_on_page:
-            yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(), image=None)
+            yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
     def _txt_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections for text files.

pixeltable/iterators/video.py CHANGED Viewed

@@ -4,7 +4,7 @@ import math
 import subprocess
 from fractions import Fraction
 from pathlib import Path
-from typing import Any, Iterator, Literal, Optional
+from typing import Any, Iterator, Literal
 import av
 import pandas as pd
@@ -42,9 +42,9 @@ class FrameIterator(ComponentIterator):
             [Frame](https://pyav.org/docs/develop/api/frame.html)):
             * `index` (`int`)
-            * `pts` (`Optional[int]`)
-            * `dts` (`Optional[int]`)
-            * `time` (`Optional[float]`)
+            * `pts` (`int | None`)
+            * `dts` (`int | None`)
+            * `time` (`float | None`)
             * `is_corrupt` (`bool`)
             * `key_frame` (`bool`)
             * `pict_type` (`int`)
@@ -55,8 +55,8 @@ class FrameIterator(ComponentIterator):
     # Input parameters
     video_path: Path
-    fps: Optional[float]
-    num_frames: Optional[int]
+    fps: float | None
+    num_frames: int | None
     all_frame_attrs: bool
     # Video info
@@ -67,19 +67,14 @@ class FrameIterator(ComponentIterator):
     video_start_time: int
     # List of frame indices to be extracted, or None to extract all frames
-    frames_to_extract: Optional[list[int]]
+    frames_to_extract: list[int] | None
     # Next frame to extract, as an iterator `pos` index. If `frames_to_extract` is None, this is the same as the
     # frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
     next_pos: int
     def __init__(
-        self,
-        video: str,
-        *,
-        fps: Optional[float] = None,
-        num_frames: Optional[int] = None,
-        all_frame_attrs: bool = False,
+        self, video: str, *, fps: float | None = None, num_frames: int | None = None, all_frame_attrs: bool = False
     ):
         if fps is not None and num_frames is not None:
             raise excs.Error('At most one of `fps` or `num_frames` may be specified')
@@ -251,7 +246,8 @@ class VideoSplitter(ComponentIterator):
     # Input parameters
     video_path: Path
-    segment_duration: float
+    segment_duration: float | None
+    segment_times: list[float] | None
     overlap: float
     min_segment_duration: float
     video_encoder: str | None
@@ -268,25 +264,31 @@ class VideoSplitter(ComponentIterator):
         self,
         video: str,
         *,
-        duration: float,
-        overlap: float = 0.0,
-        min_segment_duration: float = 0.0,
-        mode: Literal['fast', 'accurate'] = 'fast',
+        duration: float | None = None,
+        overlap: float | None = None,
+        min_segment_duration: float | None = None,
+        segment_times: list[float] | None = None,
+        mode: Literal['fast', 'accurate'] = 'accurate',
         video_encoder: str | None = None,
         video_encoder_args: dict[str, Any] | None = None,
     ):
         Env.get().require_binary('ffmpeg')
-        assert duration > 0.0
-        assert duration >= min_segment_duration
-        assert overlap < duration
+        assert (duration is not None) != (segment_times is not None)
+        if segment_times is not None:
+            assert len(segment_times) > 0
+        if duration is not None:
+            assert duration > 0.0
+            assert duration >= min_segment_duration
+            assert overlap is None or overlap < duration
         video_path = Path(video)
         assert video_path.exists() and video_path.is_file()
         self.video_path = video_path
         self.segment_duration = duration
-        self.overlap = overlap
-        self.min_segment_duration = min_segment_duration
+        self.overlap = overlap if overlap is not None else 0.0
+        self.min_segment_duration = min_segment_duration if min_segment_duration is not None else 0.0
+        self.segment_times = segment_times
         self.video_encoder = video_encoder
         self.video_encoder_args = video_encoder_args
@@ -304,6 +306,7 @@ class VideoSplitter(ComponentIterator):
             'duration': ts.FloatType(nullable=True),
             'overlap': ts.FloatType(nullable=True),
             'min_segment_duration': ts.FloatType(nullable=True),
+            'segment_times': ts.JsonType(nullable=True),
             'mode': ts.StringType(nullable=False),
             'video_encoder': ts.StringType(nullable=True),
             'video_encoder_args': ts.JsonType(nullable=True),
@@ -311,23 +314,34 @@ class VideoSplitter(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
-        param_names = ['duration', 'overlap', 'min_segment_duration']
+        param_names = ['duration', 'overlap', 'min_segment_duration', 'segment_times']
         params = dict(zip(param_names, args))
         params.update(kwargs)
-        segment_duration = params['duration']
-        min_segment_duration = params.get('min_segment_duration', 0.0)
-        overlap = params.get('overlap', 0.0)
+        segment_duration = params.get('duration')
+        segment_times = params.get('segment_times')
+        overlap = params.get('overlap')
+        min_segment_duration = params.get('min_segment_duration')
         mode = params.get('mode', 'fast')
-        if segment_duration <= 0.0:
-            raise excs.Error('duration must be a positive number')
-        if segment_duration < min_segment_duration:
-            raise excs.Error('duration must be at least min_segment_duration')
-        if mode == 'accurate' and overlap > 0:
+        if segment_duration is None and segment_times is None:
+            raise excs.Error('Must specify either duration or segment_times')
+        if segment_duration is not None and segment_times is not None:
+            raise excs.Error('duration and segment_times cannot both be specified')
+        if segment_times is not None:
+            if len(segment_times) == 0:
+                raise excs.Error('segment_times cannot be empty')
+            if overlap is not None:
+                raise excs.Error('overlap cannot be specified with segment_times')
+        if segment_duration is not None:
+            if segment_duration <= 0.0:
+                raise excs.Error('duration must be a positive number')
+            if min_segment_duration is not None and segment_duration < min_segment_duration:
+                raise excs.Error('duration must be at least min_segment_duration')
+            if overlap is not None and overlap >= segment_duration:
+                raise excs.Error('overlap must be less than duration')
+        if mode == 'accurate' and overlap is not None:
             raise excs.Error("Cannot specify overlap for mode='accurate'")
-        if overlap >= segment_duration:
-            raise excs.Error('overlap must be less than duration')
         if mode == 'fast':
             if params.get('video_encoder') is not None:
                 raise excs.Error("Cannot specify video_encoder for mode='fast'")
@@ -343,13 +357,22 @@ class VideoSplitter(ComponentIterator):
         }, []
     def fast_iter(self) -> Iterator[dict[str, Any]]:
-        segment_path: str
+        segment_path: str = ''
         try:
             start_time = 0.0
             start_pts = 0
+            segment_idx = 0
             while True:
+                target_duration: float | None
+                if self.segment_duration is not None:
+                    target_duration = self.segment_duration
+                elif self.segment_times is not None and segment_idx < len(self.segment_times):
+                    target_duration = self.segment_times[segment_idx] - start_time
+                else:
+                    target_duration = None  # the rest of the video
                 segment_path = str(TempStore.create_path(extension='.mp4'))
-                cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, self.segment_duration)
+                cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, target_duration)
                 _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
                 # use the actual duration
@@ -373,8 +396,13 @@ class VideoSplitter(ComponentIterator):
                 start_time = segment_end - self.overlap
                 start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
+                segment_idx += 1
+                if self.segment_times is not None and segment_idx > len(self.segment_times):
+                    # We've created all segments including the final segment after the last segment_time
+                    break
         except subprocess.CalledProcessError as e:
-            if Path(segment_path).exists():
+            if segment_path and Path(segment_path).exists():
                 Path(segment_path).unlink()
             error_msg = f'ffmpeg failed with return code {e.returncode}'
             if e.stderr:
@@ -389,6 +417,7 @@ class VideoSplitter(ComponentIterator):
             str(self.video_path),
             output_pattern,
             segment_duration=self.segment_duration,
+            segment_times=self.segment_times,
             video_encoder=self.video_encoder,
             video_encoder_args=self.video_encoder_args,
         )

pixeltable/metadata/converters/convert_18.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 import sqlalchemy as sql
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
     convert_table_md(engine, substitution_fn=__substitute_md)
-def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
     # Migrate a few changed function names
     if k == 'path' and v == 'pixeltable.functions.string.str_format':
         return 'path', 'pixeltable.functions.string.format'

pixeltable/metadata/converters/convert_19.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import datetime
-from typing import Any, Optional
+from typing import Any
 import sqlalchemy as sql
@@ -28,7 +28,7 @@ def _(engine: sql.engine.Engine) -> None:
                 conn.execute(sql.text(f'ALTER TABLE {store_name} ALTER COLUMN col_{col_id} TYPE TIMESTAMPTZ'))
-def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
+def __update_timestamp_literals(k: Any, v: Any) -> tuple[Any, Any] | None:
     if isinstance(v, dict) and 'val_t' in v:
         # It's a literal with an explicit 'val_t' field. In version 19 this can only mean a
         # timestamp literal, which (in version 19) is stored in the DB as a naive datetime.

pixeltable/metadata/converters/convert_20.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 import sqlalchemy as sql
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
     convert_table_md(engine, substitution_fn=__substitute_md)
-def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
     if isinstance(v, dict) and '_classname' in v:
         # The way InlineArray is represented changed in v20. Previously, literal values were stored
         # directly in the Inline expr; now we store them in Literal sub-exprs. This converter

pixeltable/metadata/converters/convert_21.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 import sqlalchemy as sql
@@ -24,7 +24,7 @@ def __update_schema_column(schema_column: dict) -> None:
     schema_column['media_validation'] = None
-def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
     if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
         if 'perform_validation' not in v:
             v['perform_validation'] = False

pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.17py3-none-any.whl → 0.4.19py3-none-any.whl