PyPI - pixeltable - Versions diffs - 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

pixeltable 0.3.14py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

pixeltable/__init__.py +42 -8
pixeltable/{dataframe.py → _query.py} +470 -206
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +5 -4
pixeltable/catalog/catalog.py +1785 -432
pixeltable/catalog/column.py +190 -113
pixeltable/catalog/dir.py +2 -4
pixeltable/catalog/globals.py +19 -46
pixeltable/catalog/insertable_table.py +191 -98
pixeltable/catalog/path.py +63 -23
pixeltable/catalog/schema_object.py +11 -15
pixeltable/catalog/table.py +843 -436
pixeltable/catalog/table_metadata.py +103 -0
pixeltable/catalog/table_version.py +978 -657
pixeltable/catalog/table_version_handle.py +72 -16
pixeltable/catalog/table_version_path.py +112 -43
pixeltable/catalog/tbl_ops.py +53 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +134 -90
pixeltable/config.py +134 -22
pixeltable/env.py +471 -157
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +4 -1
pixeltable/exec/aggregation_node.py +7 -8
pixeltable/exec/cache_prefetch_node.py +83 -110
pixeltable/exec/cell_materialization_node.py +268 -0
pixeltable/exec/cell_reconstruction_node.py +168 -0
pixeltable/exec/component_iteration_node.py +4 -3
pixeltable/exec/data_row_batch.py +8 -65
pixeltable/exec/exec_context.py +16 -4
pixeltable/exec/exec_node.py +13 -36
pixeltable/exec/expr_eval/evaluators.py +11 -7
pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
pixeltable/exec/expr_eval/globals.py +8 -5
pixeltable/exec/expr_eval/row_buffer.py +1 -2
pixeltable/exec/expr_eval/schedulers.py +106 -56
pixeltable/exec/globals.py +35 -0
pixeltable/exec/in_memory_data_node.py +19 -19
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +351 -84
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +27 -22
pixeltable/exprs/array_slice.py +3 -3
pixeltable/exprs/column_property_ref.py +36 -23
pixeltable/exprs/column_ref.py +213 -89
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +5 -4
pixeltable/exprs/data_row.py +164 -54
pixeltable/exprs/expr.py +70 -44
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +17 -10
pixeltable/exprs/function_call.py +100 -40
pixeltable/exprs/globals.py +2 -2
pixeltable/exprs/in_predicate.py +4 -4
pixeltable/exprs/inline_expr.py +18 -32
pixeltable/exprs/is_null.py +7 -3
pixeltable/exprs/json_mapper.py +8 -8
pixeltable/exprs/json_path.py +56 -22
pixeltable/exprs/literal.py +27 -5
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +167 -67
pixeltable/exprs/rowid_ref.py +25 -10
pixeltable/exprs/similarity_expr.py +58 -40
pixeltable/exprs/sql_element_cache.py +4 -4
pixeltable/exprs/string_op.py +5 -5
pixeltable/exprs/type_cast.py +3 -5
pixeltable/func/__init__.py +1 -0
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/callable_function.py +9 -9
pixeltable/func/expr_template_function.py +17 -11
pixeltable/func/function.py +18 -20
pixeltable/func/function_registry.py +6 -7
pixeltable/func/globals.py +2 -3
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +29 -27
pixeltable/func/signature.py +46 -19
pixeltable/func/tools.py +31 -13
pixeltable/func/udf.py +18 -20
pixeltable/functions/__init__.py +16 -0
pixeltable/functions/anthropic.py +123 -77
pixeltable/functions/audio.py +147 -10
pixeltable/functions/bedrock.py +13 -6
pixeltable/functions/date.py +7 -4
pixeltable/functions/deepseek.py +35 -43
pixeltable/functions/document.py +81 -0
pixeltable/functions/fal.py +76 -0
pixeltable/functions/fireworks.py +11 -20
pixeltable/functions/gemini.py +195 -39
pixeltable/functions/globals.py +142 -14
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1056 -24
pixeltable/functions/image.py +115 -57
pixeltable/functions/json.py +1 -1
pixeltable/functions/llama_cpp.py +28 -13
pixeltable/functions/math.py +67 -5
pixeltable/functions/mistralai.py +18 -55
pixeltable/functions/net.py +70 -0
pixeltable/functions/ollama.py +20 -13
pixeltable/functions/openai.py +240 -226
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +4 -4
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +239 -69
pixeltable/functions/timestamp.py +16 -16
pixeltable/functions/together.py +24 -84
pixeltable/functions/twelvelabs.py +188 -0
pixeltable/functions/util.py +6 -1
pixeltable/functions/uuid.py +30 -0
pixeltable/functions/video.py +1515 -107
pixeltable/functions/vision.py +8 -8
pixeltable/functions/voyageai.py +289 -0
pixeltable/functions/whisper.py +16 -8
pixeltable/functions/whisperx.py +179 -0
pixeltable/{ext/functions → functions}/yolox.py +2 -4
pixeltable/globals.py +362 -115
pixeltable/index/base.py +17 -21
pixeltable/index/btree.py +28 -22
pixeltable/index/embedding_index.py +100 -118
pixeltable/io/__init__.py +4 -2
pixeltable/io/datarows.py +8 -7
pixeltable/io/external_store.py +56 -105
pixeltable/io/fiftyone.py +13 -13
pixeltable/io/globals.py +31 -30
pixeltable/io/hf_datasets.py +61 -16
pixeltable/io/label_studio.py +74 -70
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +21 -12
pixeltable/io/parquet.py +25 -105
pixeltable/io/table_data_conduit.py +250 -123
pixeltable/io/utils.py +4 -4
pixeltable/iterators/__init__.py +2 -1
pixeltable/iterators/audio.py +26 -25
pixeltable/iterators/base.py +9 -3
pixeltable/iterators/document.py +112 -78
pixeltable/iterators/image.py +12 -15
pixeltable/iterators/string.py +11 -4
pixeltable/iterators/video.py +523 -120
pixeltable/metadata/__init__.py +14 -3
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_18.py +2 -2
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_20.py +2 -2
pixeltable/metadata/converters/convert_21.py +2 -2
pixeltable/metadata/converters/convert_22.py +2 -2
pixeltable/metadata/converters/convert_24.py +2 -2
pixeltable/metadata/converters/convert_25.py +2 -2
pixeltable/metadata/converters/convert_26.py +2 -2
pixeltable/metadata/converters/convert_29.py +4 -4
pixeltable/metadata/converters/convert_30.py +34 -21
pixeltable/metadata/converters/convert_34.py +2 -2
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/convert_41.py +12 -0
pixeltable/metadata/converters/convert_42.py +9 -0
pixeltable/metadata/converters/convert_43.py +44 -0
pixeltable/metadata/converters/util.py +20 -31
pixeltable/metadata/notes.py +9 -0
pixeltable/metadata/schema.py +140 -53
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +382 -115
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +547 -83
pixeltable/share/protocol/__init__.py +33 -0
pixeltable/share/protocol/common.py +165 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +119 -0
pixeltable/share/publish.py +257 -59
pixeltable/store.py +311 -194
pixeltable/type_system.py +373 -211
pixeltable/utils/__init__.py +2 -3
pixeltable/utils/arrow.py +131 -17
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +346 -0
pixeltable/utils/coco.py +6 -6
pixeltable/utils/code.py +3 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/coroutine.py +6 -23
pixeltable/utils/dbms.py +32 -6
pixeltable/utils/description_helper.py +4 -5
pixeltable/utils/documents.py +7 -18
pixeltable/utils/exception_handler.py +7 -30
pixeltable/utils/filecache.py +6 -6
pixeltable/utils/formatter.py +86 -48
pixeltable/utils/gcs_store.py +295 -0
pixeltable/utils/http.py +133 -0
pixeltable/utils/http_server.py +2 -3
pixeltable/utils/iceberg.py +1 -2
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +90 -0
pixeltable/utils/local_store.py +322 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +573 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +527 -0
pixeltable/utils/sql.py +26 -0
pixeltable/utils/system.py +30 -0
pixeltable-0.5.7.dist-info/METADATA +579 -0
pixeltable-0.5.7.dist-info/RECORD +227 -0
{pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/catalog/named_function.py +0 -40
pixeltable/ext/__init__.py +0 -17
pixeltable/ext/functions/__init__.py +0 -11
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/utils/media_store.py +0 -77
pixeltable/utils/s3.py +0 -17
pixeltable-0.3.14.dist-info/METADATA +0 -434
pixeltable-0.3.14.dist-info/RECORD +0 -186
pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
{pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0

pixeltable/iterators/audio.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import logging
-import uuid
 from fractions import Fraction
 from pathlib import Path
-from typing import Any, ClassVar, Optional
+from typing import Any, ClassVar
 import av
+from deprecated import deprecated
-from pixeltable import env, exceptions as excs, type_system as ts
+from pixeltable import exceptions as excs, type_system as ts
+from pixeltable.utils.local_store import TempStore
 from .base import ComponentIterator
@@ -14,18 +15,6 @@ _logger = logging.getLogger('pixeltable')
 class AudioSplitter(ComponentIterator):
-    """
-    Iterator over chunks of an audio file. The audio file is split into smaller chunks,
-    where the duration of each chunk is determined by chunk_duration_sec.
-    The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
-    If the input contains no audio, no chunks are yielded.
-    Args:
-        chunk_duration_sec: Audio chunk duration in seconds
-        overlap_sec: Overlap between consecutive chunks in seconds.
-        min_chunk_duration_sec: Drop the last chunk if it is smaller than min_chunk_duration_sec
-    """
     # Input parameters
     audio_path: Path
     chunk_duration_sec: float
@@ -37,7 +26,7 @@ class AudioSplitter(ComponentIterator):
     # List of chunks to extract
     # Each chunk is defined by start and end presentation timestamps in audio file (int)
-    chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
+    chunks_to_extract_in_pts: list[tuple[int, int]] | None
     # next chunk to extract
     next_pos: int
@@ -55,12 +44,9 @@ class AudioSplitter(ComponentIterator):
     def __init__(
         self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
     ):
-        if chunk_duration_sec <= 0.0:
-            raise excs.Error('chunk_duration_sec must be a positive number')
-        if chunk_duration_sec < min_chunk_duration_sec:
-            raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
-        if overlap_sec >= chunk_duration_sec:
-            raise excs.Error('overlap_sec must be less than chunk_duration_sec')
+        assert chunk_duration_sec > 0.0
+        assert chunk_duration_sec >= min_chunk_duration_sec
+        assert overlap_sec < chunk_duration_sec
         audio_path = Path(audio)
         assert audio_path.exists() and audio_path.is_file()
         self.audio_path = audio_path
@@ -128,6 +114,19 @@ class AudioSplitter(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
+        params = dict(zip(param_names, args))
+        params.update(kwargs)
+        chunk_duration_sec = params['chunk_duration_sec']
+        min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
+        overlap_sec = params.get('overlap_sec', 0.0)
+        if chunk_duration_sec <= 0.0:
+            raise excs.Error('chunk_duration_sec must be a positive number')
+        if chunk_duration_sec < min_chunk_duration_sec:
+            raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
+        if overlap_sec >= chunk_duration_sec:
+            raise excs.Error('overlap_sec must be less than chunk_duration_sec')
         return {
             'start_time_sec': ts.FloatType(),
             'end_time_sec': ts.FloatType(),
@@ -140,7 +139,7 @@ class AudioSplitter(ComponentIterator):
         target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
         chunk_start_pts = 0
         chunk_end_pts = 0
-        chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
+        chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
         output_container = av.open(chunk_file, mode='w')
         input_stream = self.container.streams.audio[0]
         codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
@@ -202,5 +201,7 @@ class AudioSplitter(ComponentIterator):
     def close(self) -> None:
         self.container.close()
-    def set_pos(self, pos: int) -> None:
-        pass
+    @classmethod
+    @deprecated('create() is deprecated; use `pixeltable.functions.audio.audio_splitter` instead', version='0.5.6')
+    def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        return super()._create(**kwargs)

pixeltable/iterators/base.py CHANGED Viewed

@@ -43,11 +43,17 @@ class ComponentIterator(ABC):
         """Close the iterator and release all resources"""
         raise NotImplementedError
-    @abstractmethod
-    def set_pos(self, pos: int) -> None:
+    def set_pos(self, pos: int, **kwargs: Any) -> None:
         """Set the iterator position to pos"""
-        raise NotImplementedError
+        pass
     @classmethod
     def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        # TODO: This is still needed for compatibility with existing user-defined iterators; it will become deprecated
+        #     when the new decorator pattern is introduced for iterators
+        return cls._create(**kwargs)
+    @classmethod
+    def _create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        # create() variant that can be called by subclasses without generating a deprecation warning.
         return cls, kwargs

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import dataclasses
 import enum
 import logging
-from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
+from typing import Any, ClassVar, Iterable, Iterator, Literal
 import ftfy
+import PIL.Image
+from bs4.element import NavigableString, Tag
+from deprecated import deprecated
+from pypdfium2 import PdfDocument  # type: ignore[import-untyped]
 from pixeltable.env import Env
 from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
+from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
 from pixeltable.utils.documents import get_document_handle
 from .base import ComponentIterator
@@ -15,6 +19,11 @@ from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
+class Element(enum.Enum):
+    TEXT = 1
+    IMAGE = 2
 class ChunkMetadata(enum.Enum):
     TITLE = 1
     HEADING = 2
@@ -37,27 +46,28 @@ class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
     # html and markdown metadata
-    sourceline: Optional[int] = None
+    sourceline: int | None = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[dict[str, str]] = None
+    heading: dict[str, str] | None = None
     # pdf-specific metadata
-    page: Optional[int] = None
+    page: int | None = None
     # bounding box as an {x1, y1, x2, y2} dictionary
-    bounding_box: Optional[dict[str, float]] = None
+    bounding_box: dict[str, float] | None = None
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
-    text: Optional[str]
-    metadata: Optional[DocumentSectionMetadata]
+    text: str | None = None
+    image: PIL.Image.Image | None = None
+    metadata: DocumentSectionMetadata | None = None
 def _parse_separators(separators: str) -> list[Separator]:
-    ret = []
+    ret: list[Separator] = []
     for s in separators.split(','):
         clean_s = s.strip().upper()
         if not clean_s:
@@ -71,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
 def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
-    ret = []
+    ret: list[ChunkMetadata] = []
     for m in metadata.split(','):
         clean_m = m.strip().upper()
         if not clean_m:
@@ -84,18 +94,22 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
     return ret
-_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
+def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
+    result: list[Element] = []
+    for e in elements:
+        clean_e = e.strip().upper()
+        if clean_e not in Element.__members__:
+            raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
+        result.append(Element[clean_e])
+    if len(result) == 0:
+        raise Error('elements cannot be empty')
+    return result
-class DocumentSplitter(ComponentIterator):
-    """Iterator over chunks of a document. The document is chunked according to the specified `separators`.
-    The iterator yields a `text` field containing the text of the chunk, and it may also
-    include additional metadata fields if specified in the `metadata` parameter, as explained below.
+_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
-    Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
-    """
+class DocumentSplitter(ComponentIterator):
     METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
         ChunkMetadata.TITLE: StringType(nullable=True),
         ChunkMetadata.HEADING: JsonType(nullable=True),
@@ -104,36 +118,41 @@ class DocumentSplitter(ComponentIterator):
         ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
     }
+    _doc_handle: Any
+    _separators: list[Separator]
+    _elements: list[Element]
+    _metadata_fields: list[ChunkMetadata]
+    _doc_title: str
+    _limit: int
+    _skip_tags: list[str]
+    _overlap: int
+    _tiktoken_encoding: str | None
+    _tiktoken_target_model: str | None
+    _image_dpi: int
+    _image_format: str
+    _sections: Iterator[DocumentSection]
     def __init__(
         self,
         document: str,
         *,
         separators: str,
-        limit: Optional[int] = None,
-        overlap: Optional[int] = None,
+        elements: list[Literal['text', 'image']] | None = None,
+        limit: int | None = None,
+        overlap: int | None = None,
         metadata: str = '',
-        html_skip_tags: Optional[list[str]] = None,
-        tiktoken_encoding: Optional[str] = 'cl100k_base',
-        tiktoken_target_model: Optional[str] = None,
+        skip_tags: list[str] | None = None,
+        tiktoken_encoding: str | None = 'cl100k_base',
+        tiktoken_target_model: str | None = None,
+        image_dpi: int = 300,
+        image_format: str = 'png',
     ):
-        """Init method for `DocumentSplitter` class.
-        Args:
-            separators: separators to use to chunk the document. Options are:
-                 `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
-                 This may be a comma-separated string, e.g., `'heading,token_limit'`.
-            limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
-                 or `'char_limit'` is specified.
-            metadata: additional metadata fields to include in the output. Options are:
-                 `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
-                 (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
-        """
-        if html_skip_tags is None:
-            html_skip_tags = ['nav']
+        if skip_tags is None:
+            skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)
+        self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
         assert self._doc_handle is not None
-        # calling the output_schema method to validate the input arguments
-        self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
         self._separators = _parse_separators(separators)
         self._metadata_fields = _parse_metadata(metadata)
         if self._doc_handle.bs_doc is not None:
@@ -145,10 +164,12 @@ class DocumentSplitter(ComponentIterator):
         else:
             self._doc_title = ''
         self._limit = 0 if limit is None else limit
-        self._skip_tags = html_skip_tags
+        self._skip_tags = skip_tags
         self._overlap = 0 if overlap is None else overlap
         self._tiktoken_encoding = tiktoken_encoding
         self._tiktoken_target_model = tiktoken_target_model
+        self._image_dpi = image_dpi
+        self._image_format = image_format
         # set up processing pipeline
         if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -178,19 +199,28 @@ class DocumentSplitter(ComponentIterator):
         return {
             'document': DocumentType(nullable=False),
             'separators': StringType(nullable=False),
+            'elements': JsonType(nullable=False),
             'metadata': StringType(nullable=False),
             'limit': IntType(nullable=True),
             'overlap': IntType(nullable=True),
             'skip_tags': StringType(nullable=True),
             'tiktoken_encoding': StringType(nullable=True),
             'tiktoken_target_model': StringType(nullable=True),
+            'image_dpi': IntType(nullable=True),
+            'image_format': StringType(nullable=True),
         }
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
-        schema: dict[str, ColumnType] = {'text': StringType()}
-        md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
+        schema: dict[str, ColumnType] = {}
+        elements = _parse_elements(kwargs.get('elements', ['text']))
+        for element in elements:
+            if element == Element.TEXT:
+                schema['text'] = StringType(nullable=False)
+            elif element == Element.IMAGE:
+                schema['image'] = ImageType(nullable=False)
+        md_fields = _parse_metadata(kwargs.get('metadata', ''))
         for md_field in md_fields:
             schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
@@ -200,6 +230,8 @@ class DocumentSplitter(ComponentIterator):
         limit = kwargs.get('limit')
         overlap = kwargs.get('overlap')
+        if Element.IMAGE in elements and separators != [Separator.PAGE]:
+            raise Error('Image elements are only supported for the "page" separator on PDF documents')
         if limit is not None or overlap is not None:
             if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
                 raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -213,7 +245,6 @@ class DocumentSplitter(ComponentIterator):
             if kwargs.get('limit') is None:
                 raise Error('limit is required with "token_limit"/"char_limit" separators')
-        # check dependencies at the end
         if Separator.SENTENCE in separators:
             _ = Env.get().spacy_nlp
         if Separator.TOKEN_LIMIT in separators:
@@ -224,9 +255,15 @@ class DocumentSplitter(ComponentIterator):
     def __next__(self) -> dict[str, Any]:
         while True:
             section = next(self._sections)
-            if section.text is None:
+            if section.text is None and section.image is None:
                 continue
-            result: dict[str, Any] = {'text': section.text}
+            result: dict[str, Any] = {}
+            for element in self._elements:
+                if element == Element.TEXT:
+                    result['text'] = section.text
+                elif element == Element.IMAGE:
+                    result['image'] = section.image
             for md_field in self._metadata_fields:
                 if md_field == ChunkMetadata.TITLE:
                     result[md_field.name.lower()] = self._doc_title
@@ -238,6 +275,7 @@ class DocumentSplitter(ComponentIterator):
                     result[md_field.name.lower()] = section.metadata.page
                 elif md_field == ChunkMetadata.BOUNDING_BOX:
                     result[md_field.name.lower()] = section.metadata.bounding_box
             return result
     def _html_sections(self) -> Iterator[DocumentSection]:
@@ -273,7 +311,7 @@ class DocumentSplitter(ComponentIterator):
                 yield DocumentSection(text=full_text, metadata=md)
                 accumulated_text = []
-        def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
+        def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
             nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
@@ -361,43 +399,35 @@ class DocumentSplitter(ComponentIterator):
         yield from emit()
     def _pdf_sections(self) -> Iterator[DocumentSection]:
-        """Create DocumentSections reflecting the pdf-specific separators"""
-        import fitz  # type: ignore[import-untyped]
-        doc: fitz.Document = self._doc_handle.pdf_doc
-        assert doc is not None
+        if Separator.PARAGRAPH in self._separators:
+            raise Error(
+                'Paragraph splitting is not currently supported for PDF documents. Please contact'
+                ' us at https://github.com/pixeltable/pixeltable/issues if you need this feature.'
+            )
-        emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
-        emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
+        doc: PdfDocument = self._doc_handle.pdf_doc
+        assert isinstance(doc, PdfDocument)
-        accumulated_text = []  # invariant: all elements are ftfy clean and non-empty
+        emit_on_page = Separator.PAGE in self._separators
+        accumulated_text: list[str] = []
-        def _add_cleaned_text(raw_text: str) -> None:
-            fixed = ftfy.fix_text(raw_text)
+        def _add_cleaned(raw: str) -> None:
+            fixed = ftfy.fix_text(raw)
             if fixed:
                 accumulated_text.append(fixed)
         def _emit_text() -> str:
-            full_text = ''.join(accumulated_text)
+            txt = ''.join(accumulated_text)
             accumulated_text.clear()
-            return full_text
-        for page_number, page in enumerate(doc.pages()):
-            for block in page.get_text('blocks'):
-                # there is no concept of paragraph in pdf, block is the closest thing
-                # we can get (eg a paragraph in text may cut across pages)
-                # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
-                # other libraries like pdfminer also lack an explicit paragraph concept
-                x1, y1, x2, y2, text, _, _ = block
-                _add_cleaned_text(text)
-                if accumulated_text and emit_on_paragraph:
-                    bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
-                    metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
-                    yield DocumentSection(text=_emit_text(), metadata=metadata)
-            if accumulated_text and emit_on_page and not emit_on_paragraph:
-                yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
-                accumulated_text = []
+            return txt
+        for page_idx, page in enumerate(doc):
+            img = page.render().to_pil() if Element.IMAGE in self._elements else None
+            text = page.get_textpage().get_text_bounded()
+            _add_cleaned(text)
+            if accumulated_text and emit_on_page:
+                md = DocumentSectionMetadata(page=page_idx)
+                yield DocumentSection(text=_emit_text(), image=img, metadata=md)
         if accumulated_text and not emit_on_page:
             yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
@@ -465,5 +495,9 @@ class DocumentSplitter(ComponentIterator):
     def close(self) -> None:
         pass
-    def set_pos(self, pos: int) -> None:
-        pass
+    @classmethod
+    @deprecated(
+        'create() is deprecated; use `pixeltable.functions.document.document_splitter` instead', version='0.5.6'
+    )
+    def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        return super()._create(**kwargs)

pixeltable/iterators/image.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Any, Sequence
 import PIL.Image
+from deprecated import deprecated
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
@@ -8,18 +9,6 @@ from pixeltable.iterators.base import ComponentIterator
 class TileIterator(ComponentIterator):
-    """
-    Iterator over tiles of an image. Each image will be divided into tiles of size `tile_size`, and the tiles will be
-    iterated over in row-major order (left-to-right, then top-to-bottom). An optional `overlap` parameter may be
-    specified. If the tiles do not exactly cover the image, then the rightmost and bottommost tiles will be padded with
-    blackspace, so that the output images all have the exact size `tile_size`.
-    Args:
-        image: Image to split into tiles.
-        tile_size: Size of each tile, as a pair of integers `[width, height]`.
-        overlap: Amount of overlap between adjacent tiles, as a pair of integers `[width, height]`.
-    """
     __image: PIL.Image.Image
     __tile_size: Sequence[int]
     __overlap: Sequence[int]
@@ -31,8 +20,7 @@ class TileIterator(ComponentIterator):
     __j: int
     def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
-        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
-            raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
+        assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
         self.__image = image
         self.__image.load()
@@ -69,7 +57,7 @@ class TileIterator(ComponentIterator):
     def close(self) -> None:
         pass
-    def set_pos(self, pos: int) -> None:
+    def set_pos(self, pos: int, **kwargs: Any) -> None:
         self.__j = pos // self.__xlen
         self.__i = pos % self.__xlen
@@ -79,4 +67,13 @@ class TileIterator(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        tile_size = kwargs.get('tile_size')
+        overlap = kwargs.get('overlap', (0, 0))
+        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
+            raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
         return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
+    @classmethod
+    @deprecated('create() is deprecated; use `pixeltable.functions.image.tile_iterator` instead', version='0.5.6')
+    def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        return super()._create(**kwargs)

pixeltable/iterators/string.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from typing import Any, Iterator
+from deprecated import deprecated
 from pixeltable import exceptions as excs, type_system as ts
 from pixeltable.env import Env
 from pixeltable.iterators.base import ComponentIterator
 class StringSplitter(ComponentIterator):
-    # TODO(aaron-siegel): Merge this with `DocumentSplitter` in order to provide additional capabilities.
+    _text: str
+    doc: Any  # spacy doc
+    iter: Iterator[dict[str, Any]]
     def __init__(self, text: str, *, separators: str):
         if separators != 'sentence':
             raise excs.Error('Only `sentence` separators are currently supported.')
@@ -24,9 +29,6 @@ class StringSplitter(ComponentIterator):
     def close(self) -> None:
         pass
-    def set_pos(self, pos: int) -> None:
-        pass
     @classmethod
     def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
         return {'text': ts.StringType(), 'separators': ts.StringType()}
@@ -34,3 +36,8 @@ class StringSplitter(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
         return {'text': ts.StringType()}, []
+    @classmethod
+    @deprecated('create() is deprecated; use `pixeltable.functions.string.string_splitter` instead', version='0.5.6')
+    def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        return super()._create(**kwargs)

pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

pixeltable 0.3.14py3-none-any.whl → 0.5.7py3-none-any.whl