PyPI - pixeltable - Versions diffs - 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl - Mend

pixeltable 0.4.0rc3py3-none-any.whl → 0.4.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show

pixeltable/__init__.py +23 -5
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +5 -3
pixeltable/catalog/catalog.py +1318 -404
pixeltable/catalog/column.py +186 -115
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/globals.py +11 -43
pixeltable/catalog/insertable_table.py +167 -79
pixeltable/catalog/path.py +61 -23
pixeltable/catalog/schema_object.py +9 -10
pixeltable/catalog/table.py +626 -308
pixeltable/catalog/table_metadata.py +101 -0
pixeltable/catalog/table_version.py +713 -569
pixeltable/catalog/table_version_handle.py +37 -6
pixeltable/catalog/table_version_path.py +42 -29
pixeltable/catalog/tbl_ops.py +50 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +108 -94
pixeltable/config.py +128 -22
pixeltable/dataframe.py +188 -100
pixeltable/env.py +407 -136
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +3 -0
pixeltable/exec/aggregation_node.py +7 -8
pixeltable/exec/cache_prefetch_node.py +83 -110
pixeltable/exec/cell_materialization_node.py +231 -0
pixeltable/exec/cell_reconstruction_node.py +135 -0
pixeltable/exec/component_iteration_node.py +4 -3
pixeltable/exec/data_row_batch.py +8 -65
pixeltable/exec/exec_context.py +16 -4
pixeltable/exec/exec_node.py +13 -36
pixeltable/exec/expr_eval/evaluators.py +7 -6
pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
pixeltable/exec/expr_eval/globals.py +8 -5
pixeltable/exec/expr_eval/row_buffer.py +1 -2
pixeltable/exec/expr_eval/schedulers.py +190 -30
pixeltable/exec/globals.py +32 -0
pixeltable/exec/in_memory_data_node.py +18 -18
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +206 -101
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +27 -22
pixeltable/exprs/array_slice.py +3 -3
pixeltable/exprs/column_property_ref.py +34 -30
pixeltable/exprs/column_ref.py +92 -96
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +5 -4
pixeltable/exprs/data_row.py +152 -55
pixeltable/exprs/expr.py +62 -43
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +17 -10
pixeltable/exprs/function_call.py +75 -37
pixeltable/exprs/globals.py +1 -2
pixeltable/exprs/in_predicate.py +4 -4
pixeltable/exprs/inline_expr.py +10 -27
pixeltable/exprs/is_null.py +1 -3
pixeltable/exprs/json_mapper.py +8 -8
pixeltable/exprs/json_path.py +56 -22
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +127 -53
pixeltable/exprs/rowid_ref.py +8 -12
pixeltable/exprs/similarity_expr.py +50 -25
pixeltable/exprs/sql_element_cache.py +4 -4
pixeltable/exprs/string_op.py +5 -5
pixeltable/exprs/type_cast.py +3 -5
pixeltable/func/__init__.py +1 -0
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/callable_function.py +9 -9
pixeltable/func/expr_template_function.py +10 -10
pixeltable/func/function.py +18 -20
pixeltable/func/function_registry.py +6 -7
pixeltable/func/globals.py +2 -3
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +20 -18
pixeltable/func/signature.py +43 -16
pixeltable/func/tools.py +23 -13
pixeltable/func/udf.py +18 -20
pixeltable/functions/__init__.py +6 -0
pixeltable/functions/anthropic.py +93 -33
pixeltable/functions/audio.py +114 -10
pixeltable/functions/bedrock.py +13 -6
pixeltable/functions/date.py +1 -1
pixeltable/functions/deepseek.py +20 -9
pixeltable/functions/fireworks.py +2 -2
pixeltable/functions/gemini.py +28 -11
pixeltable/functions/globals.py +13 -13
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1046 -23
pixeltable/functions/image.py +9 -18
pixeltable/functions/llama_cpp.py +23 -8
pixeltable/functions/math.py +3 -4
pixeltable/functions/mistralai.py +4 -15
pixeltable/functions/ollama.py +16 -9
pixeltable/functions/openai.py +104 -82
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +2 -2
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +21 -28
pixeltable/functions/timestamp.py +13 -14
pixeltable/functions/together.py +4 -6
pixeltable/functions/twelvelabs.py +92 -0
pixeltable/functions/util.py +6 -1
pixeltable/functions/video.py +1388 -106
pixeltable/functions/vision.py +7 -7
pixeltable/functions/whisper.py +15 -7
pixeltable/functions/whisperx.py +179 -0
pixeltable/{ext/functions → functions}/yolox.py +2 -4
pixeltable/globals.py +332 -105
pixeltable/index/base.py +13 -22
pixeltable/index/btree.py +23 -22
pixeltable/index/embedding_index.py +32 -44
pixeltable/io/__init__.py +4 -2
pixeltable/io/datarows.py +7 -6
pixeltable/io/external_store.py +49 -77
pixeltable/io/fiftyone.py +11 -11
pixeltable/io/globals.py +29 -28
pixeltable/io/hf_datasets.py +17 -9
pixeltable/io/label_studio.py +70 -66
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +12 -11
pixeltable/io/parquet.py +13 -93
pixeltable/io/table_data_conduit.py +71 -47
pixeltable/io/utils.py +3 -3
pixeltable/iterators/__init__.py +2 -1
pixeltable/iterators/audio.py +21 -11
pixeltable/iterators/document.py +116 -55
pixeltable/iterators/image.py +5 -2
pixeltable/iterators/video.py +293 -13
pixeltable/metadata/__init__.py +4 -2
pixeltable/metadata/converters/convert_18.py +2 -2
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_20.py +2 -2
pixeltable/metadata/converters/convert_21.py +2 -2
pixeltable/metadata/converters/convert_22.py +2 -2
pixeltable/metadata/converters/convert_24.py +2 -2
pixeltable/metadata/converters/convert_25.py +2 -2
pixeltable/metadata/converters/convert_26.py +2 -2
pixeltable/metadata/converters/convert_29.py +4 -4
pixeltable/metadata/converters/convert_34.py +2 -2
pixeltable/metadata/converters/convert_36.py +2 -2
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/util.py +13 -12
pixeltable/metadata/notes.py +4 -0
pixeltable/metadata/schema.py +79 -42
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +274 -223
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +259 -129
pixeltable/share/protocol/__init__.py +34 -0
pixeltable/share/protocol/common.py +170 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +109 -0
pixeltable/share/publish.py +213 -57
pixeltable/store.py +238 -175
pixeltable/type_system.py +104 -63
pixeltable/utils/__init__.py +2 -3
pixeltable/utils/arrow.py +108 -13
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +305 -0
pixeltable/utils/code.py +3 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/coroutine.py +6 -23
pixeltable/utils/dbms.py +31 -5
pixeltable/utils/description_helper.py +4 -5
pixeltable/utils/documents.py +5 -6
pixeltable/utils/exception_handler.py +7 -30
pixeltable/utils/filecache.py +6 -6
pixeltable/utils/formatter.py +4 -6
pixeltable/utils/gcs_store.py +283 -0
pixeltable/utils/http_server.py +2 -3
pixeltable/utils/iceberg.py +1 -2
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +88 -0
pixeltable/utils/local_store.py +316 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +528 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +392 -0
pixeltable-0.4.20.dist-info/METADATA +587 -0
pixeltable-0.4.20.dist-info/RECORD +218 -0
{pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/ext/__init__.py +0 -17
pixeltable/ext/functions/__init__.py +0 -11
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/utils/media_store.py +0 -77
pixeltable/utils/s3.py +0 -17
pixeltable/utils/sample.py +0 -25
pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
{pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import dataclasses
 import enum
+import io
 import logging
-from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
+from typing import Any, ClassVar, Iterable, Iterator, Literal
+import fitz  # type: ignore[import-untyped]
 import ftfy
+import PIL.Image
+from bs4.element import NavigableString, Tag
 from pixeltable.env import Env
 from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
+from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
 from pixeltable.utils.documents import get_document_handle
 from .base import ComponentIterator
@@ -15,6 +19,11 @@ from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
+class Element(enum.Enum):
+    TEXT = 1
+    IMAGE = 2
 class ChunkMetadata(enum.Enum):
     TITLE = 1
     HEADING = 2
@@ -37,27 +46,28 @@ class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
     # html and markdown metadata
-    sourceline: Optional[int] = None
+    sourceline: int | None = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[dict[str, str]] = None
+    heading: dict[str, str] | None = None
     # pdf-specific metadata
-    page: Optional[int] = None
+    page: int | None = None
     # bounding box as an {x1, y1, x2, y2} dictionary
-    bounding_box: Optional[dict[str, float]] = None
+    bounding_box: dict[str, float] | None = None
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
-    text: Optional[str]
-    metadata: Optional[DocumentSectionMetadata]
+    text: str | None = None
+    image: PIL.Image.Image | None = None
+    metadata: DocumentSectionMetadata | None = None
 def _parse_separators(separators: str) -> list[Separator]:
-    ret = []
+    ret: list[Separator] = []
     for s in separators.split(','):
         clean_s = s.strip().upper()
         if not clean_s:
@@ -71,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
 def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
-    ret = []
+    ret: list[ChunkMetadata] = []
     for m in metadata.split(','):
         clean_m = m.strip().upper()
         if not clean_m:
@@ -84,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
     return ret
+def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
+    result: list[Element] = []
+    for e in elements:
+        clean_e = e.strip().upper()
+        if clean_e not in Element.__members__:
+            raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
+        result.append(Element[clean_e])
+    if len(result) == 0:
+        raise Error('elements cannot be empty')
+    return result
 _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
@@ -94,6 +116,23 @@ class DocumentSplitter(ComponentIterator):
     include additional metadata fields if specified in the `metadata` parameter, as explained below.
     Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
+    How to init the `DocumentSplitter` class?
+    Args:
+        separators: separators to use to chunk the document. Options are:
+             `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
+             This may be a comma-separated string, e.g., `'heading,token_limit'`.
+        elements: list of elements to extract from the document. Options are:
+            `'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
+            for the `'page'` separator on PDF documents.
+        limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
+             or `'char_limit'` is specified.
+        metadata: additional metadata fields to include in the output. Options are:
+             `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
+             (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
+        image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
+        image_format: format to use when extracting images from PDFs. Defaults to 'png'.
     """
     METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -104,36 +143,41 @@ class DocumentSplitter(ComponentIterator):
         ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
     }
+    _doc_handle: Any
+    _separators: list[Separator]
+    _elements: list[Element]
+    _metadata_fields: list[ChunkMetadata]
+    _doc_title: str
+    _limit: int
+    _skip_tags: list[str]
+    _overlap: int
+    _tiktoken_encoding: str | None
+    _tiktoken_target_model: str | None
+    _image_dpi: int
+    _image_format: str
+    _sections: Iterator[DocumentSection]
     def __init__(
         self,
         document: str,
         *,
         separators: str,
-        limit: Optional[int] = None,
-        overlap: Optional[int] = None,
+        elements: list[Literal['text', 'image']] | None = None,
+        limit: int | None = None,
+        overlap: int | None = None,
         metadata: str = '',
-        html_skip_tags: Optional[list[str]] = None,
-        tiktoken_encoding: Optional[str] = 'cl100k_base',
-        tiktoken_target_model: Optional[str] = None,
+        html_skip_tags: list[str] | None = None,
+        tiktoken_encoding: str | None = 'cl100k_base',
+        tiktoken_target_model: str | None = None,
+        image_dpi: int = 300,
+        image_format: str = 'png',
     ):
-        """Init method for `DocumentSplitter` class.
-        Args:
-            separators: separators to use to chunk the document. Options are:
-                 `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
-                 This may be a comma-separated string, e.g., `'heading,token_limit'`.
-            limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
-                 or `'char_limit'` is specified.
-            metadata: additional metadata fields to include in the output. Options are:
-                 `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
-                 (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
-        """
         if html_skip_tags is None:
             html_skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)
+        self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
         assert self._doc_handle is not None
-        # calling the output_schema method to validate the input arguments
-        self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
         self._separators = _parse_separators(separators)
         self._metadata_fields = _parse_metadata(metadata)
         if self._doc_handle.bs_doc is not None:
@@ -149,6 +193,8 @@ class DocumentSplitter(ComponentIterator):
         self._overlap = 0 if overlap is None else overlap
         self._tiktoken_encoding = tiktoken_encoding
         self._tiktoken_target_model = tiktoken_target_model
+        self._image_dpi = image_dpi
+        self._image_format = image_format
         # set up processing pipeline
         if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -178,19 +224,28 @@ class DocumentSplitter(ComponentIterator):
         return {
             'document': DocumentType(nullable=False),
             'separators': StringType(nullable=False),
+            'elements': JsonType(nullable=False),
             'metadata': StringType(nullable=False),
             'limit': IntType(nullable=True),
             'overlap': IntType(nullable=True),
             'skip_tags': StringType(nullable=True),
             'tiktoken_encoding': StringType(nullable=True),
             'tiktoken_target_model': StringType(nullable=True),
+            'image_dpi': IntType(nullable=True),
+            'image_format': StringType(nullable=True),
         }
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
-        schema: dict[str, ColumnType] = {'text': StringType()}
-        md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
+        schema: dict[str, ColumnType] = {}
+        elements = _parse_elements(kwargs.get('elements', ['text']))
+        for element in elements:
+            if element == Element.TEXT:
+                schema['text'] = StringType(nullable=False)
+            elif element == Element.IMAGE:
+                schema['image'] = ImageType(nullable=False)
+        md_fields = _parse_metadata(kwargs.get('metadata', ''))
         for md_field in md_fields:
             schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
@@ -200,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
         limit = kwargs.get('limit')
         overlap = kwargs.get('overlap')
+        if Element.IMAGE in elements and separators != [Separator.PAGE]:
+            raise Error('Image elements are only supported for the "page" separator on PDF documents')
         if limit is not None or overlap is not None:
             if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
                 raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -213,7 +270,6 @@ class DocumentSplitter(ComponentIterator):
             if kwargs.get('limit') is None:
                 raise Error('limit is required with "token_limit"/"char_limit" separators')
-        # check dependencies at the end
         if Separator.SENTENCE in separators:
             _ = Env.get().spacy_nlp
         if Separator.TOKEN_LIMIT in separators:
@@ -224,9 +280,15 @@ class DocumentSplitter(ComponentIterator):
     def __next__(self) -> dict[str, Any]:
         while True:
             section = next(self._sections)
-            if section.text is None:
+            if section.text is None and section.image is None:
                 continue
-            result: dict[str, Any] = {'text': section.text}
+            result: dict[str, Any] = {}
+            for element in self._elements:
+                if element == Element.TEXT:
+                    result['text'] = section.text
+                elif element == Element.IMAGE:
+                    result['image'] = section.image
             for md_field in self._metadata_fields:
                 if md_field == ChunkMetadata.TITLE:
                     result[md_field.name.lower()] = self._doc_title
@@ -238,6 +300,7 @@ class DocumentSplitter(ComponentIterator):
                     result[md_field.name.lower()] = section.metadata.page
                 elif md_field == ChunkMetadata.BOUNDING_BOX:
                     result[md_field.name.lower()] = section.metadata.bounding_box
             return result
     def _html_sections(self) -> Iterator[DocumentSection]:
@@ -273,7 +336,7 @@ class DocumentSplitter(ComponentIterator):
                 yield DocumentSection(text=full_text, metadata=md)
                 accumulated_text = []
-        def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
+        def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
             nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
@@ -361,43 +424,41 @@ class DocumentSplitter(ComponentIterator):
         yield from emit()
     def _pdf_sections(self) -> Iterator[DocumentSection]:
-        """Create DocumentSections reflecting the pdf-specific separators"""
-        import fitz  # type: ignore[import-untyped]
         doc: fitz.Document = self._doc_handle.pdf_doc
         assert doc is not None
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
-        accumulated_text = []  # invariant: all elements are ftfy clean and non-empty
+        accumulated_text: list[str] = []
-        def _add_cleaned_text(raw_text: str) -> None:
-            fixed = ftfy.fix_text(raw_text)
+        def _add_cleaned(raw: str) -> None:
+            fixed = ftfy.fix_text(raw)
             if fixed:
                 accumulated_text.append(fixed)
         def _emit_text() -> str:
-            full_text = ''.join(accumulated_text)
+            txt = ''.join(accumulated_text)
             accumulated_text.clear()
-            return full_text
+            return txt
+        for page_idx, page in enumerate(doc.pages()):
+            img: PIL.Image.Image | None = None
+            if Element.IMAGE in self._elements:
+                pix = page.get_pixmap(dpi=self._image_dpi)
+                img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
-        for page_number, page in enumerate(doc.pages()):
             for block in page.get_text('blocks'):
-                # there is no concept of paragraph in pdf, block is the closest thing
-                # we can get (eg a paragraph in text may cut across pages)
-                # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
-                # other libraries like pdfminer also lack an explicit paragraph concept
-                x1, y1, x2, y2, text, _, _ = block
-                _add_cleaned_text(text)
+                x1, y1, x2, y2, text, *_ = block
+                _add_cleaned(text)
                 if accumulated_text and emit_on_paragraph:
                     bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
-                    metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
-                    yield DocumentSection(text=_emit_text(), metadata=metadata)
+                    md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
+                    yield DocumentSection(text=_emit_text(), metadata=md)
             if accumulated_text and emit_on_page and not emit_on_paragraph:
-                yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
-                accumulated_text = []
+                md = DocumentSectionMetadata(page=page_idx)
+                yield DocumentSection(text=_emit_text(), image=img, metadata=md)
         if accumulated_text and not emit_on_page:
             yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())

pixeltable/iterators/image.py CHANGED Viewed

@@ -31,8 +31,7 @@ class TileIterator(ComponentIterator):
     __j: int
     def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
-        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
-            raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
+        assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
         self.__image = image
         self.__image.load()
@@ -79,4 +78,8 @@ class TileIterator(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        tile_size = kwargs.get('tile_size')
+        overlap = kwargs.get('overlap', (0, 0))
+        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
+            raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
         return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']

pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.0rc3py3-none-any.whl → 0.4.20py3-none-any.whl