PyPI - pixeltable - Versions diffs - 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

pixeltable 0.2.26py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

pixeltable/__init__.py +83 -19
pixeltable/_query.py +1444 -0
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +7 -4
pixeltable/catalog/catalog.py +2394 -119
pixeltable/catalog/column.py +225 -104
pixeltable/catalog/dir.py +38 -9
pixeltable/catalog/globals.py +53 -34
pixeltable/catalog/insertable_table.py +265 -115
pixeltable/catalog/path.py +80 -17
pixeltable/catalog/schema_object.py +28 -43
pixeltable/catalog/table.py +1270 -677
pixeltable/catalog/table_metadata.py +103 -0
pixeltable/catalog/table_version.py +1270 -751
pixeltable/catalog/table_version_handle.py +109 -0
pixeltable/catalog/table_version_path.py +137 -42
pixeltable/catalog/tbl_ops.py +53 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +251 -134
pixeltable/config.py +215 -0
pixeltable/env.py +736 -285
pixeltable/exceptions.py +26 -2
pixeltable/exec/__init__.py +7 -2
pixeltable/exec/aggregation_node.py +39 -21
pixeltable/exec/cache_prefetch_node.py +87 -109
pixeltable/exec/cell_materialization_node.py +268 -0
pixeltable/exec/cell_reconstruction_node.py +168 -0
pixeltable/exec/component_iteration_node.py +25 -28
pixeltable/exec/data_row_batch.py +11 -46
pixeltable/exec/exec_context.py +26 -11
pixeltable/exec/exec_node.py +35 -27
pixeltable/exec/expr_eval/__init__.py +3 -0
pixeltable/exec/expr_eval/evaluators.py +365 -0
pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
pixeltable/exec/expr_eval/globals.py +200 -0
pixeltable/exec/expr_eval/row_buffer.py +74 -0
pixeltable/exec/expr_eval/schedulers.py +413 -0
pixeltable/exec/globals.py +35 -0
pixeltable/exec/in_memory_data_node.py +35 -27
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +44 -29
pixeltable/exec/sql_node.py +414 -115
pixeltable/exprs/__init__.py +8 -5
pixeltable/exprs/arithmetic_expr.py +79 -45
pixeltable/exprs/array_slice.py +5 -5
pixeltable/exprs/column_property_ref.py +40 -26
pixeltable/exprs/column_ref.py +254 -61
pixeltable/exprs/comparison.py +14 -9
pixeltable/exprs/compound_predicate.py +9 -10
pixeltable/exprs/data_row.py +213 -72
pixeltable/exprs/expr.py +270 -104
pixeltable/exprs/expr_dict.py +6 -5
pixeltable/exprs/expr_set.py +20 -11
pixeltable/exprs/function_call.py +383 -284
pixeltable/exprs/globals.py +18 -5
pixeltable/exprs/in_predicate.py +7 -7
pixeltable/exprs/inline_expr.py +37 -37
pixeltable/exprs/is_null.py +8 -4
pixeltable/exprs/json_mapper.py +120 -54
pixeltable/exprs/json_path.py +90 -60
pixeltable/exprs/literal.py +61 -16
pixeltable/exprs/method_ref.py +7 -6
pixeltable/exprs/object_ref.py +19 -8
pixeltable/exprs/row_builder.py +238 -75
pixeltable/exprs/rowid_ref.py +53 -15
pixeltable/exprs/similarity_expr.py +65 -50
pixeltable/exprs/sql_element_cache.py +5 -5
pixeltable/exprs/string_op.py +107 -0
pixeltable/exprs/type_cast.py +25 -13
pixeltable/exprs/variable.py +2 -2
pixeltable/func/__init__.py +9 -5
pixeltable/func/aggregate_function.py +197 -92
pixeltable/func/callable_function.py +119 -35
pixeltable/func/expr_template_function.py +101 -48
pixeltable/func/function.py +375 -62
pixeltable/func/function_registry.py +20 -19
pixeltable/func/globals.py +6 -5
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +151 -35
pixeltable/func/signature.py +178 -49
pixeltable/func/tools.py +164 -0
pixeltable/func/udf.py +176 -53
pixeltable/functions/__init__.py +44 -4
pixeltable/functions/anthropic.py +226 -47
pixeltable/functions/audio.py +148 -11
pixeltable/functions/bedrock.py +137 -0
pixeltable/functions/date.py +188 -0
pixeltable/functions/deepseek.py +113 -0
pixeltable/functions/document.py +81 -0
pixeltable/functions/fal.py +76 -0
pixeltable/functions/fireworks.py +72 -20
pixeltable/functions/gemini.py +249 -0
pixeltable/functions/globals.py +208 -53
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1088 -95
pixeltable/functions/image.py +155 -84
pixeltable/functions/json.py +8 -11
pixeltable/functions/llama_cpp.py +31 -19
pixeltable/functions/math.py +169 -0
pixeltable/functions/mistralai.py +50 -75
pixeltable/functions/net.py +70 -0
pixeltable/functions/ollama.py +29 -36
pixeltable/functions/openai.py +548 -160
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +15 -14
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +310 -85
pixeltable/functions/timestamp.py +37 -19
pixeltable/functions/together.py +77 -120
pixeltable/functions/twelvelabs.py +188 -0
pixeltable/functions/util.py +7 -2
pixeltable/functions/uuid.py +30 -0
pixeltable/functions/video.py +1528 -117
pixeltable/functions/vision.py +26 -26
pixeltable/functions/voyageai.py +289 -0
pixeltable/functions/whisper.py +19 -10
pixeltable/functions/whisperx.py +179 -0
pixeltable/functions/yolox.py +112 -0
pixeltable/globals.py +716 -236
pixeltable/index/__init__.py +3 -1
pixeltable/index/base.py +17 -21
pixeltable/index/btree.py +32 -22
pixeltable/index/embedding_index.py +155 -92
pixeltable/io/__init__.py +12 -7
pixeltable/io/datarows.py +140 -0
pixeltable/io/external_store.py +83 -125
pixeltable/io/fiftyone.py +24 -33
pixeltable/io/globals.py +47 -182
pixeltable/io/hf_datasets.py +96 -127
pixeltable/io/label_studio.py +171 -156
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +136 -115
pixeltable/io/parquet.py +40 -153
pixeltable/io/table_data_conduit.py +702 -0
pixeltable/io/utils.py +100 -0
pixeltable/iterators/__init__.py +8 -4
pixeltable/iterators/audio.py +207 -0
pixeltable/iterators/base.py +9 -3
pixeltable/iterators/document.py +144 -87
pixeltable/iterators/image.py +17 -38
pixeltable/iterators/string.py +15 -12
pixeltable/iterators/video.py +523 -127
pixeltable/metadata/__init__.py +33 -8
pixeltable/metadata/converters/convert_10.py +2 -3
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_15.py +15 -11
pixeltable/metadata/converters/convert_16.py +4 -5
pixeltable/metadata/converters/convert_17.py +4 -5
pixeltable/metadata/converters/convert_18.py +4 -6
pixeltable/metadata/converters/convert_19.py +6 -9
pixeltable/metadata/converters/convert_20.py +3 -6
pixeltable/metadata/converters/convert_21.py +6 -8
pixeltable/metadata/converters/convert_22.py +3 -2
pixeltable/metadata/converters/convert_23.py +33 -0
pixeltable/metadata/converters/convert_24.py +55 -0
pixeltable/metadata/converters/convert_25.py +19 -0
pixeltable/metadata/converters/convert_26.py +23 -0
pixeltable/metadata/converters/convert_27.py +29 -0
pixeltable/metadata/converters/convert_28.py +13 -0
pixeltable/metadata/converters/convert_29.py +110 -0
pixeltable/metadata/converters/convert_30.py +63 -0
pixeltable/metadata/converters/convert_31.py +11 -0
pixeltable/metadata/converters/convert_32.py +15 -0
pixeltable/metadata/converters/convert_33.py +17 -0
pixeltable/metadata/converters/convert_34.py +21 -0
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/convert_41.py +12 -0
pixeltable/metadata/converters/convert_42.py +9 -0
pixeltable/metadata/converters/convert_43.py +44 -0
pixeltable/metadata/converters/util.py +44 -18
pixeltable/metadata/notes.py +21 -0
pixeltable/metadata/schema.py +185 -42
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +616 -225
pixeltable/share/__init__.py +3 -0
pixeltable/share/packager.py +797 -0
pixeltable/share/protocol/__init__.py +33 -0
pixeltable/share/protocol/common.py +165 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +119 -0
pixeltable/share/publish.py +349 -0
pixeltable/store.py +398 -232
pixeltable/type_system.py +730 -267
pixeltable/utils/__init__.py +40 -0
pixeltable/utils/arrow.py +201 -29
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +346 -0
pixeltable/utils/coco.py +26 -27
pixeltable/utils/code.py +4 -4
pixeltable/utils/console_output.py +46 -0
pixeltable/utils/coroutine.py +24 -0
pixeltable/utils/dbms.py +92 -0
pixeltable/utils/description_helper.py +11 -12
pixeltable/utils/documents.py +60 -61
pixeltable/utils/exception_handler.py +36 -0
pixeltable/utils/filecache.py +38 -22
pixeltable/utils/formatter.py +88 -51
pixeltable/utils/gcs_store.py +295 -0
pixeltable/utils/http.py +133 -0
pixeltable/utils/http_server.py +14 -13
pixeltable/utils/iceberg.py +13 -0
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +90 -0
pixeltable/utils/local_store.py +322 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +573 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +20 -20
pixeltable/utils/s3_store.py +527 -0
pixeltable/utils/sql.py +32 -5
pixeltable/utils/system.py +30 -0
pixeltable/utils/transactional_directory.py +4 -3
pixeltable-0.5.7.dist-info/METADATA +579 -0
pixeltable-0.5.7.dist-info/RECORD +227 -0
{pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/catalog/named_function.py +0 -36
pixeltable/catalog/path_dict.py +0 -141
pixeltable/dataframe.py +0 -894
pixeltable/exec/expr_eval_node.py +0 -232
pixeltable/ext/__init__.py +0 -14
pixeltable/ext/functions/__init__.py +0 -8
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/ext/functions/yolox.py +0 -157
pixeltable/tool/create_test_db_dump.py +0 -311
pixeltable/tool/create_test_video.py +0 -81
pixeltable/tool/doc_plugins/griffe.py +0 -50
pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
pixeltable/tool/embed_udf.py +0 -9
pixeltable/tool/mypy_plugin.py +0 -55
pixeltable/utils/media_store.py +0 -76
pixeltable/utils/s3.py +0 -16
pixeltable-0.2.26.dist-info/METADATA +0 -400
pixeltable-0.2.26.dist-info/RECORD +0 -156
pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
{pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import dataclasses
 import enum
 import logging
-from typing import Any, Iterable, Iterator, Optional, Union
+from typing import Any, ClassVar, Iterable, Iterator, Literal
 import ftfy
+import PIL.Image
+from bs4.element import NavigableString, Tag
+from deprecated import deprecated
+from pypdfium2 import PdfDocument  # type: ignore[import-untyped]
 from pixeltable.env import Env
 from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
+from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
 from pixeltable.utils.documents import get_document_handle
 from .base import ComponentIterator
@@ -15,6 +19,11 @@ from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
+class Element(enum.Enum):
+    TEXT = 1
+    IMAGE = 2
 class ChunkMetadata(enum.Enum):
     TITLE = 1
     HEADING = 2
@@ -35,27 +44,30 @@ class Separator(enum.Enum):
 @dataclasses.dataclass
 class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
     # html and markdown metadata
-    sourceline: Optional[int] = None
+    sourceline: int | None = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[dict[str, str]] = None
+    heading: dict[str, str] | None = None
     # pdf-specific metadata
-    page: Optional[int] = None
+    page: int | None = None
     # bounding box as an {x1, y1, x2, y2} dictionary
-    bounding_box: Optional[dict[str, float]] = None
+    bounding_box: dict[str, float] | None = None
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
-    text: Optional[str]
-    metadata: Optional[DocumentSectionMetadata]
+    text: str | None = None
+    image: PIL.Image.Image | None = None
+    metadata: DocumentSectionMetadata | None = None
 def _parse_separators(separators: str) -> list[Separator]:
-    ret = []
+    ret: list[Separator] = []
     for s in separators.split(','):
         clean_s = s.strip().upper()
         if not clean_s:
@@ -69,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
 def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
-    ret = []
+    ret: list[ChunkMetadata] = []
     for m in metadata.split(','):
         clean_m = m.strip().upper()
         if not clean_m:
@@ -82,18 +94,23 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
     return ret
-_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
+def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
+    result: list[Element] = []
+    for e in elements:
+        clean_e = e.strip().upper()
+        if clean_e not in Element.__members__:
+            raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
+        result.append(Element[clean_e])
+    if len(result) == 0:
+        raise Error('elements cannot be empty')
+    return result
-class DocumentSplitter(ComponentIterator):
-    """Iterator over chunks of a document. The document is chunked according to the specified `separators`.
+_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
-    The iterator yields a `text` field containing the text of the chunk, and it may also
-    include additional metadata fields if specified in the `metadata` parameter, as explained below.
-    Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
-    """
-    METADATA_COLUMN_TYPES = {
+class DocumentSplitter(ComponentIterator):
+    METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
         ChunkMetadata.TITLE: StringType(nullable=True),
         ChunkMetadata.HEADING: JsonType(nullable=True),
         ChunkMetadata.SOURCELINE: IntType(nullable=True),
@@ -101,30 +118,41 @@ class DocumentSplitter(ComponentIterator):
         ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
     }
+    _doc_handle: Any
+    _separators: list[Separator]
+    _elements: list[Element]
+    _metadata_fields: list[ChunkMetadata]
+    _doc_title: str
+    _limit: int
+    _skip_tags: list[str]
+    _overlap: int
+    _tiktoken_encoding: str | None
+    _tiktoken_target_model: str | None
+    _image_dpi: int
+    _image_format: str
+    _sections: Iterator[DocumentSection]
     def __init__(
-            self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
-            metadata: str = '',
-            html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
-            tiktoken_target_model: Optional[str] = None
+        self,
+        document: str,
+        *,
+        separators: str,
+        elements: list[Literal['text', 'image']] | None = None,
+        limit: int | None = None,
+        overlap: int | None = None,
+        metadata: str = '',
+        skip_tags: list[str] | None = None,
+        tiktoken_encoding: str | None = 'cl100k_base',
+        tiktoken_target_model: str | None = None,
+        image_dpi: int = 300,
+        image_format: str = 'png',
     ):
-        """Init method for `DocumentSplitter` class.
-        Args:
-            separators: separators to use to chunk the document. Options are:
-                 `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
-                 This may be a comma-separated string, e.g., `'heading,token_limit'`.
-            limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
-                 or `'char_limit'` is specified.
-            metadata: additional metadata fields to include in the output. Options are:
-                 `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
-                 (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
-        """
-        if html_skip_tags is None:
-            html_skip_tags = ['nav']
+        if skip_tags is None:
+            skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)
+        self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
         assert self._doc_handle is not None
-        # calling the output_schema method to validate the input arguments
-        self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
         self._separators = _parse_separators(separators)
         self._metadata_fields = _parse_metadata(metadata)
         if self._doc_handle.bs_doc is not None:
@@ -136,10 +164,12 @@ class DocumentSplitter(ComponentIterator):
         else:
             self._doc_title = ''
         self._limit = 0 if limit is None else limit
-        self._skip_tags = html_skip_tags
+        self._skip_tags = skip_tags
         self._overlap = 0 if overlap is None else overlap
         self._tiktoken_encoding = tiktoken_encoding
         self._tiktoken_target_model = tiktoken_target_model
+        self._image_dpi = image_dpi
+        self._image_format = image_format
         # set up processing pipeline
         if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -151,8 +181,11 @@ class DocumentSplitter(ComponentIterator):
         elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
             assert self._doc_handle.pdf_doc is not None
             self._sections = self._pdf_sections()
+        elif self._doc_handle.format == DocumentType.DocumentFormat.TXT:
+            assert self._doc_handle.txt_doc is not None
+            self._sections = self._txt_sections()
         else:
-            assert False, f'Unsupported document format: {self._doc_handle.format}'
+            raise AssertionError(f'Unsupported document format: {self._doc_handle.format}')
         if Separator.SENTENCE in self._separators:
             self._sections = self._sentence_sections(self._sections)
@@ -166,19 +199,28 @@ class DocumentSplitter(ComponentIterator):
         return {
             'document': DocumentType(nullable=False),
             'separators': StringType(nullable=False),
+            'elements': JsonType(nullable=False),
             'metadata': StringType(nullable=False),
             'limit': IntType(nullable=True),
             'overlap': IntType(nullable=True),
             'skip_tags': StringType(nullable=True),
             'tiktoken_encoding': StringType(nullable=True),
             'tiktoken_target_model': StringType(nullable=True),
+            'image_dpi': IntType(nullable=True),
+            'image_format': StringType(nullable=True),
         }
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
-        schema: dict[str, ColumnType] = {'text': StringType()}
-        md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
+        schema: dict[str, ColumnType] = {}
+        elements = _parse_elements(kwargs.get('elements', ['text']))
+        for element in elements:
+            if element == Element.TEXT:
+                schema['text'] = StringType(nullable=False)
+            elif element == Element.IMAGE:
+                schema['image'] = ImageType(nullable=False)
+        md_fields = _parse_metadata(kwargs.get('metadata', ''))
         for md_field in md_fields:
             schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
@@ -188,6 +230,8 @@ class DocumentSplitter(ComponentIterator):
         limit = kwargs.get('limit')
         overlap = kwargs.get('overlap')
+        if Element.IMAGE in elements and separators != [Separator.PAGE]:
+            raise Error('Image elements are only supported for the "page" separator on PDF documents')
         if limit is not None or overlap is not None:
             if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
                 raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -201,9 +245,8 @@ class DocumentSplitter(ComponentIterator):
             if kwargs.get('limit') is None:
                 raise Error('limit is required with "token_limit"/"char_limit" separators')
-        # check dependencies at the end
         if Separator.SENTENCE in separators:
-            Env.get().require_package('spacy')
+            _ = Env.get().spacy_nlp
         if Separator.TOKEN_LIMIT in separators:
             Env.get().require_package('tiktoken')
@@ -212,9 +255,15 @@ class DocumentSplitter(ComponentIterator):
     def __next__(self) -> dict[str, Any]:
         while True:
             section = next(self._sections)
-            if section.text is None:
+            if section.text is None and section.image is None:
                 continue
-            result: dict[str, Any] = {'text': section.text}
+            result: dict[str, Any] = {}
+            for element in self._elements:
+                if element == Element.TEXT:
+                    result['text'] = section.text
+                elif element == Element.IMAGE:
+                    result['image'] = section.image
             for md_field in self._metadata_fields:
                 if md_field == ChunkMetadata.TITLE:
                     result[md_field.name.lower()] = self._doc_title
@@ -226,18 +275,20 @@ class DocumentSplitter(ComponentIterator):
                     result[md_field.name.lower()] = section.metadata.page
                 elif md_field == ChunkMetadata.BOUNDING_BOX:
                     result[md_field.name.lower()] = section.metadata.bounding_box
             return result
     def _html_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections reflecting the html-specific separators"""
         import bs4
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
         # current state
         accumulated_text: list[str] = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: dict[str, str] = {}   # current state of observed headings (level -> text)
+        headings: dict[str, str] = {}  # current state of observed headings (level -> text)
         sourceline = 0  # most recently seen sourceline
         def update_metadata(el: bs4.Tag) -> None:
@@ -246,9 +297,9 @@ class DocumentSplitter(ComponentIterator):
             sourceline = el.sourceline
             if el.name in _HTML_HEADINGS:
                 # remove the previously seen lower levels
-                lower_levels = [l for l in headings if l > el.name]
-                for l in lower_levels:
-                    del headings[l]
+                lower_levels = [lv for lv in headings if lv > el.name]
+                for lv in lower_levels:
+                    del headings[lv]
                 headings[el.name] = el.get_text().strip()
         def emit() -> Iterator[DocumentSection]:
@@ -260,7 +311,7 @@ class DocumentSplitter(ComponentIterator):
                 yield DocumentSection(text=full_text, metadata=md)
                 accumulated_text = []
-        def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
+        def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
             nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
@@ -297,7 +348,7 @@ class DocumentSplitter(ComponentIterator):
         # current state
         accumulated_text: list[str] = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: dict[str, str] = {}   # current state of observed headings (level -> text)
+        headings: dict[str, str] = {}  # current state of observed headings (level -> text)
         def update_headings(heading: dict) -> None:
             # update current state
@@ -307,9 +358,9 @@ class DocumentSplitter(ComponentIterator):
             level = f'h{lint}'
             text = heading['children'][0]['raw'].strip()
             # remove the previously seen lower levels
-            lower_levels = [l for l in headings.keys() if l > level]
-            for l in lower_levels:
-                del headings[l]
+            lower_levels = [lv for lv in headings if lv > level]
+            for lv in lower_levels:
+                del headings[lv]
             headings[level] = text
         def emit() -> Iterator[DocumentSection]:
@@ -348,47 +399,48 @@ class DocumentSplitter(ComponentIterator):
         yield from emit()
     def _pdf_sections(self) -> Iterator[DocumentSection]:
-        """Create DocumentSections reflecting the pdf-specific separators"""
-        import fitz  # type: ignore[import-untyped]
-        doc: fitz.Document = self._doc_handle.pdf_doc
-        assert doc is not None
+        if Separator.PARAGRAPH in self._separators:
+            raise Error(
+                'Paragraph splitting is not currently supported for PDF documents. Please contact'
+                ' us at https://github.com/pixeltable/pixeltable/issues if you need this feature.'
+            )
-        emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
-        emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
+        doc: PdfDocument = self._doc_handle.pdf_doc
+        assert isinstance(doc, PdfDocument)
-        accumulated_text = []  # invariant: all elements are ftfy clean and non-empty
+        emit_on_page = Separator.PAGE in self._separators
+        accumulated_text: list[str] = []
-        def _add_cleaned_text(raw_text: str) -> None:
-            fixed = ftfy.fix_text(raw_text)
+        def _add_cleaned(raw: str) -> None:
+            fixed = ftfy.fix_text(raw)
             if fixed:
                 accumulated_text.append(fixed)
         def _emit_text() -> str:
-            full_text = ''.join(accumulated_text)
+            txt = ''.join(accumulated_text)
             accumulated_text.clear()
-            return full_text
-        for page_number, page in enumerate(doc.pages()):
-            for block in page.get_text('blocks'):
-                # there is no concept of paragraph in pdf, block is the closest thing
-                # we can get (eg a paragraph in text may cut across pages)
-                # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
-                # other libraries like pdfminer also lack an explicit paragraph concept
-                x1, y1, x2, y2, text, _, _ = block
-                _add_cleaned_text(text)
-                if accumulated_text and emit_on_paragraph:
-                    bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
-                    metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
-                    yield DocumentSection(text=_emit_text(), metadata=metadata)
-            if accumulated_text and emit_on_page and not emit_on_paragraph:
-                yield DocumentSection(text=_emit_text(),
-                                      metadata=DocumentSectionMetadata(page=page_number))
-                accumulated_text = []
+            return txt
+        for page_idx, page in enumerate(doc):
+            img = page.render().to_pil() if Element.IMAGE in self._elements else None
+            text = page.get_textpage().get_text_bounded()
+            _add_cleaned(text)
+            if accumulated_text and emit_on_page:
+                md = DocumentSectionMetadata(page=page_idx)
+                yield DocumentSection(text=_emit_text(), image=img, metadata=md)
         if accumulated_text and not emit_on_page:
             yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
+    def _txt_sections(self) -> Iterator[DocumentSection]:
+        """Create DocumentSections for text files.
+        Currently, it returns the entire text as a single section.
+        TODO: Add support for paragraphs.
+        """
+        assert self._doc_handle.txt_doc is not None
+        yield DocumentSection(text=ftfy.fix_text(self._doc_handle.txt_doc), metadata=DocumentSectionMetadata())
     def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
         """Split the input sections into sentences"""
         for section in input_sections:
@@ -399,6 +451,7 @@ class DocumentSplitter(ComponentIterator):
     def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
         import tiktoken
         if self._tiktoken_target_model is not None:
             encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
         else:
@@ -442,5 +495,9 @@ class DocumentSplitter(ComponentIterator):
     def close(self) -> None:
         pass
-    def set_pos(self, pos: int) -> None:
-        pass
+    @classmethod
+    @deprecated(
+        'create() is deprecated; use `pixeltable.functions.document.document_splitter` instead', version='0.5.6'
+    )
+    def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        return super()._create(**kwargs)

pixeltable/iterators/image.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Any, Sequence
 import PIL.Image
+from deprecated import deprecated
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
@@ -8,18 +9,6 @@ from pixeltable.iterators.base import ComponentIterator
 class TileIterator(ComponentIterator):
-    """
-    Iterator over tiles of an image. Each image will be divided into tiles of size `tile_size`, and the tiles will be
-    iterated over in row-major order (left-to-right, then top-to-bottom). An optional `overlap` parameter may be
-    specified. If the tiles do not exactly cover the image, then the rightmost and bottommost tiles will be padded with
-    blackspace, so that the output images all have the exact size `tile_size`.
-    Args:
-        image: Image to split into tiles.
-        tile_size: Size of each tile, as a pair of integers `[width, height]`.
-        overlap: Amount of overlap between adjacent tiles, as a pair of integers `[width, height]`.
-    """
     __image: PIL.Image.Image
     __tile_size: Sequence[int]
     __overlap: Sequence[int]
@@ -30,15 +19,8 @@ class TileIterator(ComponentIterator):
     __i: int
     __j: int
-    def __init__(
-        self,
-        image: PIL.Image.Image,
-        *,
-        tile_size: tuple[int, int],
-        overlap: tuple[int, int] = (0, 0),
-    ):
-        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
-            raise excs.Error(f"overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}")
+    def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
+        assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
         self.__image = image
         self.__image.load()
@@ -64,11 +46,7 @@ class TileIterator(ComponentIterator):
         x2 = x1 + self.__tile_size[0]
         y2 = y1 + self.__tile_size[1]
         tile = self.__image.crop((x1, y1, x2, y2))
-        result = {
-            'tile': tile,
-            'tile_coord': [self.__i, self.__j],
-            'tile_box': [x1, y1, x2, y2]
-        }
+        result = {'tile': tile, 'tile_coord': [self.__i, self.__j], 'tile_box': [x1, y1, x2, y2]}
         self.__i += 1
         if self.__i >= self.__xlen:
@@ -79,22 +57,23 @@ class TileIterator(ComponentIterator):
     def close(self) -> None:
         pass
-    def set_pos(self, pos: int) -> None:
+    def set_pos(self, pos: int, **kwargs: Any) -> None:
         self.__j = pos // self.__xlen
         self.__i = pos % self.__xlen
     @classmethod
     def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
-        return {
-            'image': ts.ImageType(),
-            'tile_size': ts.JsonType(),
-            'overlap': ts.JsonType(),
-        }
+        return {'image': ts.ImageType(), 'tile_size': ts.JsonType(), 'overlap': ts.JsonType()}
+    @classmethod
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        tile_size = kwargs.get('tile_size')
+        overlap = kwargs.get('overlap', (0, 0))
+        if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
+            raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
+        return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
     @classmethod
-    def output_schema(cls,  *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
-        return {
-            'tile': ts.ImageType(),
-            'tile_coord': ts.JsonType(),
-            'tile_box': ts.JsonType(),
-        }, ['tile']
+    @deprecated('create() is deprecated; use `pixeltable.functions.image.tile_iterator` instead', version='0.5.6')
+    def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        return super()._create(**kwargs)

pixeltable/iterators/string.py CHANGED Viewed

@@ -1,13 +1,17 @@
-from typing import Iterator, Any
+from typing import Any, Iterator
-import pixeltable.exceptions as excs
-import pixeltable.type_system as ts
+from deprecated import deprecated
+from pixeltable import exceptions as excs, type_system as ts
 from pixeltable.env import Env
 from pixeltable.iterators.base import ComponentIterator
 class StringSplitter(ComponentIterator):
-    # TODO(aaron-siegel): Merge this with `DocumentSplitter` in order to provide additional capabilities.
+    _text: str
+    doc: Any  # spacy doc
+    iter: Iterator[dict[str, Any]]
     def __init__(self, text: str, *, separators: str):
         if separators != 'sentence':
             raise excs.Error('Only `sentence` separators are currently supported.')
@@ -25,16 +29,15 @@ class StringSplitter(ComponentIterator):
     def close(self) -> None:
         pass
-    def set_pos(self, pos: int) -> None:
-        pass
     @classmethod
     def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
-        return {
-            'text': ts.StringType(),
-            'separators': ts.StringType(),
-        }
+        return {'text': ts.StringType(), 'separators': ts.StringType()}
     @classmethod
-    def output_schema(cls,  *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
         return {'text': ts.StringType()}, []
+    @classmethod
+    @deprecated('create() is deprecated; use `pixeltable.functions.string.string_splitter` instead', version='0.5.6')
+    def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        return super()._create(**kwargs)

pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

pixeltable 0.2.26py3-none-any.whl → 0.5.7py3-none-any.whl