PyPI - pixeltable - Versions diffs - 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl - Mend

pixeltable 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (31) hide show

pixeltable/catalog/catalog.py +47 -32
pixeltable/catalog/table.py +33 -14
pixeltable/catalog/table_version.py +86 -46
pixeltable/catalog/table_version_path.py +0 -11
pixeltable/catalog/view.py +6 -0
pixeltable/config.py +1 -0
pixeltable/dataframe.py +1 -1
pixeltable/env.py +12 -0
pixeltable/exec/exec_context.py +15 -2
pixeltable/exec/sql_node.py +3 -2
pixeltable/exprs/arithmetic_expr.py +13 -7
pixeltable/functions/huggingface.py +1031 -2
pixeltable/functions/video.py +140 -31
pixeltable/globals.py +23 -4
pixeltable/io/globals.py +2 -2
pixeltable/io/parquet.py +1 -1
pixeltable/io/table_data_conduit.py +1 -1
pixeltable/iterators/document.py +111 -42
pixeltable/iterators/video.py +169 -62
pixeltable/plan.py +2 -6
pixeltable/share/packager.py +155 -26
pixeltable/store.py +25 -5
pixeltable/utils/arrow.py +6 -6
pixeltable/utils/av.py +104 -11
pixeltable/utils/object_stores.py +16 -1
pixeltable/utils/s3_store.py +44 -11
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/METADATA +30 -30
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/RECORD +31 -31
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/WHEEL +0 -0
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.16.dist-info → pixeltable-0.4.18.dist-info}/licenses/LICENSE +0 -0

pixeltable/functions/video.py CHANGED Viewed

@@ -2,10 +2,11 @@
 Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
 """
+import glob
 import logging
 import pathlib
 import subprocess
-from typing import Literal, NoReturn
+from typing import Any, Literal, NoReturn
 import av
 import av.stream
@@ -305,7 +306,14 @@ def _handle_ffmpeg_error(e: subprocess.CalledProcessError) -> NoReturn:
 @pxt.udf(is_method=True)
 def clip(
-    video: pxt.Video, *, start_time: float, end_time: float | None = None, duration: float | None = None
+    video: pxt.Video,
+    *,
+    start_time: float,
+    end_time: float | None = None,
+    duration: float | None = None,
+    mode: Literal['fast', 'accurate'] = 'accurate',
+    video_encoder: str | None = None,
+    video_encoder_args: dict[str, Any] | None = None,
 ) -> pxt.Video | None:
     """
     Extract a clip from a video, specified by `start_time` and either `end_time` or `duration` (in seconds).
@@ -322,6 +330,14 @@ def clip(
         start_time: Start time in seconds
         end_time: End time in seconds
         duration: Duration of the clip in seconds
+        mode:
+            - `'fast'`: avoids re-encoding but starts the clip at the nearest keyframes and as a result, the clip
+                duration will be slightly longer than requested
+            - `'accurate'`: extracts a frame-accurate clip, but requires re-encoding
+        video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
+            Only available for `mode='accurate'`.
+        video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
     Returns:
         New video containing only the specified time range or None if start_time is beyond the end of the video.
@@ -335,6 +351,11 @@ def clip(
         raise pxt.Error(f'duration must be positive, got {duration}')
     if end_time is not None and duration is not None:
         raise pxt.Error('end_time and duration cannot both be specified')
+    if mode == 'fast':
+        if video_encoder is not None:
+            raise pxt.Error("video_encoder is not supported for mode='fast'")
+        if video_encoder_args is not None:
+            raise pxt.Error("video_encoder_args is not supported for mode='fast'")
     video_duration = av_utils.get_video_duration(video)
     if video_duration is not None and start_time > video_duration:
@@ -344,7 +365,15 @@ def clip(
     if end_time is not None:
         duration = end_time - start_time
-    cmd = av_utils.ffmpeg_clip_cmd(str(video), output_path, start_time, duration)
+    cmd = av_utils.ffmpeg_clip_cmd(
+        str(video),
+        output_path,
+        start_time,
+        duration,
+        fast=(mode == 'fast'),
+        video_encoder=video_encoder,
+        video_encoder_args=video_encoder_args,
+    )
     try:
         result = subprocess.run(cmd, capture_output=True, text=True, check=True)
@@ -358,9 +387,17 @@ def clip(
 @pxt.udf(is_method=True)
-def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
+def segment_video(
+    video: pxt.Video,
+    *,
+    duration: float | None = None,
+    segment_times: list[float] | None = None,
+    mode: Literal['fast', 'accurate'] = 'accurate',
+    video_encoder: str | None = None,
+    video_encoder_args: dict[str, Any] | None = None,
+) -> list[str]:
     """
-    Split a video into fixed-size segments.
+    Split a video into segments.
     __Requirements:__
@@ -368,7 +405,19 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
     Args:
         video: Input video file to segment
-        duration: Approximate duration of each segment (in seconds).
+        duration: Duration of each segment (in seconds). For `mode='fast'`, this is approximate;
+            for `mode='accurate'`, segments will have exact durations. Cannot be specified together with
+            `segment_times`.
+        segment_times: List of timestamps (in seconds) in video where segments should be split. Note that these are not
+            segment durations. If all segment times are less than the duration of the video, produces exactly
+            `len(segment_times) + 1` segments. Cannot be empty or be specified together with `duration`.
+        mode: Segmentation mode:
+            - `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
+            - `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
+        video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
+            Only available for `mode='accurate'`.
+        video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
     Returns:
         List of file paths for the generated video segments.
@@ -377,45 +426,105 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
         pxt.Error: If the video is missing timing information.
     Examples:
-        Split a video at 1 minute intervals
+        Split a video at 1 minute intervals using fast mode:
-        >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
+        >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60, mode='fast')).collect()
+        Split video into exact 10-second segments with default accurate mode, using the libx264 encoder with a CRF of 23
+        and slow preset (for smaller output files):
+        >>> tbl.select(
+        ...     segment_paths=tbl.video.segment_video(
+        ...         duration=10,
+        ...         video_encoder='libx264',
+        ...         video_encoder_args={'crf': 23, 'preset': 'slow'}
+        ...     )
+        ... ).collect()
         Split video into two parts at the midpoint:
         >>> duration = tbl.video.get_duration()
-        >>> tbl.select(segment_paths=tbl.video.segment_video(duration=duration / 2 + 1)).collect()
+        >>> tbl.select(segment_paths=tbl.video.segment_video(segment_times=[duration / 2])).collect()
     """
     Env.get().require_binary('ffmpeg')
-    if duration <= 0:
+    if duration is not None and segment_times is not None:
+        raise pxt.Error('duration and segment_times cannot both be specified')
+    if duration is not None and duration <= 0:
         raise pxt.Error(f'duration must be positive, got {duration}')
+    if segment_times is not None and len(segment_times) == 0:
+        raise pxt.Error('segment_times cannot be empty')
+    if mode == 'fast':
+        if video_encoder is not None:
+            raise pxt.Error("video_encoder is not supported for mode='fast'")
+        if video_encoder_args is not None:
+            raise pxt.Error("video_encoder_args is not supported for mode='fast'")
     base_path = TempStore.create_path(extension='')
-    # we extract consecutive clips instead of running ffmpeg -f segment, which is inexplicably much slower
-    start_time = 0.0
-    result: list[str] = []
-    try:
-        while True:
-            segment_path = f'{base_path}_segment_{len(result)}.mp4'
-            cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, duration)
+    output_paths: list[str] = []
+    if mode == 'accurate':
+        # Use ffmpeg -f segment for accurate segmentation with re-encoding
+        output_pattern = f'{base_path}_segment_%04d.mp4'
+        cmd = av_utils.ffmpeg_segment_cmd(
+            str(video),
+            output_pattern,
+            segment_duration=duration,
+            segment_times=segment_times,
+            video_encoder=video_encoder,
+            video_encoder_args=video_encoder_args,
+        )
+        try:
             _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
-            segment_duration = av_utils.get_video_duration(segment_path)
-            if segment_duration == 0.0:
-                # we're done
-                pathlib.Path(segment_path).unlink()
-                return result
-            result.append(segment_path)
-            start_time += segment_duration  # use the actual segment duration here, it won't match duration exactly
+            output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
+            # TODO: is this actually an error?
+            # if len(output_paths) == 0:
+            #     stderr_output = result.stderr.strip() if result.stderr is not None else ''
+            #     raise pxt.Error(
+            #         f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
+            #     )
+            return output_paths
+        except subprocess.CalledProcessError as e:
+            _handle_ffmpeg_error(e)
-        return result
-    except subprocess.CalledProcessError as e:
-        # clean up partial results
-        for segment_path in result:
-            pathlib.Path(segment_path).unlink()
-        _handle_ffmpeg_error(e)
+    else:
+        # Fast mode: extract consecutive clips using stream copy (no re-encoding)
+        # This is faster but can only split at keyframes, leading to approximate durations
+        start_time = 0.0
+        segment_idx = 0
+        try:
+            while True:
+                target_duration: float | None
+                if duration is not None:
+                    target_duration = duration
+                elif segment_idx < len(segment_times):
+                    target_duration = segment_times[segment_idx] - start_time
+                else:
+                    target_duration = None  # the rest
+                segment_path = f'{base_path}_segment_{len(output_paths)}.mp4'
+                cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, target_duration)
+                _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
+                segment_duration = av_utils.get_video_duration(segment_path)
+                if segment_duration == 0.0:
+                    # we're done
+                    pathlib.Path(segment_path).unlink()
+                    return output_paths
+                output_paths.append(segment_path)
+                start_time += segment_duration  # use the actual segment duration here, it won't match duration exactly
+                segment_idx += 1
+                if segment_times is not None and segment_idx > len(segment_times):
+                    break
+            return output_paths
+        except subprocess.CalledProcessError as e:
+            # clean up partial results
+            for segment_path in output_paths:
+                pathlib.Path(segment_path).unlink()
+            _handle_ffmpeg_error(e)
 @pxt.udf(is_method=True)

pixeltable/globals.py CHANGED Viewed

@@ -487,12 +487,28 @@ def get_table(path: str, if_not_exists: Literal['error', 'ignore'] = 'error') ->
     return tbl
-def move(path: str, new_path: str) -> None:
+def move(
+    path: str,
+    new_path: str,
+    *,
+    if_exists: Literal['error', 'ignore'] = 'error',
+    if_not_exists: Literal['error', 'ignore'] = 'error',
+) -> None:
     """Move a schema object to a new directory and/or rename a schema object.
     Args:
         path: absolute path to the existing schema object.
         new_path: absolute new path for the schema object.
+        if_exists: Directive regarding how to handle if a schema object already exists at the new path.
+            Must be one of the following:
+            - `'error'`: raise an error
+            - `'ignore'`: do nothing and return
+        if_not_exists: Directive regarding how to handle if the source path does not exist.
+            Must be one of the following:
+            - `'error'`: raise an error
+            - `'ignore'`: do nothing and return
     Raises:
         Error: If path does not exist or new_path already exists.
@@ -506,13 +522,16 @@ def move(path: str, new_path: str) -> None:
         >>>> pxt.move('dir1.my_table', 'dir1.new_name')
     """
+    if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
+    if if_exists_ not in (catalog.IfExistsParam.ERROR, catalog.IfExistsParam.IGNORE):
+        raise excs.Error("`if_exists` must be one of 'error' or 'ignore'")
+    if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
     if path == new_path:
         raise excs.Error('move(): source and destination cannot be identical')
     path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
     if path_obj.is_ancestor(new_path_obj):
         raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
-    cat = Catalog.get()
-    cat.move(path_obj, new_path_obj)
+    Catalog.get().move(path_obj, new_path_obj, if_exists_, if_not_exists_)
 def drop_table(
@@ -660,7 +679,7 @@ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths:
 def create_dir(
-    path: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
+    path: str, *, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
 ) -> Optional[catalog.Dir]:
     """Create a directory.

pixeltable/io/globals.py CHANGED Viewed

@@ -152,7 +152,7 @@ def export_images_as_fo_dataset(
     (or expression) containing image data, along with optional additional columns containing labels. Currently, only
     classification and detection labels are supported.
-    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
+    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial contains a
     fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
     Images in the dataset that already exist on disk will be exported directly, in whatever format they
@@ -211,7 +211,7 @@ def export_images_as_fo_dataset(
         ...     classifications=tbl.classifications
         ... )
-        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
+        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial
         for a fully worked example.
     """
     Env.get().require_package('fiftyone')

pixeltable/io/parquet.py CHANGED Viewed

@@ -62,7 +62,7 @@ def export_parquet(
         with Catalog.get().begin_xact(for_write=False):
             for record_batch in to_record_batches(df, partition_size_bytes):
                 output_path = temp_path / f'part-{batch_num:05d}.parquet'
-                arrow_tbl = pa.Table.from_batches([record_batch])  # type: ignore
+                arrow_tbl = pa.Table.from_batches([record_batch])
                 pa.parquet.write_table(arrow_tbl, str(output_path))
                 batch_num += 1

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -528,7 +528,7 @@ class ParquetTableDataConduit(TableDataConduit):
         from pixeltable.utils.arrow import iter_tuples2
         try:
-            for fragment in self.pq_ds.fragments:  # type: ignore[attr-defined]
+            for fragment in self.pq_ds.fragments:
                 for batch in fragment.to_batches():
                     dict_batch = list(iter_tuples2(batch, self.source_column_map, self.pxt_schema))
                     self.total_rows += len(dict_batch)

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import dataclasses
 import enum
+import io
 import logging
-from typing import Any, ClassVar, Iterable, Iterator, Optional
+from typing import Any, ClassVar, Iterable, Iterator, Literal
+import fitz  # type: ignore[import-untyped]
 import ftfy
+import PIL.Image
+from bs4.element import NavigableString, Tag
 from pixeltable.env import Env
 from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
+from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
 from pixeltable.utils.documents import get_document_handle
 from .base import ComponentIterator
@@ -15,6 +19,11 @@ from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
+class Element(enum.Enum):
+    TEXT = 1
+    IMAGE = 2
 class ChunkMetadata(enum.Enum):
     TITLE = 1
     HEADING = 2
@@ -37,27 +46,28 @@ class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
     # html and markdown metadata
-    sourceline: Optional[int] = None
+    sourceline: int | None = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[dict[str, str]] = None
+    heading: dict[str, str] | None = None
     # pdf-specific metadata
-    page: Optional[int] = None
+    page: int | None = None
     # bounding box as an {x1, y1, x2, y2} dictionary
-    bounding_box: Optional[dict[str, float]] = None
+    bounding_box: dict[str, float] | None = None
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
-    text: Optional[str]
-    metadata: Optional[DocumentSectionMetadata]
+    text: str | None = None
+    image: PIL.Image.Image | None = None
+    metadata: DocumentSectionMetadata | None = None
 def _parse_separators(separators: str) -> list[Separator]:
-    ret = []
+    ret: list[Separator] = []
     for s in separators.split(','):
         clean_s = s.strip().upper()
         if not clean_s:
@@ -71,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
 def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
-    ret = []
+    ret: list[ChunkMetadata] = []
     for m in metadata.split(','):
         clean_m = m.strip().upper()
         if not clean_m:
@@ -84,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
     return ret
+def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
+    result: list[Element] = []
+    for e in elements:
+        clean_e = e.strip().upper()
+        if clean_e not in Element.__members__:
+            raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
+        result.append(Element[clean_e])
+    if len(result) == 0:
+        raise Error('elements cannot be empty')
+    return result
 _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
@@ -95,15 +117,22 @@ class DocumentSplitter(ComponentIterator):
     Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
+    How to init the `DocumentSplitter` class?
     Args:
         separators: separators to use to chunk the document. Options are:
              `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
              This may be a comma-separated string, e.g., `'heading,token_limit'`.
+        elements: list of elements to extract from the document. Options are:
+            `'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
+            for the `'page'` separator on PDF documents.
         limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
              or `'char_limit'` is specified.
         metadata: additional metadata fields to include in the output. Options are:
              `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
              (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
+        image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
+        image_format: format to use when extracting images from PDFs. Defaults to 'png'.
     """
     METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -114,24 +143,41 @@ class DocumentSplitter(ComponentIterator):
         ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
     }
+    _doc_handle: Any
+    _separators: list[Separator]
+    _elements: list[Element]
+    _metadata_fields: list[ChunkMetadata]
+    _doc_title: str
+    _limit: int
+    _skip_tags: list[str]
+    _overlap: int
+    _tiktoken_encoding: str | None
+    _tiktoken_target_model: str | None
+    _image_dpi: int
+    _image_format: str
+    _sections: Iterator[DocumentSection]
     def __init__(
         self,
         document: str,
         *,
         separators: str,
-        limit: Optional[int] = None,
-        overlap: Optional[int] = None,
+        elements: list[Literal['text', 'image']] | None = None,
+        limit: int | None = None,
+        overlap: int | None = None,
         metadata: str = '',
-        html_skip_tags: Optional[list[str]] = None,
-        tiktoken_encoding: Optional[str] = 'cl100k_base',
-        tiktoken_target_model: Optional[str] = None,
+        html_skip_tags: list[str] | None = None,
+        tiktoken_encoding: str | None = 'cl100k_base',
+        tiktoken_target_model: str | None = None,
+        image_dpi: int = 300,
+        image_format: str = 'png',
     ):
         if html_skip_tags is None:
             html_skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)
+        self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
         assert self._doc_handle is not None
-        # calling the output_schema method to validate the input arguments
-        self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
         self._separators = _parse_separators(separators)
         self._metadata_fields = _parse_metadata(metadata)
         if self._doc_handle.bs_doc is not None:
@@ -147,6 +193,8 @@ class DocumentSplitter(ComponentIterator):
         self._overlap = 0 if overlap is None else overlap
         self._tiktoken_encoding = tiktoken_encoding
         self._tiktoken_target_model = tiktoken_target_model
+        self._image_dpi = image_dpi
+        self._image_format = image_format
         # set up processing pipeline
         if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -176,19 +224,28 @@ class DocumentSplitter(ComponentIterator):
         return {
             'document': DocumentType(nullable=False),
             'separators': StringType(nullable=False),
+            'elements': JsonType(nullable=False),
             'metadata': StringType(nullable=False),
             'limit': IntType(nullable=True),
             'overlap': IntType(nullable=True),
             'skip_tags': StringType(nullable=True),
             'tiktoken_encoding': StringType(nullable=True),
             'tiktoken_target_model': StringType(nullable=True),
+            'image_dpi': IntType(nullable=True),
+            'image_format': StringType(nullable=True),
         }
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
-        schema: dict[str, ColumnType] = {'text': StringType()}
-        md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
+        schema: dict[str, ColumnType] = {}
+        elements = _parse_elements(kwargs.get('elements', ['text']))
+        for element in elements:
+            if element == Element.TEXT:
+                schema['text'] = StringType(nullable=False)
+            elif element == Element.IMAGE:
+                schema['image'] = ImageType(nullable=False)
+        md_fields = _parse_metadata(kwargs.get('metadata', ''))
         for md_field in md_fields:
             schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
@@ -198,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
         limit = kwargs.get('limit')
         overlap = kwargs.get('overlap')
+        if Element.IMAGE in elements and separators != [Separator.PAGE]:
+            raise Error('Image elements are only supported for the "page" separator on PDF documents')
         if limit is not None or overlap is not None:
             if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
                 raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -211,14 +270,25 @@ class DocumentSplitter(ComponentIterator):
             if kwargs.get('limit') is None:
                 raise Error('limit is required with "token_limit"/"char_limit" separators')
+        if Separator.SENTENCE in separators:
+            _ = Env.get().spacy_nlp
+        if Separator.TOKEN_LIMIT in separators:
+            Env.get().require_package('tiktoken')
         return schema, []
     def __next__(self) -> dict[str, Any]:
         while True:
             section = next(self._sections)
-            if section.text is None:
+            if section.text is None and section.image is None:
                 continue
-            result: dict[str, Any] = {'text': section.text}
+            result: dict[str, Any] = {}
+            for element in self._elements:
+                if element == Element.TEXT:
+                    result['text'] = section.text
+                elif element == Element.IMAGE:
+                    result['image'] = section.image
             for md_field in self._metadata_fields:
                 if md_field == ChunkMetadata.TITLE:
                     result[md_field.name.lower()] = self._doc_title
@@ -230,6 +300,7 @@ class DocumentSplitter(ComponentIterator):
                     result[md_field.name.lower()] = section.metadata.page
                 elif md_field == ChunkMetadata.BOUNDING_BOX:
                     result[md_field.name.lower()] = section.metadata.bounding_box
             return result
     def _html_sections(self) -> Iterator[DocumentSection]:
@@ -265,7 +336,7 @@ class DocumentSplitter(ComponentIterator):
                 yield DocumentSection(text=full_text, metadata=md)
                 accumulated_text = []
-        def process_element(el: bs4.element.Tag | bs4.NavigableString) -> Iterator[DocumentSection]:
+        def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
             nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
@@ -353,43 +424,41 @@ class DocumentSplitter(ComponentIterator):
         yield from emit()
     def _pdf_sections(self) -> Iterator[DocumentSection]:
-        """Create DocumentSections reflecting the pdf-specific separators"""
-        import fitz  # type: ignore[import-untyped]
         doc: fitz.Document = self._doc_handle.pdf_doc
         assert doc is not None
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
-        accumulated_text = []  # invariant: all elements are ftfy clean and non-empty
+        accumulated_text: list[str] = []
-        def _add_cleaned_text(raw_text: str) -> None:
-            fixed = ftfy.fix_text(raw_text)
+        def _add_cleaned(raw: str) -> None:
+            fixed = ftfy.fix_text(raw)
             if fixed:
                 accumulated_text.append(fixed)
         def _emit_text() -> str:
-            full_text = ''.join(accumulated_text)
+            txt = ''.join(accumulated_text)
             accumulated_text.clear()
-            return full_text
+            return txt
+        for page_idx, page in enumerate(doc.pages()):
+            img: PIL.Image.Image | None = None
+            if Element.IMAGE in self._elements:
+                pix = page.get_pixmap(dpi=self._image_dpi)
+                img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
-        for page_number, page in enumerate(doc.pages()):
             for block in page.get_text('blocks'):
-                # there is no concept of paragraph in pdf, block is the closest thing
-                # we can get (eg a paragraph in text may cut across pages)
-                # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
-                # other libraries like pdfminer also lack an explicit paragraph concept
-                x1, y1, x2, y2, text, _, _ = block
-                _add_cleaned_text(text)
+                x1, y1, x2, y2, text, *_ = block
+                _add_cleaned(text)
                 if accumulated_text and emit_on_paragraph:
                     bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
-                    metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
-                    yield DocumentSection(text=_emit_text(), metadata=metadata)
+                    md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
+                    yield DocumentSection(text=_emit_text(), metadata=md)
             if accumulated_text and emit_on_page and not emit_on_paragraph:
-                yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
-                accumulated_text = []
+                md = DocumentSectionMetadata(page=page_idx)
+                yield DocumentSection(text=_emit_text(), image=img, metadata=md)
         if accumulated_text and not emit_on_page:
             yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())

pixeltable 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl