PyPI - pixeltable - Versions diffs - 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl - Mend

pixeltable 0.4.16py3-none-any.whl → 0.4.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (24) hide show

pixeltable/catalog/catalog.py +21 -13
pixeltable/catalog/table_version.py +75 -39
pixeltable/catalog/table_version_path.py +0 -11
pixeltable/catalog/view.py +6 -0
pixeltable/config.py +1 -0
pixeltable/env.py +8 -0
pixeltable/exprs/arithmetic_expr.py +13 -7
pixeltable/functions/video.py +110 -28
pixeltable/io/globals.py +2 -2
pixeltable/io/parquet.py +1 -1
pixeltable/io/table_data_conduit.py +1 -1
pixeltable/iterators/document.py +61 -23
pixeltable/iterators/video.py +126 -53
pixeltable/share/packager.py +155 -26
pixeltable/store.py +1 -2
pixeltable/utils/arrow.py +6 -6
pixeltable/utils/av.py +65 -0
pixeltable/utils/object_stores.py +16 -1
pixeltable/utils/s3_store.py +44 -11
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/METADATA +28 -28
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/RECORD +24 -24
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.16.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import dataclasses
 import enum
+import io
 import logging
 from typing import Any, ClassVar, Iterable, Iterator, Optional
+import fitz  # type: ignore[import-untyped]
 import ftfy
+import PIL.Image
+from bs4.element import NavigableString, Tag
 from pixeltable.env import Env
 from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
+from pixeltable.type_system import BoolType, ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
 from pixeltable.utils.documents import get_document_handle
 from .base import ComponentIterator
@@ -54,6 +58,7 @@ class DocumentSection:
     text: Optional[str]
     metadata: Optional[DocumentSectionMetadata]
+    image: Optional[PIL.Image.Image] = None
 def _parse_separators(separators: str) -> list[Separator]:
@@ -95,6 +100,8 @@ class DocumentSplitter(ComponentIterator):
     Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
+    How to init the `DocumentSplitter` class?
     Args:
         separators: separators to use to chunk the document. Options are:
              `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
@@ -125,13 +132,23 @@ class DocumentSplitter(ComponentIterator):
         html_skip_tags: Optional[list[str]] = None,
         tiktoken_encoding: Optional[str] = 'cl100k_base',
         tiktoken_target_model: Optional[str] = None,
+        # (PDF-processing-only)
+        include_page_image: bool = False,
+        page_image_dpi: int = 300,
+        page_image_format: str = 'png',
     ):
         if html_skip_tags is None:
             html_skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)
         assert self._doc_handle is not None
         # calling the output_schema method to validate the input arguments
-        self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
+        self.output_schema(
+            separators=separators,
+            metadata=metadata,
+            limit=limit,
+            overlap=overlap,
+            include_page_image=include_page_image,
+        )
         self._separators = _parse_separators(separators)
         self._metadata_fields = _parse_metadata(metadata)
         if self._doc_handle.bs_doc is not None:
@@ -148,6 +165,10 @@ class DocumentSplitter(ComponentIterator):
         self._tiktoken_encoding = tiktoken_encoding
         self._tiktoken_target_model = tiktoken_target_model
+        self._include_page_image = include_page_image
+        self._page_image_dpi = page_image_dpi
+        self._page_image_format = page_image_format
         # set up processing pipeline
         if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
             assert self._doc_handle.bs_doc is not None
@@ -182,6 +203,10 @@ class DocumentSplitter(ComponentIterator):
             'skip_tags': StringType(nullable=True),
             'tiktoken_encoding': StringType(nullable=True),
             'tiktoken_target_model': StringType(nullable=True),
+            # PDF options must be declared so validation accepts them:
+            'include_page_image': BoolType(nullable=True),
+            'page_image_dpi': IntType(nullable=True),
+            'page_image_format': StringType(nullable=True),
         }
     @classmethod
@@ -211,6 +236,15 @@ class DocumentSplitter(ComponentIterator):
             if kwargs.get('limit') is None:
                 raise Error('limit is required with "token_limit"/"char_limit" separators')
+        # check dependencies at the end
+        if Separator.SENTENCE in separators:
+            _ = Env.get().spacy_nlp
+        if Separator.TOKEN_LIMIT in separators:
+            Env.get().require_package('tiktoken')
+        if kwargs.get('include_page_image'):
+            schema['image'] = ImageType(nullable=True)
         return schema, []
     def __next__(self) -> dict[str, Any]:
@@ -230,6 +264,11 @@ class DocumentSplitter(ComponentIterator):
                     result[md_field.name.lower()] = section.metadata.page
                 elif md_field == ChunkMetadata.BOUNDING_BOX:
                     result[md_field.name.lower()] = section.metadata.bounding_box
+            # FIX: only include image if schema supports it
+            if self._include_page_image:
+                result['image'] = section.image
             return result
     def _html_sections(self) -> Iterator[DocumentSection]:
@@ -265,7 +304,7 @@ class DocumentSplitter(ComponentIterator):
                 yield DocumentSection(text=full_text, metadata=md)
                 accumulated_text = []
-        def process_element(el: bs4.element.Tag | bs4.NavigableString) -> Iterator[DocumentSection]:
+        def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
             nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
@@ -353,46 +392,45 @@ class DocumentSplitter(ComponentIterator):
         yield from emit()
     def _pdf_sections(self) -> Iterator[DocumentSection]:
-        """Create DocumentSections reflecting the pdf-specific separators"""
-        import fitz  # type: ignore[import-untyped]
         doc: fitz.Document = self._doc_handle.pdf_doc
         assert doc is not None
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
-        accumulated_text = []  # invariant: all elements are ftfy clean and non-empty
+        accumulated_text: list[str] = []
-        def _add_cleaned_text(raw_text: str) -> None:
-            fixed = ftfy.fix_text(raw_text)
+        def _add_cleaned(raw: str) -> None:
+            fixed = ftfy.fix_text(raw)
             if fixed:
                 accumulated_text.append(fixed)
         def _emit_text() -> str:
-            full_text = ''.join(accumulated_text)
+            txt = ''.join(accumulated_text)
             accumulated_text.clear()
-            return full_text
+            return txt
+        for page_idx, page in enumerate(doc.pages()):
+            # render once per page if requested
+            page_image = None
+            if self._include_page_image:
+                pix = page.get_pixmap(dpi=self._page_image_dpi)  # ← single render
+                page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
-        for page_number, page in enumerate(doc.pages()):
             for block in page.get_text('blocks'):
-                # there is no concept of paragraph in pdf, block is the closest thing
-                # we can get (eg a paragraph in text may cut across pages)
-                # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
-                # other libraries like pdfminer also lack an explicit paragraph concept
-                x1, y1, x2, y2, text, _, _ = block
-                _add_cleaned_text(text)
+                x1, y1, x2, y2, text, *_ = block
+                _add_cleaned(text)
                 if accumulated_text and emit_on_paragraph:
                     bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
-                    metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
-                    yield DocumentSection(text=_emit_text(), metadata=metadata)
+                    md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
+                    yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
             if accumulated_text and emit_on_page and not emit_on_paragraph:
-                yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
-                accumulated_text = []
+                md = DocumentSectionMetadata(page=page_idx)
+                yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
         if accumulated_text and not emit_on_page:
-            yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
+            yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(), image=None)
     def _txt_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections for text files.

pixeltable/iterators/video.py CHANGED Viewed

@@ -1,10 +1,10 @@
+import glob
 import logging
 import math
-import shutil
 import subprocess
 from fractions import Fraction
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Iterator, Literal, Optional
 import av
 import pandas as pd
@@ -14,6 +14,7 @@ import pixeltable as pxt
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
 import pixeltable.utils.av as av_utils
+from pixeltable.env import Env
 from pixeltable.utils.local_store import TempStore
 from .base import ComponentIterator
@@ -237,9 +238,15 @@ class VideoSplitter(ComponentIterator):
     seconds.
     Args:
-        segment_duration: Video segment duration in seconds
-        overlap: Overlap between consecutive segments in seconds.
-        min_segment_duration: Drop the last segment if it is smaller than min_segment_duration
+        duration: Video segment duration in seconds
+        overlap: Overlap between consecutive segments in seconds. Only available for `mode='fast'`.
+        min_segment_duration: Drop the last segment if it is smaller than min_segment_duration.
+        mode: Segmentation mode:
+            - `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
+            - `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
+        video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
+            Only available for `mode='accurate'`.
+        video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
     """
     # Input parameters
@@ -247,65 +254,85 @@ class VideoSplitter(ComponentIterator):
     segment_duration: float
     overlap: float
     min_segment_duration: float
+    video_encoder: str | None
+    video_encoder_args: dict[str, Any] | None
     # Video metadata
     video_duration: float
     video_time_base: Fraction
     video_start_time: int
-    # position tracking
-    next_segment_start: float
-    next_segment_start_pts: int
+    output_iter: Iterator[dict[str, Any]]
-    def __init__(self, video: str, segment_duration: float, *, overlap: float = 0.0, min_segment_duration: float = 0.0):
-        assert segment_duration > 0.0
-        assert segment_duration >= min_segment_duration
-        assert overlap < segment_duration
+    def __init__(
+        self,
+        video: str,
+        *,
+        duration: float,
+        overlap: float = 0.0,
+        min_segment_duration: float = 0.0,
+        mode: Literal['fast', 'accurate'] = 'fast',
+        video_encoder: str | None = None,
+        video_encoder_args: dict[str, Any] | None = None,
+    ):
+        Env.get().require_binary('ffmpeg')
+        assert duration > 0.0
+        assert duration >= min_segment_duration
+        assert overlap < duration
         video_path = Path(video)
         assert video_path.exists() and video_path.is_file()
-        if not shutil.which('ffmpeg'):
-            raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use VideoSplitter.')
         self.video_path = video_path
-        self.segment_duration = segment_duration
+        self.segment_duration = duration
         self.overlap = overlap
         self.min_segment_duration = min_segment_duration
+        self.video_encoder = video_encoder
+        self.video_encoder_args = video_encoder_args
         with av.open(str(video_path)) as container:
             video_stream = container.streams.video[0]
             self.video_time_base = video_stream.time_base
             self.video_start_time = video_stream.start_time or 0
-        self.next_segment_start = float(self.video_start_time * self.video_time_base)
-        self.next_segment_start_pts = self.video_start_time
+        self.output_iter = self.fast_iter() if mode == 'fast' else self.accurate_iter()
     @classmethod
     def input_schema(cls) -> dict[str, ts.ColumnType]:
         return {
             'video': ts.VideoType(nullable=False),
-            'segment_duration': ts.FloatType(nullable=False),
+            'duration': ts.FloatType(nullable=True),
             'overlap': ts.FloatType(nullable=True),
             'min_segment_duration': ts.FloatType(nullable=True),
+            'mode': ts.StringType(nullable=False),
+            'video_encoder': ts.StringType(nullable=True),
+            'video_encoder_args': ts.JsonType(nullable=True),
         }
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
-        param_names = ['segment_duration', 'overlap', 'min_segment_duration']
+        param_names = ['duration', 'overlap', 'min_segment_duration']
         params = dict(zip(param_names, args))
         params.update(kwargs)
-        segment_duration = params['segment_duration']
+        segment_duration = params['duration']
         min_segment_duration = params.get('min_segment_duration', 0.0)
         overlap = params.get('overlap', 0.0)
+        mode = params.get('mode', 'fast')
         if segment_duration <= 0.0:
-            raise excs.Error('segment_duration must be a positive number')
+            raise excs.Error('duration must be a positive number')
         if segment_duration < min_segment_duration:
-            raise excs.Error('segment_duration must be at least min_segment_duration')
+            raise excs.Error('duration must be at least min_segment_duration')
+        if mode == 'accurate' and overlap > 0:
+            raise excs.Error("Cannot specify overlap for mode='accurate'")
         if overlap >= segment_duration:
-            raise excs.Error('overlap must be less than segment_duration')
+            raise excs.Error('overlap must be less than duration')
+        if mode == 'fast':
+            if params.get('video_encoder') is not None:
+                raise excs.Error("Cannot specify video_encoder for mode='fast'")
+            if params.get('video_encoder_args') is not None:
+                raise excs.Error("Cannot specify video_encoder_args for mode='fast'")
         return {
             'segment_start': ts.FloatType(nullable=False),
@@ -315,48 +342,94 @@ class VideoSplitter(ComponentIterator):
             'video_segment': ts.VideoType(nullable=False),
         }, []
-    def __next__(self) -> dict[str, Any]:
-        segment_path = str(TempStore.create_path(extension='.mp4'))
+    def fast_iter(self) -> Iterator[dict[str, Any]]:
+        segment_path: str
         try:
-            cmd = av_utils.ffmpeg_clip_cmd(
-                str(self.video_path), segment_path, self.next_segment_start, self.segment_duration
-            )
-            _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            start_time = 0.0
+            start_pts = 0
+            while True:
+                segment_path = str(TempStore.create_path(extension='.mp4'))
+                cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, self.segment_duration)
+                _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
+                # use the actual duration
+                segment_duration = av_utils.get_video_duration(segment_path)
+                if segment_duration - self.overlap == 0.0 or segment_duration < self.min_segment_duration:
+                    # we're done
+                    Path(segment_path).unlink()
+                    return
+                segment_end = start_time + segment_duration
+                segment_end_pts = start_pts + round(segment_duration / self.video_time_base)
+                result = {
+                    'segment_start': start_time,
+                    'segment_start_pts': start_pts,
+                    'segment_end': segment_end,
+                    'segment_end_pts': segment_end_pts,
+                    'video_segment': segment_path,
+                }
+                yield result
-            # use the actual duration
-            segment_duration = av_utils.get_video_duration(segment_path)
-            if segment_duration - self.overlap == 0.0:
-                # we're done
-                Path(segment_path).unlink()
-                raise StopIteration
+                start_time = segment_end - self.overlap
+                start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
-            if segment_duration < self.min_segment_duration:
+        except subprocess.CalledProcessError as e:
+            if Path(segment_path).exists():
                 Path(segment_path).unlink()
-                raise StopIteration
-            segment_end = self.next_segment_start + segment_duration
-            segment_end_pts = self.next_segment_start_pts + round(segment_duration / self.video_time_base)
+            error_msg = f'ffmpeg failed with return code {e.returncode}'
+            if e.stderr:
+                error_msg += f': {e.stderr.strip()}'
+            raise pxt.Error(error_msg) from e
-            result = {
-                'segment_start': self.next_segment_start,
-                'segment_start_pts': self.next_segment_start_pts,
-                'segment_end': segment_end,
-                'segment_end_pts': segment_end_pts,
-                'video_segment': segment_path,
-            }
-            self.next_segment_start = segment_end - self.overlap
-            self.next_segment_start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
+    def accurate_iter(self) -> Iterator[dict[str, Any]]:
+        base_path = TempStore.create_path(extension='')
+        # Use ffmpeg -f segment for accurate segmentation with re-encoding
+        output_pattern = f'{base_path}_segment_%04d.mp4'
+        cmd = av_utils.ffmpeg_segment_cmd(
+            str(self.video_path),
+            output_pattern,
+            segment_duration=self.segment_duration,
+            video_encoder=self.video_encoder,
+            video_encoder_args=self.video_encoder_args,
+        )
-            return result
+        try:
+            _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
+            # TODO: is this actually an error?
+            # if len(output_paths) == 0:
+            #     stderr_output = result.stderr.strip() if result.stderr is not None else ''
+            #     raise pxt.Error(
+            #         f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
+            #     )
+            start_time = 0.0
+            start_pts = 0
+            for segment_path in output_paths:
+                segment_duration = av_utils.get_video_duration(segment_path)
+                if segment_duration < self.min_segment_duration:
+                    Path(segment_path).unlink()
+                    return
+                result = {
+                    'segment_start': start_time,
+                    'segment_start_pts': start_pts,
+                    'segment_end': start_time + segment_duration,
+                    'segment_end_pts': start_pts + round(segment_duration / self.video_time_base),
+                    'video_segment': segment_path,
+                }
+                yield result
+                start_time += segment_duration
+                start_pts += round(segment_duration / self.video_time_base)
         except subprocess.CalledProcessError as e:
-            if Path(segment_path).exists():
-                Path(segment_path).unlink()
             error_msg = f'ffmpeg failed with return code {e.returncode}'
             if e.stderr:
                 error_msg += f': {e.stderr.strip()}'
             raise pxt.Error(error_msg) from e
+    def __next__(self) -> dict[str, Any]:
+        return next(self.output_iter)
     def close(self) -> None:
         pass

pixeltable 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.16py3-none-any.whl → 0.4.17py3-none-any.whl