PyPI - pixeltable - Versions diffs - 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl - Mend

pixeltable 0.4.17py3-none-any.whl → 0.4.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (20) hide show

pixeltable/catalog/catalog.py +26 -19
pixeltable/catalog/table.py +33 -14
pixeltable/catalog/table_version.py +16 -12
pixeltable/dataframe.py +1 -1
pixeltable/env.py +4 -0
pixeltable/exec/exec_context.py +15 -2
pixeltable/exec/sql_node.py +3 -2
pixeltable/functions/huggingface.py +1031 -2
pixeltable/functions/video.py +34 -7
pixeltable/globals.py +23 -4
pixeltable/iterators/document.py +88 -57
pixeltable/iterators/video.py +58 -24
pixeltable/plan.py +2 -6
pixeltable/store.py +24 -3
pixeltable/utils/av.py +66 -38
{pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/METADATA +4 -4
{pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/RECORD +20 -20
{pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/WHEEL +0 -0
{pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/licenses/LICENSE +0 -0

pixeltable/functions/video.py CHANGED Viewed

@@ -306,7 +306,14 @@ def _handle_ffmpeg_error(e: subprocess.CalledProcessError) -> NoReturn:
 @pxt.udf(is_method=True)
 def clip(
-    video: pxt.Video, *, start_time: float, end_time: float | None = None, duration: float | None = None
+    video: pxt.Video,
+    *,
+    start_time: float,
+    end_time: float | None = None,
+    duration: float | None = None,
+    mode: Literal['fast', 'accurate'] = 'accurate',
+    video_encoder: str | None = None,
+    video_encoder_args: dict[str, Any] | None = None,
 ) -> pxt.Video | None:
     """
     Extract a clip from a video, specified by `start_time` and either `end_time` or `duration` (in seconds).
@@ -323,6 +330,14 @@ def clip(
         start_time: Start time in seconds
         end_time: End time in seconds
         duration: Duration of the clip in seconds
+        mode:
+            - `'fast'`: avoids re-encoding but starts the clip at the nearest keyframes and as a result, the clip
+                duration will be slightly longer than requested
+            - `'accurate'`: extracts a frame-accurate clip, but requires re-encoding
+        video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
+            Only available for `mode='accurate'`.
+        video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
     Returns:
         New video containing only the specified time range or None if start_time is beyond the end of the video.
@@ -336,6 +351,11 @@ def clip(
         raise pxt.Error(f'duration must be positive, got {duration}')
     if end_time is not None and duration is not None:
         raise pxt.Error('end_time and duration cannot both be specified')
+    if mode == 'fast':
+        if video_encoder is not None:
+            raise pxt.Error("video_encoder is not supported for mode='fast'")
+        if video_encoder_args is not None:
+            raise pxt.Error("video_encoder_args is not supported for mode='fast'")
     video_duration = av_utils.get_video_duration(video)
     if video_duration is not None and start_time > video_duration:
@@ -345,7 +365,15 @@ def clip(
     if end_time is not None:
         duration = end_time - start_time
-    cmd = av_utils.ffmpeg_clip_cmd(str(video), output_path, start_time, duration)
+    cmd = av_utils.ffmpeg_clip_cmd(
+        str(video),
+        output_path,
+        start_time,
+        duration,
+        fast=(mode == 'fast'),
+        video_encoder=video_encoder,
+        video_encoder_args=video_encoder_args,
+    )
     try:
         result = subprocess.run(cmd, capture_output=True, text=True, check=True)
@@ -364,7 +392,7 @@ def segment_video(
     *,
     duration: float | None = None,
     segment_times: list[float] | None = None,
-    mode: Literal['fast', 'accurate'] = 'fast',
+    mode: Literal['fast', 'accurate'] = 'accurate',
     video_encoder: str | None = None,
     video_encoder_args: dict[str, Any] | None = None,
 ) -> list[str]:
@@ -400,15 +428,14 @@ def segment_video(
     Examples:
         Split a video at 1 minute intervals using fast mode:
-        >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
+        >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60, mode='fast')).collect()
-        Split video into exact 10-second segments with accurate mode, using the libx264 encoder with a CRF of 23 and
-        slow preset (for smaller output files):
+        Split video into exact 10-second segments with default accurate mode, using the libx264 encoder with a CRF of 23
+        and slow preset (for smaller output files):
         >>> tbl.select(
         ...     segment_paths=tbl.video.segment_video(
         ...         duration=10,
-        ...         mode='accurate',
         ...         video_encoder='libx264',
         ...         video_encoder_args={'crf': 23, 'preset': 'slow'}
         ...     )

pixeltable/globals.py CHANGED Viewed

@@ -487,12 +487,28 @@ def get_table(path: str, if_not_exists: Literal['error', 'ignore'] = 'error') ->
     return tbl
-def move(path: str, new_path: str) -> None:
+def move(
+    path: str,
+    new_path: str,
+    *,
+    if_exists: Literal['error', 'ignore'] = 'error',
+    if_not_exists: Literal['error', 'ignore'] = 'error',
+) -> None:
     """Move a schema object to a new directory and/or rename a schema object.
     Args:
         path: absolute path to the existing schema object.
         new_path: absolute new path for the schema object.
+        if_exists: Directive regarding how to handle if a schema object already exists at the new path.
+            Must be one of the following:
+            - `'error'`: raise an error
+            - `'ignore'`: do nothing and return
+        if_not_exists: Directive regarding how to handle if the source path does not exist.
+            Must be one of the following:
+            - `'error'`: raise an error
+            - `'ignore'`: do nothing and return
     Raises:
         Error: If path does not exist or new_path already exists.
@@ -506,13 +522,16 @@ def move(path: str, new_path: str) -> None:
         >>>> pxt.move('dir1.my_table', 'dir1.new_name')
     """
+    if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
+    if if_exists_ not in (catalog.IfExistsParam.ERROR, catalog.IfExistsParam.IGNORE):
+        raise excs.Error("`if_exists` must be one of 'error' or 'ignore'")
+    if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
     if path == new_path:
         raise excs.Error('move(): source and destination cannot be identical')
     path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
     if path_obj.is_ancestor(new_path_obj):
         raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
-    cat = Catalog.get()
-    cat.move(path_obj, new_path_obj)
+    Catalog.get().move(path_obj, new_path_obj, if_exists_, if_not_exists_)
 def drop_table(
@@ -660,7 +679,7 @@ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths:
 def create_dir(
-    path: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
+    path: str, *, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
 ) -> Optional[catalog.Dir]:
     """Create a directory.

pixeltable/iterators/document.py CHANGED Viewed

@@ -2,7 +2,7 @@ import dataclasses
 import enum
 import io
 import logging
-from typing import Any, ClassVar, Iterable, Iterator, Optional
+from typing import Any, ClassVar, Iterable, Iterator, Literal
 import fitz  # type: ignore[import-untyped]
 import ftfy
@@ -11,7 +11,7 @@ from bs4.element import NavigableString, Tag
 from pixeltable.env import Env
 from pixeltable.exceptions import Error
-from pixeltable.type_system import BoolType, ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
+from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
 from pixeltable.utils.documents import get_document_handle
 from .base import ComponentIterator
@@ -19,6 +19,11 @@ from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
+class Element(enum.Enum):
+    TEXT = 1
+    IMAGE = 2
 class ChunkMetadata(enum.Enum):
     TITLE = 1
     HEADING = 2
@@ -41,28 +46,28 @@ class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
     # html and markdown metadata
-    sourceline: Optional[int] = None
+    sourceline: int | None = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[dict[str, str]] = None
+    heading: dict[str, str] | None = None
     # pdf-specific metadata
-    page: Optional[int] = None
+    page: int | None = None
     # bounding box as an {x1, y1, x2, y2} dictionary
-    bounding_box: Optional[dict[str, float]] = None
+    bounding_box: dict[str, float] | None = None
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
-    text: Optional[str]
-    metadata: Optional[DocumentSectionMetadata]
-    image: Optional[PIL.Image.Image] = None
+    text: str | None = None
+    image: PIL.Image.Image | None = None
+    metadata: DocumentSectionMetadata | None = None
 def _parse_separators(separators: str) -> list[Separator]:
-    ret = []
+    ret: list[Separator] = []
     for s in separators.split(','):
         clean_s = s.strip().upper()
         if not clean_s:
@@ -76,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
 def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
-    ret = []
+    ret: list[ChunkMetadata] = []
     for m in metadata.split(','):
         clean_m = m.strip().upper()
         if not clean_m:
@@ -89,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
     return ret
+def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
+    result: list[Element] = []
+    for e in elements:
+        clean_e = e.strip().upper()
+        if clean_e not in Element.__members__:
+            raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
+        result.append(Element[clean_e])
+    if len(result) == 0:
+        raise Error('elements cannot be empty')
+    return result
 _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
@@ -106,11 +123,16 @@ class DocumentSplitter(ComponentIterator):
         separators: separators to use to chunk the document. Options are:
              `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
              This may be a comma-separated string, e.g., `'heading,token_limit'`.
+        elements: list of elements to extract from the document. Options are:
+            `'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
+            for the `'page'` separator on PDF documents.
         limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
              or `'char_limit'` is specified.
         metadata: additional metadata fields to include in the output. Options are:
              `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
              (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
+        image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
+        image_format: format to use when extracting images from PDFs. Defaults to 'png'.
     """
     METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -121,34 +143,41 @@ class DocumentSplitter(ComponentIterator):
         ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
     }
+    _doc_handle: Any
+    _separators: list[Separator]
+    _elements: list[Element]
+    _metadata_fields: list[ChunkMetadata]
+    _doc_title: str
+    _limit: int
+    _skip_tags: list[str]
+    _overlap: int
+    _tiktoken_encoding: str | None
+    _tiktoken_target_model: str | None
+    _image_dpi: int
+    _image_format: str
+    _sections: Iterator[DocumentSection]
     def __init__(
         self,
         document: str,
         *,
         separators: str,
-        limit: Optional[int] = None,
-        overlap: Optional[int] = None,
+        elements: list[Literal['text', 'image']] | None = None,
+        limit: int | None = None,
+        overlap: int | None = None,
         metadata: str = '',
-        html_skip_tags: Optional[list[str]] = None,
-        tiktoken_encoding: Optional[str] = 'cl100k_base',
-        tiktoken_target_model: Optional[str] = None,
-        # (PDF-processing-only)
-        include_page_image: bool = False,
-        page_image_dpi: int = 300,
-        page_image_format: str = 'png',
+        html_skip_tags: list[str] | None = None,
+        tiktoken_encoding: str | None = 'cl100k_base',
+        tiktoken_target_model: str | None = None,
+        image_dpi: int = 300,
+        image_format: str = 'png',
     ):
         if html_skip_tags is None:
             html_skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)
+        self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
         assert self._doc_handle is not None
-        # calling the output_schema method to validate the input arguments
-        self.output_schema(
-            separators=separators,
-            metadata=metadata,
-            limit=limit,
-            overlap=overlap,
-            include_page_image=include_page_image,
-        )
         self._separators = _parse_separators(separators)
         self._metadata_fields = _parse_metadata(metadata)
         if self._doc_handle.bs_doc is not None:
@@ -164,10 +193,8 @@ class DocumentSplitter(ComponentIterator):
         self._overlap = 0 if overlap is None else overlap
         self._tiktoken_encoding = tiktoken_encoding
         self._tiktoken_target_model = tiktoken_target_model
-        self._include_page_image = include_page_image
-        self._page_image_dpi = page_image_dpi
-        self._page_image_format = page_image_format
+        self._image_dpi = image_dpi
+        self._image_format = image_format
         # set up processing pipeline
         if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -197,23 +224,28 @@ class DocumentSplitter(ComponentIterator):
         return {
             'document': DocumentType(nullable=False),
             'separators': StringType(nullable=False),
+            'elements': JsonType(nullable=False),
             'metadata': StringType(nullable=False),
             'limit': IntType(nullable=True),
             'overlap': IntType(nullable=True),
             'skip_tags': StringType(nullable=True),
             'tiktoken_encoding': StringType(nullable=True),
             'tiktoken_target_model': StringType(nullable=True),
-            # PDF options must be declared so validation accepts them:
-            'include_page_image': BoolType(nullable=True),
-            'page_image_dpi': IntType(nullable=True),
-            'page_image_format': StringType(nullable=True),
+            'image_dpi': IntType(nullable=True),
+            'image_format': StringType(nullable=True),
         }
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
-        schema: dict[str, ColumnType] = {'text': StringType()}
-        md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
+        schema: dict[str, ColumnType] = {}
+        elements = _parse_elements(kwargs.get('elements', ['text']))
+        for element in elements:
+            if element == Element.TEXT:
+                schema['text'] = StringType(nullable=False)
+            elif element == Element.IMAGE:
+                schema['image'] = ImageType(nullable=False)
+        md_fields = _parse_metadata(kwargs.get('metadata', ''))
         for md_field in md_fields:
             schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
@@ -223,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
         limit = kwargs.get('limit')
         overlap = kwargs.get('overlap')
+        if Element.IMAGE in elements and separators != [Separator.PAGE]:
+            raise Error('Image elements are only supported for the "page" separator on PDF documents')
         if limit is not None or overlap is not None:
             if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
                 raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -236,23 +270,25 @@ class DocumentSplitter(ComponentIterator):
             if kwargs.get('limit') is None:
                 raise Error('limit is required with "token_limit"/"char_limit" separators')
-        # check dependencies at the end
         if Separator.SENTENCE in separators:
             _ = Env.get().spacy_nlp
         if Separator.TOKEN_LIMIT in separators:
             Env.get().require_package('tiktoken')
-        if kwargs.get('include_page_image'):
-            schema['image'] = ImageType(nullable=True)
         return schema, []
     def __next__(self) -> dict[str, Any]:
         while True:
             section = next(self._sections)
-            if section.text is None:
+            if section.text is None and section.image is None:
                 continue
-            result: dict[str, Any] = {'text': section.text}
+            result: dict[str, Any] = {}
+            for element in self._elements:
+                if element == Element.TEXT:
+                    result['text'] = section.text
+                elif element == Element.IMAGE:
+                    result['image'] = section.image
             for md_field in self._metadata_fields:
                 if md_field == ChunkMetadata.TITLE:
                     result[md_field.name.lower()] = self._doc_title
@@ -265,10 +301,6 @@ class DocumentSplitter(ComponentIterator):
                 elif md_field == ChunkMetadata.BOUNDING_BOX:
                     result[md_field.name.lower()] = section.metadata.bounding_box
-            # FIX: only include image if schema supports it
-            if self._include_page_image:
-                result['image'] = section.image
             return result
     def _html_sections(self) -> Iterator[DocumentSection]:
@@ -411,11 +443,10 @@ class DocumentSplitter(ComponentIterator):
             return txt
         for page_idx, page in enumerate(doc.pages()):
-            # render once per page if requested
-            page_image = None
-            if self._include_page_image:
-                pix = page.get_pixmap(dpi=self._page_image_dpi)  # ← single render
-                page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
+            img: PIL.Image.Image | None = None
+            if Element.IMAGE in self._elements:
+                pix = page.get_pixmap(dpi=self._image_dpi)
+                img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
             for block in page.get_text('blocks'):
                 x1, y1, x2, y2, text, *_ = block
@@ -423,14 +454,14 @@ class DocumentSplitter(ComponentIterator):
                 if accumulated_text and emit_on_paragraph:
                     bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
                     md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
-                    yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
+                    yield DocumentSection(text=_emit_text(), metadata=md)
             if accumulated_text and emit_on_page and not emit_on_paragraph:
                 md = DocumentSectionMetadata(page=page_idx)
-                yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
+                yield DocumentSection(text=_emit_text(), image=img, metadata=md)
         if accumulated_text and not emit_on_page:
-            yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(), image=None)
+            yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
     def _txt_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections for text files.

pixeltable/iterators/video.py CHANGED Viewed

@@ -251,7 +251,8 @@ class VideoSplitter(ComponentIterator):
     # Input parameters
     video_path: Path
-    segment_duration: float
+    segment_duration: float | None
+    segment_times: list[float] | None
     overlap: float
     min_segment_duration: float
     video_encoder: str | None
@@ -268,25 +269,31 @@ class VideoSplitter(ComponentIterator):
         self,
         video: str,
         *,
-        duration: float,
-        overlap: float = 0.0,
-        min_segment_duration: float = 0.0,
-        mode: Literal['fast', 'accurate'] = 'fast',
+        duration: float | None = None,
+        overlap: float | None = None,
+        min_segment_duration: float | None = None,
+        segment_times: list[float] | None = None,
+        mode: Literal['fast', 'accurate'] = 'accurate',
         video_encoder: str | None = None,
         video_encoder_args: dict[str, Any] | None = None,
     ):
         Env.get().require_binary('ffmpeg')
-        assert duration > 0.0
-        assert duration >= min_segment_duration
-        assert overlap < duration
+        assert (duration is not None) != (segment_times is not None)
+        if segment_times is not None:
+            assert len(segment_times) > 0
+        if duration is not None:
+            assert duration > 0.0
+            assert duration >= min_segment_duration
+            assert overlap is None or overlap < duration
         video_path = Path(video)
         assert video_path.exists() and video_path.is_file()
         self.video_path = video_path
         self.segment_duration = duration
-        self.overlap = overlap
-        self.min_segment_duration = min_segment_duration
+        self.overlap = overlap if overlap is not None else 0.0
+        self.min_segment_duration = min_segment_duration if min_segment_duration is not None else 0.0
+        self.segment_times = segment_times
         self.video_encoder = video_encoder
         self.video_encoder_args = video_encoder_args
@@ -304,6 +311,7 @@ class VideoSplitter(ComponentIterator):
             'duration': ts.FloatType(nullable=True),
             'overlap': ts.FloatType(nullable=True),
             'min_segment_duration': ts.FloatType(nullable=True),
+            'segment_times': ts.JsonType(nullable=True),
             'mode': ts.StringType(nullable=False),
             'video_encoder': ts.StringType(nullable=True),
             'video_encoder_args': ts.JsonType(nullable=True),
@@ -311,23 +319,34 @@ class VideoSplitter(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
-        param_names = ['duration', 'overlap', 'min_segment_duration']
+        param_names = ['duration', 'overlap', 'min_segment_duration', 'segment_times']
         params = dict(zip(param_names, args))
         params.update(kwargs)
-        segment_duration = params['duration']
-        min_segment_duration = params.get('min_segment_duration', 0.0)
-        overlap = params.get('overlap', 0.0)
+        segment_duration = params.get('duration')
+        segment_times = params.get('segment_times')
+        overlap = params.get('overlap')
+        min_segment_duration = params.get('min_segment_duration')
         mode = params.get('mode', 'fast')
-        if segment_duration <= 0.0:
-            raise excs.Error('duration must be a positive number')
-        if segment_duration < min_segment_duration:
-            raise excs.Error('duration must be at least min_segment_duration')
-        if mode == 'accurate' and overlap > 0:
+        if segment_duration is None and segment_times is None:
+            raise excs.Error('Must specify either duration or segment_times')
+        if segment_duration is not None and segment_times is not None:
+            raise excs.Error('duration and segment_times cannot both be specified')
+        if segment_times is not None:
+            if len(segment_times) == 0:
+                raise excs.Error('segment_times cannot be empty')
+            if overlap is not None:
+                raise excs.Error('overlap cannot be specified with segment_times')
+        if segment_duration is not None:
+            if segment_duration <= 0.0:
+                raise excs.Error('duration must be a positive number')
+            if min_segment_duration is not None and segment_duration < min_segment_duration:
+                raise excs.Error('duration must be at least min_segment_duration')
+            if overlap is not None and overlap >= segment_duration:
+                raise excs.Error('overlap must be less than duration')
+        if mode == 'accurate' and overlap is not None:
             raise excs.Error("Cannot specify overlap for mode='accurate'")
-        if overlap >= segment_duration:
-            raise excs.Error('overlap must be less than duration')
         if mode == 'fast':
             if params.get('video_encoder') is not None:
                 raise excs.Error("Cannot specify video_encoder for mode='fast'")
@@ -343,13 +362,22 @@ class VideoSplitter(ComponentIterator):
         }, []
     def fast_iter(self) -> Iterator[dict[str, Any]]:
-        segment_path: str
+        segment_path: str = ''
         try:
             start_time = 0.0
             start_pts = 0
+            segment_idx = 0
             while True:
+                target_duration: float | None
+                if self.segment_duration is not None:
+                    target_duration = self.segment_duration
+                elif self.segment_times is not None and segment_idx < len(self.segment_times):
+                    target_duration = self.segment_times[segment_idx] - start_time
+                else:
+                    target_duration = None  # the rest of the video
                 segment_path = str(TempStore.create_path(extension='.mp4'))
-                cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, self.segment_duration)
+                cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, target_duration)
                 _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
                 # use the actual duration
@@ -373,8 +401,13 @@ class VideoSplitter(ComponentIterator):
                 start_time = segment_end - self.overlap
                 start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
+                segment_idx += 1
+                if self.segment_times is not None and segment_idx > len(self.segment_times):
+                    # We've created all segments including the final segment after the last segment_time
+                    break
         except subprocess.CalledProcessError as e:
-            if Path(segment_path).exists():
+            if segment_path and Path(segment_path).exists():
                 Path(segment_path).unlink()
             error_msg = f'ffmpeg failed with return code {e.returncode}'
             if e.stderr:
@@ -389,6 +422,7 @@ class VideoSplitter(ComponentIterator):
             str(self.video_path),
             output_pattern,
             segment_duration=self.segment_duration,
+            segment_times=self.segment_times,
             video_encoder=self.video_encoder,
             video_encoder_args=self.video_encoder_args,
         )

pixeltable/plan.py CHANGED Viewed

@@ -93,18 +93,13 @@ class SampleClause:
     seed: Optional[int]
     stratify_exprs: Optional[list[exprs.Expr]]
-    # This seed value is used if one is not supplied
-    DEFAULT_SEED = 0
     # The version of the hashing algorithm used for ordering and fractional sampling.
     CURRENT_VERSION = 1
     def __post_init__(self) -> None:
-        """If no version was provided, provide the default version"""
+        # If no version was provided, provide the default version
         if self.version is None:
             self.version = self.CURRENT_VERSION
-        if self.seed is None:
-            self.seed = self.DEFAULT_SEED
     @property
     def is_stratified(self) -> bool:
@@ -1006,6 +1001,7 @@ class Planner:
             analyzer.window_fn_calls
         )
         ctx = exec.ExecContext(row_builder)
         combined_ordering = cls._create_combined_ordering(analyzer, verify_agg=is_python_agg)
         cls._verify_join_clauses(analyzer)

pixeltable 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.17py3-none-any.whl → 0.4.18py3-none-any.whl