PyPI - pixeltable - Versions diffs - 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

pixeltable 0.2.21py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (82) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/column.py +37 -11
pixeltable/catalog/globals.py +18 -0
pixeltable/catalog/insertable_table.py +6 -4
pixeltable/catalog/table.py +19 -3
pixeltable/catalog/table_version.py +34 -14
pixeltable/catalog/view.py +16 -17
pixeltable/dataframe.py +7 -8
pixeltable/env.py +5 -0
pixeltable/exec/__init__.py +0 -1
pixeltable/exec/aggregation_node.py +6 -3
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/data_row_batch.py +2 -19
pixeltable/exec/exec_node.py +2 -1
pixeltable/exec/expr_eval_node.py +17 -10
pixeltable/exec/in_memory_data_node.py +6 -3
pixeltable/exec/sql_node.py +24 -25
pixeltable/exprs/arithmetic_expr.py +3 -1
pixeltable/exprs/array_slice.py +7 -7
pixeltable/exprs/column_property_ref.py +37 -10
pixeltable/exprs/column_ref.py +93 -14
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +8 -7
pixeltable/exprs/data_row.py +27 -18
pixeltable/exprs/expr.py +53 -52
pixeltable/exprs/expr_set.py +5 -0
pixeltable/exprs/function_call.py +32 -16
pixeltable/exprs/globals.py +4 -1
pixeltable/exprs/in_predicate.py +8 -7
pixeltable/exprs/inline_expr.py +4 -4
pixeltable/exprs/is_null.py +4 -4
pixeltable/exprs/json_mapper.py +11 -12
pixeltable/exprs/json_path.py +5 -10
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +5 -4
pixeltable/exprs/object_ref.py +2 -1
pixeltable/exprs/row_builder.py +88 -36
pixeltable/exprs/rowid_ref.py +12 -11
pixeltable/exprs/similarity_expr.py +12 -7
pixeltable/exprs/sql_element_cache.py +7 -5
pixeltable/exprs/type_cast.py +8 -6
pixeltable/exprs/variable.py +5 -4
pixeltable/func/aggregate_function.py +1 -1
pixeltable/func/function.py +11 -10
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/globals.py +5 -7
pixeltable/functions/huggingface.py +19 -20
pixeltable/functions/llama_cpp.py +106 -0
pixeltable/functions/ollama.py +147 -0
pixeltable/functions/replicate.py +72 -0
pixeltable/functions/string.py +9 -0
pixeltable/globals.py +12 -20
pixeltable/index/btree.py +16 -3
pixeltable/index/embedding_index.py +4 -4
pixeltable/io/__init__.py +1 -2
pixeltable/io/fiftyone.py +178 -0
pixeltable/io/globals.py +96 -2
pixeltable/iterators/base.py +3 -2
pixeltable/iterators/document.py +1 -1
pixeltable/iterators/video.py +120 -63
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_21.py +34 -0
pixeltable/metadata/converters/util.py +45 -4
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +8 -0
pixeltable/plan.py +16 -14
pixeltable/py.typed +0 -0
pixeltable/store.py +7 -2
pixeltable/tool/create_test_video.py +1 -1
pixeltable/tool/embed_udf.py +1 -1
pixeltable/tool/mypy_plugin.py +28 -5
pixeltable/type_system.py +17 -1
pixeltable/utils/documents.py +15 -1
pixeltable/utils/formatter.py +9 -10
{pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/METADATA +46 -10
pixeltable-0.2.22.dist-info/RECORD +153 -0
pixeltable/exec/media_validation_node.py +0 -43
pixeltable-0.2.21.dist-info/RECORD +0 -148
{pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
{pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
{pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0

pixeltable/iterators/video.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import logging
 import math
+from fractions import Fraction
 from pathlib import Path
 from typing import Any, Optional, Sequence
-import cv2
+import av  # type: ignore[import-untyped]
+import pandas as pd
 import PIL.Image
-from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, FloatType, ImageType, IntType, VideoType
+import pixeltable.exceptions as excs
+import pixeltable.type_system as ts
 from .base import ComponentIterator
@@ -30,108 +32,163 @@ class FrameIterator(ComponentIterator):
                 `num_frames` is greater than the number of frames in the video, all frames will be extracted.
     """
+    # Input parameters
     video_path: Path
-    video_reader: cv2.VideoCapture
     fps: Optional[float]
     num_frames: Optional[int]
-    frames_to_extract: Sequence[int]
-    frames_set: set[int]
-    next_frame_idx: int
+    # Video info
+    container: av.container.input.InputContainer
+    video_framerate: Fraction
+    video_time_base: Fraction
+    video_frame_count: int
+    video_start_time: int
+    # List of frame indices to be extracted, or None to extract all frames
+    frames_to_extract: Optional[list[int]]
+    # Next frame to extract, as an iterator `pos` index. If `frames_to_extract` is None, this is the same as the
+    # frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
+    next_pos: int
     def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
         if fps is not None and num_frames is not None:
-            raise Error('At most one of `fps` or `num_frames` may be specified')
+            raise excs.Error('At most one of `fps` or `num_frames` may be specified')
         video_path = Path(video)
         assert video_path.exists() and video_path.is_file()
         self.video_path = video_path
-        self.video_reader = cv2.VideoCapture(str(video_path))
+        self.container = av.open(str(video_path))
         self.fps = fps
         self.num_frames = num_frames
-        if not self.video_reader.isOpened():
-            raise Error(f'Failed to open video: {video}')
-        video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
-        if fps is not None and fps > video_fps:
-            raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
-        num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
-        if num_video_frames == 0:
-            raise Error(f'Video {video}: failed to get number of frames')
+        self.video_framerate = self.container.streams.video[0].average_rate
+        self.video_time_base = self.container.streams.video[0].time_base
+        self.video_start_time = self.container.streams.video[0].start_time or 0
+        # Determine the number of frames in the video
+        self.video_frame_count = self.container.streams.video[0].frames
+        if self.video_frame_count == 0:
+            # The video codec does not provide a frame count in the standard `frames` field. Try some other methods.
+            metadata: dict = self.container.streams.video[0].metadata
+            if 'NUMBER_OF_FRAMES' in metadata:
+                self.video_frame_count = int(metadata['NUMBER_OF_FRAMES'])
+            elif 'DURATION' in metadata:
+                # As a last resort, calculate the frame count from the stream duration.
+                duration = metadata['DURATION']
+                assert isinstance(duration, str)
+                seconds = pd.to_timedelta(duration).total_seconds()
+                # Usually the duration and framerate are precise enough for this calculation to be accurate, but if
+                # we encounter a case where it's off by one due to a rounding error, that's ok; we only use this
+                # to determine the positions of the sampled frames when `fps` or `num_frames` is specified.
+                self.video_frame_count = round(seconds * self.video_framerate)
+            else:
+                raise excs.Error(f'Video {video}: failed to get number of frames')
         if num_frames is not None:
             # specific number of frames
-            if num_frames > num_video_frames:
+            if num_frames > self.video_frame_count:
                 # Extract all frames
-                self.frames_to_extract = range(num_video_frames)
+                self.frames_to_extract = None
             else:
-                spacing = float(num_video_frames) / float(num_frames)
+                spacing = float(self.video_frame_count) / float(num_frames)
                 self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
                 assert len(self.frames_to_extract) == num_frames
         else:
             if fps is None or fps == 0.0:
                 # Extract all frames
-                self.frames_to_extract = range(num_video_frames)
+                self.frames_to_extract = None
+            elif fps > float(self.video_framerate):
+                raise excs.Error(
+                    f'Video {video}: requested fps ({fps}) exceeds that of the video ({float(self.video_framerate)})'
+                )
             else:
                 # Extract frames at the implied frequency
-                freq = fps / video_fps
-                n = math.ceil(num_video_frames * freq)  # number of frames to extract
+                freq = fps / float(self.video_framerate)
+                n = math.ceil(self.video_frame_count * freq)  # number of frames to extract
                 self.frames_to_extract = list(round(i / freq) for i in range(n))
-        # We need the list of frames as both a list (for set_pos) and a set (for fast lookups when
-        # there are lots of frames)
-        self.frames_set = set(self.frames_to_extract)
         _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
-        self.next_frame_idx = 0
+        self.next_pos = 0
     @classmethod
-    def input_schema(cls) -> dict[str, ColumnType]:
+    def input_schema(cls) -> dict[str, ts.ColumnType]:
         return {
-            'video': VideoType(nullable=False),
-            'fps': FloatType(nullable=True),
-            'num_frames': IntType(nullable=True),
+            'video': ts.VideoType(nullable=False),
+            'fps': ts.FloatType(nullable=True),
+            'num_frames': ts.IntType(nullable=True),
         }
     @classmethod
-    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
         return {
-            'frame_idx': IntType(),
-            'pos_msec': FloatType(),
-            'pos_frame': FloatType(),
-            'frame': ImageType(),
+            'frame_idx': ts.IntType(),
+            'pos_msec': ts.FloatType(),
+            'pos_frame': ts.IntType(),
+            'frame': ts.ImageType(),
         }, ['frame']
     def __next__(self) -> dict[str, Any]:
-        # jumping to the target frame here with video_reader.set() is far slower than just
-        # skipping the unwanted frames
+        # Determine the frame index in the video corresponding to the iterator index `next_pos`;
+        # the frame at this index is the one we want to extract next
+        if self.frames_to_extract is None:
+            next_video_idx = self.next_pos  # we're extracting all frames
+        elif self.next_pos >= len(self.frames_to_extract):
+            raise StopIteration
+        else:
+            next_video_idx = self.frames_to_extract[self.next_pos]
+        # We are searching for the frame at the index implied by `next_pos`. Step through the video until we
+        # find it. There are two reasons why it might not be the immediate next frame in the video:
+        # (1) `fps` or `num_frames` was specified as an iterator argument; or
+        # (2) we just did a seek, and the desired frame is not a keyframe.
+        # TODO: In case (1) it will usually be fastest to step through the frames until we find the one we're
+        #     looking for. But in some cases it may be faster to do a seek; for example, when `fps` is very
+        #     low and there are multiple keyframes in between each frame we want to extract (imagine extracting
+        #     10 frames from an hourlong video).
         while True:
-            pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
-            pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
-            status, img = self.video_reader.read()
-            if not status:
-                _logger.debug(f'releasing video reader for {self.video_path}')
-                self.video_reader.release()
-                self.video_reader = None
+            try:
+                frame = next(self.container.decode(video=0))
+            except EOFError:
                 raise StopIteration
-            if pos_frame in self.frames_set:
-                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-                result = {
-                    'frame_idx': self.next_frame_idx,
-                    'pos_msec': pos_msec,
-                    'pos_frame': pos_frame,
-                    'frame': PIL.Image.fromarray(img),
-                }
-                self.next_frame_idx += 1
-                return result
+            # Compute the index of the current frame in the video based on the presentation timestamp (pts);
+            # this ensures we have a canonical understanding of frame index, regardless of how we got here
+            # (seek or iteration)
+            pts = frame.pts - self.video_start_time
+            video_idx = round(pts * self.video_time_base * self.video_framerate)
+            assert isinstance(video_idx, int)
+            if video_idx < next_video_idx:
+                # We haven't reached the desired frame yet
+                continue
+            # Sanity check that we're at the right frame.
+            if video_idx != next_video_idx:
+                raise excs.Error(f'Frame {next_video_idx} is missing from the video (video file is corrupt)')
+            img = frame.to_image()
+            assert isinstance(img, PIL.Image.Image)
+            pos_msec = float(pts * self.video_time_base * 1000)
+            result = {
+                'frame_idx': self.next_pos,
+                'pos_msec': pos_msec,
+                'pos_frame': video_idx,
+                'frame': img,
+            }
+            self.next_pos += 1
+            return result
     def close(self) -> None:
-        if self.video_reader is not None:
-            self.video_reader.release()
-            self.video_reader = None
+        self.container.close()
     def set_pos(self, pos: int) -> None:
         """Seek to frame idx"""
-        if pos == self.next_frame_idx:
-            return
-        _logger.debug(f'seeking to frame {pos}')
-        self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, self.frames_to_extract[pos])
-        self.next_frame_idx = pos
+        if pos == self.next_pos:
+            return  # already there
+        video_idx = pos if self.frames_to_extract is None else self.frames_to_extract[pos]
+        _logger.debug(f'seeking to frame number {video_idx} (at iterator index {pos})')
+        # compute the frame position in time_base units
+        seek_pos = int(video_idx / self.video_framerate / self.video_time_base + self.video_start_time)
+        # This will seek to the nearest keyframe before the desired frame. If the frame being sought is not a keyframe,
+        # then the iterator will step forward to the desired frame on the subsequent call to next().
+        self.container.seek(seek_pos, backward=True, stream=self.container.streams.video[0])
+        self.next_pos = pos

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 21
+VERSION = 22
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_21.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_schema_version_md, convert_table_md
+@register_converter(version=21)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_schema_version_md(
+        engine,
+        table_schema_version_md_updater=__update_table_schema_version,
+        schema_column_updater=__update_schema_column
+    )
+    convert_table_md(
+        engine,
+        substitution_fn=__substitute_md
+    )
+def __update_table_schema_version(table_schema_version_md: dict) -> None:
+    table_schema_version_md['media_validation'] = 'on_write'  # MediaValidation.ON_WRITE
+def __update_schema_column(schema_column: dict) -> None:
+    schema_column['media_validation'] = None
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
+        if 'perform_validation' not in v:
+            v['perform_validation'] = False
+        return k, v
+    return None

pixeltable/metadata/converters/util.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Callable, Optional
 import sqlalchemy as sql
-from pixeltable.metadata.schema import Table
+from pixeltable.metadata.schema import Table, TableSchemaVersion
 __logger = logging.getLogger('pixeltable')
@@ -17,12 +17,12 @@ def convert_table_md(
     substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None
 ) -> None:
     """
-    Converts table metadata based on the specified conversion functions.
+    Converts schema.TableMd dicts based on the specified conversion functions.
     Args:
         engine: The SQLAlchemy engine.
-        table_md_updater: A function that updates the table metadata in place.
-        column_md_updater: A function that updates the column metadata in place.
+        table_md_updater: A function that updates schema.TableMd dicts in place.
+        column_md_updater: A function that updates schema.ColumnMd dicts in place.
         external_store_md_updater: A function that updates the external store metadata in place.
         substitution_fn: A function that substitutes metadata values. If specified, all metadata will be traversed
             recursively, and `substitution_fn` will be called once for each metadata entry. If the entry appears in
@@ -90,3 +90,44 @@ def __substitute_md_rec(
         return updated_list
     else:
         return md
+def convert_table_schema_version_md(
+    engine: sql.engine.Engine,
+    table_schema_version_md_updater: Optional[Callable[[dict], None]] = None,
+    schema_column_updater: Optional[Callable[[dict], None]] = None
+) -> None:
+    """
+    Converts schema.TableSchemaVersionMd dicts based on the specified conversion functions.
+    Args:
+        engine: The SQLAlchemy engine.
+        table_schema_version_md_updater: A function that updates schema.TableSchemaVersionMd dicts in place.
+        schema_column_updater: A function that updates schema.SchemaColumn dicts in place.
+    """
+    with engine.begin() as conn:
+        stmt = sql.select(TableSchemaVersion.tbl_id, TableSchemaVersion.schema_version, TableSchemaVersion.md)
+        for row in conn.execute(stmt):
+            tbl_id, schema_version, md = row[0], row[1], row[2]
+            assert isinstance(md, dict)
+            updated_md = copy.deepcopy(md)
+            if table_schema_version_md_updater is not None:
+                table_schema_version_md_updater(updated_md)
+            if schema_column_updater is not None:
+                __update_schema_column(updated_md, schema_column_updater)
+            if updated_md != md:
+                __logger.info(f'Updating TableSchemaVersion(tbl_id={tbl_id}, schema_version={schema_version})')
+                update_stmt = (
+                    sql.update(TableSchemaVersion)
+                    .where(TableSchemaVersion.tbl_id == tbl_id)
+                    .where(TableSchemaVersion.schema_version == schema_version)
+                    .values(md=updated_md)
+                )
+                conn.execute(update_stmt)
+def __update_schema_column(table_schema_version_md: dict, schema_column_updater: Callable[[dict], None]) -> None:
+    cols = table_schema_version_md['columns']
+    assert isinstance(cols, dict)
+    for schema_col in cols.values():
+        schema_column_updater(schema_col)

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    22: 'TableMd/ColumnMd.media_validation',
     21: 'Separate InlineArray and InlineList',
     20: 'Store DB timestamps in UTC',
     19: 'UDF renames; ImageMemberAccess removal',

pixeltable/metadata/schema.py CHANGED Viewed

@@ -202,6 +202,10 @@ class SchemaColumn:
     pos: int
     name: str
+    # media validation strategy of this particular media column; if not set, TableMd.media_validation applies
+    # stores column.MediaValiation.name.lower()
+    media_validation: Optional[str]
 @dataclasses.dataclass
 class TableSchemaVersionMd:
@@ -214,6 +218,10 @@ class TableSchemaVersionMd:
     num_retained_versions: int
     comment: str
+    # default validation strategy for any media column of this table
+    # stores column.MediaValiation.name.lower()
+    media_validation: str
 # versioning: each table schema change results in a new record
 class TableSchemaVersion(Base):

pixeltable/plan.py CHANGED Viewed

@@ -225,27 +225,28 @@ class Planner:
         assert not tbl.is_view()
         # stored_cols: all cols we need to store, incl computed cols (and indices)
         stored_cols = [c for c in tbl.cols if c.is_stored]
-        assert len(stored_cols) > 0
+        assert len(stored_cols) > 0  # there needs to be something to store
         row_builder = exprs.RowBuilder([], stored_cols, [])
         # create InMemoryDataNode for 'rows'
-        stored_col_info = row_builder.output_slot_idxs()
-        stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
-        input_col_info = [info for info in stored_col_info if not info.col.is_computed]
         plan: exec.ExecNode = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
-        media_input_cols = [info for info in input_col_info if info.col.col_type.is_media_type()]
-        if len(media_input_cols) > 0:
-            # prefetch external files for all input column refs for validation
-            plan = exec.CachePrefetchNode(tbl.id, media_input_cols, input=plan)
-            plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
+        media_input_col_info = [
+            exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
+            for col_ref in row_builder.input_exprs
+            if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
+        ]
+        if len(media_input_col_info) > 0:
+            # prefetch external files for all input column refs
+            plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
-        computed_exprs = [e for e in row_builder.default_eval_ctx.target_exprs if not isinstance(e, exprs.ColumnRef)]
+        computed_exprs = row_builder.output_exprs - row_builder.input_exprs
         if len(computed_exprs) > 0:
             # add an ExprEvalNode when there are exprs to compute
             plan = exec.ExprEvalNode(row_builder, computed_exprs, plan.output_exprs, input=plan)
+        stored_col_info = row_builder.output_slot_idxs()
+        stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
         plan.set_stored_img_cols(stored_img_col_info)
         plan.set_ctx(
             exec.ExecContext(
@@ -621,7 +622,8 @@ class Planner:
         assert isinstance(tbl, catalog.TableVersionPath)
         sql_elements = analyzer.sql_elements
         is_python_agg = (
-            not sql_elements.contains(analyzer.agg_fn_calls) or not sql_elements.contains(analyzer.window_fn_calls)
+            not sql_elements.contains_all(analyzer.agg_fn_calls)
+            or not sql_elements.contains_all(analyzer.window_fn_calls)
         )
         ctx = exec.ExecContext(row_builder)
         cls._verify_ordering(analyzer, verify_agg=is_python_agg)
@@ -671,8 +673,8 @@ class Planner:
             ctx.batch_size = 16
             # do aggregation in SQL if all agg exprs can be translated
-            if (sql_elements.contains(analyzer.select_list)
-                    and sql_elements.contains(analyzer.grouping_exprs)
+            if (sql_elements.contains_all(analyzer.select_list)
+                    and sql_elements.contains_all(analyzer.grouping_exprs)
                     and isinstance(plan, exec.SqlNode)
                     and plan.to_cte() is not None):
                 plan = exec.SqlAggregationNode(

pixeltable/py.typed ADDED Viewed

File without changes

pixeltable/store.py CHANGED Viewed

@@ -303,7 +303,7 @@ class StoreBase:
     def insert_rows(
             self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
-            show_progress: bool = True, rowids: Optional[Iterator[int]] = None
+            show_progress: bool = True, rowids: Optional[Iterator[int]] = None, abort_on_exc: bool = False
     ) -> tuple[int, int, set[int]]:
         """Insert rows into the store table and update the catalog table's md
         Returns:
@@ -325,8 +325,13 @@ class StoreBase:
                 for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
                     # compute batch of rows and convert them into table rows
                     table_rows: list[dict[str, Any]] = []
-                    for row_idx in range(batch_start_idx, min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))):
+                    batch_stop_idx = min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))
+                    for row_idx in range(batch_start_idx, batch_stop_idx):
                         row = row_batch[row_idx]
+                        # if abort_on_exc == True, we need to check for media validation exceptions
+                        if abort_on_exc and row.has_exc():
+                            exc = row.get_first_exc()
+                            raise exc
                         rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
                         pk = rowid + (v_min,)

pixeltable/tool/create_test_video.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import av
+import av  # type: ignore[import-untyped]
 import PIL.Image
 import PIL.ImageDraw
 import PIL.ImageFont

pixeltable/tool/embed_udf.py CHANGED Viewed

@@ -6,4 +6,4 @@ import pixeltable as pxt
 # TODO This can go away once we have the ability to inline expr_udf's
 @pxt.expr_udf
 def clip_text_embed(txt: str) -> np.ndarray:
-    return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32')
+    return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32')  # type: ignore[return-value]

pixeltable/tool/mypy_plugin.py CHANGED Viewed

@@ -1,12 +1,15 @@
 from typing import Callable, Optional
-from mypy.plugin import AnalyzeTypeContext, Plugin
-from mypy.types import Type
+from mypy import nodes
+from mypy.plugin import AnalyzeTypeContext, ClassDefContext, Plugin
+from mypy.plugins.common import add_method_to_class
+from mypy.types import AnyType, Type, TypeOfAny
 import pixeltable as pxt
 class PxtPlugin(Plugin):
+    __UDA_FULLNAME = f'{pxt.uda.__module__}.{pxt.uda.__name__}'
     __TYPE_MAP = {
         pxt.Json: 'typing.Any',
         pxt.Array: 'numpy.ndarray',
@@ -20,13 +23,33 @@ class PxtPlugin(Plugin):
         for k, v in __TYPE_MAP.items()
     }
-    def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], type]]:
+    def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], Type]]:
         if fullname in self.__FULLNAME_MAP:
             subst_name = self.__FULLNAME_MAP[fullname]
             return lambda ctx: pxt_hook(ctx, subst_name)
+        return None
-def plugin(version: str):
+    def get_class_decorator_hook_2(self, fullname: str) -> Optional[Callable[[ClassDefContext], bool]]:
+        if fullname == self.__UDA_FULLNAME:
+            return pxt_decorator_hook
+        return None
+def plugin(version: str) -> type:
     return PxtPlugin
 def pxt_hook(ctx: AnalyzeTypeContext, subst_name: str) -> Type:
-    return ctx.api.named_type(subst_name)
+    if subst_name == 'typing.Any':
+        return AnyType(TypeOfAny.special_form)
+    return ctx.api.named_type(subst_name, [])
+def pxt_decorator_hook(ctx: ClassDefContext) -> bool:
+    arg = nodes.Argument(nodes.Var('fn'), AnyType(TypeOfAny.special_form), None, nodes.ARG_POS)
+    add_method_to_class(
+        ctx.api,
+        ctx.cls,
+        "to_sql",
+        args=[arg],
+        return_type=AnyType(TypeOfAny.special_form),
+        is_staticmethod=True,
+    )
+    return True

pixeltable/type_system.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import abc
 import datetime
 import enum
+import io
 import json
 import typing
 import urllib.parse
@@ -10,9 +11,9 @@ import urllib.request
 from pathlib import Path
 from typing import Any, Iterable, Mapping, Optional, Sequence, Union
+import PIL.Image
 import av  # type: ignore
 import numpy as np
-import PIL.Image
 import sqlalchemy as sql
 from typing import _GenericAlias  # type: ignore[attr-defined]
 from typing_extensions import _AnnotatedAlias
@@ -798,6 +799,20 @@ class ImageType(ColumnType):
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.String()
+    def _create_literal(self, val: Any) -> Any:
+        if isinstance(val, str) and val.startswith('data:'):
+            # try parsing this as a `data:` URL, and if successful, decode the image immediately
+            try:
+                with urllib.request.urlopen(val) as response:
+                    b = response.read()
+                img = PIL.Image.open(io.BytesIO(b))
+                img.load()
+                return img
+            except Exception as exc:
+                errormsg_val = val if len(val) < 50 else val[:50] + '...'
+                raise excs.Error(f'data URL could not be decoded into a valid image: {errormsg_val}') from exc
+        return val
     def _validate_literal(self, val: Any) -> None:
         if isinstance(val, PIL.Image.Image):
             return
@@ -876,6 +891,7 @@ class DocumentType(ColumnType):
         HTML = 0
         MD = 1
         PDF = 2
+        XML = 3
     def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
         super().__init__(self.Type.DOCUMENT, nullable=nullable)

pixeltable/utils/documents.py CHANGED Viewed

@@ -35,6 +35,11 @@ def get_document_handle(path: str) -> Optional[DocumentHandle]:
         if md_ast is not None:
             return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
+    if doc_format == '.xml':
+        bs_doc = get_xml_handle(path)
+        if bs_doc is not None:
+            return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
     return None
@@ -54,7 +59,16 @@ def get_pdf_handle(path: str) -> Optional[fitz.Document]:
 def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
     try:
         with open(path, 'r', encoding='utf8') as fp:
-            doc = bs4.BeautifulSoup(fp, 'html.parser')
+            doc = bs4.BeautifulSoup(fp, 'lxml')
+        return doc if doc.find() is not None else None
+    except Exception:
+        return None
+def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
+    try:
+        with open(path, 'r', encoding='utf8') as fp:
+            doc = bs4.BeautifulSoup(fp, 'xml')
         return doc if doc.find() is not None else None
     except Exception:
         return None

pixeltable 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.21py3-none-any.whl → 0.2.22py3-none-any.whl