PyPI - pixeltable - Versions diffs - 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

pixeltable 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (120) hide show

pixeltable/__init__.py +7 -19
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +7 -7
pixeltable/catalog/column.py +37 -11
pixeltable/catalog/globals.py +21 -0
pixeltable/catalog/insertable_table.py +6 -4
pixeltable/catalog/table.py +227 -148
pixeltable/catalog/table_version.py +66 -28
pixeltable/catalog/table_version_path.py +0 -8
pixeltable/catalog/view.py +18 -19
pixeltable/dataframe.py +16 -32
pixeltable/env.py +6 -1
pixeltable/exec/__init__.py +1 -2
pixeltable/exec/aggregation_node.py +27 -17
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/data_row_batch.py +9 -26
pixeltable/exec/exec_node.py +36 -7
pixeltable/exec/expr_eval_node.py +19 -11
pixeltable/exec/in_memory_data_node.py +14 -11
pixeltable/exec/sql_node.py +266 -138
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/arithmetic_expr.py +3 -1
pixeltable/exprs/array_slice.py +7 -7
pixeltable/exprs/column_property_ref.py +37 -10
pixeltable/exprs/column_ref.py +93 -14
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +8 -7
pixeltable/exprs/data_row.py +56 -36
pixeltable/exprs/expr.py +65 -63
pixeltable/exprs/expr_dict.py +55 -0
pixeltable/exprs/expr_set.py +26 -15
pixeltable/exprs/function_call.py +53 -24
pixeltable/exprs/globals.py +4 -1
pixeltable/exprs/in_predicate.py +8 -7
pixeltable/exprs/inline_expr.py +4 -4
pixeltable/exprs/is_null.py +4 -4
pixeltable/exprs/json_mapper.py +11 -12
pixeltable/exprs/json_path.py +5 -10
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +5 -4
pixeltable/exprs/object_ref.py +2 -1
pixeltable/exprs/row_builder.py +88 -36
pixeltable/exprs/rowid_ref.py +14 -13
pixeltable/exprs/similarity_expr.py +12 -7
pixeltable/exprs/sql_element_cache.py +12 -6
pixeltable/exprs/type_cast.py +8 -6
pixeltable/exprs/variable.py +5 -4
pixeltable/ext/functions/whisperx.py +7 -2
pixeltable/func/aggregate_function.py +1 -1
pixeltable/func/callable_function.py +2 -2
pixeltable/func/function.py +11 -10
pixeltable/func/function_registry.py +6 -7
pixeltable/func/query_template_function.py +11 -12
pixeltable/func/signature.py +17 -15
pixeltable/func/udf.py +0 -4
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/audio.py +4 -6
pixeltable/functions/globals.py +84 -42
pixeltable/functions/huggingface.py +31 -34
pixeltable/functions/image.py +59 -45
pixeltable/functions/json.py +0 -1
pixeltable/functions/llama_cpp.py +106 -0
pixeltable/functions/mistralai.py +2 -2
pixeltable/functions/ollama.py +147 -0
pixeltable/functions/openai.py +22 -25
pixeltable/functions/replicate.py +72 -0
pixeltable/functions/string.py +59 -50
pixeltable/functions/timestamp.py +20 -20
pixeltable/functions/together.py +2 -2
pixeltable/functions/video.py +11 -20
pixeltable/functions/whisper.py +2 -20
pixeltable/globals.py +65 -74
pixeltable/index/base.py +2 -2
pixeltable/index/btree.py +20 -7
pixeltable/index/embedding_index.py +12 -14
pixeltable/io/__init__.py +1 -2
pixeltable/io/external_store.py +11 -5
pixeltable/io/fiftyone.py +178 -0
pixeltable/io/globals.py +98 -2
pixeltable/io/hf_datasets.py +1 -1
pixeltable/io/label_studio.py +6 -6
pixeltable/io/parquet.py +14 -13
pixeltable/iterators/base.py +3 -2
pixeltable/iterators/document.py +10 -8
pixeltable/iterators/video.py +126 -60
pixeltable/metadata/__init__.py +4 -3
pixeltable/metadata/converters/convert_14.py +4 -2
pixeltable/metadata/converters/convert_15.py +1 -1
pixeltable/metadata/converters/convert_19.py +1 -0
pixeltable/metadata/converters/convert_20.py +1 -1
pixeltable/metadata/converters/convert_21.py +34 -0
pixeltable/metadata/converters/util.py +54 -12
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +40 -21
pixeltable/plan.py +149 -165
pixeltable/py.typed +0 -0
pixeltable/store.py +57 -37
pixeltable/tool/create_test_db_dump.py +6 -6
pixeltable/tool/create_test_video.py +1 -1
pixeltable/tool/doc_plugins/griffe.py +3 -34
pixeltable/tool/embed_udf.py +1 -1
pixeltable/tool/mypy_plugin.py +55 -0
pixeltable/type_system.py +260 -61
pixeltable/utils/arrow.py +10 -9
pixeltable/utils/coco.py +4 -4
pixeltable/utils/documents.py +16 -2
pixeltable/utils/filecache.py +9 -9
pixeltable/utils/formatter.py +10 -11
pixeltable/utils/http_server.py +2 -5
pixeltable/utils/media_store.py +6 -6
pixeltable/utils/pytorch.py +10 -11
pixeltable/utils/sql.py +2 -1
{pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
pixeltable-0.2.22.dist-info/RECORD +153 -0
pixeltable/exec/media_validation_node.py +0 -43
pixeltable/utils/help.py +0 -11
pixeltable-0.2.20.dist-info/RECORD +0 -147
{pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
{pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
{pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0

pixeltable/iterators/video.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import logging
 import math
+from fractions import Fraction
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Sequence
-import cv2
+import av  # type: ignore[import-untyped]
+import pandas as pd
 import PIL.Image
-from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, FloatType, ImageType, IntType, VideoType
+import pixeltable.exceptions as excs
+import pixeltable.type_system as ts
 from .base import ComponentIterator
@@ -29,100 +31,164 @@ class FrameIterator(ComponentIterator):
             num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
                 `num_frames` is greater than the number of frames in the video, all frames will be extracted.
     """
+    # Input parameters
+    video_path: Path
+    fps: Optional[float]
+    num_frames: Optional[int]
+    # Video info
+    container: av.container.input.InputContainer
+    video_framerate: Fraction
+    video_time_base: Fraction
+    video_frame_count: int
+    video_start_time: int
+    # List of frame indices to be extracted, or None to extract all frames
+    frames_to_extract: Optional[list[int]]
+    # Next frame to extract, as an iterator `pos` index. If `frames_to_extract` is None, this is the same as the
+    # frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
+    next_pos: int
     def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
         if fps is not None and num_frames is not None:
-            raise Error('At most one of `fps` or `num_frames` may be specified')
+            raise excs.Error('At most one of `fps` or `num_frames` may be specified')
         video_path = Path(video)
         assert video_path.exists() and video_path.is_file()
         self.video_path = video_path
-        self.video_reader = cv2.VideoCapture(str(video_path))
+        self.container = av.open(str(video_path))
         self.fps = fps
         self.num_frames = num_frames
-        if not self.video_reader.isOpened():
-            raise Error(f'Failed to open video: {video}')
-        video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
-        if fps is not None and fps > video_fps:
-            raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
-        num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
-        if num_video_frames == 0:
-            raise Error(f'Video {video}: failed to get number of frames')
+        self.video_framerate = self.container.streams.video[0].average_rate
+        self.video_time_base = self.container.streams.video[0].time_base
+        self.video_start_time = self.container.streams.video[0].start_time or 0
+        # Determine the number of frames in the video
+        self.video_frame_count = self.container.streams.video[0].frames
+        if self.video_frame_count == 0:
+            # The video codec does not provide a frame count in the standard `frames` field. Try some other methods.
+            metadata: dict = self.container.streams.video[0].metadata
+            if 'NUMBER_OF_FRAMES' in metadata:
+                self.video_frame_count = int(metadata['NUMBER_OF_FRAMES'])
+            elif 'DURATION' in metadata:
+                # As a last resort, calculate the frame count from the stream duration.
+                duration = metadata['DURATION']
+                assert isinstance(duration, str)
+                seconds = pd.to_timedelta(duration).total_seconds()
+                # Usually the duration and framerate are precise enough for this calculation to be accurate, but if
+                # we encounter a case where it's off by one due to a rounding error, that's ok; we only use this
+                # to determine the positions of the sampled frames when `fps` or `num_frames` is specified.
+                self.video_frame_count = round(seconds * self.video_framerate)
+            else:
+                raise excs.Error(f'Video {video}: failed to get number of frames')
         if num_frames is not None:
             # specific number of frames
-            if num_frames > num_video_frames:
+            if num_frames > self.video_frame_count:
                 # Extract all frames
-                self.frames_to_extract = range(num_video_frames)
+                self.frames_to_extract = None
             else:
-                spacing = float(num_video_frames) / float(num_frames)
+                spacing = float(self.video_frame_count) / float(num_frames)
                 self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
                 assert len(self.frames_to_extract) == num_frames
         else:
             if fps is None or fps == 0.0:
                 # Extract all frames
-                self.frames_to_extract = range(num_video_frames)
+                self.frames_to_extract = None
+            elif fps > float(self.video_framerate):
+                raise excs.Error(
+                    f'Video {video}: requested fps ({fps}) exceeds that of the video ({float(self.video_framerate)})'
+                )
             else:
                 # Extract frames at the implied frequency
-                freq = fps / video_fps
-                n = math.ceil(num_video_frames * freq)  # number of frames to extract
+                freq = fps / float(self.video_framerate)
+                n = math.ceil(self.video_frame_count * freq)  # number of frames to extract
                 self.frames_to_extract = list(round(i / freq) for i in range(n))
-        # We need the list of frames as both a list (for set_pos) and a set (for fast lookups when
-        # there are lots of frames)
-        self.frames_set = set(self.frames_to_extract)
         _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
-        self.next_frame_idx = 0
+        self.next_pos = 0
     @classmethod
-    def input_schema(cls) -> dict[str, ColumnType]:
+    def input_schema(cls) -> dict[str, ts.ColumnType]:
         return {
-            'video': VideoType(nullable=False),
-            'fps': FloatType(nullable=True),
-            'num_frames': IntType(nullable=True),
+            'video': ts.VideoType(nullable=False),
+            'fps': ts.FloatType(nullable=True),
+            'num_frames': ts.IntType(nullable=True),
         }
     @classmethod
-    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
         return {
-            'frame_idx': IntType(),
-            'pos_msec': FloatType(),
-            'pos_frame': FloatType(),
-            'frame': ImageType(),
+            'frame_idx': ts.IntType(),
+            'pos_msec': ts.FloatType(),
+            'pos_frame': ts.IntType(),
+            'frame': ts.ImageType(),
         }, ['frame']
     def __next__(self) -> dict[str, Any]:
-        # jumping to the target frame here with video_reader.set() is far slower than just
-        # skipping the unwanted frames
+        # Determine the frame index in the video corresponding to the iterator index `next_pos`;
+        # the frame at this index is the one we want to extract next
+        if self.frames_to_extract is None:
+            next_video_idx = self.next_pos  # we're extracting all frames
+        elif self.next_pos >= len(self.frames_to_extract):
+            raise StopIteration
+        else:
+            next_video_idx = self.frames_to_extract[self.next_pos]
+        # We are searching for the frame at the index implied by `next_pos`. Step through the video until we
+        # find it. There are two reasons why it might not be the immediate next frame in the video:
+        # (1) `fps` or `num_frames` was specified as an iterator argument; or
+        # (2) we just did a seek, and the desired frame is not a keyframe.
+        # TODO: In case (1) it will usually be fastest to step through the frames until we find the one we're
+        #     looking for. But in some cases it may be faster to do a seek; for example, when `fps` is very
+        #     low and there are multiple keyframes in between each frame we want to extract (imagine extracting
+        #     10 frames from an hourlong video).
         while True:
-            pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
-            pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
-            status, img = self.video_reader.read()
-            if not status:
-                _logger.debug(f'releasing video reader for {self.video_path}')
-                self.video_reader.release()
-                self.video_reader = None
+            try:
+                frame = next(self.container.decode(video=0))
+            except EOFError:
                 raise StopIteration
-            if pos_frame in self.frames_set:
-                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-                result = {
-                    'frame_idx': self.next_frame_idx,
-                    'pos_msec': pos_msec,
-                    'pos_frame': pos_frame,
-                    'frame': PIL.Image.fromarray(img),
-                }
-                self.next_frame_idx += 1
-                return result
+            # Compute the index of the current frame in the video based on the presentation timestamp (pts);
+            # this ensures we have a canonical understanding of frame index, regardless of how we got here
+            # (seek or iteration)
+            pts = frame.pts - self.video_start_time
+            video_idx = round(pts * self.video_time_base * self.video_framerate)
+            assert isinstance(video_idx, int)
+            if video_idx < next_video_idx:
+                # We haven't reached the desired frame yet
+                continue
+            # Sanity check that we're at the right frame.
+            if video_idx != next_video_idx:
+                raise excs.Error(f'Frame {next_video_idx} is missing from the video (video file is corrupt)')
+            img = frame.to_image()
+            assert isinstance(img, PIL.Image.Image)
+            pos_msec = float(pts * self.video_time_base * 1000)
+            result = {
+                'frame_idx': self.next_pos,
+                'pos_msec': pos_msec,
+                'pos_frame': video_idx,
+                'frame': img,
+            }
+            self.next_pos += 1
+            return result
     def close(self) -> None:
-        if self.video_reader is not None:
-            self.video_reader.release()
-            self.video_reader = None
+        self.container.close()
     def set_pos(self, pos: int) -> None:
         """Seek to frame idx"""
-        if pos == self.next_frame_idx:
-            return
-        _logger.debug(f'seeking to frame {pos}')
-        self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, self.frames_to_extract[pos])
-        self.next_frame_idx = pos
+        if pos == self.next_pos:
+            return  # already there
+        video_idx = pos if self.frames_to_extract is None else self.frames_to_extract[pos]
+        _logger.debug(f'seeking to frame number {video_idx} (at iterator index {pos})')
+        # compute the frame position in time_base units
+        seek_pos = int(video_idx / self.video_framerate / self.video_time_base + self.video_start_time)
+        # This will seek to the nearest keyframe before the desired frame. If the frame being sought is not a keyframe,
+        # then the iterator will step forward to the desired frame on the subsequent call to next().
+        self.container.seek(seek_pos, backward=True, stream=self.container.streams.video[0])
+        self.next_pos = pos

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ import dataclasses
 import importlib
 import os
 import pkgutil
-from typing import Callable, Dict
+from typing import Callable
 import sqlalchemy as sql
 import sqlalchemy.orm as orm
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 21
+VERSION = 22
 def create_system_info(engine: sql.engine.Engine) -> None:
@@ -24,7 +24,7 @@ def create_system_info(engine: sql.engine.Engine) -> None:
 # conversion functions for upgrading the metadata schema from one version to the following
 # key: old schema version
-converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
+converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
 def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
     def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
@@ -41,6 +41,7 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
     with orm.Session(engine) as session:
         system_info = session.query(SystemInfo).one().md
         md_version = system_info['schema_version']
+        assert isinstance(md_version, int)
         if md_version == VERSION:
             return
         while md_version < VERSION:

pixeltable/metadata/converters/convert_14.py CHANGED Viewed

@@ -1,11 +1,13 @@
+from typing import Any
 import sqlalchemy as sql
-from pixeltable.metadata.schema import Table
 from pixeltable.metadata import register_converter
+from pixeltable.metadata.schema import Table
 @register_converter(version=14)
 def _(engine: sql.engine.Engine) -> None:
-    default_remotes = {'remotes': []}
+    default_remotes: dict[str, Any] = {'remotes': []}
     with engine.begin() as conn:
         conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))

pixeltable/metadata/converters/convert_15.py CHANGED Viewed

@@ -3,7 +3,7 @@ import inspect
 import logging
 from typing import Any
-import cloudpickle
+import cloudpickle  # type: ignore[import-untyped]
 import sqlalchemy as sql
 import pixeltable.func as func

pixeltable/metadata/converters/convert_19.py CHANGED Viewed

@@ -44,3 +44,4 @@ def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
         dt_utc = dt.astimezone(datetime.timezone.utc)
         v['val'] = dt_utc.isoformat()
         return k, v
+    return None

pixeltable/metadata/converters/convert_20.py CHANGED Viewed

@@ -35,7 +35,7 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
             # but it might actually be transformed into an InlineList when it is instantiated
             # (unfortunately, there is no way to disambiguate at this stage; see comments in
             # InlineArray._from_dict() for more details).
-            updated_v = {'_classname': 'InlineList' if v.get('is_json') else 'InlineArray'}
+            updated_v: dict[str, Any] = {'_classname': 'InlineList' if v.get('is_json') else 'InlineArray'}
             if len(updated_components) > 0:
                 updated_v['components'] = updated_components
             return k, updated_v

pixeltable/metadata/converters/convert_21.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_schema_version_md, convert_table_md
+@register_converter(version=21)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_schema_version_md(
+        engine,
+        table_schema_version_md_updater=__update_table_schema_version,
+        schema_column_updater=__update_schema_column
+    )
+    convert_table_md(
+        engine,
+        substitution_fn=__substitute_md
+    )
+def __update_table_schema_version(table_schema_version_md: dict) -> None:
+    table_schema_version_md['media_validation'] = 'on_write'  # MediaValidation.ON_WRITE
+def __update_schema_column(schema_column: dict) -> None:
+    schema_column['media_validation'] = None
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
+        if 'perform_validation' not in v:
+            v['perform_validation'] = False
+        return k, v
+    return None

pixeltable/metadata/converters/util.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Callable, Optional
 import sqlalchemy as sql
-from pixeltable.metadata.schema import Table
+from pixeltable.metadata.schema import Table, TableSchemaVersion
 __logger = logging.getLogger('pixeltable')
@@ -17,12 +17,12 @@ def convert_table_md(
     substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None
 ) -> None:
     """
-    Converts table metadata based on the specified conversion functions.
+    Converts schema.TableMd dicts based on the specified conversion functions.
     Args:
         engine: The SQLAlchemy engine.
-        table_md_updater: A function that updates the table metadata in place.
-        column_md_updater: A function that updates the column metadata in place.
+        table_md_updater: A function that updates schema.TableMd dicts in place.
+        column_md_updater: A function that updates schema.ColumnMd dicts in place.
         external_store_md_updater: A function that updates the external store metadata in place.
         substitution_fn: A function that substitutes metadata values. If specified, all metadata will be traversed
             recursively, and `substitution_fn` will be called once for each metadata entry. If the entry appears in
@@ -68,24 +68,66 @@ def __substitute_md_rec(
     substitution_fn: Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]
 ) -> Any:
     if isinstance(md, dict):
-        updated_md = {}
+        updated_dict: dict[str, Any] = {}
         for k, v in md.items():
+            assert isinstance(k, str)
             substitute = substitution_fn(k, v)
             if substitute is not None:
                 updated_k, updated_v = substitute
-                updated_md[updated_k] = __substitute_md_rec(updated_v, substitution_fn)
+                updated_dict[updated_k] = __substitute_md_rec(updated_v, substitution_fn)
             else:
-                updated_md[k] = __substitute_md_rec(v, substitution_fn)
-        return updated_md
+                updated_dict[k] = __substitute_md_rec(v, substitution_fn)
+        return updated_dict
     elif isinstance(md, list):
-        updated_md = []
+        updated_list: list[Any] = []
         for v in md:
             substitute = substitution_fn(None, v)
             if substitute is not None:
                 _, updated_v = substitute
-                updated_md.append(__substitute_md_rec(updated_v, substitution_fn))
+                updated_list.append(__substitute_md_rec(updated_v, substitution_fn))
             else:
-                updated_md.append(__substitute_md_rec(v, substitution_fn))
-        return updated_md
+                updated_list.append(__substitute_md_rec(v, substitution_fn))
+        return updated_list
     else:
         return md
+def convert_table_schema_version_md(
+    engine: sql.engine.Engine,
+    table_schema_version_md_updater: Optional[Callable[[dict], None]] = None,
+    schema_column_updater: Optional[Callable[[dict], None]] = None
+) -> None:
+    """
+    Converts schema.TableSchemaVersionMd dicts based on the specified conversion functions.
+    Args:
+        engine: The SQLAlchemy engine.
+        table_schema_version_md_updater: A function that updates schema.TableSchemaVersionMd dicts in place.
+        schema_column_updater: A function that updates schema.SchemaColumn dicts in place.
+    """
+    with engine.begin() as conn:
+        stmt = sql.select(TableSchemaVersion.tbl_id, TableSchemaVersion.schema_version, TableSchemaVersion.md)
+        for row in conn.execute(stmt):
+            tbl_id, schema_version, md = row[0], row[1], row[2]
+            assert isinstance(md, dict)
+            updated_md = copy.deepcopy(md)
+            if table_schema_version_md_updater is not None:
+                table_schema_version_md_updater(updated_md)
+            if schema_column_updater is not None:
+                __update_schema_column(updated_md, schema_column_updater)
+            if updated_md != md:
+                __logger.info(f'Updating TableSchemaVersion(tbl_id={tbl_id}, schema_version={schema_version})')
+                update_stmt = (
+                    sql.update(TableSchemaVersion)
+                    .where(TableSchemaVersion.tbl_id == tbl_id)
+                    .where(TableSchemaVersion.schema_version == schema_version)
+                    .values(md=updated_md)
+                )
+                conn.execute(update_stmt)
+def __update_schema_column(table_schema_version_md: dict, schema_column_updater: Callable[[dict], None]) -> None:
+    cols = table_schema_version_md['columns']
+    assert isinstance(cols, dict)
+    for schema_col in cols.values():
+        schema_column_updater(schema_col)

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    22: 'TableMd/ColumnMd.media_validation',
     21: 'Separate InlineArray and InlineList',
     20: 'Store DB timestamps in UTC',
     19: 'UDF renames; ImageMemberAccess removal',

pixeltable/metadata/schema.py CHANGED Viewed

@@ -1,37 +1,48 @@
 import dataclasses
+import typing
 import uuid
-from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
+from typing import Any, Optional, TypeVar, Union, get_type_hints
 import sqlalchemy as sql
 import sqlalchemy.orm as orm
-from sqlalchemy import ForeignKey
-from sqlalchemy import Integer, BigInteger, LargeBinary
-from sqlalchemy.dialects.postgresql import UUID, JSONB
+from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary
+from sqlalchemy.dialects.postgresql import JSONB, UUID
 from sqlalchemy.orm import declarative_base
+from sqlalchemy.orm.decl_api import DeclarativeMeta
-Base = declarative_base()
+# Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
+# a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
+# outside of the module in a typesafe way.
+Base: type = declarative_base()
+assert isinstance(Base, DeclarativeMeta)
+base_metadata = Base.metadata
 T = TypeVar('T')
-def md_from_dict(data_class_type: Type[T], data: Any) -> T:
+def md_from_dict(data_class_type: type[T], data: Any) -> T:
     """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
     if dataclasses.is_dataclass(data_class_type):
         fieldtypes = {f: t for f, t in get_type_hints(data_class_type).items()}
-        return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
-    elif hasattr(data_class_type, '__origin__'):
-        if data_class_type.__origin__ is Union and type(None) in data_class_type.__args__:
+        return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})  # type: ignore[return-value]
+    origin = typing.get_origin(data_class_type)
+    if origin is not None:
+        type_args = typing.get_args(data_class_type)
+        if origin is Union and type(None) in type_args:
             # Handling Optional types
-            non_none_args = [arg for arg in data_class_type.__args__ if arg is not type(None)]
-            if len(non_none_args) == 1:
-                return md_from_dict(non_none_args[0], data) if data is not None else None
-        elif data_class_type.__origin__ is list:
-            return [md_from_dict(data_class_type.__args__[0], elem) for elem in data]
-        elif data_class_type.__origin__ is dict:
-            key_type = data_class_type.__args__[0]
-            val_type = data_class_type.__args__[1]
-            return {key_type(key): md_from_dict(val_type, val) for key, val in data.items()}
-        elif data_class_type.__origin__ is tuple:
-            return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(data_class_type.__args__, data))
+            non_none_args = [arg for arg in type_args if arg is not type(None)]
+            assert len(non_none_args) == 1
+            return md_from_dict(non_none_args[0], data) if data is not None else None
+        elif origin is list:
+            return [md_from_dict(type_args[0], elem) for elem in data]  # type: ignore[return-value]
+        elif origin is dict:
+            key_type = type_args[0]
+            val_type = type_args[1]
+            return {key_type(key): md_from_dict(val_type, val) for key, val in data.items()}  # type: ignore[return-value]
+        elif origin is tuple:
+            return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(type_args, data))  # type: ignore[return-value]
+        else:
+            assert False
     else:
         return data
@@ -115,7 +126,7 @@ class ViewMd:
     is_snapshot: bool
     # (table id, version); for mutable views, all versions are None
-    base_versions: List[Tuple[str, Optional[int]]]
+    base_versions: list[tuple[str, Optional[int]]]
     # filter predicate applied to the base table; view-only
     predicate: Optional[dict[str, Any]]
@@ -191,6 +202,10 @@ class SchemaColumn:
     pos: int
     name: str
+    # media validation strategy of this particular media column; if not set, TableMd.media_validation applies
+    # stores column.MediaValiation.name.lower()
+    media_validation: Optional[str]
 @dataclasses.dataclass
 class TableSchemaVersionMd:
@@ -203,6 +218,10 @@ class TableSchemaVersionMd:
     num_retained_versions: int
     comment: str
+    # default validation strategy for any media column of this table
+    # stores column.MediaValiation.name.lower()
+    media_validation: str
 # versioning: each table schema change results in a new record
 class TableSchemaVersion(Base):

pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl