PyPI - pixeltable - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl - Mend

pixeltable 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (58) hide show

pixeltable/__init__.py +1 -1
pixeltable/__version__.py +2 -2
pixeltable/catalog/column.py +8 -3
pixeltable/catalog/globals.py +8 -0
pixeltable/catalog/table.py +25 -9
pixeltable/catalog/table_version.py +30 -55
pixeltable/catalog/view.py +1 -1
pixeltable/env.py +4 -4
pixeltable/exec/__init__.py +2 -1
pixeltable/exec/row_update_node.py +61 -0
pixeltable/exec/{sql_scan_node.py → sql_node.py} +120 -56
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +41 -16
pixeltable/exprs/expr.py +72 -22
pixeltable/exprs/function_call.py +64 -29
pixeltable/exprs/globals.py +5 -1
pixeltable/exprs/inline_array.py +18 -11
pixeltable/exprs/method_ref.py +63 -0
pixeltable/ext/__init__.py +9 -0
pixeltable/ext/functions/__init__.py +8 -0
pixeltable/ext/functions/whisperx.py +45 -5
pixeltable/ext/functions/yolox.py +60 -14
pixeltable/func/callable_function.py +12 -4
pixeltable/func/expr_template_function.py +1 -1
pixeltable/func/function.py +12 -2
pixeltable/func/function_registry.py +24 -9
pixeltable/func/udf.py +32 -4
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/fireworks.py +33 -0
pixeltable/functions/huggingface.py +96 -6
pixeltable/functions/image.py +226 -41
pixeltable/functions/json.py +46 -0
pixeltable/functions/openai.py +214 -0
pixeltable/functions/string.py +195 -218
pixeltable/functions/timestamp.py +210 -0
pixeltable/functions/together.py +106 -0
pixeltable/functions/video.py +2 -2
pixeltable/functions/{eval.py → vision.py} +170 -27
pixeltable/functions/whisper.py +32 -0
pixeltable/io/__init__.py +1 -1
pixeltable/io/external_store.py +2 -2
pixeltable/io/globals.py +133 -1
pixeltable/io/pandas.py +82 -31
pixeltable/iterators/video.py +55 -23
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_18.py +39 -0
pixeltable/metadata/notes.py +10 -0
pixeltable/plan.py +76 -1
pixeltable/store.py +65 -28
pixeltable/tool/create_test_db_dump.py +8 -9
pixeltable/tool/doc_plugins/griffe.py +4 -0
pixeltable/type_system.py +84 -63
{pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/METADATA +2 -2
{pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/RECORD +57 -51
pixeltable/exprs/image_member_access.py +0 -96
{pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/LICENSE +0 -0
{pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/WHEEL +0 -0
{pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/entry_points.txt +0 -0

pixeltable/io/external_store.py CHANGED Viewed

@@ -244,7 +244,7 @@ class Project(ExternalStore, abc.ABC):
             if ext_col in export_cols:
                 # Validate that the table column can be assigned to the external column
                 ext_col_type = export_cols[ext_col]
-                if not ext_col_type.is_supertype_of(t_col_type):
+                if not ext_col_type.is_supertype_of(t_col_type, ignore_nullable=True):
                     raise excs.Error(
                         f'Column `{t_col}` cannot be exported to external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
                     )
@@ -255,7 +255,7 @@ class Project(ExternalStore, abc.ABC):
                         f'Column `{t_col}` is a computed column, which cannot be populated from an external column'
                     )
                 ext_col_type = import_cols[ext_col]
-                if not t_col_type.is_supertype_of(ext_col_type):
+                if not t_col_type.is_supertype_of(ext_col_type, ignore_nullable=True):
                     raise excs.Error(
                         f'Column `{t_col}` cannot be imported from external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
                     )

pixeltable/io/globals.py CHANGED Viewed

@@ -1,5 +1,7 @@
-from typing import Any, Optional, Literal
+from typing import Any, Literal, Optional, Union
+import urllib.request
+import pixeltable as pxt
 import pixeltable.exceptions as excs
 from pixeltable import Table
 from pixeltable.io.external_store import SyncStatus
@@ -134,3 +136,133 @@ def create_label_studio_project(
         return t.sync()
     else:
         return SyncStatus.empty()
+def import_rows(
+    tbl_path: str,
+    rows: list[dict[str, Any]],
+    *,
+    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    primary_key: Optional[Union[str, list[str]]] = None,
+    num_retained_versions: int = 10,
+    comment: str = ''
+    ) -> Table:
+    """
+    Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
+    `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
+    supplied data, using the most specific type that can represent all the values in a column.
+    If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
+    Pixeltable will force the specified column to the specified type (and will not attempt any type inference
+    for that column).
+    All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
+    `schema_overrides`.
+    Args:
+        tbl_path: The qualified name of the table to create.
+        rows: The list of dictionaries to import.
+        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
+            as described above.
+        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
+        num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
+        comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
+    Returns:
+        The newly created `Table`.
+    """
+    if schema_overrides is None:
+        schema_overrides = {}
+    schema: dict[str, pxt.ColumnType] = {}
+    cols_with_nones: set[str] = set()
+    for n, row in enumerate(rows):
+        for col_name, value in row.items():
+            if col_name in schema_overrides:
+                # We do the insertion here; this will ensure that the column order matches the order
+                # in which the column names are encountered in the input data, even if `schema_overrides`
+                # is specified.
+                if col_name not in schema:
+                    schema[col_name] = schema_overrides[col_name]
+            elif value is not None:
+                # If `key` is not in `schema_overrides`, then we infer its type from the data.
+                # The column type will always be nullable by default.
+                col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
+                if col_name not in schema:
+                    schema[col_name] = col_type
+                else:
+                    supertype = schema[col_name].supertype(col_type)
+                    if supertype is None:
+                        raise excs.Error(
+                            f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
+                            'Consider specifying the type explicitly in `schema_overrides`.'
+                        )
+                    schema[col_name] = supertype
+            else:
+                cols_with_nones.add(col_name)
+    extraneous_keys = schema_overrides.keys() - schema.keys()
+    if len(extraneous_keys) > 0:
+        raise excs.Error(f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}')
+    entirely_none_cols = cols_with_nones - schema.keys()
+    if len(entirely_none_cols) > 0:
+        # A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
+        # was not encountered in any row with a non-None value.
+        raise excs.Error(
+            f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
+            'Consider specifying the type(s) explicitly in `schema_overrides`.'
+        )
+    t = pxt.create_table(tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
+    t.insert(rows)
+    return t
+def import_json(
+    tbl_path: str,
+    filepath_or_url: str,
+    *,
+    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    primary_key: Optional[Union[str, list[str]]] = None,
+    num_retained_versions: int = 10,
+    comment: str = '',
+    **kwargs: Any
+) -> Table:
+    """
+    Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
+    to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
+    is the contents of the specified `filepath_or_url`.
+    Args:
+        tbl_path: The name of the table to create.
+        filepath_or_url: The path or URL of the JSON file.
+        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
+            (see [`import_rows()`][pixeltable.io.import_rows]).
+        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
+        num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
+        comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
+        kwargs: Additional keyword arguments to pass to `json.loads`.
+    Returns:
+        The newly created `Table`.
+    """
+    import json
+    import urllib.parse
+    import urllib.request
+    # TODO Consolidate this logic with other places where files/URLs are parsed
+    parsed = urllib.parse.urlparse(filepath_or_url)
+    if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
+        # local file path
+        if len(parsed.scheme) <= 1:
+            filepath = filepath_or_url
+        else:
+            filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
+        with open(filepath) as fp:
+            contents = fp.read()
+    else:
+        # URL
+        contents = urllib.request.urlopen(filepath_or_url).read()
+    data = json.loads(contents, **kwargs)
+    return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)

pixeltable/io/pandas.py CHANGED Viewed

@@ -1,7 +1,9 @@
-from typing import Optional, Any, Iterable, Union
+import datetime
+from typing import Any, Optional, Union
 import numpy as np
 import pandas as pd
+import PIL.Image
 import pixeltable as pxt
 import pixeltable.exceptions as excs
@@ -15,7 +17,7 @@ def import_pandas(
     comment: str = ''
 ) -> pxt.catalog.InsertableTable:
     """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
-    will be inferred from the `DataFrame`, unless `schema` is specified.
+    will be inferred from the `DataFrame`.
     The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
     Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
@@ -32,9 +34,16 @@ def import_pandas(
             `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
             Pixeltable identifiers).
     """
-    schema = _df_to_pxt_schema(df, schema_overrides)
-    tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
-    table = pxt.create_table(tbl_name, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
+    if schema_overrides is None:
+        schema_overrides = {}
+    if primary_key is None:
+        primary_key = []
+    elif isinstance(primary_key, str):
+        primary_key = [primary_key]
+    schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
+    tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
+    table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
     table.insert(tbl_rows)
     return table
@@ -71,22 +80,44 @@ def import_excel(
     return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
-def _df_to_pxt_schema(
-    df: pd.DataFrame, schema_overrides: Optional[dict[str, pxt.ColumnType]]
-) -> dict[str, pxt.ColumnType]:
-    if schema_overrides is not None:
-        for pd_name in schema_overrides:
-            if pd_name not in df.columns:
-                raise excs.Error(
-                    f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
-                )
-    schema = {}
+def __df_to_pxt_schema(
+    df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
+) -> tuple[dict[str, pxt.ColumnType], list[str]]:
+    """
+    Infers a Pixeltable schema from a Pandas DataFrame.
+    Returns:
+        A tuple containing a Pixeltable schema and a list of primary key column names.
+    """
+    for pd_name in schema_overrides:
+        if pd_name not in df.columns:
+            raise excs.Error(
+                f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
+            )
+    for pd_name in primary_key:
+        if pd_name not in df.columns:
+            raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
+    schema: dict[str, pxt.ColumnType] = {}
+    col_mapping: dict[str, str] = {}  # Maps Pandas column names to Pixeltable column names
     for pd_name, pd_dtype in zip(df.columns, df.dtypes):
-        if schema_overrides is not None and pd_name in schema_overrides:
+        if pd_name in schema_overrides:
             pxt_type = schema_overrides[pd_name]
         else:
-            pxt_type = _np_dtype_to_pxt_type(pd_dtype, df[pd_name])
-        pxt_name = _normalize_pxt_col_name(pd_name)
+            # This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
+            # general objects, so we need to check for nulls in the specific cases where we might expect them.
+            # isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
+            # in object columns (where Pandas uses NaN as a general null).
+            # np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
+            has_na = any(
+                (isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
+                for val in df[pd_name]
+            )
+            if has_na and pd_name in primary_key:
+                raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
+            pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
+        pxt_name = __normalize_pxt_col_name(pd_name)
         # Ensure that column names are unique by appending a distinguishing suffix
         # to any collisions
         if pxt_name in schema:
@@ -95,10 +126,13 @@ def _df_to_pxt_schema(
                 n += 1
             pxt_name = f'{pxt_name}_{n}'
         schema[pxt_name] = pxt_type
-    return schema
+        col_mapping[pd_name] = pxt_name
+    pxt_pk = [col_mapping[pk] for pk in primary_key]
+    return schema, pxt_pk
-def _normalize_pxt_col_name(pd_name: str) -> str:
+def __normalize_pxt_col_name(pd_name: str) -> str:
     """
     Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
     - replacing any non-ascii or non-alphanumeric characters with an underscore _
@@ -113,26 +147,43 @@ def _normalize_pxt_col_name(pd_name: str) -> str:
     return id
-def _np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series) -> pxt.ColumnType:
+def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
     """
     Infers a Pixeltable type based on a Numpy dtype.
     """
     if np.issubdtype(np_dtype, np.integer):
-        return pxt.IntType()
+        return pxt.IntType(nullable=nullable)
     if np.issubdtype(np_dtype, np.floating):
-        return pxt.FloatType()
+        return pxt.FloatType(nullable=nullable)
     if np.issubdtype(np_dtype, np.bool_):
-        return pxt.BoolType()
-    if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
-        has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
-        return pxt.StringType(nullable=has_nan)
+        return pxt.BoolType(nullable=nullable)
+    if np.issubdtype(np_dtype, np.character):
+        return pxt.StringType(nullable=nullable)
     if np.issubdtype(np_dtype, np.datetime64):
-        has_nat = any(pd.isnull(val) for val in data_col)
-        return pxt.TimestampType(nullable=has_nat)
-    raise excs.Error(f'Unsupported dtype: {np_dtype}')
+        return pxt.TimestampType(nullable=nullable)
+    if np_dtype == np.object_:
+        # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
+        # based on the actual data in `data_col`.
+        # First drop any null values (they don't contribute to type inference).
+        data_col = data_col.dropna()
+        if len(data_col) == 0:
+            # No non-null values; default to FloatType (the Pandas type of an all-NaN column)
+            return pxt.FloatType(nullable=nullable)
+        inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
+        if inferred_type is not None:
+            return inferred_type.copy(nullable=nullable)
+    raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
-def _df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
+def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
     rows = {}
     for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
         if pxt_type.is_float_type():

pixeltable/iterators/video.py CHANGED Viewed

@@ -1,57 +1,89 @@
 import logging
 import math
 from pathlib import Path
-from typing import Dict, Any, List, Tuple
+from typing import Any, Optional
-import PIL.Image
 import cv2
+import PIL.Image
 from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
+from pixeltable.type_system import ColumnType, FloatType, ImageType, IntType, VideoType
 from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
 class FrameIterator(ComponentIterator):
-    """Iterator over frames of a video.
+    """
+    Iterator over frames of a video. At most one of `fps` or `num_frames` may be specified. If `fps` is specified,
+    then frames will be extracted at the specified rate (frames per second). If `num_frames` is specified, then the
+    exact number of frames will be extracted. If neither is specified, then all frames will be extracted. The first
+    frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
         Args:
-            video: URL or file of the video to use for frame extraction
-            fps: number of frames to extract per second of video. This may be a fractional value, such as 0.5.
-                If set to 0.0, then the native framerate of the video will be used (all frames will be extracted).
-                Default: 0.0
+            video: URL or path of the video to use for frame extraction.
+            fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
+                If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
+                extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
+            num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
+                `num_frames` is greater than the number of frames in the video, all frames will be extracted.
     """
-    def __init__(self, video: str, *, fps: float = 0.0):
+    def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
+        if fps is not None and num_frames is not None:
+            raise Error('At most one of `fps` or `num_frames` may be specified')
         video_path = Path(video)
         assert video_path.exists() and video_path.is_file()
         self.video_path = video_path
-        self.fps = fps
         self.video_reader = cv2.VideoCapture(str(video_path))
+        self.fps = fps
+        self.num_frames = num_frames
         if not self.video_reader.isOpened():
             raise Error(f'Failed to open video: {video}')
         video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
-        if fps > video_fps:
+        if fps is not None and fps > video_fps:
             raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
-        self.frame_freq = int(video_fps / fps) if fps > 0 else 1
         num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
         if num_video_frames == 0:
             raise Error(f'Video {video}: failed to get number of frames')
-        # ceil: round up to ensure we count frame 0
-        self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
-        _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
+        if num_frames is not None:
+            # specific number of frames
+            if num_frames > num_video_frames:
+                # Extract all frames
+                self.frames_to_extract = range(num_video_frames)
+            else:
+                spacing = float(num_video_frames) / float(num_frames)
+                self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
+                assert len(self.frames_to_extract) == num_frames
+        else:
+            if fps is None or fps == 0.0:
+                # Extract all frames
+                self.frames_to_extract = range(num_video_frames)
+            else:
+                # Extract frames at the implied frequency
+                freq = fps / video_fps
+                n = math.ceil(num_video_frames * freq)  # number of frames to extract
+                self.frames_to_extract = list(round(i / freq) for i in range(n))
+        # We need the list of frames as both a list (for set_pos) and a set (for fast lookups when
+        # there are lots of frames)
+        self.frames_set = set(self.frames_to_extract)
+        _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
         self.next_frame_idx = 0
     @classmethod
-    def input_schema(cls) -> Dict[str, ColumnType]:
+    def input_schema(cls) -> dict[str, ColumnType]:
         return {
             'video': VideoType(nullable=False),
-            'fps': FloatType()
+            'fps': FloatType(nullable=True),
+            'num_frames': IntType(nullable=True),
         }
     @classmethod
-    def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
         return {
             'frame_idx': IntType(),
             'pos_msec': FloatType(),
@@ -59,7 +91,9 @@ class FrameIterator(ComponentIterator):
             'frame': ImageType(),
         }, ['frame']
-    def __next__(self) -> Dict[str, Any]:
+    def __next__(self) -> dict[str, Any]:
+        # jumping to the target frame here with video_reader.set() is far slower than just
+        # skipping the unwanted frames
         while True:
             pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
             pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
@@ -69,7 +103,7 @@ class FrameIterator(ComponentIterator):
                 self.video_reader.release()
                 self.video_reader = None
                 raise StopIteration
-            if pos_frame % self.frame_freq == 0:
+            if pos_frame in self.frames_set:
                 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                 result = {
                     'frame_idx': self.next_frame_idx,
@@ -78,8 +112,6 @@ class FrameIterator(ComponentIterator):
                     'frame': PIL.Image.fromarray(img),
                 }
                 self.next_frame_idx += 1
-                # frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
-                # skipping the unwanted frames
                 return result
     def close(self) -> None:
@@ -92,5 +124,5 @@ class FrameIterator(ComponentIterator):
         if pos == self.next_frame_idx:
             return
         _logger.debug(f'seeking to frame {pos}')
-        self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
+        self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, self.frames_to_extract[pos])
         self.next_frame_idx = pos

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 18
+VERSION = 19
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_18.py ADDED Viewed

@@ -0,0 +1,39 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=18)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(
+        engine,
+        substitution_fn=__substitute_md
+    )
+def __substitute_md(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
+    # Migrate a few changed function names
+    if k == 'path' and v == 'pixeltable.functions.string.str_format':
+        return 'path', 'pixeltable.functions.string.format'
+    if k == 'path' and v.startswith('pixeltable.functions.pil.image'):
+        return 'path', v.replace('pixeltable.functions.pil.image', 'pixeltable.functions.image')
+    # Migrate deprecated `ImageMemberAccess` expressions to `FunctionCall`s
+    if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ImageMemberAccess':
+        member_name = v['member_name']
+        new_v = {
+            'fn': {
+                'path': f'pixeltable.functions.image.{member_name}',
+                '_classpath': 'pixeltable.func.callable_function.CallableFunction',
+            },
+            'args': [[0, None]],
+            'kwargs': {},
+            '_classname': 'FunctionCall',
+            'components': v['components'],
+            'group_by_stop_idx': 0,
+            'group_by_start_idx': 0,
+            'order_by_start_idx': 1,
+        }
+        return k, new_v
+    return None

pixeltable/metadata/notes.py ADDED Viewed

@@ -0,0 +1,10 @@
+# Descriptive notes for each new metadata version. These are stored in a Python dict
+# rather than as a comment, so that the existence of a description can be enforced by
+# the unit tests when new versions are added.
+VERSION_NOTES = {
+    19: 'UDF renames; ImageMemberAccess removal',
+    18: 'Restructured index metadata',
+    17: 'Renamed remotes to external_stores',
+    16: 'Query functions; deferred Expr deserialization',
+    15: 'Remotes in table metadata',
+}

pixeltable/plan.py CHANGED Viewed

@@ -107,7 +107,7 @@ class Analyzer:
         for e in self.group_by_clause:
             if e.sql_expr() is None:
                 raise excs.Error(f'Invalid grouping expression, needs to be expressible in SQL: {e}')
-            if e.contains(filter=lambda e: _is_agg_fn_call(e)):
+            if e._contains(filter=lambda e: _is_agg_fn_call(e)):
                 raise excs.Error(f'Grouping expression contains aggregate function: {e}')
         # check that agg fn calls don't have contradicting ordering requirements
@@ -288,6 +288,81 @@ class Planner:
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
         return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
+    @classmethod
+    def create_batch_update_plan(
+            cls, tbl: catalog.TableVersionPath,
+            batch: list[dict[catalog.Column, exprs.Expr]], rowids: list[tuple[int, ...]],
+            cascade: bool
+    ) -> Tuple[exec.ExecNode, exec.RowUpdateNode, sql.ClauseElement, List[catalog.Column], List[catalog.Column]]:
+        """
+        Returns:
+        - root node of the plan to produce the updated rows
+        - RowUpdateNode of plan
+        - Where clause for deleting the current versions of updated rows
+        - list of columns that are getting updated
+        - list of user-visible columns that are being recomputed
+        """
+        assert isinstance(tbl, catalog.TableVersionPath)
+        target = tbl.tbl_version  # the one we need to update
+        sa_key_cols: list[sql.Column] = []
+        key_vals: list[tuple] = []
+        if len(rowids) > 0:
+            sa_key_cols = target.store_tbl.rowid_columns()
+            key_vals = rowids
+        else:
+            pk_cols = target.primary_key_columns()
+            sa_key_cols = [c.sa_col for c in pk_cols]
+            key_vals = [tuple(row[col].val for col in pk_cols) for row in batch]
+        # retrieve all stored cols and all target exprs
+        updated_cols = batch[0].keys() - target.primary_key_columns()
+        recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
+        # regardless of cascade, we need to update all indices on any updated column
+        idx_val_cols = target.get_idx_val_columns(updated_cols)
+        recomputed_cols.update(idx_val_cols)
+        # we only need to recompute stored columns (unstored ones are substituted away)
+        recomputed_cols = {c for c in recomputed_cols if c.is_stored}
+        recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
+        copied_cols = [
+            col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
+        ]
+        select_list = [exprs.ColumnRef(col) for col in copied_cols]
+        select_list.extend([exprs.ColumnRef(col) for col in updated_cols])
+        recomputed_exprs = \
+            [c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
+        # the RowUpdateNode updates columns in-place, ie, in the original ColumnRef; no further sustitution is needed
+        select_list.extend(recomputed_exprs)
+        # ExecNode tree (from bottom to top):
+        # - SqlLookupNode to retrieve the existing rows
+        # - RowUpdateNode to update the retrieved rows
+        # - ExprEvalNode to evaluate the remaining output exprs
+        analyzer = Analyzer(tbl, select_list)
+        row_builder = exprs.RowBuilder(analyzer.all_exprs, [], analyzer.sql_exprs)
+        analyzer.finalize(row_builder)
+        plan = exec.SqlLookupNode(tbl, row_builder, analyzer.sql_exprs, sa_key_cols, key_vals)
+        delete_where_clause = plan.where_clause
+        col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
+        plan = row_update_node = exec.RowUpdateNode(tbl, key_vals, len(rowids) > 0, col_vals, row_builder, plan)
+        if not cls._is_contained_in(analyzer.select_list, analyzer.sql_exprs):
+            # we need an ExprEvalNode to evaluate the remaining output exprs
+            plan = exec.ExprEvalNode(row_builder, analyzer.select_list, analyzer.sql_exprs, input=plan)
+        # update row builder with column information
+        all_base_cols = copied_cols + list(updated_cols) + list(recomputed_base_cols)  # same order as select_list
+        row_builder.substitute_exprs(select_list, remove_duplicates=False)
+        for i, col in enumerate(all_base_cols):
+            plan.row_builder.add_table_column(col, select_list[i].slot_idx)
+        ctx = exec.ExecContext(row_builder)
+        # we're returning everything to the user, so we might as well do it in a single batch
+        ctx.batch_size = 0
+        plan.set_ctx(ctx)
+        recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
+        return (
+            plan, row_update_node, delete_where_clause, list(updated_cols) + recomputed_user_cols, recomputed_user_cols
+        )
     @classmethod
     def create_view_update_plan(
             cls, view: catalog.TableVersionPath, recompute_targets: List[catalog.Column]

pixeltable 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl