PyPI - pixeltable - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl - Mend

pixeltable 0.2.14py3-none-any.whl → 0.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/column.py +6 -3
pixeltable/catalog/dir.py +1 -1
pixeltable/catalog/globals.py +15 -6
pixeltable/catalog/insertable_table.py +23 -8
pixeltable/catalog/named_function.py +1 -1
pixeltable/catalog/path_dict.py +4 -4
pixeltable/catalog/schema_object.py +30 -18
pixeltable/catalog/table.py +87 -104
pixeltable/catalog/table_version.py +35 -24
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/catalog/view.py +15 -8
pixeltable/dataframe.py +56 -56
pixeltable/env.py +10 -9
pixeltable/exec/__init__.py +3 -3
pixeltable/exec/aggregation_node.py +3 -3
pixeltable/exec/expr_eval_node.py +3 -3
pixeltable/exec/in_memory_data_node.py +4 -4
pixeltable/exec/sql_node.py +4 -1
pixeltable/exprs/arithmetic_expr.py +41 -16
pixeltable/exprs/array_slice.py +3 -4
pixeltable/exprs/column_ref.py +20 -4
pixeltable/exprs/comparison.py +11 -6
pixeltable/exprs/data_row.py +3 -0
pixeltable/exprs/expr.py +88 -23
pixeltable/exprs/function_call.py +12 -1
pixeltable/exprs/globals.py +3 -1
pixeltable/exprs/inline_array.py +4 -4
pixeltable/exprs/json_path.py +36 -20
pixeltable/exprs/row_builder.py +4 -4
pixeltable/exprs/rowid_ref.py +1 -1
pixeltable/functions/__init__.py +1 -2
pixeltable/functions/audio.py +32 -0
pixeltable/functions/huggingface.py +4 -4
pixeltable/functions/image.py +1 -1
pixeltable/functions/json.py +46 -0
pixeltable/functions/video.py +5 -1
pixeltable/functions/{eval.py → vision.py} +166 -27
pixeltable/globals.py +57 -28
pixeltable/io/external_store.py +6 -6
pixeltable/io/globals.py +13 -14
pixeltable/io/label_studio.py +6 -6
pixeltable/io/pandas.py +60 -19
pixeltable/io/parquet.py +14 -14
pixeltable/iterators/document.py +7 -7
pixeltable/iterators/video.py +55 -23
pixeltable/plan.py +58 -29
pixeltable/store.py +97 -59
pixeltable/tool/create_test_db_dump.py +17 -11
pixeltable/type_system.py +155 -143
pixeltable/utils/pytorch.py +12 -10
{pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/METADATA +10 -10
{pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/RECORD +56 -54
{pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/LICENSE +0 -0
{pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/WHEEL +0 -0
{pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/entry_points.txt +0 -0

pixeltable/io/globals.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from typing import Any, Literal, Optional, Union
-import urllib.request
 import pixeltable as pxt
 import pixeltable.exceptions as excs
@@ -19,7 +18,7 @@ def create_label_studio_project(
         **kwargs: Any
 ) -> SyncStatus:
     """
-    Create a new Label Studio project and link it to the specified `Table`.
+    Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
     - A tutorial notebook with fully worked examples can be found here:
       [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
@@ -34,7 +33,7 @@ def create_label_studio_project(
     then the linked project will have a column named `image`. In addition, the linked project
     will always have a JSON-typed column `annotations` representing the output.
-    By default, Pixeltable will link each of these columns to a column of the specified `Table`
+    By default, Pixeltable will link each of these columns to a column of the specified [`Table`][pixeltable.Table]
     with the same name. If any of the data fields are missing, an exception will be raised. If
     the `annotations` column is missing, it will be created. The default names can be overridden
     by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
@@ -52,7 +51,7 @@ def create_label_studio_project(
     - `pip install boto3` (if using S3 import storage)
     Args:
-        t: The Table to link to.
+        t: The table to link to.
         label_config: The Label Studio project configuration, in XML format.
         name: An optional name for the new project in Pixeltable. If specified, must be a valid
             Pixeltable identifier and must not be the name of any other external data store
@@ -73,7 +72,7 @@ def create_label_studio_project(
             The default is `post`.
         col_mapping: An optional mapping of local column names to Label Studio fields.
         sync_immediately: If `True`, immediately perform an initial synchronization by
-            exporting all rows of the `Table` as Label Studio tasks.
+            exporting all rows of the table as Label Studio tasks.
         s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
             be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
             referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
@@ -148,15 +147,15 @@ def import_rows(
     comment: str = ''
     ) -> Table:
     """
-    Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
-    `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
+    Creates a new base table from a list of dictionaries. The dictionaries must be of the
+    form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
     supplied data, using the most specific type that can represent all the values in a column.
     If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
     Pixeltable will force the specified column to the specified type (and will not attempt any type inference
     for that column).
-    All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
+    All column types of the new table will be nullable unless explicitly specified as non-nullable in
     `schema_overrides`.
     Args:
@@ -169,7 +168,7 @@ def import_rows(
         comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
     Returns:
-        The newly created `Table`.
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     if schema_overrides is None:
         schema_overrides = {}
@@ -187,11 +186,11 @@ def import_rows(
             elif value is not None:
                 # If `key` is not in `schema_overrides`, then we infer its type from the data.
                 # The column type will always be nullable by default.
-                col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
+                col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
                 if col_name not in schema:
                     schema[col_name] = col_type
                 else:
-                    supertype = pxt.ColumnType.supertype(schema[col_name], col_type)
+                    supertype = schema[col_name].supertype(col_type)
                     if supertype is None:
                         raise excs.Error(
                             f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
@@ -230,8 +229,8 @@ def import_json(
     **kwargs: Any
 ) -> Table:
     """
-    Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
-    to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
+    Creates a new base table from a JSON file. This is a convenience method and is
+    equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
     is the contents of the specified `filepath_or_url`.
     Args:
@@ -245,7 +244,7 @@ def import_json(
         kwargs: Additional keyword arguments to pass to `json.loads`.
     Returns:
-        The newly created `Table`.
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     import json
     import urllib.parse

pixeltable/io/label_studio.py CHANGED Viewed

@@ -105,7 +105,7 @@ class LabelStudioProject(Project):
         return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
     def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
-        _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.name}`'
+        _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
                      f' (export: {export_data}, import: {import_data}).')
         # Collect all existing tasks into a dict with entries `rowid: task`
         tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
@@ -396,15 +396,15 @@ class LabelStudioProject(Project):
         updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
         if len(updates) > 0:
             _logger.info(
-                f'Updating table `{t.name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
+                f'Updating table `{t._name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
             )
             # batch_update currently doesn't propagate from views to base tables. As a workaround, we call
             # batch_update on the actual ancestor table that holds the annotations column.
             # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
             ancestor = t
             while local_annotations_col not in ancestor._tbl_version.cols:
-                assert ancestor.base is not None
-                ancestor = ancestor.base
+                assert ancestor._base is not None
+                ancestor = ancestor._base
             update_status = ancestor.batch_update(updates)
             print(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
             return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
@@ -565,7 +565,7 @@ class LabelStudioProject(Project):
         if title is None:
             # `title` defaults to table name
-            title = t.name
+            title = t._name
         # Create a column to hold the annotations, if one does not yet exist
         if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
@@ -573,7 +573,7 @@ class LabelStudioProject(Project):
                 local_annotations_column = ANNOTATIONS_COLUMN
             else:
                 local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
-            if local_annotations_column not in t.column_names():
+            if local_annotations_column not in t._schema.keys():
                 t[local_annotations_column] = pxt.JsonType(nullable=True)
         resolved_col_mapping = cls.validate_columns(

pixeltable/io/pandas.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import datetime
 from typing import Any, Optional, Union
 import numpy as np
 import pandas as pd
+import PIL.Image
 import pixeltable as pxt
 import pixeltable.exceptions as excs
@@ -13,11 +15,12 @@ def import_pandas(
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = ''
-) -> pxt.catalog.InsertableTable:
-    """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
-    will be inferred from the `DataFrame`.
+) -> pxt.Table:
+    """Creates a new base table from a Pandas
+    [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
+    specified name. The schema of the table will be inferred from the DataFrame.
-    The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
+    The column names of the new table will be identical to those in the DataFrame, as long as they are valid
     Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
     the following procedure:
     - first replace any non-alphanumeric characters with underscores;
@@ -31,6 +34,9 @@ def import_pandas(
             name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
             `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
             Pixeltable identifiers).
+    Returns:
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     if schema_overrides is None:
         schema_overrides = {}
@@ -52,11 +58,15 @@ def import_csv(
     num_retained_versions: int = 10,
     comment: str = '',
     **kwargs
-) -> pxt.catalog.InsertableTable:
+) -> pxt.Table:
     """
-    Creates a new `Table` from a csv file. This is a convenience method and is equivalent
+    Creates a new base table from a csv file. This is a convenience method and is equivalent
     to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
-    See the Pandas documentation for `read_csv` for more details.
+    See the Pandas documentation for [`read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)
+    for more details.
+    Returns:
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     df = pd.read_csv(filepath_or_buffer, **kwargs)
     return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -68,11 +78,15 @@ def import_excel(
     num_retained_versions: int = 10,
     comment: str = '',
     **kwargs
-) -> pxt.catalog.InsertableTable:
+) -> pxt.Table:
     """
-    Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
-    to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
-    See the Pandas documentation for `read_excel` for more details.
+    Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
+    equivalent to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
+    See the Pandas documentation for [`read_excel`](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html)
+    for more details.
+    Returns:
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     df = pd.read_excel(io, *args, **kwargs)
     return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -103,6 +117,17 @@ def __df_to_pxt_schema(
         if pd_name in schema_overrides:
             pxt_type = schema_overrides[pd_name]
         else:
+            # This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
+            # general objects, so we need to check for nulls in the specific cases where we might expect them.
+            # isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
+            # in object columns (where Pandas uses NaN as a general null).
+            # np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
+            has_na = any(
+                (isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
+                for val in df[pd_name]
+            )
+            if has_na and pd_name in primary_key:
+                raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
             pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
         pxt_name = __normalize_pxt_col_name(pd_name)
         # Ensure that column names are unique by appending a distinguishing suffix
@@ -140,21 +165,37 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
     """
     if np.issubdtype(np_dtype, np.integer):
         return pxt.IntType(nullable=nullable)
     if np.issubdtype(np_dtype, np.floating):
         return pxt.FloatType(nullable=nullable)
     if np.issubdtype(np_dtype, np.bool_):
         return pxt.BoolType(nullable=nullable)
-    if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
-        has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
-        if has_nan and not nullable:
-            raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
+    if np.issubdtype(np_dtype, np.character):
         return pxt.StringType(nullable=nullable)
     if np.issubdtype(np_dtype, np.datetime64):
-        has_nat = any(pd.isnull(val) for val in data_col)
-        if has_nat and not nullable:
-            raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
         return pxt.TimestampType(nullable=nullable)
-    raise excs.Error(f'Unsupported dtype: {np_dtype}')
+    if np_dtype == np.object_:
+        # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
+        # based on the actual data in `data_col`.
+        # First drop any null values (they don't contribute to type inference).
+        data_col = data_col.dropna()
+        if len(data_col) == 0:
+            # No non-null values; default to FloatType (the Pandas type of an all-NaN column)
+            return pxt.FloatType(nullable=nullable)
+        inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
+        if inferred_type is None:
+            # Fallback on StringType if everything else fails
+            return pxt.StringType(nullable=nullable)
+        else:
+            return inferred_type.copy(nullable=nullable)
+    raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
 def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:

pixeltable/io/parquet.py CHANGED Viewed

@@ -19,12 +19,14 @@ from pixeltable.utils.transactional_directory import transactional_directory
 if typing.TYPE_CHECKING:
     import pixeltable as pxt
     import pyarrow as pa
+    from pyarrow import parquet
 _logger = logging.getLogger(__name__)
 def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
     import pyarrow as pa
+    from pyarrow import parquet
     pydict = {}
     for field in schema:
@@ -35,7 +37,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
             pydict[field.name] = value_batch[field.name]
     tab = pa.Table.from_pydict(pydict, schema=schema)
-    pa.parquet.write_table(tab, output_path)
+    parquet.write_table(tab, output_path)
 def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
@@ -55,23 +57,21 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
     """
     from pixeltable.utils.arrow import to_arrow_schema
-    column_names = df.get_column_names()
-    column_types = df.get_column_types()
-    type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
-    arrow_schema = to_arrow_schema(dict(zip(column_names, column_types)))
+    type_dict = {k: v.as_dict() for k, v in df.schema.items()}
+    arrow_schema = to_arrow_schema(df.schema)
     # store the changes atomically
     with transactional_directory(dest_path) as temp_path:
         # dump metadata json file so we can inspect what was the source of the parquet file later on.
-        json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))  # pylint: disable=protected-access
+        json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch: Dict[str, deque] = {k: deque() for k in column_names}
+        current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
         current_byte_estimate = 0
-        for data_row in df._exec():  # pylint: disable=protected-access
-            for col_name, col_type, e in zip(column_names, column_types, df._select_list_exprs):  # pylint: disable=protected-access
+        for data_row in df._exec():
+            for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
                 val = data_row[e.slot_idx]
                 if val is None:
                     current_value_batch[col_name].append(val)
@@ -122,7 +122,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
                 assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
                 _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
                 batch_num += 1
-                current_value_batch = {k: deque() for k in column_names}
+                current_value_batch = {k: deque() for k in df.schema.keys()}
                 current_byte_estimate = 0
         _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -130,11 +130,11 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
 def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
     """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
-    import pyarrow as pa
+    from pyarrow import parquet
     from pixeltable.utils.arrow import to_pixeltable_schema
     input_path = Path(parquet_path).expanduser()
-    parquet_dataset = pa.parquet.ParquetDataset(input_path)
+    parquet_dataset = parquet.ParquetDataset(input_path)
     return to_pixeltable_schema(parquet_dataset.schema)
@@ -159,11 +159,11 @@ def import_parquet(
         The newly created table. The table will have loaded the data from the Parquet file(s).
     """
     import pixeltable as pxt
-    import pyarrow as pa
+    from pyarrow import parquet
     from pixeltable.utils.arrow import iter_tuples
     input_path = Path(parquet_path).expanduser()
-    parquet_dataset = pa.parquet.ParquetDataset(input_path)
+    parquet_dataset = parquet.ParquetDataset(input_path)
     schema = parquet_schema_to_pixeltable_schema(parquet_path)
     if schema_override is None:

pixeltable/iterators/document.py CHANGED Viewed

@@ -38,7 +38,7 @@ class DocumentSectionMetadata:
     sourceline: Optional[int] = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[Dict[int, str]] = None
+    heading: Optional[Dict[str, str]] = None
     # pdf-specific metadata
     page: Optional[int] = None
@@ -236,7 +236,7 @@ class DocumentSplitter(ComponentIterator):
         accumulated_text = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: Dict[int, str] = {}   # current state of observed headings (level -> text)
+        headings: Dict[str, str] = {}   # current state of observed headings (level -> text)
         sourceline = 0  # most recently seen sourceline
         def update_metadata(el: bs4.Tag) -> None:
@@ -244,12 +244,11 @@ class DocumentSplitter(ComponentIterator):
             nonlocal headings, sourceline
             sourceline = el.sourceline
             if el.name in _HTML_HEADINGS:
-                level = int(el.name[1])
                 # remove the previously seen lower levels
-                lower_levels = [l for l in headings if l > level]
+                lower_levels = [l for l in headings if l > el.name]
                 for l in lower_levels:
                     del headings[l]
-                headings[level] = el.get_text().strip()
+                headings[el.name] = el.get_text().strip()
         def emit() -> None:
             nonlocal accumulated_text, headings, sourceline
@@ -295,13 +294,14 @@ class DocumentSplitter(ComponentIterator):
         # current state
         accumulated_text = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: Dict[int, str] = {}   # current state of observed headings (level -> text)
+        headings: Dict[str, str] = {}   # current state of observed headings (level -> text)
         def update_headings(heading: Dict) -> None:
             # update current state
             nonlocal headings
             assert 'type' in heading and heading['type'] == 'heading'
-            level = heading['attrs']['level']
+            lint = heading['attrs']['level']
+            level = f'h{lint}'
             text = heading['children'][0]['raw'].strip()
             # remove the previously seen lower levels
             lower_levels = [l for l in headings.keys() if l > level]

pixeltable/iterators/video.py CHANGED Viewed

@@ -1,57 +1,89 @@
 import logging
 import math
 from pathlib import Path
-from typing import Dict, Any, List, Tuple
+from typing import Any, Optional
-import PIL.Image
 import cv2
+import PIL.Image
 from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
+from pixeltable.type_system import ColumnType, FloatType, ImageType, IntType, VideoType
 from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
 class FrameIterator(ComponentIterator):
-    """Iterator over frames of a video.
+    """
+    Iterator over frames of a video. At most one of `fps` or `num_frames` may be specified. If `fps` is specified,
+    then frames will be extracted at the specified rate (frames per second). If `num_frames` is specified, then the
+    exact number of frames will be extracted. If neither is specified, then all frames will be extracted. The first
+    frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
         Args:
-            video: URL or file of the video to use for frame extraction
-            fps: number of frames to extract per second of video. This may be a fractional value, such as 0.5.
-                If set to 0.0, then the native framerate of the video will be used (all frames will be extracted).
-                Default: 0.0
+            video: URL or path of the video to use for frame extraction.
+            fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
+                If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
+                extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
+            num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
+                `num_frames` is greater than the number of frames in the video, all frames will be extracted.
     """
-    def __init__(self, video: str, *, fps: float = 0.0):
+    def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
+        if fps is not None and num_frames is not None:
+            raise Error('At most one of `fps` or `num_frames` may be specified')
         video_path = Path(video)
         assert video_path.exists() and video_path.is_file()
         self.video_path = video_path
-        self.fps = fps
         self.video_reader = cv2.VideoCapture(str(video_path))
+        self.fps = fps
+        self.num_frames = num_frames
         if not self.video_reader.isOpened():
             raise Error(f'Failed to open video: {video}')
         video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
-        if fps > video_fps:
+        if fps is not None and fps > video_fps:
             raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
-        self.frame_freq = int(video_fps / fps) if fps > 0 else 1
         num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
         if num_video_frames == 0:
             raise Error(f'Video {video}: failed to get number of frames')
-        # ceil: round up to ensure we count frame 0
-        self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
-        _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
+        if num_frames is not None:
+            # specific number of frames
+            if num_frames > num_video_frames:
+                # Extract all frames
+                self.frames_to_extract = range(num_video_frames)
+            else:
+                spacing = float(num_video_frames) / float(num_frames)
+                self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
+                assert len(self.frames_to_extract) == num_frames
+        else:
+            if fps is None or fps == 0.0:
+                # Extract all frames
+                self.frames_to_extract = range(num_video_frames)
+            else:
+                # Extract frames at the implied frequency
+                freq = fps / video_fps
+                n = math.ceil(num_video_frames * freq)  # number of frames to extract
+                self.frames_to_extract = list(round(i / freq) for i in range(n))
+        # We need the list of frames as both a list (for set_pos) and a set (for fast lookups when
+        # there are lots of frames)
+        self.frames_set = set(self.frames_to_extract)
+        _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
         self.next_frame_idx = 0
     @classmethod
-    def input_schema(cls) -> Dict[str, ColumnType]:
+    def input_schema(cls) -> dict[str, ColumnType]:
         return {
             'video': VideoType(nullable=False),
-            'fps': FloatType()
+            'fps': FloatType(nullable=True),
+            'num_frames': IntType(nullable=True),
         }
     @classmethod
-    def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
         return {
             'frame_idx': IntType(),
             'pos_msec': FloatType(),
@@ -59,7 +91,9 @@ class FrameIterator(ComponentIterator):
             'frame': ImageType(),
         }, ['frame']
-    def __next__(self) -> Dict[str, Any]:
+    def __next__(self) -> dict[str, Any]:
+        # jumping to the target frame here with video_reader.set() is far slower than just
+        # skipping the unwanted frames
         while True:
             pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
             pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
@@ -69,7 +103,7 @@ class FrameIterator(ComponentIterator):
                 self.video_reader.release()
                 self.video_reader = None
                 raise StopIteration
-            if pos_frame % self.frame_freq == 0:
+            if pos_frame in self.frames_set:
                 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                 result = {
                     'frame_idx': self.next_frame_idx,
@@ -78,8 +112,6 @@ class FrameIterator(ComponentIterator):
                     'frame': PIL.Image.fromarray(img),
                 }
                 self.next_frame_idx += 1
-                # frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
-                # skipping the unwanted frames
                 return result
     def close(self) -> None:
@@ -92,5 +124,5 @@ class FrameIterator(ComponentIterator):
         if pos == self.next_frame_idx:
             return
         _logger.debug(f'seeking to frame {pos}')
-        self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
+        self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, self.frames_to_extract[pos])
         self.next_frame_idx = pos

pixeltable 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.14py3-none-any.whl → 0.2.16py3-none-any.whl