PyPI - pixeltable - Versions diffs - 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl - Mend

pixeltable 0.4.14py3-none-any.whl → 0.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (64) hide show

pixeltable/__init__.py +6 -1
pixeltable/catalog/catalog.py +107 -45
pixeltable/catalog/column.py +7 -2
pixeltable/catalog/table.py +1 -0
pixeltable/catalog/table_metadata.py +5 -0
pixeltable/catalog/table_version.py +100 -106
pixeltable/catalog/table_version_handle.py +4 -1
pixeltable/catalog/update_status.py +12 -0
pixeltable/config.py +6 -0
pixeltable/dataframe.py +11 -5
pixeltable/env.py +52 -19
pixeltable/exec/__init__.py +2 -0
pixeltable/exec/cell_materialization_node.py +231 -0
pixeltable/exec/cell_reconstruction_node.py +135 -0
pixeltable/exec/exec_node.py +1 -1
pixeltable/exec/expr_eval/evaluators.py +1 -0
pixeltable/exec/expr_eval/expr_eval_node.py +14 -0
pixeltable/exec/expr_eval/globals.py +2 -0
pixeltable/exec/globals.py +32 -0
pixeltable/exec/object_store_save_node.py +1 -4
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +107 -14
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +10 -11
pixeltable/exprs/column_property_ref.py +10 -10
pixeltable/exprs/column_ref.py +2 -2
pixeltable/exprs/data_row.py +106 -37
pixeltable/exprs/expr.py +9 -0
pixeltable/exprs/expr_set.py +14 -7
pixeltable/exprs/inline_expr.py +2 -19
pixeltable/exprs/json_path.py +45 -12
pixeltable/exprs/row_builder.py +54 -22
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/bedrock.py +7 -0
pixeltable/functions/deepseek.py +11 -4
pixeltable/functions/llama_cpp.py +7 -0
pixeltable/functions/math.py +1 -1
pixeltable/functions/ollama.py +7 -0
pixeltable/functions/openai.py +4 -4
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/video.py +123 -9
pixeltable/functions/whisperx.py +2 -0
pixeltable/functions/yolox.py +2 -0
pixeltable/globals.py +56 -31
pixeltable/io/__init__.py +1 -0
pixeltable/io/globals.py +16 -15
pixeltable/io/table_data_conduit.py +46 -21
pixeltable/iterators/__init__.py +1 -0
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +175 -46
pixeltable/share/publish.py +0 -1
pixeltable/store.py +2 -2
pixeltable/type_system.py +5 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/exception_handler.py +5 -28
pixeltable/utils/image.py +7 -0
pixeltable/utils/misc.py +5 -0
{pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/METADATA +2 -1
{pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/RECORD +64 -57
{pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/WHEEL +0 -0
{pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.14.dist-info → pixeltable-0.4.16.dist-info}/licenses/LICENSE +0 -0

pixeltable/functions/video.py CHANGED Viewed

@@ -4,7 +4,6 @@ Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
 import logging
 import pathlib
-import shutil
 import subprocess
 from typing import Literal, NoReturn
@@ -327,6 +326,7 @@ def clip(
     Returns:
         New video containing only the specified time range or None if start_time is beyond the end of the video.
     """
+    Env.get().require_binary('ffmpeg')
     if start_time < 0:
         raise pxt.Error(f'start_time must be non-negative, got {start_time}')
     if end_time is not None and end_time <= start_time:
@@ -335,8 +335,6 @@ def clip(
         raise pxt.Error(f'duration must be positive, got {duration}')
     if end_time is not None and duration is not None:
         raise pxt.Error('end_time and duration cannot both be specified')
-    if not shutil.which('ffmpeg'):
-        raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use get_clip().')
     video_duration = av_utils.get_video_duration(video)
     if video_duration is not None and start_time > video_duration:
@@ -388,10 +386,9 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
         >>> duration = tbl.video.get_duration()
         >>> tbl.select(segment_paths=tbl.video.segment_video(duration=duration / 2 + 1)).collect()
     """
+    Env.get().require_binary('ffmpeg')
     if duration <= 0:
         raise pxt.Error(f'duration must be positive, got {duration}')
-    if not shutil.which('ffmpeg'):
-        raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use segment_video().')
     base_path = TempStore.create_path(extension='')
@@ -436,10 +433,9 @@ def concat_videos(videos: list[pxt.Video]) -> pxt.Video:
     Returns:
         A new video containing the merged videos.
     """
+    Env.get().require_binary('ffmpeg')
     if len(videos) == 0:
         raise pxt.Error('concat_videos(): empty argument list')
-    if not shutil.which('ffmpeg'):
-        raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use concat_videos().')
     # Check that all videos have the same resolution
     resolutions: list[tuple[int, int]] = []
@@ -528,6 +524,125 @@ def concat_videos(videos: list[pxt.Video]) -> pxt.Video:
         filelist_path.unlink()
+@pxt.udf
+def with_audio(
+    video: pxt.Video,
+    audio: pxt.Audio,
+    *,
+    video_start_time: float = 0.0,
+    video_duration: float | None = None,
+    audio_start_time: float = 0.0,
+    audio_duration: float | None = None,
+) -> pxt.Video:
+    """
+    Creates a new video that combines the video stream from `video` and the audio stream from `audio`.
+    The `start_time` and `duration` parameters can be used to select a specific time range from each input.
+    If the audio input (or selected time range) is longer than the video, the audio will be truncated.
+    __Requirements:__
+    - `ffmpeg` needs to be installed and in PATH
+    Args:
+        video: Input video.
+        audio: Input audio.
+        video_start_time: Start time in the video input (in seconds).
+        video_duration: Duration of video segment (in seconds). If None, uses the remainder of the video after
+            `video_start_time`. `video_duration` determines the duration of the output video.
+        audio_start_time: Start time in the audio input (in seconds).
+        audio_duration: Duration of audio segment (in seconds). If None, uses the remainder of the audio after
+            `audio_start_time`. If the audio is longer than the output video, it will be truncated.
+    Returns:
+        A new video file with the audio track added.
+    Examples:
+        Add background music to a video:
+        >>> tbl.select(tbl.video.with_audio(tbl.music_track)).collect()
+        Add audio starting 5 seconds into both files:
+        >>> tbl.select(
+        ...     tbl.video.with_audio(
+        ...         tbl.music_track,
+        ...         video_start_time=5.0,
+        ...         audio_start_time=5.0
+        ...     )
+        ... ).collect()
+        Use a 10-second clip from the middle of both files:
+        >>> tbl.select(
+        ...     tbl.video.with_audio(
+        ...         tbl.music_track,
+        ...         video_start_time=30.0,
+        ...         video_duration=10.0,
+        ...         audio_start_time=15.0,
+        ...         audio_duration=10.0
+        ...     )
+        ... ).collect()
+    """
+    Env.get().require_binary('ffmpeg')
+    if video_start_time < 0:
+        raise pxt.Error(f'video_offset must be non-negative, got {video_start_time}')
+    if audio_start_time < 0:
+        raise pxt.Error(f'audio_offset must be non-negative, got {audio_start_time}')
+    if video_duration is not None and video_duration <= 0:
+        raise pxt.Error(f'video_duration must be positive, got {video_duration}')
+    if audio_duration is not None and audio_duration <= 0:
+        raise pxt.Error(f'audio_duration must be positive, got {audio_duration}')
+    output_path = str(TempStore.create_path(extension='.mp4'))
+    cmd = ['ffmpeg']
+    if video_start_time > 0:
+        # fast seek, must precede -i
+        cmd.extend(['-ss', str(video_start_time)])
+    if video_duration is not None:
+        cmd.extend(['-t', str(video_duration)])
+    else:
+        video_duration = av_utils.get_video_duration(video)
+    cmd.extend(['-i', str(video)])
+    if audio_start_time > 0:
+        cmd.extend(['-ss', str(audio_start_time)])
+    if audio_duration is not None:
+        cmd.extend(['-t', str(audio_duration)])
+    cmd.extend(['-i', str(audio)])
+    cmd.extend(
+        [
+            '-map',
+            '0:v:0',  # video from first input
+            '-map',
+            '1:a:0',  # audio from second input
+            '-c:v',
+            'copy',  # avoid re-encoding
+            '-c:a',
+            'copy',  # avoid re-encoding
+            '-t',
+            str(video_duration),  # limit output duration to video duration
+            '-loglevel',
+            'error',  # only show errors
+            output_path,
+        ]
+    )
+    _logger.debug(f'with_audio(): {" ".join(cmd)}')
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        output_file = pathlib.Path(output_path)
+        if not output_file.exists() or output_file.stat().st_size == 0:
+            stderr_output = result.stderr.strip() if result.stderr is not None else ''
+            raise pxt.Error(f'ffmpeg failed to create output file for commandline: {" ".join(cmd)}\n{stderr_output}')
+        return output_path
+    except subprocess.CalledProcessError as e:
+        _handle_ffmpeg_error(e)
 @pxt.udf(is_method=True)
 def overlay_text(
     video: pxt.Video,
@@ -614,8 +729,7 @@ def overlay_text(
         ...     )
         ... ).collect()
     """
-    if not shutil.which('ffmpeg'):
-        raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use overlay_text().')
+    Env.get().require_binary('ffmpeg')
     if font_size <= 0:
         raise pxt.Error(f'font_size must be positive, got {font_size}')
     if opacity < 0.0 or opacity > 1.0:

pixeltable/functions/whisperx.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""WhisperX audio transcription and diarization functions."""
 from typing import TYPE_CHECKING, Any, Optional
 import numpy as np

pixeltable/functions/yolox.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""YOLOX object detection functions."""
 import logging
 from typing import TYPE_CHECKING

pixeltable/globals.py CHANGED Viewed

@@ -179,7 +179,7 @@ def create_table(
             'Unable to create a proper schema from supplied `source`. Please use appropriate `schema_overrides`.'
         )
-    table = Catalog.get().create_table(
+    table, was_created = Catalog.get().create_table(
         path_obj,
         schema,
         data_source.pxt_df if isinstance(data_source, DFTableDataConduit) else None,
@@ -189,7 +189,7 @@ def create_table(
         media_validation=media_validation_,
         num_retained_versions=num_retained_versions,
     )
-    if data_source is not None and not is_direct_df:
+    if was_created and data_source is not None and not is_direct_df:
         fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
         table.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
@@ -397,47 +397,66 @@ def create_snapshot(
     )
-def create_replica(
-    destination: str,
+def publish(
     source: str | catalog.Table,
+    destination_uri: str,
     bucket_name: str | None = None,
     access: Literal['public', 'private'] = 'private',
-) -> Optional[catalog.Table]:
+) -> None:
     """
-    Create a replica of a table. Can be used either to create a remote replica of a local table, or to create a local
-    replica of a remote table. A given table can have at most one replica per Pixeltable instance.
+    Publishes a replica of a local Pixeltable table to Pixeltable cloud. A given table can be published to at most one
+    URI per Pixeltable cloud database.
     Args:
-        destination: Path where the replica will be created. Can be either a local path such as `'my_dir.my_table'`, or
-            a remote URI such as `'pxt://username/mydir.my_table'`.
-        source: Path to the source table, or (if the source table is a local table) a handle to the source table.
-        bucket_name: The name of the pixeltable cloud-registered bucket to use to store replica's data.
-            If no `bucket_name` is provided, the default Pixeltable storage bucket will be used.
+        source: Path or table handle of the local table to be published.
+        destination_uri: Remote URI where the replica will be published, such as `'pxt://org_name/my_dir/my_table'`.
+        bucket_name: The name of the bucket to use to store replica's data. The bucket must be registered with
+            Pixeltable cloud. If no `bucket_name` is provided, the default storage bucket for the destination
+            database will be used.
         access: Access control for the replica.
             - `'public'`: Anyone can access this replica.
-            - `'private'`: Only the owner can access.
+            - `'private'`: Only the host organization can access.
     """
-    remote_dest = destination.startswith('pxt://')
-    remote_source = isinstance(source, str) and source.startswith('pxt://')
-    if remote_dest == remote_source:
-        raise excs.Error('Exactly one of `destination` or `source` must be a remote URI.')
-    if remote_dest:
-        if isinstance(source, str):
-            source = get_table(source)
-        share.push_replica(destination, source, bucket_name, access)
-        return None
-    else:
-        assert isinstance(source, str)
-        return share.pull_replica(destination, source)
+    if not destination_uri.startswith('pxt://'):
+        raise excs.Error("`destination_uri` must be a remote Pixeltable URI with the prefix 'pxt://'")
+    if isinstance(source, str):
+        source = get_table(source)
-def get_table(path: str) -> catalog.Table:
+    share.push_replica(destination_uri, source, bucket_name, access)
+def replicate(remote_uri: str, local_path: str) -> catalog.Table:
+    """
+    Retrieve a replica from Pixeltable cloud as a local table. This will create a full local copy of the replica in a
+    way that preserves the table structure of the original source data. Once replicated, the local table can be
+    queried offline just as any other Pixeltable table.
+    Args:
+        remote_uri: Remote URI of the table to be replicated, such as `'pxt://org_name/my_dir/my_table'`.
+        local_path: Local table path where the replica will be created, such as `'my_new_dir.my_new_tbl'`. It can be
+            the same or different from the cloud table name.
+    Returns:
+        A handle to the newly created local replica table.
+    """
+    if not remote_uri.startswith('pxt://'):
+        raise excs.Error("`remote_uri` must be a remote Pixeltable URI with the prefix 'pxt://'")
+    return share.pull_replica(local_path, remote_uri)
+def get_table(path: str, if_not_exists: Literal['error', 'ignore'] = 'error') -> catalog.Table | None:
     """Get a handle to an existing table, view, or snapshot.
     Args:
         path: Path to the table.
+        if_not_exists: Directive regarding how to handle if the path does not exist.
+            Must be one of the following:
+            - `'error'`: raise an error
+            - `'ignore'`: do nothing and return `None`
     Returns:
         A handle to the [`Table`][pixeltable.Table].
@@ -462,8 +481,9 @@ def get_table(path: str) -> catalog.Table:
         >>> tbl = pxt.get_table('my_table:722')
     """
+    if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
     path_obj = catalog.Path.parse(path, allow_versioned_path=True)
-    tbl = Catalog.get().get_table(path_obj)
+    tbl = Catalog.get().get_table(path_obj, if_not_exists_)
     return tbl
@@ -498,10 +518,11 @@ def move(path: str, new_path: str) -> None:
 def drop_table(
     table: str | catalog.Table, force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error'
 ) -> None:
-    """Drop a table, view, or snapshot.
+    """Drop a table, view, snapshot, or replica.
     Args:
-        table: Fully qualified name, or handle, of the table to be dropped.
+        table: Fully qualified name or table handle of the table to be dropped; or a remote URI of a cloud replica to
+            be deleted.
         force: If `True`, will also drop all views and sub-views of this table.
         if_not_exists: Directive regarding how to handle if the path does not exist.
             Must be one of the following:
@@ -541,13 +562,17 @@ def drop_table(
         assert isinstance(table, str)
         tbl_path = table
+    if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
     if tbl_path.startswith('pxt://'):
         # Remote table
+        if force:
+            raise excs.Error('Cannot use `force=True` with a cloud replica URI.')
+        # TODO: Handle if_not_exists properly
         share.delete_replica(tbl_path)
     else:
         # Local table
         path_obj = catalog.Path.parse(tbl_path)
-        if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
         Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)

pixeltable/io/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+"""Functions for importing and exporting Pixeltable data."""
 # ruff: noqa: F401
 from .datarows import import_json, import_rows

pixeltable/io/globals.py CHANGED Viewed

@@ -103,25 +103,26 @@ def create_label_studio_project(
         column of the table `tbl`:
         >>> config = \"\"\"
-            <View>
-                <Video name="video_obj" value="$video_col"/>
-                <Choices name="video-category" toName="video" showInLine="true">
-                    <Choice value="city"/>
-                    <Choice value="food"/>
-                    <Choice value="sports"/>
-                </Choices>
-            </View>\"\"\"
-            create_label_studio_project(tbl, config)
+        ... <View>
+        ...     <Video name="video_obj" value="$video_col"/>
+        ...     <Choices name="video-category" toName="video" showInLine="true">
+        ...         <Choice value="city"/>
+        ...         <Choice value="food"/>
+        ...         <Choice value="sports"/>
+        ...     </Choices>
+        ... </View>
+        ... \"\"\"
+        >>> create_label_studio_project(tbl, config)
         Create a Label Studio project with the same configuration, using `media_import_method='url'`,
         whose media are stored in an S3 bucket:
         >>> create_label_studio_project(
-                tbl,
-                config,
-                media_import_method='url',
-                s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
-            )
+        ...     tbl,
+        ...     config,
+        ...     media_import_method='url',
+        ...     s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
+        ... )
     """
     Env.get().require_package('label_studio_sdk')
@@ -204,7 +205,7 @@ def export_images_as_fo_dataset(
         Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
         labels from `tbl.classifications`:
-        >>> export_as_fiftyone(
+        >>> export_images_as_fo_dataset(
         ...     tbl,
         ...     tbl.image,
         ...     classifications=tbl.classifications

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -10,7 +10,9 @@ from dataclasses import dataclass, field, fields
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
+import numpy as np
 import pandas as pd
+import PIL
 from pyarrow.parquet import ParquetDataset
 import pixeltable as pxt
@@ -325,7 +327,11 @@ class JsonTableDataConduit(TableDataConduit):
 class HFTableDataConduit(TableDataConduit):
-    hf_ds: datasets.Dataset | datasets.DatasetDict | None = None
+    """
+    TODO:
+    - use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
+    """
     column_name_for_split: Optional[str] = None
     categorical_features: dict[str, dict[int, str]]
     dataset_dict: dict[str, datasets.Dataset] = None
@@ -339,9 +345,19 @@ class HFTableDataConduit(TableDataConduit):
         import datasets
         assert isinstance(tds.source, (datasets.Dataset, datasets.DatasetDict))
-        t.hf_ds = tds.source
         if 'column_name_for_split' in t.extra_fields:
             t.column_name_for_split = t.extra_fields['column_name_for_split']
+        # make sure we get numpy arrays for arrays, not Python lists
+        source = tds.source.with_format(type='numpy')
+        if isinstance(source, datasets.Dataset):
+            # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
+            raw_name = source.split._name
+            split_name = raw_name.split('[')[0] if raw_name is not None else None
+            t.dataset_dict = {split_name: source}
+        else:
+            assert isinstance(source, datasets.DatasetDict)
+            t.dataset_dict = source
         return t
     @classmethod
@@ -361,7 +377,7 @@ class HFTableDataConduit(TableDataConduit):
         if self.source_column_map is None:
             if self.src_schema_overrides is None:
                 self.src_schema_overrides = {}
-            self.hf_schema_source = _get_hf_schema(self.hf_ds)
+            self.hf_schema_source = _get_hf_schema(self.source)
             self.src_schema = huggingface_schema_to_pxt_schema(
                 self.hf_schema_source, self.src_schema_overrides, self.src_pk
             )
@@ -396,15 +412,6 @@ class HFTableDataConduit(TableDataConduit):
     def prepare_insert(self) -> None:
         import datasets
-        if isinstance(self.source, datasets.Dataset):
-            # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-            raw_name = self.source.split._name
-            split_name = raw_name.split('[')[0] if raw_name is not None else None
-            self.dataset_dict = {split_name: self.source}
-        else:
-            assert isinstance(self.source, datasets.DatasetDict)
-            self.dataset_dict = self.source
         # extract all class labels from the dataset to translate category ints to strings
         self.categorical_features = {
             feature_name: feature_type.names
@@ -415,26 +422,44 @@ class HFTableDataConduit(TableDataConduit):
             self.source_column_map = {}
         self.check_source_columns_are_insertable(self.hf_schema_source.keys())
-    def _translate_row(self, row: dict[str, Any], split_name: str) -> dict[str, Any]:
+    def _translate_row(self, row: dict[str, Any], split_name: str, features: datasets.Features) -> dict[str, Any]:
         output_row: dict[str, Any] = {}
         for col_name, val in row.items():
             # translate category ints to strings
             new_val = self.categorical_features[col_name][val] if col_name in self.categorical_features else val
             mapped_col_name = self.source_column_map.get(col_name, col_name)
-            # Convert values to the appropriate type if needed
-            try:
-                checked_val = self.pxt_schema[mapped_col_name].create_literal(new_val)
-            except TypeError as e:
-                msg = str(e)
-                raise excs.Error(f'Error in column {col_name}: {msg[0].lower() + msg[1:]}\nRow: {row}') from e
-            output_row[mapped_col_name] = checked_val
+            new_val = self._translate_val(new_val, features[col_name])
+            output_row[mapped_col_name] = new_val
         # add split name to output row
         if self.column_name_for_split is not None:
             output_row[self.column_name_for_split] = split_name
         return output_row
+    def _translate_val(self, val: Any, feature: datasets.Feature) -> Any:
+        """Convert numpy scalars to Python types and images to PIL.Image.Image"""
+        import datasets
+        if isinstance(feature, datasets.Value):
+            if isinstance(val, (np.generic, np.ndarray)):
+                # a scalar, which we want as a standard Python type
+                assert np.ndim(val) == 0
+                return val.item()
+            else:
+                # a standard Python object
+                return val
+        elif isinstance(feature, datasets.Sequence):
+            assert np.ndim(val) > 0
+            return val
+        elif isinstance(feature, datasets.Image):
+            return PIL.Image.fromarray(val)
+        elif isinstance(feature, dict):
+            assert isinstance(val, dict)
+            return {k: self._translate_val(v, feature[k]) for k, v in val.items()}
+        else:
+            return val
     def valid_row_batch(self) -> Iterator[RowData]:
         for split_name, split_dataset in self.dataset_dict.items():
             num_batches = split_dataset.size_in_bytes / self._K_BATCH_SIZE_BYTES
@@ -443,7 +468,7 @@ class HFTableDataConduit(TableDataConduit):
             batch = []
             for row in split_dataset:
-                batch.append(self._translate_row(row, split_name))
+                batch.append(self._translate_row(row, split_name, split_dataset.features))
                 if len(batch) >= tuples_per_batch:
                     yield batch
                     batch = []

pixeltable/iterators/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+"""Iterators for splitting media and documents into components."""
 # ruff: noqa: F401
 from .audio import AudioSplitter

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
 _logger = logging.getLogger('pixeltable')
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 40
+VERSION = 41
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_40.py ADDED Viewed

@@ -0,0 +1,73 @@
+import logging
+from uuid import UUID
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+_logger = logging.getLogger('pixeltable')
+@register_converter(version=40)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, table_modifier=__table_modifier)
+def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
+    store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
+    store_name = f'{store_prefix}_{tbl_id.hex}'
+    # Get the list of column names that need _cellmd columns
+    _logger.info(f'Checking table {orig_table_md["name"]} ({store_name})')
+    col_ids = find_target_columns(orig_table_md)
+    if len(col_ids) == 0:
+        _logger.info(f'No Array or Json columns found in table {orig_table_md["name"]}. Skipping migration.')
+        return
+    # Check which columns already exist in the table
+    check_columns_sql = sql.text(f"""
+        SELECT column_name
+        FROM information_schema.columns
+        WHERE table_name = '{store_name}'
+    """)
+    existing_columns = {row[0] for row in conn.execute(check_columns_sql)}
+    # Filter out columns that already have _cellmd
+    col_ids_to_add: list[int] = []
+    for col_id in col_ids:
+        cellmd_col = f'col_{col_id}_cellmd'
+        if cellmd_col not in existing_columns:
+            col_ids_to_add.append(col_id)
+        else:
+            _logger.info(f'Column {cellmd_col} already exists in table {orig_table_md["name"]}. Skipping.')
+    if len(col_ids_to_add) == 0:
+        _logger.info(f'All _cellmd columns already exist in table {orig_table_md["name"]}. Skipping migration.')
+        return
+    return add_cellmd_columns(conn, store_name, col_ids_to_add)
+def find_target_columns(table_md: dict) -> list[int]:
+    """Returns ids of stored array and json columns"""
+    result: list[int] = []
+    for col_id, col_md in table_md['column_md'].items():
+        col_type = col_md['col_type']
+        classname = col_type.get('_classname')
+        if classname in ['ArrayType', 'JsonType'] and col_md.get('stored', False):
+            result.append(col_id)
+            _logger.info(f'Found {classname} column: {col_id}')
+    return result
+def add_cellmd_columns(conn: sql.Connection, store_name: str, col_ids: list[int]) -> None:
+    try:
+        # Add new columns
+        add_column_str = ', '.join(f'ADD COLUMN col_{col_id}_cellmd JSONB DEFAULT NULL' for col_id in col_ids)
+        add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
+        conn.execute(add_column_sql)
+        _logger.info(f'Added columns to {store_name}: {", ".join(f"col_{col_id}_cellmd" for col_id in col_ids)}')
+    except sql.exc.SQLAlchemyError as e:
+        _logger.error(f'Migration for table {store_name} failed: {e}')
+        raise

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    41: 'Cellmd columns for array and json columns',
     40: 'Convert error property columns to cellmd columns',
     39: 'ColumnHandles in external stores',
     38: 'Added TableMd.view_sn',

pixeltable 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.14py3-none-any.whl → 0.4.16py3-none-any.whl