PyPI - pixeltable - Versions diffs - 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl - Mend

pixeltable 0.4.15py3-none-any.whl → 0.4.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (68) hide show

pixeltable/__init__.py +4 -0
pixeltable/catalog/catalog.py +125 -63
pixeltable/catalog/column.py +7 -2
pixeltable/catalog/table.py +1 -0
pixeltable/catalog/table_metadata.py +4 -0
pixeltable/catalog/table_version.py +174 -117
pixeltable/catalog/table_version_handle.py +4 -1
pixeltable/catalog/table_version_path.py +0 -11
pixeltable/catalog/view.py +6 -0
pixeltable/config.py +7 -0
pixeltable/dataframe.py +10 -5
pixeltable/env.py +56 -19
pixeltable/exec/__init__.py +2 -0
pixeltable/exec/cell_materialization_node.py +231 -0
pixeltable/exec/cell_reconstruction_node.py +135 -0
pixeltable/exec/exec_node.py +1 -1
pixeltable/exec/expr_eval/evaluators.py +1 -0
pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
pixeltable/exec/expr_eval/globals.py +2 -0
pixeltable/exec/globals.py +32 -0
pixeltable/exec/object_store_save_node.py +1 -4
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +107 -14
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +23 -18
pixeltable/exprs/column_property_ref.py +10 -10
pixeltable/exprs/column_ref.py +2 -2
pixeltable/exprs/data_row.py +106 -37
pixeltable/exprs/expr.py +9 -0
pixeltable/exprs/expr_set.py +14 -7
pixeltable/exprs/inline_expr.py +2 -19
pixeltable/exprs/json_path.py +45 -12
pixeltable/exprs/row_builder.py +54 -22
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/bedrock.py +7 -0
pixeltable/functions/deepseek.py +11 -4
pixeltable/functions/llama_cpp.py +7 -0
pixeltable/functions/math.py +1 -1
pixeltable/functions/ollama.py +7 -0
pixeltable/functions/openai.py +4 -4
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/video.py +110 -28
pixeltable/globals.py +10 -4
pixeltable/io/globals.py +18 -17
pixeltable/io/parquet.py +1 -1
pixeltable/io/table_data_conduit.py +47 -22
pixeltable/iterators/document.py +61 -23
pixeltable/iterators/video.py +126 -53
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +175 -46
pixeltable/share/packager.py +155 -26
pixeltable/store.py +2 -3
pixeltable/type_system.py +5 -3
pixeltable/utils/arrow.py +6 -6
pixeltable/utils/av.py +65 -0
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/exception_handler.py +5 -28
pixeltable/utils/image.py +7 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +16 -1
pixeltable/utils/s3_store.py +44 -11
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0

pixeltable/functions/ollama.py CHANGED Viewed

@@ -1,3 +1,10 @@
+"""
+Pixeltable UDFs for Ollama local models.
+Provides integration with Ollama for running large language models locally,
+including chat completions and embeddings.
+"""
 from typing import TYPE_CHECKING, Optional
 import numpy as np

pixeltable/functions/openai.py CHANGED Viewed

@@ -395,10 +395,10 @@ async def chat_completions(
         of the table `tbl`:
         >>> messages = [
-                {'role': 'system', 'content': 'You are a helpful assistant.'},
-                {'role': 'user', 'content': tbl.prompt}
-            ]
-            tbl.add_computed_column(response=chat_completions(messages, model='gpt-4o-mini'))
+        ...     {'role': 'system', 'content': 'You are a helpful assistant.'},
+        ...     {'role': 'user', 'content': tbl.prompt}
+        ... ]
+        >>> tbl.add_computed_column(response=chat_completions(messages, model='gpt-4o-mini'))
     """
     if model_kwargs is None:
         model_kwargs = {}

pixeltable/functions/openrouter.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""
+Pixeltable UDFs that wrap the OpenRouter API.
+OpenRouter provides a unified interface to multiple LLM providers. In order to use it,
+you must first sign up at https://openrouter.ai, create an API key, and configure it
+as described in the Working with OpenRouter tutorial.
+"""
+from typing import TYPE_CHECKING, Any, Optional
+import pixeltable as pxt
+from pixeltable.env import Env, register_client
+from pixeltable.utils.code import local_public_names
+if TYPE_CHECKING:
+    import openai
+@register_client('openrouter')
+def _(api_key: str, site_url: Optional[str] = None, app_name: Optional[str] = None) -> 'openai.AsyncOpenAI':
+    import openai
+    # Create default headers for OpenRouter
+    default_headers: dict[str, Any] = {}
+    if site_url:
+        default_headers['HTTP-Referer'] = site_url
+    if app_name:
+        default_headers['X-Title'] = app_name
+    return openai.AsyncOpenAI(base_url='https://openrouter.ai/api/v1', api_key=api_key, default_headers=default_headers)
+def _openrouter_client() -> 'openai.AsyncOpenAI':
+    return Env.get().get_client('openrouter')
+@pxt.udf(resource_pool='request-rate:openrouter')
+async def chat_completions(
+    messages: list,
+    *,
+    model: str,
+    model_kwargs: Optional[dict[str, Any]] = None,
+    tools: Optional[list[dict[str, Any]]] = None,
+    tool_choice: Optional[dict[str, Any]] = None,
+    provider: Optional[dict[str, Any]] = None,
+    transforms: Optional[list[str]] = None,
+) -> dict:
+    """
+    Chat Completion API via OpenRouter.
+    OpenRouter provides access to multiple LLM providers through a unified API.
+    For additional details, see: <https://openrouter.ai/docs>
+    Supported models can be found at: <https://openrouter.ai/models>
+    Request throttling:
+    Applies the rate limit set in the config (section `openrouter`, key `rate_limit`). If no rate
+    limit is configured, uses a default of 600 RPM.
+    __Requirements:__
+    - `pip install openai`
+    Args:
+        messages: A list of messages comprising the conversation so far.
+        model: ID of the model to use (e.g., 'anthropic/claude-3.5-sonnet', 'openai/gpt-4').
+        model_kwargs: Additional OpenAI-compatible parameters.
+        tools: List of tools available to the model.
+        tool_choice: Controls which (if any) tool is called by the model.
+        provider: OpenRouter-specific provider preferences (e.g., {'order': ['Anthropic', 'OpenAI']}).
+        transforms: List of message transforms to apply (e.g., ['middle-out']).
+    Returns:
+        A dictionary containing the response in OpenAI format.
+    Examples:
+        Basic chat completion:
+        >>> messages = [{'role': 'user', 'content': tbl.prompt}]
+        ... tbl.add_computed_column(
+        ...     response=chat_completions(
+        ...         messages,
+        ...         model='anthropic/claude-3.5-sonnet'
+        ...     )
+        ... )
+        With provider routing:
+        >>> tbl.add_computed_column(
+        ...     response=chat_completions(
+        ...         messages,
+        ...         model='anthropic/claude-3.5-sonnet',
+        ...         provider={'require_parameters': True, 'order': ['Anthropic']}
+        ...     )
+        ... )
+        With transforms:
+        >>> tbl.add_computed_column(
+        ...     response=chat_completions(
+        ...         messages,
+        ...         model='openai/gpt-4',
+        ...         transforms=['middle-out']  # Optimize for long contexts
+        ...     )
+        ... )
+    """
+    if model_kwargs is None:
+        model_kwargs = {}
+    Env.get().require_package('openai')
+    # Handle tools if provided
+    if tools is not None:
+        model_kwargs['tools'] = [{'type': 'function', 'function': tool} for tool in tools]
+    if tool_choice is not None:
+        if tool_choice['auto']:
+            model_kwargs['tool_choice'] = 'auto'
+        elif tool_choice['required']:
+            model_kwargs['tool_choice'] = 'required'
+        else:
+            assert tool_choice['tool'] is not None
+            model_kwargs['tool_choice'] = {'type': 'function', 'function': {'name': tool_choice['tool']}}
+    # Prepare OpenRouter-specific parameters for extra_body
+    extra_body: dict[str, Any] = {}
+    if provider is not None:
+        extra_body['provider'] = provider
+    if transforms is not None:
+        extra_body['transforms'] = transforms
+    # Make the API call
+    result = await _openrouter_client().chat.completions.create(
+        messages=messages, model=model, extra_body=extra_body if extra_body else None, **model_kwargs
+    )
+    return result.model_dump()
+__all__ = local_public_names(__name__)
+def __dir__() -> list[str]:
+    return __all__

pixeltable/functions/video.py CHANGED Viewed

@@ -2,10 +2,11 @@
 Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
 """
+import glob
 import logging
 import pathlib
 import subprocess
-from typing import Literal, NoReturn
+from typing import Any, Literal, NoReturn
 import av
 import av.stream
@@ -358,9 +359,17 @@ def clip(
 @pxt.udf(is_method=True)
-def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
+def segment_video(
+    video: pxt.Video,
+    *,
+    duration: float | None = None,
+    segment_times: list[float] | None = None,
+    mode: Literal['fast', 'accurate'] = 'fast',
+    video_encoder: str | None = None,
+    video_encoder_args: dict[str, Any] | None = None,
+) -> list[str]:
     """
-    Split a video into fixed-size segments.
+    Split a video into segments.
     __Requirements:__
@@ -368,7 +377,19 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
     Args:
         video: Input video file to segment
-        duration: Approximate duration of each segment (in seconds).
+        duration: Duration of each segment (in seconds). For `mode='fast'`, this is approximate;
+            for `mode='accurate'`, segments will have exact durations. Cannot be specified together with
+            `segment_times`.
+        segment_times: List of timestamps (in seconds) in video where segments should be split. Note that these are not
+            segment durations. If all segment times are less than the duration of the video, produces exactly
+            `len(segment_times) + 1` segments. Cannot be empty or be specified together with `duration`.
+        mode: Segmentation mode:
+            - `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
+            - `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
+        video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
+            Only available for `mode='accurate'`.
+        video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
     Returns:
         List of file paths for the generated video segments.
@@ -377,45 +398,106 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
         pxt.Error: If the video is missing timing information.
     Examples:
-        Split a video at 1 minute intervals
+        Split a video at 1 minute intervals using fast mode:
         >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
+        Split video into exact 10-second segments with accurate mode, using the libx264 encoder with a CRF of 23 and
+        slow preset (for smaller output files):
+        >>> tbl.select(
+        ...     segment_paths=tbl.video.segment_video(
+        ...         duration=10,
+        ...         mode='accurate',
+        ...         video_encoder='libx264',
+        ...         video_encoder_args={'crf': 23, 'preset': 'slow'}
+        ...     )
+        ... ).collect()
         Split video into two parts at the midpoint:
         >>> duration = tbl.video.get_duration()
-        >>> tbl.select(segment_paths=tbl.video.segment_video(duration=duration / 2 + 1)).collect()
+        >>> tbl.select(segment_paths=tbl.video.segment_video(segment_times=[duration / 2])).collect()
     """
     Env.get().require_binary('ffmpeg')
-    if duration <= 0:
+    if duration is not None and segment_times is not None:
+        raise pxt.Error('duration and segment_times cannot both be specified')
+    if duration is not None and duration <= 0:
         raise pxt.Error(f'duration must be positive, got {duration}')
+    if segment_times is not None and len(segment_times) == 0:
+        raise pxt.Error('segment_times cannot be empty')
+    if mode == 'fast':
+        if video_encoder is not None:
+            raise pxt.Error("video_encoder is not supported for mode='fast'")
+        if video_encoder_args is not None:
+            raise pxt.Error("video_encoder_args is not supported for mode='fast'")
     base_path = TempStore.create_path(extension='')
-    # we extract consecutive clips instead of running ffmpeg -f segment, which is inexplicably much slower
-    start_time = 0.0
-    result: list[str] = []
-    try:
-        while True:
-            segment_path = f'{base_path}_segment_{len(result)}.mp4'
-            cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, duration)
+    output_paths: list[str] = []
+    if mode == 'accurate':
+        # Use ffmpeg -f segment for accurate segmentation with re-encoding
+        output_pattern = f'{base_path}_segment_%04d.mp4'
+        cmd = av_utils.ffmpeg_segment_cmd(
+            str(video),
+            output_pattern,
+            segment_duration=duration,
+            segment_times=segment_times,
+            video_encoder=video_encoder,
+            video_encoder_args=video_encoder_args,
+        )
+        try:
             _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
-            segment_duration = av_utils.get_video_duration(segment_path)
-            if segment_duration == 0.0:
-                # we're done
-                pathlib.Path(segment_path).unlink()
-                return result
-            result.append(segment_path)
-            start_time += segment_duration  # use the actual segment duration here, it won't match duration exactly
+            output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
+            # TODO: is this actually an error?
+            # if len(output_paths) == 0:
+            #     stderr_output = result.stderr.strip() if result.stderr is not None else ''
+            #     raise pxt.Error(
+            #         f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
+            #     )
+            return output_paths
+        except subprocess.CalledProcessError as e:
+            _handle_ffmpeg_error(e)
-        return result
-    except subprocess.CalledProcessError as e:
-        # clean up partial results
-        for segment_path in result:
-            pathlib.Path(segment_path).unlink()
-        _handle_ffmpeg_error(e)
+    else:
+        # Fast mode: extract consecutive clips using stream copy (no re-encoding)
+        # This is faster but can only split at keyframes, leading to approximate durations
+        start_time = 0.0
+        segment_idx = 0
+        try:
+            while True:
+                target_duration: float | None
+                if duration is not None:
+                    target_duration = duration
+                elif segment_idx < len(segment_times):
+                    target_duration = segment_times[segment_idx] - start_time
+                else:
+                    target_duration = None  # the rest
+                segment_path = f'{base_path}_segment_{len(output_paths)}.mp4'
+                cmd = av_utils.ffmpeg_clip_cmd(str(video), segment_path, start_time, target_duration)
+                _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
+                segment_duration = av_utils.get_video_duration(segment_path)
+                if segment_duration == 0.0:
+                    # we're done
+                    pathlib.Path(segment_path).unlink()
+                    return output_paths
+                output_paths.append(segment_path)
+                start_time += segment_duration  # use the actual segment duration here, it won't match duration exactly
+                segment_idx += 1
+                if segment_times is not None and segment_idx > len(segment_times):
+                    break
+            return output_paths
+        except subprocess.CalledProcessError as e:
+            # clean up partial results
+            for segment_path in output_paths:
+                pathlib.Path(segment_path).unlink()
+            _handle_ffmpeg_error(e)
 @pxt.udf(is_method=True)

pixeltable/globals.py CHANGED Viewed

@@ -179,7 +179,7 @@ def create_table(
             'Unable to create a proper schema from supplied `source`. Please use appropriate `schema_overrides`.'
         )
-    table = Catalog.get().create_table(
+    table, was_created = Catalog.get().create_table(
         path_obj,
         schema,
         data_source.pxt_df if isinstance(data_source, DFTableDataConduit) else None,
@@ -189,7 +189,7 @@ def create_table(
         media_validation=media_validation_,
         num_retained_versions=num_retained_versions,
     )
-    if data_source is not None and not is_direct_df:
+    if was_created and data_source is not None and not is_direct_df:
         fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
         table.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
@@ -447,11 +447,16 @@ def replicate(remote_uri: str, local_path: str) -> catalog.Table:
     return share.pull_replica(local_path, remote_uri)
-def get_table(path: str) -> catalog.Table:
+def get_table(path: str, if_not_exists: Literal['error', 'ignore'] = 'error') -> catalog.Table | None:
     """Get a handle to an existing table, view, or snapshot.
     Args:
         path: Path to the table.
+        if_not_exists: Directive regarding how to handle if the path does not exist.
+            Must be one of the following:
+            - `'error'`: raise an error
+            - `'ignore'`: do nothing and return `None`
     Returns:
         A handle to the [`Table`][pixeltable.Table].
@@ -476,8 +481,9 @@ def get_table(path: str) -> catalog.Table:
         >>> tbl = pxt.get_table('my_table:722')
     """
+    if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
     path_obj = catalog.Path.parse(path, allow_versioned_path=True)
-    tbl = Catalog.get().get_table(path_obj)
+    tbl = Catalog.get().get_table(path_obj, if_not_exists_)
     return tbl

pixeltable/io/globals.py CHANGED Viewed

@@ -103,25 +103,26 @@ def create_label_studio_project(
         column of the table `tbl`:
         >>> config = \"\"\"
-            <View>
-                <Video name="video_obj" value="$video_col"/>
-                <Choices name="video-category" toName="video" showInLine="true">
-                    <Choice value="city"/>
-                    <Choice value="food"/>
-                    <Choice value="sports"/>
-                </Choices>
-            </View>\"\"\"
-            create_label_studio_project(tbl, config)
+        ... <View>
+        ...     <Video name="video_obj" value="$video_col"/>
+        ...     <Choices name="video-category" toName="video" showInLine="true">
+        ...         <Choice value="city"/>
+        ...         <Choice value="food"/>
+        ...         <Choice value="sports"/>
+        ...     </Choices>
+        ... </View>
+        ... \"\"\"
+        >>> create_label_studio_project(tbl, config)
         Create a Label Studio project with the same configuration, using `media_import_method='url'`,
         whose media are stored in an S3 bucket:
         >>> create_label_studio_project(
-                tbl,
-                config,
-                media_import_method='url',
-                s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
-            )
+        ...     tbl,
+        ...     config,
+        ...     media_import_method='url',
+        ...     s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
+        ... )
     """
     Env.get().require_package('label_studio_sdk')
@@ -151,7 +152,7 @@ def export_images_as_fo_dataset(
     (or expression) containing image data, along with optional additional columns containing labels. Currently, only
     classification and detection labels are supported.
-    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
+    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial contains a
     fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
     Images in the dataset that already exist on disk will be exported directly, in whatever format they
@@ -204,13 +205,13 @@ def export_images_as_fo_dataset(
         Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
         labels from `tbl.classifications`:
-        >>> export_as_fiftyone(
+        >>> export_images_as_fo_dataset(
         ...     tbl,
         ...     tbl.image,
         ...     classifications=tbl.classifications
         ... )
-        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
+        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial
         for a fully worked example.
     """
     Env.get().require_package('fiftyone')

pixeltable/io/parquet.py CHANGED Viewed

@@ -62,7 +62,7 @@ def export_parquet(
         with Catalog.get().begin_xact(for_write=False):
             for record_batch in to_record_batches(df, partition_size_bytes):
                 output_path = temp_path / f'part-{batch_num:05d}.parquet'
-                arrow_tbl = pa.Table.from_batches([record_batch])  # type: ignore
+                arrow_tbl = pa.Table.from_batches([record_batch])
                 pa.parquet.write_table(arrow_tbl, str(output_path))
                 batch_num += 1

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -10,7 +10,9 @@ from dataclasses import dataclass, field, fields
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
+import numpy as np
 import pandas as pd
+import PIL
 from pyarrow.parquet import ParquetDataset
 import pixeltable as pxt
@@ -325,7 +327,11 @@ class JsonTableDataConduit(TableDataConduit):
 class HFTableDataConduit(TableDataConduit):
-    hf_ds: datasets.Dataset | datasets.DatasetDict | None = None
+    """
+    TODO:
+    - use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
+    """
     column_name_for_split: Optional[str] = None
     categorical_features: dict[str, dict[int, str]]
     dataset_dict: dict[str, datasets.Dataset] = None
@@ -339,9 +345,19 @@ class HFTableDataConduit(TableDataConduit):
         import datasets
         assert isinstance(tds.source, (datasets.Dataset, datasets.DatasetDict))
-        t.hf_ds = tds.source
         if 'column_name_for_split' in t.extra_fields:
             t.column_name_for_split = t.extra_fields['column_name_for_split']
+        # make sure we get numpy arrays for arrays, not Python lists
+        source = tds.source.with_format(type='numpy')
+        if isinstance(source, datasets.Dataset):
+            # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
+            raw_name = source.split._name
+            split_name = raw_name.split('[')[0] if raw_name is not None else None
+            t.dataset_dict = {split_name: source}
+        else:
+            assert isinstance(source, datasets.DatasetDict)
+            t.dataset_dict = source
         return t
     @classmethod
@@ -361,7 +377,7 @@ class HFTableDataConduit(TableDataConduit):
         if self.source_column_map is None:
             if self.src_schema_overrides is None:
                 self.src_schema_overrides = {}
-            self.hf_schema_source = _get_hf_schema(self.hf_ds)
+            self.hf_schema_source = _get_hf_schema(self.source)
             self.src_schema = huggingface_schema_to_pxt_schema(
                 self.hf_schema_source, self.src_schema_overrides, self.src_pk
             )
@@ -396,15 +412,6 @@ class HFTableDataConduit(TableDataConduit):
     def prepare_insert(self) -> None:
         import datasets
-        if isinstance(self.source, datasets.Dataset):
-            # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-            raw_name = self.source.split._name
-            split_name = raw_name.split('[')[0] if raw_name is not None else None
-            self.dataset_dict = {split_name: self.source}
-        else:
-            assert isinstance(self.source, datasets.DatasetDict)
-            self.dataset_dict = self.source
         # extract all class labels from the dataset to translate category ints to strings
         self.categorical_features = {
             feature_name: feature_type.names
@@ -415,26 +422,44 @@ class HFTableDataConduit(TableDataConduit):
             self.source_column_map = {}
         self.check_source_columns_are_insertable(self.hf_schema_source.keys())
-    def _translate_row(self, row: dict[str, Any], split_name: str) -> dict[str, Any]:
+    def _translate_row(self, row: dict[str, Any], split_name: str, features: datasets.Features) -> dict[str, Any]:
         output_row: dict[str, Any] = {}
         for col_name, val in row.items():
             # translate category ints to strings
             new_val = self.categorical_features[col_name][val] if col_name in self.categorical_features else val
             mapped_col_name = self.source_column_map.get(col_name, col_name)
-            # Convert values to the appropriate type if needed
-            try:
-                checked_val = self.pxt_schema[mapped_col_name].create_literal(new_val)
-            except TypeError as e:
-                msg = str(e)
-                raise excs.Error(f'Error in column {col_name}: {msg[0].lower() + msg[1:]}\nRow: {row}') from e
-            output_row[mapped_col_name] = checked_val
+            new_val = self._translate_val(new_val, features[col_name])
+            output_row[mapped_col_name] = new_val
         # add split name to output row
         if self.column_name_for_split is not None:
             output_row[self.column_name_for_split] = split_name
         return output_row
+    def _translate_val(self, val: Any, feature: datasets.Feature) -> Any:
+        """Convert numpy scalars to Python types and images to PIL.Image.Image"""
+        import datasets
+        if isinstance(feature, datasets.Value):
+            if isinstance(val, (np.generic, np.ndarray)):
+                # a scalar, which we want as a standard Python type
+                assert np.ndim(val) == 0
+                return val.item()
+            else:
+                # a standard Python object
+                return val
+        elif isinstance(feature, datasets.Sequence):
+            assert np.ndim(val) > 0
+            return val
+        elif isinstance(feature, datasets.Image):
+            return PIL.Image.fromarray(val)
+        elif isinstance(feature, dict):
+            assert isinstance(val, dict)
+            return {k: self._translate_val(v, feature[k]) for k, v in val.items()}
+        else:
+            return val
     def valid_row_batch(self) -> Iterator[RowData]:
         for split_name, split_dataset in self.dataset_dict.items():
             num_batches = split_dataset.size_in_bytes / self._K_BATCH_SIZE_BYTES
@@ -443,7 +468,7 @@ class HFTableDataConduit(TableDataConduit):
             batch = []
             for row in split_dataset:
-                batch.append(self._translate_row(row, split_name))
+                batch.append(self._translate_row(row, split_name, split_dataset.features))
                 if len(batch) >= tuples_per_batch:
                     yield batch
                     batch = []
@@ -503,7 +528,7 @@ class ParquetTableDataConduit(TableDataConduit):
         from pixeltable.utils.arrow import iter_tuples2
         try:
-            for fragment in self.pq_ds.fragments:  # type: ignore[attr-defined]
+            for fragment in self.pq_ds.fragments:
                 for batch in fragment.to_batches():
                     dict_batch = list(iter_tuples2(batch, self.source_column_map, self.pxt_schema))
                     self.total_rows += len(dict_batch)

pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.15py3-none-any.whl → 0.4.17py3-none-any.whl