PyPI - pixeltable - Versions diffs - 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

pixeltable 0.2.26py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

pixeltable/__init__.py +83 -19
pixeltable/_query.py +1444 -0
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +7 -4
pixeltable/catalog/catalog.py +2394 -119
pixeltable/catalog/column.py +225 -104
pixeltable/catalog/dir.py +38 -9
pixeltable/catalog/globals.py +53 -34
pixeltable/catalog/insertable_table.py +265 -115
pixeltable/catalog/path.py +80 -17
pixeltable/catalog/schema_object.py +28 -43
pixeltable/catalog/table.py +1270 -677
pixeltable/catalog/table_metadata.py +103 -0
pixeltable/catalog/table_version.py +1270 -751
pixeltable/catalog/table_version_handle.py +109 -0
pixeltable/catalog/table_version_path.py +137 -42
pixeltable/catalog/tbl_ops.py +53 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +251 -134
pixeltable/config.py +215 -0
pixeltable/env.py +736 -285
pixeltable/exceptions.py +26 -2
pixeltable/exec/__init__.py +7 -2
pixeltable/exec/aggregation_node.py +39 -21
pixeltable/exec/cache_prefetch_node.py +87 -109
pixeltable/exec/cell_materialization_node.py +268 -0
pixeltable/exec/cell_reconstruction_node.py +168 -0
pixeltable/exec/component_iteration_node.py +25 -28
pixeltable/exec/data_row_batch.py +11 -46
pixeltable/exec/exec_context.py +26 -11
pixeltable/exec/exec_node.py +35 -27
pixeltable/exec/expr_eval/__init__.py +3 -0
pixeltable/exec/expr_eval/evaluators.py +365 -0
pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
pixeltable/exec/expr_eval/globals.py +200 -0
pixeltable/exec/expr_eval/row_buffer.py +74 -0
pixeltable/exec/expr_eval/schedulers.py +413 -0
pixeltable/exec/globals.py +35 -0
pixeltable/exec/in_memory_data_node.py +35 -27
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +44 -29
pixeltable/exec/sql_node.py +414 -115
pixeltable/exprs/__init__.py +8 -5
pixeltable/exprs/arithmetic_expr.py +79 -45
pixeltable/exprs/array_slice.py +5 -5
pixeltable/exprs/column_property_ref.py +40 -26
pixeltable/exprs/column_ref.py +254 -61
pixeltable/exprs/comparison.py +14 -9
pixeltable/exprs/compound_predicate.py +9 -10
pixeltable/exprs/data_row.py +213 -72
pixeltable/exprs/expr.py +270 -104
pixeltable/exprs/expr_dict.py +6 -5
pixeltable/exprs/expr_set.py +20 -11
pixeltable/exprs/function_call.py +383 -284
pixeltable/exprs/globals.py +18 -5
pixeltable/exprs/in_predicate.py +7 -7
pixeltable/exprs/inline_expr.py +37 -37
pixeltable/exprs/is_null.py +8 -4
pixeltable/exprs/json_mapper.py +120 -54
pixeltable/exprs/json_path.py +90 -60
pixeltable/exprs/literal.py +61 -16
pixeltable/exprs/method_ref.py +7 -6
pixeltable/exprs/object_ref.py +19 -8
pixeltable/exprs/row_builder.py +238 -75
pixeltable/exprs/rowid_ref.py +53 -15
pixeltable/exprs/similarity_expr.py +65 -50
pixeltable/exprs/sql_element_cache.py +5 -5
pixeltable/exprs/string_op.py +107 -0
pixeltable/exprs/type_cast.py +25 -13
pixeltable/exprs/variable.py +2 -2
pixeltable/func/__init__.py +9 -5
pixeltable/func/aggregate_function.py +197 -92
pixeltable/func/callable_function.py +119 -35
pixeltable/func/expr_template_function.py +101 -48
pixeltable/func/function.py +375 -62
pixeltable/func/function_registry.py +20 -19
pixeltable/func/globals.py +6 -5
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +151 -35
pixeltable/func/signature.py +178 -49
pixeltable/func/tools.py +164 -0
pixeltable/func/udf.py +176 -53
pixeltable/functions/__init__.py +44 -4
pixeltable/functions/anthropic.py +226 -47
pixeltable/functions/audio.py +148 -11
pixeltable/functions/bedrock.py +137 -0
pixeltable/functions/date.py +188 -0
pixeltable/functions/deepseek.py +113 -0
pixeltable/functions/document.py +81 -0
pixeltable/functions/fal.py +76 -0
pixeltable/functions/fireworks.py +72 -20
pixeltable/functions/gemini.py +249 -0
pixeltable/functions/globals.py +208 -53
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1088 -95
pixeltable/functions/image.py +155 -84
pixeltable/functions/json.py +8 -11
pixeltable/functions/llama_cpp.py +31 -19
pixeltable/functions/math.py +169 -0
pixeltable/functions/mistralai.py +50 -75
pixeltable/functions/net.py +70 -0
pixeltable/functions/ollama.py +29 -36
pixeltable/functions/openai.py +548 -160
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +15 -14
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +310 -85
pixeltable/functions/timestamp.py +37 -19
pixeltable/functions/together.py +77 -120
pixeltable/functions/twelvelabs.py +188 -0
pixeltable/functions/util.py +7 -2
pixeltable/functions/uuid.py +30 -0
pixeltable/functions/video.py +1528 -117
pixeltable/functions/vision.py +26 -26
pixeltable/functions/voyageai.py +289 -0
pixeltable/functions/whisper.py +19 -10
pixeltable/functions/whisperx.py +179 -0
pixeltable/functions/yolox.py +112 -0
pixeltable/globals.py +716 -236
pixeltable/index/__init__.py +3 -1
pixeltable/index/base.py +17 -21
pixeltable/index/btree.py +32 -22
pixeltable/index/embedding_index.py +155 -92
pixeltable/io/__init__.py +12 -7
pixeltable/io/datarows.py +140 -0
pixeltable/io/external_store.py +83 -125
pixeltable/io/fiftyone.py +24 -33
pixeltable/io/globals.py +47 -182
pixeltable/io/hf_datasets.py +96 -127
pixeltable/io/label_studio.py +171 -156
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +136 -115
pixeltable/io/parquet.py +40 -153
pixeltable/io/table_data_conduit.py +702 -0
pixeltable/io/utils.py +100 -0
pixeltable/iterators/__init__.py +8 -4
pixeltable/iterators/audio.py +207 -0
pixeltable/iterators/base.py +9 -3
pixeltable/iterators/document.py +144 -87
pixeltable/iterators/image.py +17 -38
pixeltable/iterators/string.py +15 -12
pixeltable/iterators/video.py +523 -127
pixeltable/metadata/__init__.py +33 -8
pixeltable/metadata/converters/convert_10.py +2 -3
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_15.py +15 -11
pixeltable/metadata/converters/convert_16.py +4 -5
pixeltable/metadata/converters/convert_17.py +4 -5
pixeltable/metadata/converters/convert_18.py +4 -6
pixeltable/metadata/converters/convert_19.py +6 -9
pixeltable/metadata/converters/convert_20.py +3 -6
pixeltable/metadata/converters/convert_21.py +6 -8
pixeltable/metadata/converters/convert_22.py +3 -2
pixeltable/metadata/converters/convert_23.py +33 -0
pixeltable/metadata/converters/convert_24.py +55 -0
pixeltable/metadata/converters/convert_25.py +19 -0
pixeltable/metadata/converters/convert_26.py +23 -0
pixeltable/metadata/converters/convert_27.py +29 -0
pixeltable/metadata/converters/convert_28.py +13 -0
pixeltable/metadata/converters/convert_29.py +110 -0
pixeltable/metadata/converters/convert_30.py +63 -0
pixeltable/metadata/converters/convert_31.py +11 -0
pixeltable/metadata/converters/convert_32.py +15 -0
pixeltable/metadata/converters/convert_33.py +17 -0
pixeltable/metadata/converters/convert_34.py +21 -0
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/convert_41.py +12 -0
pixeltable/metadata/converters/convert_42.py +9 -0
pixeltable/metadata/converters/convert_43.py +44 -0
pixeltable/metadata/converters/util.py +44 -18
pixeltable/metadata/notes.py +21 -0
pixeltable/metadata/schema.py +185 -42
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +616 -225
pixeltable/share/__init__.py +3 -0
pixeltable/share/packager.py +797 -0
pixeltable/share/protocol/__init__.py +33 -0
pixeltable/share/protocol/common.py +165 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +119 -0
pixeltable/share/publish.py +349 -0
pixeltable/store.py +398 -232
pixeltable/type_system.py +730 -267
pixeltable/utils/__init__.py +40 -0
pixeltable/utils/arrow.py +201 -29
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +346 -0
pixeltable/utils/coco.py +26 -27
pixeltable/utils/code.py +4 -4
pixeltable/utils/console_output.py +46 -0
pixeltable/utils/coroutine.py +24 -0
pixeltable/utils/dbms.py +92 -0
pixeltable/utils/description_helper.py +11 -12
pixeltable/utils/documents.py +60 -61
pixeltable/utils/exception_handler.py +36 -0
pixeltable/utils/filecache.py +38 -22
pixeltable/utils/formatter.py +88 -51
pixeltable/utils/gcs_store.py +295 -0
pixeltable/utils/http.py +133 -0
pixeltable/utils/http_server.py +14 -13
pixeltable/utils/iceberg.py +13 -0
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +90 -0
pixeltable/utils/local_store.py +322 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +573 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +20 -20
pixeltable/utils/s3_store.py +527 -0
pixeltable/utils/sql.py +32 -5
pixeltable/utils/system.py +30 -0
pixeltable/utils/transactional_directory.py +4 -3
pixeltable-0.5.7.dist-info/METADATA +579 -0
pixeltable-0.5.7.dist-info/RECORD +227 -0
{pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/catalog/named_function.py +0 -36
pixeltable/catalog/path_dict.py +0 -141
pixeltable/dataframe.py +0 -894
pixeltable/exec/expr_eval_node.py +0 -232
pixeltable/ext/__init__.py +0 -14
pixeltable/ext/functions/__init__.py +0 -8
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/ext/functions/yolox.py +0 -157
pixeltable/tool/create_test_db_dump.py +0 -311
pixeltable/tool/create_test_video.py +0 -81
pixeltable/tool/doc_plugins/griffe.py +0 -50
pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
pixeltable/tool/embed_udf.py +0 -9
pixeltable/tool/mypy_plugin.py +0 -55
pixeltable/utils/media_store.py +0 -76
pixeltable/utils/s3.py +0 -16
pixeltable-0.2.26.dist-info/METADATA +0 -400
pixeltable-0.2.26.dist-info/RECORD +0 -156
pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
{pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0

pixeltable/io/globals.py CHANGED Viewed

@@ -1,31 +1,33 @@
-from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, Literal
 import pixeltable as pxt
 import pixeltable.exceptions as excs
 from pixeltable import Table, exprs
+from pixeltable.catalog.update_status import UpdateStatus
 from pixeltable.env import Env
-from pixeltable.io.external_store import SyncStatus
 if TYPE_CHECKING:
     import fiftyone as fo  # type: ignore[import-untyped]
 def create_label_studio_project(
-        t: Table,
-        label_config: str,
-        name: Optional[str] = None,
-        title: Optional[str] = None,
-        media_import_method: Literal['post', 'file', 'url'] = 'post',
-        col_mapping: Optional[dict[str, str]] = None,
-        sync_immediately: bool = True,
-        s3_configuration: Optional[dict[str, Any]] = None,
-        **kwargs: Any
-) -> SyncStatus:
+    t: Table,
+    label_config: str,
+    name: str | None = None,
+    title: str | None = None,
+    media_import_method: Literal['post', 'file', 'url'] = 'post',
+    col_mapping: dict[str, str] | None = None,
+    sync_immediately: bool = True,
+    s3_configuration: dict[str, Any] | None = None,
+    **kwargs: Any,
+) -> UpdateStatus:
     """
     Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
     - A tutorial notebook with fully worked examples can be found here:
-      [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
+      [Using Label Studio for Annotations with Pixeltable](https://docs.pixeltable.com/notebooks/integrations/using-label-studio-with-pixeltable)
     The required parameter `label_config` specifies the Label Studio project configuration,
     in XML format, as described in the Label Studio documentation. The linked project will
@@ -86,53 +88,48 @@ def create_label_studio_project(
             parameters of the Label Studio `connect_s3_import_storage` method, as described in the
             [Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
             `bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
-            Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`). If a title is not
-            specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`. All other parameters use their Label
-            Studio defaults.
+            Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`).
+            If a title is not specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`.
+            All other parameters use their Label Studio defaults.
         kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
             Studio SDK, as described in the
             [Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
     Returns:
-        A `SyncStatus` representing the status of any synchronization operations that occurred.
+        An `UpdateStatus` representing the status of any synchronization operations that occurred.
     Examples:
-        Create a Label Studio project whose tasks correspond to videos stored in the `video_col` column of the table `tbl`:
+        Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
+        column of the table `tbl`:
         >>> config = \"\"\"
-            <View>
-                <Video name="video_obj" value="$video_col"/>
-                <Choices name="video-category" toName="video" showInLine="true">
-                    <Choice value="city"/>
-                    <Choice value="food"/>
-                    <Choice value="sports"/>
-                </Choices>
-            </View>\"\"\"
-            create_label_studio_project(tbl, config)
+        ... <View>
+        ...     <Video name="video_obj" value="$video_col"/>
+        ...     <Choices name="video-category" toName="video" showInLine="true">
+        ...         <Choice value="city"/>
+        ...         <Choice value="food"/>
+        ...         <Choice value="sports"/>
+        ...     </Choices>
+        ... </View>
+        ... \"\"\"
+        >>> create_label_studio_project(tbl, config)
         Create a Label Studio project with the same configuration, using `media_import_method='url'`,
         whose media are stored in an S3 bucket:
         >>> create_label_studio_project(
-                tbl,
-                config,
-                media_import_method='url',
-                s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
-            )
+        ...     tbl,
+        ...     config,
+        ...     media_import_method='url',
+        ...     s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
+        ... )
     """
     Env.get().require_package('label_studio_sdk')
     from pixeltable.io.label_studio import LabelStudioProject
     ls_project = LabelStudioProject.create(
-        t,
-        label_config,
-        name,
-        title,
-        media_import_method,
-        col_mapping,
-        s3_configuration,
-        **kwargs
+        t, label_config, name, title, media_import_method, col_mapping, s3_configuration, **kwargs
     )
     # Link the project to `t`, and sync if appropriate.
@@ -140,159 +137,27 @@ def create_label_studio_project(
     if sync_immediately:
         return t.sync()
     else:
-        return SyncStatus.empty()
-def import_rows(
-    tbl_path: str,
-    rows: list[dict[str, Any]],
-    *,
-    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
-    primary_key: Optional[Union[str, list[str]]] = None,
-    num_retained_versions: int = 10,
-    comment: str = ''
-    ) -> Table:
-    """
-    Creates a new base table from a list of dictionaries. The dictionaries must be of the
-    form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
-    supplied data, using the most specific type that can represent all the values in a column.
-    If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
-    Pixeltable will force the specified column to the specified type (and will not attempt any type inference
-    for that column).
-    All column types of the new table will be nullable unless explicitly specified as non-nullable in
-    `schema_overrides`.
-    Args:
-        tbl_path: The qualified name of the table to create.
-        rows: The list of dictionaries to import.
-        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
-            as described above.
-        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
-        num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
-        comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
-    Returns:
-        A handle to the newly created [`Table`][pixeltable.Table].
-    """
-    if schema_overrides is None:
-        schema_overrides = {}
-    schema: dict[str, pxt.ColumnType] = {}
-    cols_with_nones: set[str] = set()
-    for n, row in enumerate(rows):
-        for col_name, value in row.items():
-            if col_name in schema_overrides:
-                # We do the insertion here; this will ensure that the column order matches the order
-                # in which the column names are encountered in the input data, even if `schema_overrides`
-                # is specified.
-                if col_name not in schema:
-                    schema[col_name] = schema_overrides[col_name]
-            elif value is not None:
-                # If `key` is not in `schema_overrides`, then we infer its type from the data.
-                # The column type will always be nullable by default.
-                col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
-                if col_type is None:
-                    raise excs.Error(f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}')
-                if col_name not in schema:
-                    schema[col_name] = col_type
-                else:
-                    supertype = schema[col_name].supertype(col_type)
-                    if supertype is None:
-                        raise excs.Error(
-                            f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
-                            'Consider specifying the type explicitly in `schema_overrides`.'
-                        )
-                    schema[col_name] = supertype
-            else:
-                cols_with_nones.add(col_name)
-    extraneous_keys = schema_overrides.keys() - schema.keys()
-    if len(extraneous_keys) > 0:
-        raise excs.Error(f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}')
-    entirely_none_cols = cols_with_nones - schema.keys()
-    if len(entirely_none_cols) > 0:
-        # A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
-        # was not encountered in any row with a non-None value.
-        raise excs.Error(
-            f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
-            'Consider specifying the type(s) explicitly in `schema_overrides`.'
-        )
-    t = pxt.create_table(tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
-    t.insert(rows)
-    return t
-def import_json(
-    tbl_path: str,
-    filepath_or_url: str,
-    *,
-    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
-    primary_key: Optional[Union[str, list[str]]] = None,
-    num_retained_versions: int = 10,
-    comment: str = '',
-    **kwargs: Any
-) -> Table:
-    """
-    Creates a new base table from a JSON file. This is a convenience method and is
-    equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
-    is the contents of the specified `filepath_or_url`.
-    Args:
-        tbl_path: The name of the table to create.
-        filepath_or_url: The path or URL of the JSON file.
-        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
-            (see [`import_rows()`][pixeltable.io.import_rows]).
-        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
-        num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
-        comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
-        kwargs: Additional keyword arguments to pass to `json.loads`.
-    Returns:
-        A handle to the newly created [`Table`][pixeltable.Table].
-    """
-    import json
-    import urllib.parse
-    import urllib.request
-    # TODO Consolidate this logic with other places where files/URLs are parsed
-    parsed = urllib.parse.urlparse(filepath_or_url)
-    if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
-        # local file path
-        if len(parsed.scheme) <= 1:
-            filepath = filepath_or_url
-        else:
-            filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
-        with open(filepath) as fp:
-            contents = fp.read()
-    else:
-        # URL
-        contents = urllib.request.urlopen(filepath_or_url).read()
-    data = json.loads(contents, **kwargs)
-    return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
+        return UpdateStatus()
 def export_images_as_fo_dataset(
     tbl: pxt.Table,
     images: exprs.Expr,
     image_format: str = 'webp',
-    classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
-    detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
+    classifications: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
+    detections: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
 ) -> 'fo.Dataset':
     """
     Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
     (or expression) containing image data, along with optional additional columns containing labels. Currently, only
     classification and detection labels are supported.
-    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
+    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial contains a
     fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
     Images in the dataset that already exist on disk will be exported directly, in whatever format they
     are stored in. Images that are not already on disk (such as frames extracted using a
-    [`FrameIterator`][pixeltable.iterators.FrameIterator]) will first be written to disk in the specified
+    [`frame_iterator`][pixeltable.functions.video.frame_iterator]) will first be written to disk in the specified
     `image_format`.
     The label parameters accept one or more sets of labels of each type. If a single `Expr` is provided, then it will
@@ -340,13 +205,13 @@ def export_images_as_fo_dataset(
         Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
         labels from `tbl.classifications`:
-        >>> export_as_fiftyone(
+        >>> export_images_as_fo_dataset(
         ...     tbl,
         ...     tbl.image,
         ...     classifications=tbl.classifications
         ... )
-        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
+        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial
         for a fully worked example.
     """
     Env.get().require_package('fiftyone')
@@ -358,6 +223,6 @@ def export_images_as_fo_dataset(
     if not images.col_type.is_image_type():
         raise excs.Error(f'`images` must be an expression of type Image (got {images.col_type._to_base_str()})')
-    return fo.Dataset.from_importer(PxtImageDatasetImporter(
-        tbl, images, image_format, classifications=classifications, detections=detections
-    ))
+    return fo.Dataset.from_importer(
+        PxtImageDatasetImporter(tbl, images, image_format, classifications=classifications, detections=detections)
+    )

pixeltable/io/hf_datasets.py CHANGED Viewed

@@ -1,190 +1,159 @@
 from __future__ import annotations
-import logging
-import math
-import random
 import typing
-from typing import Union, Optional, Any
+from typing import Any
 import pixeltable as pxt
 import pixeltable.type_system as ts
-from pixeltable import exceptions as excs
 if typing.TYPE_CHECKING:
     import datasets  # type: ignore[import-untyped]
-_logger = logging.getLogger(__name__)
-# use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
-# The primary goal is to bound memory use, regardless of dataset size.
-# Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
-_K_BATCH_SIZE_BYTES = 100_000_000
-# note, there are many more types. we allow overrides in the schema_override parameter
+# note, there are many more types. we allow overrides in the schema_overrides parameter
 # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
 # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
 _hf_to_pxt: dict[str, ts.ColumnType] = {
-    'int32': ts.IntType(nullable=True),  # pixeltable widens to big int
-    'int64': ts.IntType(nullable=True),
     'bool': ts.BoolType(nullable=True),
+    'int8': ts.IntType(nullable=True),
+    'int16': ts.IntType(nullable=True),
+    'int32': ts.IntType(nullable=True),
+    'int64': ts.IntType(nullable=True),
+    'uint8': ts.IntType(nullable=True),
+    'uint16': ts.IntType(nullable=True),
+    'uint32': ts.IntType(nullable=True),
+    'uint64': ts.IntType(nullable=True),
+    'float16': ts.FloatType(nullable=True),
     'float32': ts.FloatType(nullable=True),
+    'float64': ts.FloatType(nullable=True),
     'string': ts.StringType(nullable=True),
+    'large_string': ts.StringType(nullable=True),
     'timestamp[s]': ts.TimestampType(nullable=True),
     'timestamp[ms]': ts.TimestampType(nullable=True),  # HF dataset iterator converts timestamps to datetime.datetime
+    'timestamp[us]': ts.TimestampType(nullable=True),
+    'timestamp[ns]': ts.TimestampType(nullable=True),
+    'date32': ts.DateType(nullable=True),
+    'date64': ts.DateType(nullable=True),
 }
-def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
+def _to_pixeltable_type(feature_type: Any, nullable: bool) -> ts.ColumnType | None:
     """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
     import datasets
     if isinstance(feature_type, datasets.ClassLabel):
         # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
-        return ts.StringType(nullable=True)
+        return ts.StringType(nullable=nullable)
     elif isinstance(feature_type, datasets.Value):
         # example: Value(dtype='int64', id=None)
-        return _hf_to_pxt.get(feature_type.dtype, None)
-    elif isinstance(feature_type, datasets.Sequence):
+        pt = _hf_to_pxt.get(feature_type.dtype, None)
+        return pt.copy(nullable=nullable) if pt is not None else None
+    elif isinstance(feature_type, (datasets.Sequence, datasets.LargeList)):
         # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
-        dtype = _to_pixeltable_type(feature_type.feature)
-        length = feature_type.length if feature_type.length != -1 else None
-        return ts.ArrayType(shape=(length,), dtype=dtype)
+        dtype = _to_pixeltable_type(feature_type.feature, nullable)
+        if dtype is None:
+            return None
+        if dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type():
+            length = feature_type.length if feature_type.length != -1 else None
+            return ts.ArrayType(shape=(length,), dtype=dtype, nullable=nullable)
+        else:
+            # Sequence of dicts must be cast as Json
+            return ts.JsonType(nullable=nullable)
     elif isinstance(feature_type, datasets.Image):
-        return ts.ImageType(nullable=True)
+        return ts.ImageType(nullable=nullable)
+    elif isinstance(feature_type, datasets.Audio):
+        return ts.AudioType(nullable=nullable)
+    elif isinstance(feature_type, datasets.Video):
+        return ts.VideoType(nullable=nullable)
+    elif isinstance(feature_type, (datasets.Array2D, datasets.Array3D, datasets.Array4D, datasets.Array5D)):
+        # Multi-dimensional arrays with fixed shape and dtype
+        inner_dtype = _hf_to_pxt.get(feature_type.dtype, None)
+        if inner_dtype is None:
+            return None
+        return ts.ArrayType(shape=feature_type.shape, dtype=inner_dtype, nullable=nullable)
+    elif isinstance(feature_type, (datasets.Translation, datasets.TranslationVariableLanguages)):
+        # Translation types are dict-like structures
+        return ts.JsonType(nullable=nullable)
+    elif isinstance(feature_type, (list, dict)):
+        return ts.JsonType(nullable=nullable)
     else:
         return None
-def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
+def _get_hf_schema(dataset: datasets.Dataset | datasets.DatasetDict) -> datasets.Features:
     """Get the schema of a huggingface dataset as a dictionary."""
     import datasets
-    first_dataset = dataset if isinstance(dataset, datasets.Dataset) else next(iter(dataset.values()))
+    first_dataset = (
+        dataset if isinstance(dataset, (datasets.Dataset, datasets.IterableDataset)) else next(iter(dataset.values()))
+    )
     return first_dataset.features
-def huggingface_schema_to_pixeltable_schema(
-    hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
-) -> dict[str, Optional[ts.ColumnType]]:
+def huggingface_schema_to_pxt_schema(
+    hf_schema: datasets.Features, schema_overrides: dict[str, Any], primary_key: list[str]
+) -> dict[str, ts.ColumnType | None]:
     """Generate a pixeltable schema from a huggingface dataset schema.
     Columns without a known mapping are mapped to None
     """
-    hf_schema = _get_hf_schema(hf_dataset)
     pixeltable_schema = {
-        column_name: _to_pixeltable_type(feature_type) for column_name, feature_type in hf_schema.items()
+        column_name: _to_pixeltable_type(feature_type, column_name not in primary_key)
+        if column_name not in schema_overrides
+        else schema_overrides[column_name]
+        for column_name, feature_type in hf_schema.items()
     }
     return pixeltable_schema
 def import_huggingface_dataset(
     table_path: str,
-    dataset: Union[datasets.Dataset, datasets.DatasetDict],
+    dataset: datasets.Dataset | datasets.DatasetDict | datasets.IterableDataset | datasets.IterableDatasetDict,
     *,
-    column_name_for_split: Optional[str] = None,
-    schema_overrides: Optional[dict[str, Any]] = None,
+    schema_overrides: dict[str, Any] | None = None,
+    primary_key: str | list[str] | None = None,
     **kwargs: Any,
 ) -> pxt.Table:
-    """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
-        Requires `datasets` library to be installed.
+    """
+    Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
+    Requires `datasets` library to be installed.
+    HuggingFace feature types are mapped to Pixeltable column types as follows:
+    - `Value(bool)`: `Bool`<br/>
+      `Value(int*/uint*)`: `Int`<br/>
+      `Value(float*)`: `Float`<br/>
+      `Value(string/large_string)`: `String`<br/>
+      `Value(timestamp*)`: `Timestamp`<br/>
+      `Value(date*)`: `Date`
+    - `ClassLabel`: `String` (converted to label names)
+    - `Sequence`/`LargeList` of numeric types: `Array`
+    - `Sequence`/`LargeList` of string: `Json`
+    - `Sequence`/`LargeList` of dicts: `Json`
+    - `Array2D`-`Array5D`: `Array` (preserves shape)
+    - `Image`: `Image`
+    - `Audio`: `Audio`
+    - `Video`: `Video`
+    - `Translation`/`TranslationVariableLanguages`: `Json`
     Args:
         table_path: Path to the table.
-        dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
-            or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
-            to insert into the table.
-        column_name_for_split: column name to use for split information. If None, no split information will be stored.
+        dataset: An instance of any of the Huggingface dataset classes:
+            [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset),
+            [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict),
+            [`datasets.IterableDataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.IterableDataset),
+            [`datasets.IterableDatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.IterableDatasetDict)
         schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
-            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
-            `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
-            Pixeltable identifiers).
+            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
+            The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
+            they are valid Pixeltable identifiers).
+        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
         kwargs: Additional arguments to pass to `create_table`.
+            An argument of `column_name_for_split` must be provided if the source is a DatasetDict.
+            This column name will contain the split information. If None, no split information will be stored.
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    import datasets
-    import pixeltable as pxt
-    if table_path in pxt.list_tables():
-        raise excs.Error(f'table {table_path} already exists')
-    if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
-        raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
-    if isinstance(dataset, datasets.Dataset):
-        # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-        raw_name = dataset.split._name
-        split_name = raw_name.split('[')[0] if raw_name is not None else None
-        dataset_dict = {split_name: dataset}
-    else:
-        dataset_dict = dataset
-    pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
-    if schema_overrides is not None:
-        pixeltable_schema.update(schema_overrides)
-    if column_name_for_split is not None:
-        if column_name_for_split in pixeltable_schema:
-            raise excs.Error(
-                f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
-            )
-        pixeltable_schema[column_name_for_split] = ts.StringType(nullable=True)
-    for field, column_type in pixeltable_schema.items():
-        if column_type is None:
-            raise excs.Error(f'Could not infer pixeltable type for feature `{field}` in huggingface dataset')
-    if isinstance(dataset, datasets.Dataset):
-        # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-        raw_name = dataset.split._name
-        split_name = raw_name.split('[')[0] if raw_name is not None else None
-        dataset_dict = {split_name: dataset}
-    elif isinstance(dataset, datasets.DatasetDict):
-        dataset_dict = dataset
-    else:
-        raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
-    # extract all class labels from the dataset to translate category ints to strings
-    hf_schema = _get_hf_schema(dataset)
-    categorical_features = {
-        feature_name: feature_type.names
-        for (feature_name, feature_type) in hf_schema.items()
-        if isinstance(feature_type, datasets.ClassLabel)
-    }
-    try:
-        # random tmp name
-        tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
-        tab = pxt.create_table(tmp_name, pixeltable_schema, **kwargs)
-        def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
-            output_row = row.copy()
-            # map all class labels to strings
-            for field, values in categorical_features.items():
-                output_row[field] = values[row[field]]
-            # add split name to row
-            if column_name_for_split is not None:
-                output_row[column_name_for_split] = split_name
-            return output_row
-        for split_name, split_dataset in dataset_dict.items():
-            num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
-            tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
-            assert tuples_per_batch > 0
-            batch = []
-            for row in split_dataset:
-                batch.append(_translate_row(row, split_name))
-                if len(batch) >= tuples_per_batch:
-                    tab.insert(batch)
-                    batch = []
-            # last batch
-            if len(batch) > 0:
-                tab.insert(batch)
-    except Exception as e:
-        _logger.error(f'Error while inserting dataset into table: {tmp_name}')
-        raise e
-    pxt.move(tmp_name, table_path)
-    return pxt.get_table(table_path)
+    return pxt.create_table(
+        table_path, source=dataset, schema_overrides=schema_overrides, primary_key=primary_key, extra_args=kwargs
+    )

pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

pixeltable 0.2.26py3-none-any.whl → 0.5.7py3-none-any.whl