PyPI - pixeltable - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

pixeltable 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (52) hide show

pixeltable/__init__.py +1 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +509 -103
pixeltable/catalog/column.py +5 -0
pixeltable/catalog/dir.py +15 -6
pixeltable/catalog/globals.py +16 -0
pixeltable/catalog/insertable_table.py +82 -41
pixeltable/catalog/path.py +15 -0
pixeltable/catalog/schema_object.py +7 -12
pixeltable/catalog/table.py +81 -67
pixeltable/catalog/table_version.py +23 -7
pixeltable/catalog/view.py +9 -6
pixeltable/env.py +15 -9
pixeltable/exec/exec_node.py +1 -1
pixeltable/exprs/__init__.py +2 -1
pixeltable/exprs/arithmetic_expr.py +2 -0
pixeltable/exprs/column_ref.py +38 -2
pixeltable/exprs/expr.py +61 -12
pixeltable/exprs/function_call.py +1 -4
pixeltable/exprs/globals.py +12 -0
pixeltable/exprs/json_mapper.py +4 -4
pixeltable/exprs/json_path.py +10 -11
pixeltable/exprs/similarity_expr.py +5 -20
pixeltable/exprs/string_op.py +107 -0
pixeltable/ext/functions/yolox.py +21 -64
pixeltable/func/callable_function.py +5 -2
pixeltable/func/query_template_function.py +6 -18
pixeltable/func/tools.py +2 -2
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/globals.py +16 -5
pixeltable/globals.py +172 -262
pixeltable/io/__init__.py +3 -2
pixeltable/io/datarows.py +138 -0
pixeltable/io/external_store.py +8 -5
pixeltable/io/globals.py +7 -160
pixeltable/io/hf_datasets.py +21 -98
pixeltable/io/pandas.py +29 -43
pixeltable/io/parquet.py +17 -42
pixeltable/io/table_data_conduit.py +569 -0
pixeltable/io/utils.py +6 -21
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_30.py +50 -0
pixeltable/metadata/converters/util.py +26 -1
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +3 -0
pixeltable/utils/arrow.py +32 -7
pixeltable/utils/coroutine.py +41 -0
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/METADATA +1 -1
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/RECORD +52 -47
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/WHEEL +1 -1
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/LICENSE +0 -0
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/entry_points.txt +0 -0

pixeltable/io/datarows.py ADDED Viewed

@@ -0,0 +1,138 @@
+from __future__ import annotations
+from typing import Any, Iterable, Optional, Union
+import pixeltable as pxt
+from pixeltable import exceptions as excs
+def _infer_schema_from_rows(
+    rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
+) -> dict[str, pxt.ColumnType]:
+    schema: dict[str, pxt.ColumnType] = {}
+    cols_with_nones: set[str] = set()
+    for n, row in enumerate(rows):
+        for col_name, value in row.items():
+            if col_name in schema_overrides:
+                # We do the insertion here; this will ensure that the column order matches the order
+                # in which the column names are encountered in the input data, even if `schema_overrides`
+                # is specified.
+                if col_name not in schema:
+                    schema[col_name] = schema_overrides[col_name]
+            elif value is not None:
+                # If `key` is not in `schema_overrides`, then we infer its type from the data.
+                # The column type will always be nullable by default.
+                col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
+                if col_type is None:
+                    raise excs.Error(
+                        f'Could not infer type for column `{col_name}`; the value in row {n} '
+                        f'has an unsupported type: {type(value)}'
+                    )
+                if col_name not in schema:
+                    schema[col_name] = col_type
+                else:
+                    supertype = schema[col_name].supertype(col_type)
+                    if supertype is None:
+                        raise excs.Error(
+                            f'Could not infer type of column `{col_name}`; the value in row {n} '
+                            f'does not match preceding type {schema[col_name]}: {value!r}\n'
+                            'Consider specifying the type explicitly in `schema_overrides`.'
+                        )
+                    schema[col_name] = supertype
+            else:
+                cols_with_nones.add(col_name)
+    entirely_none_cols = cols_with_nones - schema.keys()
+    if len(entirely_none_cols) > 0:
+        # A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
+        # was not encountered in any row with a non-None value.
+        raise excs.Error(
+            f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
+            'Consider specifying the type(s) explicitly in `schema_overrides`.'
+        )
+    return schema
+def import_rows(
+    tbl_path: str,
+    rows: list[dict[str, Any]],
+    *,
+    schema_overrides: Optional[dict[str, Any]] = None,
+    primary_key: Optional[Union[str, list[str]]] = None,
+    num_retained_versions: int = 10,
+    comment: str = '',
+) -> pxt.Table:
+    """
+    Creates a new base table from a list of dictionaries. The dictionaries must be of the
+    form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
+    supplied data, using the most specific type that can represent all the values in a column.
+    If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
+    Pixeltable will force the specified column to the specified type (and will not attempt any type inference
+    for that column).
+    All column types of the new table will be nullable unless explicitly specified as non-nullable in
+    `schema_overrides`.
+    Args:
+        tbl_path: The qualified name of the table to create.
+        rows: The list of dictionaries to import.
+        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
+            as described above.
+        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
+        num_retained_versions: The number of retained versions of the table
+            (see [`create_table()`][pixeltable.create_table]).
+        comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
+    Returns:
+        A handle to the newly created [`Table`][pixeltable.Table].
+    """
+    return pxt.create_table(
+        tbl_path,
+        source=rows,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        num_retained_versions=num_retained_versions,
+        comment=comment,
+    )
+def import_json(
+    tbl_path: str,
+    filepath_or_url: str,
+    *,
+    schema_overrides: Optional[dict[str, Any]] = None,
+    primary_key: Optional[Union[str, list[str]]] = None,
+    num_retained_versions: int = 10,
+    comment: str = '',
+    **kwargs: Any,
+) -> pxt.Table:
+    """
+    Creates a new base table from a JSON file. This is a convenience method and is
+    equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
+    is the contents of the specified `filepath_or_url`.
+    Args:
+        tbl_path: The name of the table to create.
+        filepath_or_url: The path or URL of the JSON file.
+        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
+            (see [`import_rows()`][pixeltable.io.import_rows]).
+        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
+        num_retained_versions: The number of retained versions of the table
+            (see [`create_table()`][pixeltable.create_table]).
+        comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
+        kwargs: Additional keyword arguments to pass to `json.loads`.
+    Returns:
+        A handle to the newly created [`Table`][pixeltable.Table].
+    """
+    return pxt.create_table(
+        tbl_path,
+        source=filepath_or_url,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        num_retained_versions=num_retained_versions,
+        comment=comment,
+        extra_args=kwargs,
+    )

pixeltable/io/external_store.py CHANGED Viewed

@@ -97,7 +97,7 @@ class Project(ExternalStore, abc.ABC):
         # This ensures that the media in those columns resides in the media store.
         # First determine which columns (if any) need stored proxies, but don't have one yet.
         stored_proxies_needed: list[Column] = []
-        for col in self.col_mapping.keys():
+        for col in self.col_mapping:
             if col.col_type.is_media_type() and not (col.is_stored and col.is_computed):
                 # If this column is already proxied in some other Project, use the existing proxy to avoid
                 # duplication. Otherwise, we'll create a new one.
@@ -234,7 +234,8 @@ class Project(ExternalStore, abc.ABC):
                 else:
                     raise excs.Error(
                         f'Column `{t_col}` does not exist in Table `{table._name}`. Either add a column `{t_col}`, '
-                        f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
+                        f'or specify a `col_mapping` to associate a different column with '
+                        f'the external field `{ext_col}`.'
                     )
             if ext_col not in export_cols and ext_col not in import_cols:
                 raise excs.Error(
@@ -253,7 +254,8 @@ class Project(ExternalStore, abc.ABC):
                 ext_col_type = export_cols[ext_col]
                 if not ext_col_type.is_supertype_of(t_col_type, ignore_nullable=True):
                     raise excs.Error(
-                        f'Column `{t_col}` cannot be exported to external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
+                        f'Column `{t_col}` cannot be exported to external column `{ext_col}` '
+                        f'(incompatible types; expecting `{ext_col_type}`)'
                     )
             if ext_col in import_cols:
                 # Validate that the external column can be assigned to the table column
@@ -264,7 +266,8 @@ class Project(ExternalStore, abc.ABC):
                 ext_col_type = import_cols[ext_col]
                 if not t_col_type.is_supertype_of(ext_col_type, ignore_nullable=True):
                     raise excs.Error(
-                        f'Column `{t_col}` cannot be imported from external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
+                        f'Column `{t_col}` cannot be imported from external column `{ext_col}` '
+                        f'(incompatible types; expecting `{ext_col_type}`)'
                     )
         return resolved_col_mapping
@@ -368,7 +371,7 @@ class MockProject(Project):
             {cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']},
         )
-    def __eq__(self, other: Any) -> bool:
+    def __eq__(self, other: object) -> bool:
         if not isinstance(other, MockProject):
             return False
         return self.name == other.name

pixeltable/io/globals.py CHANGED Viewed

@@ -1,7 +1,5 @@
-import json
-import urllib.parse
-import urllib.request
-from pathlib import Path
+from __future__ import annotations
 from typing import TYPE_CHECKING, Any, Literal, Optional, Union
 import pixeltable as pxt
@@ -9,61 +7,11 @@ import pixeltable.exceptions as excs
 from pixeltable import Table, exprs
 from pixeltable.env import Env
 from pixeltable.io.external_store import SyncStatus
-from pixeltable.utils import parse_local_file_path
 if TYPE_CHECKING:
     import fiftyone as fo  # type: ignore[import-untyped]
-from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
-def _infer_schema_from_rows(
-    rows: list[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
-) -> dict[str, pxt.ColumnType]:
-    schema: dict[str, pxt.ColumnType] = {}
-    cols_with_nones: set[str] = set()
-    for n, row in enumerate(rows):
-        for col_name, value in row.items():
-            if col_name in schema_overrides:
-                # We do the insertion here; this will ensure that the column order matches the order
-                # in which the column names are encountered in the input data, even if `schema_overrides`
-                # is specified.
-                if col_name not in schema:
-                    schema[col_name] = schema_overrides[col_name]
-            elif value is not None:
-                # If `key` is not in `schema_overrides`, then we infer its type from the data.
-                # The column type will always be nullable by default.
-                col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
-                if col_type is None:
-                    raise excs.Error(
-                        f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
-                    )
-                if col_name not in schema:
-                    schema[col_name] = col_type
-                else:
-                    supertype = schema[col_name].supertype(col_type)
-                    if supertype is None:
-                        raise excs.Error(
-                            f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
-                            'Consider specifying the type explicitly in `schema_overrides`.'
-                        )
-                    schema[col_name] = supertype
-            else:
-                cols_with_nones.add(col_name)
-    entirely_none_cols = cols_with_nones - schema.keys()
-    if len(entirely_none_cols) > 0:
-        # A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
-        # was not encountered in any row with a non-None value.
-        raise excs.Error(
-            f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
-            'Consider specifying the type(s) explicitly in `schema_overrides`.'
-        )
-    return schema
 def create_label_studio_project(
     t: Table,
     label_config: str,
@@ -140,9 +88,9 @@ def create_label_studio_project(
             parameters of the Label Studio `connect_s3_import_storage` method, as described in the
             [Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
             `bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
-            Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`). If a title is not
-            specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`. All other parameters use their Label
-            Studio defaults.
+            Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`).
+            If a title is not specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`.
+            All other parameters use their Label Studio defaults.
         kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
             Studio SDK, as described in the
             [Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
@@ -151,7 +99,8 @@ def create_label_studio_project(
         A `SyncStatus` representing the status of any synchronization operations that occurred.
     Examples:
-        Create a Label Studio project whose tasks correspond to videos stored in the `video_col` column of the table `tbl`:
+        Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
+        column of the table `tbl`:
         >>> config = \"\"\"
             <View>
@@ -190,108 +139,6 @@ def create_label_studio_project(
         return SyncStatus.empty()
-def import_rows(
-    tbl_path: str,
-    rows: list[dict[str, Any]],
-    *,
-    schema_overrides: Optional[dict[str, Any]] = None,
-    primary_key: Optional[Union[str, list[str]]] = None,
-    num_retained_versions: int = 10,
-    comment: str = '',
-) -> Table:
-    """
-    Creates a new base table from a list of dictionaries. The dictionaries must be of the
-    form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
-    supplied data, using the most specific type that can represent all the values in a column.
-    If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
-    Pixeltable will force the specified column to the specified type (and will not attempt any type inference
-    for that column).
-    All column types of the new table will be nullable unless explicitly specified as non-nullable in
-    `schema_overrides`.
-    Args:
-        tbl_path: The qualified name of the table to create.
-        rows: The list of dictionaries to import.
-        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
-            as described above.
-        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
-        num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
-        comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
-    Returns:
-        A handle to the newly created [`Table`][pixeltable.Table].
-    """
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
-    schema, pxt_pk, _ = normalize_schema_names(row_schema, primary_key, schema_overrides, True)
-    table = find_or_create_table(
-        tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
-    )
-    table.insert(rows)
-    return table
-def import_json(
-    tbl_path: str,
-    filepath_or_url: str,
-    *,
-    schema_overrides: Optional[dict[str, Any]] = None,
-    primary_key: Optional[Union[str, list[str]]] = None,
-    num_retained_versions: int = 10,
-    comment: str = '',
-    **kwargs: Any,
-) -> Table:
-    """
-    Creates a new base table from a JSON file. This is a convenience method and is
-    equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
-    is the contents of the specified `filepath_or_url`.
-    Args:
-        tbl_path: The name of the table to create.
-        filepath_or_url: The path or URL of the JSON file.
-        schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
-            (see [`import_rows()`][pixeltable.io.import_rows]).
-        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
-        num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
-        comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
-        kwargs: Additional keyword arguments to pass to `json.loads`.
-    Returns:
-        A handle to the newly created [`Table`][pixeltable.Table].
-    """
-    path = parse_local_file_path(filepath_or_url)
-    if path is None:  # it's a URL
-        # TODO: This should read from S3 as well.
-        contents = urllib.request.urlopen(filepath_or_url).read()
-    else:
-        with open(path) as fp:
-            contents = fp.read()
-    rows = json.loads(contents, **kwargs)
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
-    schema, pxt_pk, col_mapping = normalize_schema_names(row_schema, primary_key, schema_overrides, False)
-    # Convert all rows to insertable format - not needed, misnamed columns and types are errors in the incoming row format
-    if col_mapping is not None:
-        tbl_rows = [
-            {field if col_mapping is None else col_mapping[field]: val for field, val in row.items()} for row in rows
-        ]
-    else:
-        tbl_rows = rows
-    table = find_or_create_table(
-        tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
-    )
-    table.insert(tbl_rows)
-    return table
 def export_images_as_fo_dataset(
     tbl: pxt.Table,
     images: exprs.Expr,

pixeltable/io/hf_datasets.py CHANGED Viewed

@@ -1,41 +1,38 @@
 from __future__ import annotations
-import logging
-import math
-import random
 import typing
 from typing import Any, Optional, Union
 import pixeltable as pxt
 import pixeltable.type_system as ts
-from pixeltable import exceptions as excs
-from .utils import normalize_import_parameters, normalize_schema_names
 if typing.TYPE_CHECKING:
     import datasets  # type: ignore[import-untyped]
-_logger = logging.getLogger('pixeltable')
-# use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
-# The primary goal is to bound memory use, regardless of dataset size.
-# Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
-_K_BATCH_SIZE_BYTES = 100_000_000
-# note, there are many more types. we allow overrides in the schema_override parameter
+# note, there are many more types. we allow overrides in the schema_overrides parameter
 # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
 # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
 _hf_to_pxt: dict[str, ts.ColumnType] = {
-    'int32': ts.IntType(nullable=True),  # pixeltable widens to big int
-    'int64': ts.IntType(nullable=True),
     'bool': ts.BoolType(nullable=True),
+    'int8': ts.IntType(nullable=True),
+    'int16': ts.IntType(nullable=True),
+    'int32': ts.IntType(nullable=True),
+    'int64': ts.IntType(nullable=True),
+    'uint8': ts.IntType(nullable=True),
+    'uint16': ts.IntType(nullable=True),
+    'uint32': ts.IntType(nullable=True),
+    'uint64': ts.IntType(nullable=True),
+    'float16': ts.FloatType(nullable=True),
     'float32': ts.FloatType(nullable=True),
     'float64': ts.FloatType(nullable=True),
-    'large_string': ts.StringType(nullable=True),
     'string': ts.StringType(nullable=True),
+    'large_string': ts.StringType(nullable=True),
     'timestamp[s]': ts.TimestampType(nullable=True),
     'timestamp[ms]': ts.TimestampType(nullable=True),  # HF dataset iterator converts timestamps to datetime.datetime
     'timestamp[us]': ts.TimestampType(nullable=True),
+    'date32': ts.StringType(nullable=True),  # date32 is not supported in pixeltable, use string
+    'date64': ts.StringType(nullable=True),  # date64 is not supported in pixeltable, use string
 }
@@ -88,7 +85,6 @@ def import_huggingface_dataset(
     table_path: str,
     dataset: Union[datasets.Dataset, datasets.DatasetDict],
     *,
-    column_name_for_split: Optional[str] = None,
     schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     **kwargs: Any,
@@ -101,91 +97,18 @@ def import_huggingface_dataset(
         dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
             or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
             to insert into the table.
-        column_name_for_split: column name to use for split information. If None, no split information will be stored.
         schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
-            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
-            `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
-            Pixeltable identifiers).
+            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
+            The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
+            they are valid Pixeltable identifiers).
         primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
         kwargs: Additional arguments to pass to `create_table`.
+            An argument of `column_name_for_split` must be provided if the source is a DatasetDict.
+            This column name will contain the split information. If None, no split information will be stored.
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    import datasets
-    import pixeltable as pxt
-    if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
-        raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
-    # Create the pixeltable schema from the huggingface schema
-    hf_schema_source = _get_hf_schema(dataset)
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
-    # Add the split column to the schema if requested
-    if column_name_for_split is not None:
-        if column_name_for_split in hf_schema:
-            raise excs.Error(
-                f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
-            )
-        hf_schema[column_name_for_split] = ts.StringType(nullable=True)
-    schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
-    # Prepare to create table and insert data
-    if table_path in pxt.list_tables():
-        raise excs.Error(f'table {table_path} already exists')
-    if isinstance(dataset, datasets.Dataset):
-        # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-        raw_name = dataset.split._name
-        split_name = raw_name.split('[')[0] if raw_name is not None else None
-        dataset_dict = {split_name: dataset}
-    else:
-        dataset_dict = dataset
-    # extract all class labels from the dataset to translate category ints to strings
-    categorical_features = {
-        feature_name: feature_type.names
-        for (feature_name, feature_type) in hf_schema_source.items()
-        if isinstance(feature_type, datasets.ClassLabel)
-    }
-    try:
-        # random tmp name
-        tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
-        tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
-        def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
-            output_row = row.copy()
-            # map all class labels to strings
-            for field, values in categorical_features.items():
-                output_row[field] = values[row[field]]
-            # add split name to row
-            if column_name_for_split is not None:
-                output_row[column_name_for_split] = split_name
-            return output_row
-        for split_name, split_dataset in dataset_dict.items():
-            num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
-            tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
-            assert tuples_per_batch > 0
-            batch = []
-            for row in split_dataset:
-                batch.append(_translate_row(row, split_name))
-                if len(batch) >= tuples_per_batch:
-                    tab.insert(batch)
-                    batch = []
-            # last batch
-            if len(batch) > 0:
-                tab.insert(batch)
-    except Exception as e:
-        _logger.error(f'Error while inserting dataset into table: {tmp_name}')
-        raise e
-    pxt.move(tmp_name, table_path)
-    return pxt.get_table(table_path)
+    return pxt.create_table(
+        table_path, source=dataset, schema_overrides=schema_overrides, primary_key=primary_key, extra_args=kwargs
+    )

pixeltable 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl