PyPI - pixeltable - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

pixeltable 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (63) hide show

pixeltable/catalog/column.py +26 -49
pixeltable/catalog/insertable_table.py +7 -4
pixeltable/catalog/table.py +163 -57
pixeltable/catalog/table_version.py +416 -140
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/client.py +72 -6
pixeltable/dataframe.py +65 -21
pixeltable/env.py +52 -53
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/in_memory_data_node.py +11 -7
pixeltable/exprs/comparison.py +3 -3
pixeltable/exprs/data_row.py +5 -1
pixeltable/exprs/literal.py +16 -4
pixeltable/exprs/row_builder.py +8 -40
pixeltable/ext/__init__.py +5 -0
pixeltable/ext/functions/yolox.py +92 -0
pixeltable/func/aggregate_function.py +15 -15
pixeltable/func/expr_template_function.py +9 -1
pixeltable/func/globals.py +24 -14
pixeltable/func/signature.py +18 -12
pixeltable/func/udf.py +7 -2
pixeltable/functions/__init__.py +9 -9
pixeltable/functions/eval.py +7 -8
pixeltable/functions/fireworks.py +10 -37
pixeltable/functions/huggingface.py +47 -19
pixeltable/functions/openai.py +192 -24
pixeltable/functions/together.py +104 -9
pixeltable/functions/util.py +11 -0
pixeltable/index/__init__.py +2 -0
pixeltable/index/base.py +49 -0
pixeltable/index/embedding_index.py +95 -0
pixeltable/metadata/schema.py +45 -22
pixeltable/plan.py +15 -34
pixeltable/store.py +38 -41
pixeltable/tests/conftest.py +8 -14
pixeltable/tests/ext/test_yolox.py +21 -0
pixeltable/tests/functions/test_fireworks.py +43 -0
pixeltable/tests/functions/test_functions.py +60 -0
pixeltable/tests/{test_functions.py → functions/test_huggingface.py} +7 -143
pixeltable/tests/functions/test_openai.py +162 -0
pixeltable/tests/functions/test_together.py +112 -0
pixeltable/tests/test_component_view.py +14 -5
pixeltable/tests/test_dataframe.py +23 -22
pixeltable/tests/test_exprs.py +99 -102
pixeltable/tests/test_function.py +51 -43
pixeltable/tests/test_index.py +138 -0
pixeltable/tests/test_migration.py +2 -1
pixeltable/tests/test_snapshot.py +24 -1
pixeltable/tests/test_table.py +205 -26
pixeltable/tests/test_types.py +30 -0
pixeltable/tests/test_video.py +16 -16
pixeltable/tests/test_view.py +5 -0
pixeltable/tests/utils.py +171 -14
pixeltable/tool/create_test_db_dump.py +16 -0
pixeltable/type_system.py +77 -128
pixeltable/utils/arrow.py +98 -0
pixeltable/utils/hf_datasets.py +157 -0
pixeltable/utils/parquet.py +68 -27
pixeltable/utils/pytorch.py +16 -97
{pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/METADATA +35 -28
{pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/RECORD +63 -50
{pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
{pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0

pixeltable/type_system.py CHANGED Viewed

@@ -6,9 +6,10 @@ import enum
 import json
 import typing
 import urllib.parse
+import urllib.request
 from copy import copy
 from pathlib import Path
-from typing import Any, Optional, Tuple, Dict, Callable, List, Union
+from typing import Any, Optional, Tuple, Dict, Callable, List, Union, Sequence, Mapping
 import PIL.Image
 import av
@@ -240,19 +241,38 @@ class ColumnType:
     @classmethod
     def from_python_type(cls, t: type) -> Optional[ColumnType]:
-        if t in _python_type_to_column_type:
-            return _python_type_to_column_type[t]
-        elif isinstance(t, typing._UnionGenericAlias) and t.__args__[1] is type(None):
-            # `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
-            # We treat it as the underlying type but with nullable=True.
-            if t.__args__[0] in _python_type_to_column_type:
-                underlying = copy(_python_type_to_column_type[t.__args__[0]])
-                underlying.nullable = True
-                return underlying
+        if typing.get_origin(t) is typing.Union:
+            union_args = typing.get_args(t)
+            if union_args[1] is type(None):
+                # `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
+                # We treat it as the underlying type but with nullable=True.
+                underlying = cls.from_python_type(union_args[0])
+                if underlying is not None:
+                    underlying.nullable = True
+                    return underlying
+        else:
+            # Discard type parameters to ensure that parameterized types such as `list[T]`
+            # are correctly mapped to Pixeltable types.
+            base = typing.get_origin(t)
+            if base is None:
+                # No type parameters; the base type is just `t` itself
+                base = t
+            if base is str:
+                return StringType()
+            if base is int:
+                return IntType()
+            if base is float:
+                return FloatType()
+            if base is bool:
+                return BoolType()
+            if base is datetime.date or base is datetime.datetime:
+                return TimestampType()
+            if issubclass(base, Sequence) or issubclass(base, Mapping):
+                return JsonType()
+            if issubclass(base, PIL.Image.Image):
+                return ImageType()
         return None
     def validate_literal(self, val: Any) -> None:
         """Raise TypeError if val is not a valid literal for this type"""
         if val is None:
@@ -275,7 +295,7 @@ class ColumnType:
             parsed = urllib.parse.urlparse(val)
             if parsed.scheme != '' and parsed.scheme != 'file':
                 return
-            path = Path(urllib.parse.unquote(parsed.path))
+            path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
             if not path.is_file():
                 raise TypeError(f'File not found: {str(path)}')
         else:
@@ -358,35 +378,12 @@ class ColumnType:
         pass
     @abc.abstractmethod
-    def to_sa_type(self) -> Any:
+    def to_sa_type(self) -> sql.types.TypeEngine:
         """
         Return corresponding SQLAlchemy type.
-        return type Any: there doesn't appear to be a superclass for the sqlalchemy types
         """
-        assert self._type != self.Type.INVALID
-        if self._type == self.Type.STRING:
-            return sql.String
-        if self._type == self.Type.INT:
-            return sql.Integer
-        if self._type == self.Type.FLOAT:
-            return sql.Float
-        if self._type == self.Type.BOOL:
-            return sql.Boolean
-        if self._type == self.Type.TIMESTAMP:
-            return sql.TIMESTAMP
-        if self._type == self.Type.IMAGE:
-            # the URL
-            return sql.String
-        if self._type == self.Type.JSON:
-            return sql.dialects.postgresql.JSONB
-        if self._type == self.Type.ARRAY:
-            return sql.VARBINARY
-        assert False
+        pass
-    @abc.abstractmethod
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        assert False, f'Have not implemented {self.__class__.__name__} to Arrow'
     @staticmethod
     def no_conversion(v: Any) -> Any:
         """
@@ -410,10 +407,7 @@ class InvalidType(ColumnType):
     def to_sql(self) -> str:
         assert False
-    def to_sa_type(self) -> Any:
-        assert False
-    def to_arrow_type(self) -> 'pyarrow.DataType':
+    def to_sa_type(self) -> sql.types.TypeEngine:
         assert False
     def print_value(self, val: Any) -> str:
@@ -422,6 +416,7 @@ class InvalidType(ColumnType):
     def _validate_literal(self, val: Any) -> None:
         assert False
 class StringType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.STRING, nullable=nullable)
@@ -440,12 +435,8 @@ class StringType(ColumnType):
     def to_sql(self) -> str:
         return 'VARCHAR'
-    def to_sa_type(self) -> str:
-        return sql.String
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa # pylint: disable=import-outside-toplevel
-        return pa.string()
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.String()
     def print_value(self, val: Any) -> str:
         return f"'{val}'"
@@ -454,6 +445,14 @@ class StringType(ColumnType):
         if not isinstance(val, str):
             raise TypeError(f'Expected string, got {val.__class__.__name__}')
+    def _create_literal(self, val: Any) -> Any:
+        # Replace null byte within python string with space to avoid issues with Postgres.
+        # Use a space to avoid merging words.
+        # TODO(orm): this will also be an issue with JSON inputs, would space still be a good replacement?
+        if isinstance(val, str) and '\x00' in val:
+            return val.replace('\x00', ' ')
+        return val
 class IntType(ColumnType):
     def __init__(self, nullable: bool = False):
@@ -462,12 +461,8 @@ class IntType(ColumnType):
     def to_sql(self) -> str:
         return 'BIGINT'
-    def to_sa_type(self) -> str:
-        return sql.BigInteger
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa # pylint: disable=import-outside-toplevel
-        return pa.int64() # to be consistent with bigint above
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.BigInteger()
     def _validate_literal(self, val: Any) -> None:
         if not isinstance(val, int):
@@ -481,12 +476,8 @@ class FloatType(ColumnType):
     def to_sql(self) -> str:
         return 'FLOAT'
-    def to_sa_type(self) -> str:
-        return sql.Float
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa
-        return pa.float32()
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.Float()
     def _validate_literal(self, val: Any) -> None:
         if not isinstance(val, float):
@@ -497,6 +488,7 @@ class FloatType(ColumnType):
             return float(val)
         return val
 class BoolType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.BOOL, nullable=nullable)
@@ -504,12 +496,8 @@ class BoolType(ColumnType):
     def to_sql(self) -> str:
         return 'BOOLEAN'
-    def to_sa_type(self) -> str:
-        return sql.Boolean
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa # pylint: disable=import-outside-toplevel
-        return pa.bool_()
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.Boolean()
     def _validate_literal(self, val: Any) -> None:
         if not isinstance(val, bool):
@@ -520,6 +508,7 @@ class BoolType(ColumnType):
             return bool(val)
         return val
 class TimestampType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.TIMESTAMP, nullable=nullable)
@@ -527,12 +516,8 @@ class TimestampType(ColumnType):
     def to_sql(self) -> str:
         return 'INTEGER'
-    def to_sa_type(self) -> str:
-        return sql.TIMESTAMP
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa # pylint: disable=import-outside-toplevel
-        return pa.timestamp('us') # postgres timestamp is microseconds
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.TIMESTAMP()
     def _validate_literal(self, val: Any) -> None:
         if not isinstance(val, datetime.datetime) and not isinstance(val, datetime.date):
@@ -543,6 +528,7 @@ class TimestampType(ColumnType):
             return datetime.datetime.fromisoformat(val)
         return val
 class JsonType(ColumnType):
     # TODO: type_spec also needs to be able to express lists
     def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None, nullable: bool = False):
@@ -568,12 +554,8 @@ class JsonType(ColumnType):
     def to_sql(self) -> str:
         return 'JSONB'
-    def to_sa_type(self) -> str:
-        return sql.dialects.postgresql.JSONB
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa # pylint: disable=import-outside-toplevel
-        return pa.string() # TODO: weight advantage of pa.struct type.
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.dialects.postgresql.JSONB()
     def print_value(self, val: Any) -> str:
         val_type = self.infer_literal_type(val)
@@ -594,6 +576,7 @@ class JsonType(ColumnType):
             val = list(val)
         return val
 class ArrayType(ColumnType):
     def __init__(
             self, shape: Tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
@@ -669,20 +652,16 @@ class ArrayType(ColumnType):
     def _create_literal(self, val: Any) -> Any:
         if isinstance(val, (list,tuple)):
-            return np.array(val)
+            # map python float to whichever numpy float is
+            # declared for this type, rather than assume float64
+            return np.array(val, dtype=self.numpy_dtype())
         return val
     def to_sql(self) -> str:
         return 'BYTEA'
-    def to_sa_type(self) -> str:
-        return sql.LargeBinary
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa # pylint: disable=import-outside-toplevel
-        if any([n is None for n in self.shape]):
-            raise TypeError(f'Cannot convert array with unknown shape to Arrow')
-        return pa.fixed_shape_tensor(pa.from_numpy_dtype(self.numpy_dtype()), self.shape)
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.LargeBinary()
     def numpy_dtype(self) -> np.dtype:
         if self.dtype == self.Type.INT:
@@ -786,12 +765,8 @@ class ImageType(ColumnType):
     def to_sql(self) -> str:
         return 'VARCHAR'
-    def to_sa_type(self) -> str:
-        return sql.String
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa # pylint: disable=import-outside-toplevel
-        return pa.binary()
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.String()
     def _validate_literal(self, val: Any) -> None:
         if isinstance(val, PIL.Image.Image):
@@ -805,6 +780,7 @@ class ImageType(ColumnType):
         except PIL.UnidentifiedImageError:
             raise excs.Error(f'Not a valid image: {val}') from None
 class VideoType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.VIDEO, nullable=nullable)
@@ -813,12 +789,8 @@ class VideoType(ColumnType):
         # stored as a file path
         return 'VARCHAR'
-    def to_sa_type(self) -> str:
-        return sql.String
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa # pylint: disable=import-outside-toplevel
-        return pa.string()
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.String()
     def _validate_literal(self, val: Any) -> None:
         self._validate_file_path(val)
@@ -843,6 +815,7 @@ class VideoType(ColumnType):
         except av.AVError:
             raise excs.Error(f'Not a valid video: {val}') from None
 class AudioType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.AUDIO, nullable=nullable)
@@ -851,12 +824,8 @@ class AudioType(ColumnType):
         # stored as a file path
         return 'VARCHAR'
-    def to_sa_type(self) -> str:
-        return sql.String
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa  # pylint: disable=import-outside-toplevel
-        return pa.string()
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.String()
     def _validate_literal(self, val: Any) -> None:
         self._validate_file_path(val)
@@ -876,6 +845,7 @@ class AudioType(ColumnType):
         except av.AVError as e:
             raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
 class DocumentType(ColumnType):
     @enum.unique
     class DocumentFormat(enum.Enum):
@@ -898,12 +868,8 @@ class DocumentType(ColumnType):
         # stored as a file path
         return 'VARCHAR'
-    def to_sa_type(self) -> str:
-        return sql.String
-    def to_arrow_type(self) -> 'pyarrow.DataType':
-        import pyarrow as pa  # pylint: disable=import-outside-toplevel
-        return pa.string()
+    def to_sa_type(self) -> sql.types.TypeEngine:
+        return sql.String()
     def _validate_literal(self, val: Any) -> None:
         self._validate_file_path(val)
@@ -919,20 +885,3 @@ class DocumentType(ColumnType):
                     raise excs.Error(f'Not a recognized document format: {val}')
             except Exception as e:
                 raise excs.Error(f'Not a recognized document format: {val}') from None
-# A dictionary mapping various Python types to their respective ColumnTypes.
-# This can be used to infer Pixeltable ColumnTypes from type hints on Python
-# functions. (Since Python functions do not necessarily have type hints, this
-# should always be an optional/convenience inference.)
-_python_type_to_column_type: dict[type, ColumnType] = {
-    str: StringType(),
-    int: IntType(),
-    float: FloatType(),
-    bool: BoolType(),
-    datetime.datetime: TimestampType(),
-    datetime.date: TimestampType(),
-    list: JsonType(),
-    dict: JsonType(),
-    PIL.Image.Image: ImageType()
-}

pixeltable/utils/arrow.py ADDED Viewed

@@ -0,0 +1,98 @@
+import logging
+from typing import Any, Dict, Iterable, Iterator, Optional
+import pyarrow as pa
+import pixeltable.type_system as ts
+_logger = logging.getLogger(__name__)
+_pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
+    pa.string(): ts.StringType(nullable=True),
+    pa.timestamp('us'): ts.TimestampType(nullable=True),
+    pa.bool_(): ts.BoolType(nullable=True),
+    pa.uint8(): ts.IntType(nullable=True),
+    pa.int8(): ts.IntType(nullable=True),
+    pa.uint32(): ts.IntType(nullable=True),
+    pa.uint64(): ts.IntType(nullable=True),
+    pa.int32(): ts.IntType(nullable=True),
+    pa.int64(): ts.IntType(nullable=True),
+    pa.float32(): ts.FloatType(nullable=True),
+}
+_pt_to_pa: Dict[ts.ColumnType, pa.DataType] = {
+    ts.StringType: pa.string(),
+    ts.TimestampType: pa.timestamp('us'),  # postgres timestamp is microseconds
+    ts.BoolType: pa.bool_(),
+    ts.IntType: pa.int64(),
+    ts.FloatType: pa.float32(),
+    ts.JsonType: pa.string(),  # TODO(orm) pa.struct() is possible
+    ts.ImageType: pa.binary(),  # inline image
+    ts.AudioType: pa.string(),  # path
+    ts.VideoType: pa.string(),  # path
+    ts.DocumentType: pa.string(),  # path
+}
+def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
+    """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
+    Returns None if no conversion is currently implemented.
+    """
+    if arrow_type in _pa_to_pt:
+        return _pa_to_pt[arrow_type]
+    elif isinstance(arrow_type, pa.FixedShapeTensorType):
+        dtype = to_pixeltable_type(arrow_type.value_type)
+        if dtype is None:
+            return None
+        return ts.ArrayType(shape=arrow_type.shape, dtype=dtype)
+    else:
+        return None
+def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
+    """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
+    Returns None if no conversion is currently implemented.
+    """
+    if pixeltable_type.__class__ in _pt_to_pa:
+        return _pt_to_pa[pixeltable_type.__class__]
+    elif isinstance(pixeltable_type, ts.ArrayType):
+        return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
+    else:
+        return None
+def to_pixeltable_schema(arrow_schema: pa.Schema) -> Dict[str, ts.ColumnType]:
+    return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
+def to_arrow_schema(pixeltable_schema: Dict[str, Any]) -> pa.Schema:
+    return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
+def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
+    """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
+    this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
+    """
+    out = {}
+    for k, name in enumerate(batch.schema.names):
+        col = batch.column(k)
+        if isinstance(col.type, pa.FixedShapeTensorType):
+            # treat array columns as numpy arrays to easily preserve numpy type
+            out[name] = col.to_numpy(zero_copy_only=False)
+        else:
+            # for the rest, use pydict to preserve python types
+            out[name] = col.to_pylist()
+    return out
+def iter_tuples(batch: pa.RecordBatch) -> Iterator[Dict[str, Any]]:
+    """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
+    pydict = to_pydict(batch)
+    assert len(pydict) > 0, 'empty record batch'
+    for _, v in pydict.items():
+        batch_size = len(v)
+        break
+    for i in range(batch_size):
+        yield {col_name: values[i] for col_name, values in pydict.items()}

pixeltable/utils/hf_datasets.py ADDED Viewed

@@ -0,0 +1,157 @@
+import datasets
+from typing import Union, Optional, List, Dict, Any
+import pixeltable.type_system as ts
+from pixeltable import exceptions as excs
+import math
+import logging
+import pixeltable
+import random
+_logger = logging.getLogger(__name__)
+# use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
+# The primary goal is to bound memory use, regardless of dataset size.
+# Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
+_K_BATCH_SIZE_BYTES = 100_000_000
+# note, there are many more types. we allow overrides in the schema_override parameter
+# to handle cases where the appropriate type is not yet mapped, or to override this mapping.
+# https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
+_hf_to_pxt: Dict[str, ts.ColumnType] = {
+    'int32': ts.IntType(nullable=True),  # pixeltable widens to big int
+    'int64': ts.IntType(nullable=True),
+    'bool': ts.BoolType(nullable=True),
+    'float32': ts.FloatType(nullable=True),
+    'string': ts.StringType(nullable=True),
+    'timestamp[s]': ts.TimestampType(nullable=True),
+    'timestamp[ms]': ts.TimestampType(nullable=True),  # HF dataset iterator converts timestamps to datetime.datetime
+}
+def _to_pixeltable_type(
+    feature_type: Union[datasets.ClassLabel, datasets.Value, datasets.Sequence],
+) -> Optional[ts.ColumnType]:
+    """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
+    if isinstance(feature_type, datasets.ClassLabel):
+        # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
+        return ts.StringType(nullable=True)
+    elif isinstance(feature_type, datasets.Value):
+        # example: Value(dtype='int64', id=None)
+        return _hf_to_pxt.get(feature_type.dtype, None)
+    elif isinstance(feature_type, datasets.Sequence):
+        # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
+        dtype = _to_pixeltable_type(feature_type.feature)
+        length = feature_type.length if feature_type.length != -1 else None
+        return ts.ArrayType(shape=(length,), dtype=dtype)
+    else:
+        return None
+def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
+    """Get the schema of a huggingface dataset as a dictionary."""
+    first_dataset = dataset if isinstance(dataset, datasets.Dataset) else next(iter(dataset.values()))
+    return first_dataset.features
+def huggingface_schema_to_pixeltable_schema(
+    hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
+) -> Dict[str, Optional[ts.ColumnType]]:
+    """Generate a pixeltable schema from a huggingface dataset schema.
+    Columns without a known mapping are mapped to None
+    """
+    hf_schema = _get_hf_schema(hf_dataset)
+    pixeltable_schema = {
+        column_name: _to_pixeltable_type(feature_type) for column_name, feature_type in hf_schema.items()
+    }
+    return pixeltable_schema
+def import_huggingface_dataset(
+    cl: 'pixeltable.Client',
+    table_path: str,
+    dataset: Union[datasets.Dataset, datasets.DatasetDict],
+    *,
+    column_name_for_split: Optional[str],
+    schema_override: Optional[Dict[str, Any]],
+    **kwargs,
+) -> 'pixeltable.InsertableTable':
+    """See `pixeltable.Client.import_huggingface_dataset` for documentation"""
+    if table_path in cl.list_tables():
+        raise excs.Error(f'table {table_path} already exists')
+    if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
+        raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
+    if isinstance(dataset, datasets.Dataset):
+        # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
+        raw_name = dataset.split._name
+        split_name = raw_name.split('[')[0] if raw_name is not None else None
+        dataset_dict = {split_name: dataset}
+    else:
+        dataset_dict = dataset
+    pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
+    if schema_override is not None:
+        pixeltable_schema.update(schema_override)
+    if column_name_for_split is not None:
+        if column_name_for_split in pixeltable_schema:
+            raise excs.Error(
+                f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
+            )
+        pixeltable_schema[column_name_for_split] = ts.StringType(nullable=True)
+    for field, column_type in pixeltable_schema.items():
+        if column_type is None:
+            raise excs.Error(f'Could not infer pixeltable type for feature `{field}` in huggingface dataset')
+    if isinstance(dataset, datasets.Dataset):
+        # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
+        raw_name = dataset.split._name
+        split_name = raw_name.split('[')[0] if raw_name is not None else None
+        dataset_dict = {split_name: dataset}
+    elif isinstance(dataset, datasets.DatasetDict):
+        dataset_dict = dataset
+    else:
+        raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
+    # extract all class labels from the dataset to translate category ints to strings
+    hf_schema = _get_hf_schema(dataset)
+    categorical_features = {
+        feature_name: feature_type.names
+        for (feature_name, feature_type) in hf_schema.items()
+        if isinstance(feature_type, datasets.ClassLabel)
+    }
+    try:
+        # random tmp name
+        tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
+        tab = cl.create_table(tmp_name, pixeltable_schema, **kwargs)
+        def _translate_row(row: Dict[str, Any], split_name: str) -> Dict[str, Any]:
+            output_row = row.copy()
+            # map all class labels to strings
+            for field, values in categorical_features.items():
+                output_row[field] = values[row[field]]
+            # add split name to row
+            if column_name_for_split is not None:
+                output_row[column_name_for_split] = split_name
+            return output_row
+        for split_name, split_dataset in dataset_dict.items():
+            num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
+            tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
+            assert tuples_per_batch > 0
+            batch = []
+            for row in split_dataset:
+                batch.append(_translate_row(row, split_name))
+                if len(batch) >= tuples_per_batch:
+                    tab.insert(batch)
+                    batch = []
+            # last batch
+            if len(batch) > 0:
+                tab.insert(batch)
+    except Exception as e:
+        _logger.error(f'Error while inserting dataset into table: {tmp_name}')
+        raise e
+    cl.move(tmp_name, table_path)
+    return cl.get_table(table_path)

pixeltable 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl