PyPI - datachain - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

datachain 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (51) hide show

datachain/__init__.py +17 -8
datachain/catalog/catalog.py +5 -5
datachain/cli.py +0 -2
datachain/data_storage/schema.py +5 -5
datachain/data_storage/sqlite.py +1 -1
datachain/data_storage/warehouse.py +7 -7
datachain/lib/arrow.py +25 -8
datachain/lib/clip.py +6 -11
datachain/lib/convert/__init__.py +0 -0
datachain/lib/convert/flatten.py +67 -0
datachain/lib/convert/type_converter.py +96 -0
datachain/lib/convert/unflatten.py +69 -0
datachain/lib/convert/values_to_tuples.py +85 -0
datachain/lib/data_model.py +74 -0
datachain/lib/dc.py +225 -168
datachain/lib/file.py +41 -41
datachain/lib/gpt4_vision.py +1 -9
datachain/lib/hf_image_to_text.py +9 -17
datachain/lib/hf_pipeline.py +4 -12
datachain/lib/image.py +2 -18
datachain/lib/image_transform.py +0 -1
datachain/lib/iptc_exif_xmp.py +8 -15
datachain/lib/meta_formats.py +1 -5
datachain/lib/model_store.py +77 -0
datachain/lib/pytorch.py +9 -21
datachain/lib/signal_schema.py +139 -60
datachain/lib/text.py +5 -16
datachain/lib/udf.py +114 -30
datachain/lib/udf_signature.py +5 -5
datachain/lib/webdataset.py +3 -3
datachain/lib/webdataset_laion.py +2 -3
datachain/node.py +4 -4
datachain/query/batch.py +1 -1
datachain/query/dataset.py +51 -178
datachain/query/dispatch.py +43 -30
datachain/query/udf.py +46 -26
datachain/remote/studio.py +1 -9
datachain/torch/__init__.py +21 -0
datachain/utils.py +39 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/METADATA +14 -12
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/RECORD +45 -43
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/WHEEL +1 -1
datachain/image/__init__.py +0 -3
datachain/lib/cached_stream.py +0 -38
datachain/lib/claude.py +0 -69
datachain/lib/feature.py +0 -412
datachain/lib/feature_registry.py +0 -51
datachain/lib/feature_utils.py +0 -154
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/LICENSE +0 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/entry_points.txt +0 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/top_level.txt +0 -0

datachain/lib/signal_schema.py CHANGED Viewed

@@ -1,20 +1,30 @@
 import copy
 from collections.abc import Iterator, Sequence
+from dataclasses import dataclass
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin
+from typing import (
+    TYPE_CHECKING,
+    Annotated,
+    Any,
+    Callable,
+    Literal,
+    Optional,
+    Union,
+    get_args,
+    get_origin,
+)
-from pydantic import create_model
+from pydantic import BaseModel, create_model
+from typing_extensions import Literal as LiteralEx
-from datachain.lib.feature import (
-    DATACHAIN_TO_TYPE,
-    DEFAULT_DELIMITER,
-    Feature,
-    FeatureType,
-    convert_type_to_datachain,
-)
-from datachain.lib.feature_registry import Registry
+from datachain.lib.convert.flatten import DATACHAIN_TO_TYPE
+from datachain.lib.convert.type_converter import convert_to_db_type
+from datachain.lib.convert.unflatten import unflatten_to_json_pos
+from datachain.lib.data_model import DataModel, DataType
 from datachain.lib.file import File
+from datachain.lib.model_store import ModelStore
 from datachain.lib.utils import DataChainParamsError
+from datachain.query.schema import DEFAULT_DELIMITER
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
@@ -56,10 +66,16 @@ class SignalResolvingTypeError(SignalResolvingError):
         )
+@dataclass
 class SignalSchema:
+    values: dict[str, DataType]
+    tree: dict[str, Any]
+    setup_func: dict[str, Callable]
+    setup_values: Optional[dict[str, Callable]]
     def __init__(
         self,
-        values: dict[str, FeatureType],
+        values: dict[str, DataType],
         setup: Optional[dict[str, Callable]] = None,
     ):
         self.values = values
@@ -85,7 +101,7 @@ class SignalSchema:
     @staticmethod
     def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
-        signals: dict[str, FeatureType] = {}
+        signals: dict[str, DataType] = {}
         for field, type_ in col_types.items():
             type_ = DATACHAIN_TO_TYPE.get(type_, None)
             if type_ is None:
@@ -99,15 +115,16 @@ class SignalSchema:
     def serialize(self) -> dict[str, str]:
         signals = {}
         for name, fr_type in self.values.items():
-            if Feature.is_feature(fr_type):
-                signals[name] = fr_type._name()  # type: ignore[union-attr]
+            if (fr := ModelStore.to_pydantic(fr_type)) is not None:
+                ModelStore.add(fr)
+                signals[name] = ModelStore.get_name(fr)
             else:
                 orig = get_origin(fr_type)
                 args = get_args(fr_type)
                 # Check if fr_type is Optional
                 if orig == Union and len(args) == 2 and (type(None) in args):
                     fr_type = args[0]
-                signals[name] = fr_type.__name__
+                signals[name] = str(fr_type.__name__)  # type: ignore[union-attr]
         return signals
     @staticmethod
@@ -115,80 +132,93 @@ class SignalSchema:
         if not isinstance(schema, dict):
             raise SignalSchemaError(f"cannot deserialize signal schema: {schema}")
-        signals: dict[str, FeatureType] = {}
+        signals: dict[str, DataType] = {}
         for signal, type_name in schema.items():
             try:
                 fr = NAMES_TO_TYPES.get(type_name)
                 if not fr:
-                    type_name, version = Registry.parse_name_version(type_name)
-                    fr = Registry.get(type_name, version)
+                    type_name, version = ModelStore.parse_name_version(type_name)
+                    fr = ModelStore.get(type_name, version)
+                    if not fr:
+                        raise SignalSchemaError(
+                            f"cannot deserialize '{signal}': "
+                            f"unknown type '{type_name}'."
+                            f" Try to add it with `ModelStore.add({type_name})`."
+                        )
             except TypeError as err:
                 raise SignalSchemaError(
                     f"cannot deserialize '{signal}': {err}"
                 ) from err
-            if not fr:
-                raise SignalSchemaError(
-                    f"cannot deserialize '{signal}': unsupported type '{type_name}'"
-                )
             signals[signal] = fr
         return SignalSchema(signals)
-    def to_udf_spec(self) -> dict[str, Any]:
+    def to_udf_spec(self) -> dict[str, type]:
         res = {}
         for path, type_, has_subtree, _ in self.get_flat_tree():
             if path[0] in self.setup_func:
                 continue
             if not has_subtree:
                 db_name = DEFAULT_DELIMITER.join(path)
-                res[db_name] = convert_type_to_datachain(type_)
+                res[db_name] = convert_to_db_type(type_)
         return res
-    def row_to_objs(self, row: Sequence[Any]) -> list[FeatureType]:
+    def row_to_objs(self, row: Sequence[Any]) -> list[DataType]:
         self._init_setup_values()
         objs = []
         pos = 0
         for name, fr_type in self.values.items():
-            if val := self.setup_values.get(name, None):  # type: ignore[attr-defined]
+            if self.setup_values and (val := self.setup_values.get(name, None)):
                 objs.append(val)
-            elif Feature.is_feature(fr_type):
-                j, pos = fr_type._unflatten_to_json_pos(row, pos)  # type: ignore[union-attr]
-                objs.append(fr_type(**j))
+            elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
+                j, pos = unflatten_to_json_pos(fr, row, pos)
+                objs.append(fr(**j))  # type: ignore[arg-type]
             else:
                 objs.append(row[pos])
                 pos += 1
         return objs  # type: ignore[return-value]
     def contains_file(self) -> bool:
-        return any(
-            fr._is_file  # type: ignore[union-attr]
-            for fr in self.values.values()
-            if Feature.is_feature(fr)
-        )
+        for type_ in self.values.values():
+            if (fr := ModelStore.to_pydantic(type_)) is not None and issubclass(
+                fr, File
+            ):
+                return True
+        return False
     def slice(
         self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
     ) -> "SignalSchema":
+        # Make new schema that combines current schema and setup signals
         setup = setup or {}
         setup_no_types = dict.fromkeys(setup.keys(), str)
-        union = self.values | setup_no_types
-        schema = {k: union[k] for k in keys if k in union}
+        union = SignalSchema(self.values | setup_no_types)
+        # Slice combined schema by keys
+        schema = {}
+        for k in keys:
+            try:
+                schema[k] = union._find_in_tree(k.split("."))
+            except SignalResolvingError:
+                pass
         return SignalSchema(schema, setup)
-    def row_to_features(self, row: Sequence, catalog: "Catalog") -> list[FeatureType]:
+    def row_to_features(
+        self, row: Sequence, catalog: "Catalog", cache: bool = False
+    ) -> list[DataType]:
         res = []
         pos = 0
         for fr_cls in self.values.values():
-            if not Feature.is_feature(fr_cls):
+            if (fr := ModelStore.to_pydantic(fr_cls)) is None:
                 res.append(row[pos])
                 pos += 1
             else:
-                json, pos = fr_cls._unflatten_to_json_pos(row, pos)  # type: ignore[union-attr]
-                obj = fr_cls(**json)
+                json, pos = unflatten_to_json_pos(fr, row, pos)  # type: ignore[union-attr]
+                obj = fr(**json)
                 if isinstance(obj, File):
-                    obj._set_stream(catalog)
+                    obj._set_stream(catalog, caching_enabled=cache)
                 res.append(obj)
         return res
@@ -208,7 +238,7 @@ class SignalSchema:
         return SignalSchema(schema)
-    def _find_in_tree(self, path: list[str]) -> FeatureType:
+    def _find_in_tree(self, path: list[str]) -> DataType:
         curr_tree = self.tree
         curr_type = None
         i = 0
@@ -265,24 +295,23 @@ class SignalSchema:
             if has_subtree and issubclass(type_, File):
                 yield ".".join(path)
-    def create_model(self, name: str) -> type[Feature]:
+    def create_model(self, name: str) -> type[DataModel]:
         fields = {key: (value, None) for key, value in self.values.items()}
         return create_model(
             name,
-            __base__=(Feature,),  # type: ignore[call-overload]
+            __base__=(DataModel,),  # type: ignore[call-overload]
             **fields,
         )
     @staticmethod
-    def _build_tree(values: dict[str, FeatureType]) -> dict[str, Any]:
-        res = {}
-        for name, val in values.items():
-            subtree = val.build_tree() if Feature.is_feature(val) else None  # type: ignore[union-attr]
-            res[name] = (val, subtree)
-        return res
+    def _build_tree(
+        values: dict[str, DataType],
+    ) -> dict[str, tuple[DataType, Optional[dict]]]:
+        return {
+            name: (val, SignalSchema._build_tree_for_type(val))
+            for name, val in values.items()
+        }
     def get_flat_tree(self) -> Iterator[tuple[list[str], type, bool, int]]:
         yield from self._get_flat_tree(self.tree, [], 0)
@@ -305,27 +334,77 @@ class SignalSchema:
             if get_origin(type_) is list:
                 args = get_args(type_)
-                if len(args) > 0 and Feature.is_feature(args[0]):
+                if len(args) > 0 and ModelStore.is_pydantic(args[0]):
                     sub_schema = SignalSchema({"* list of": args[0]})
                     sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
+    def get_headers_with_length(self):
+        paths = [
+            path for path, _, has_subtree, _ in self.get_flat_tree() if not has_subtree
+        ]
+        max_length = max([len(path) for path in paths], default=0)
+        return [
+            path + [""] * (max_length - len(path)) if len(path) < max_length else path
+            for path in paths
+        ], max_length
+    def __or__(self, other):
+        return self.__class__(self.values | other.values)
+    def __contains__(self, name: str):
+        return name in self.values
+    def remove(self, name: str):
+        return self.values.pop(name)
     @staticmethod
-    def _type_to_str(type_):
-        if get_origin(type_) == Union:
+    def _type_to_str(type_):  # noqa: PLR0911
+        origin = get_origin(type_)
+        if origin == Union:
             args = get_args(type_)
             formatted_types = ", ".join(SignalSchema._type_to_str(arg) for arg in args)
             return f"Union[{formatted_types}]"
-        if get_origin(type_) == Optional:
+        if origin == Optional:
             args = get_args(type_)
             type_str = SignalSchema._type_to_str(args[0])
             return f"Optional[{type_str}]"
-        if get_origin(type_) is list:
+        if origin is list:
             args = get_args(type_)
             type_str = SignalSchema._type_to_str(args[0])
             return f"list[{type_str}]"
-        if get_origin(type_) is dict:
+        if origin is dict:
             args = get_args(type_)
-            type_str = SignalSchema._type_to_str(args[0])
+            type_str = SignalSchema._type_to_str(args[0]) if len(args) > 0 else ""
             vals = f", {SignalSchema._type_to_str(args[1])}" if len(args) > 1 else ""
             return f"dict[{type_str}{vals}]"
+        if origin == Annotated:
+            args = get_args(type_)
+            return SignalSchema._type_to_str(args[0])
+        if origin in (Literal, LiteralEx):
+            return "Literal"
         return type_.__name__
+    @staticmethod
+    def _build_tree_for_type(
+        model: DataType,
+    ) -> Optional[dict[str, tuple[DataType, Optional[dict]]]]:
+        if (fr := ModelStore.to_pydantic(model)) is not None:
+            return SignalSchema._build_tree_for_model(fr)
+        return None
+    @staticmethod
+    def _build_tree_for_model(
+        model: type[BaseModel],
+    ) -> Optional[dict[str, tuple[DataType, Optional[dict]]]]:
+        res: dict[str, tuple[DataType, Optional[dict]]] = {}
+        for name, f_info in model.model_fields.items():
+            anno = f_info.annotation
+            if (fr := ModelStore.to_pydantic(anno)) is not None:
+                subtree = SignalSchema._build_tree_for_model(fr)
+            else:
+                subtree = None
+            res[name] = (anno, subtree)  # type: ignore[assignment]
+        return res

datachain/lib/text.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, Union
-if TYPE_CHECKING:
-    import torch
+import torch
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 def convert_text(
@@ -9,7 +9,7 @@ def convert_text(
     tokenizer: Optional[Callable] = None,
     tokenizer_kwargs: Optional[dict[str, Any]] = None,
     encoder: Optional[Callable] = None,
-) -> Union[str, list[str], "torch.Tensor"]:
+) -> Union[str, list[str], torch.Tensor]:
     """
     Tokenize and otherwise transform text.
@@ -29,21 +29,10 @@ def convert_text(
         res = tokenizer(text, **tokenizer_kwargs)
     else:
         res = tokenizer(text)
-    try:
-        from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-        tokens = (
-            res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
-        )
-    except ImportError:
-        tokens = res
+    tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
     if not encoder:
         return tokens
-    try:
-        import torch
-    except ImportError:
-        "Missing dependency 'torch' needed to encode text."
     return encoder(torch.tensor(tokens))

datachain/lib/udf.py CHANGED Viewed

@@ -1,16 +1,29 @@
-import inspect
 import sys
 import traceback
-from typing import TYPE_CHECKING, Callable
+from collections.abc import Iterable, Iterator
+from typing import TYPE_CHECKING, Callable, Optional
-from datachain.lib.feature import Feature
+from fsspec.callbacks import DEFAULT_CALLBACK, Callback
+from pydantic import BaseModel
+from datachain.dataset import RowDict
+from datachain.lib.convert.flatten import flatten
+from datachain.lib.convert.unflatten import unflatten_to_json
+from datachain.lib.data_model import FileBasic
+from datachain.lib.model_store import ModelStore
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
-from datachain.query import udf
+from datachain.query.batch import RowBatch
+from datachain.query.schema import ColumnParameter
+from datachain.query.udf import UDFBase as _UDFBase
+from datachain.query.udf import UDFProperties, UDFResult
 if TYPE_CHECKING:
-    from datachain.query.udf import UDFWrapper
+    from typing_extensions import Self
+    from datachain.catalog import Catalog
+    from datachain.query.batch import BatchingResult
 class UdfError(DataChainParamsError):
@@ -18,10 +31,67 @@ class UdfError(DataChainParamsError):
         super().__init__(f"UDF error: {msg}")
+class UDFAdapter(_UDFBase):
+    def __init__(
+        self,
+        inner: "UDFBase",
+        properties: UDFProperties,
+    ):
+        self.inner = inner
+        super().__init__(properties)
+    def run(
+        self,
+        udf_inputs: "Iterable[BatchingResult]",
+        catalog: "Catalog",
+        is_generator: bool,
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable["UDFResult"]]:
+        self.inner._catalog = catalog
+        if hasattr(self.inner, "setup") and callable(self.inner.setup):
+            self.inner.setup()
+        for batch in udf_inputs:
+            n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
+            output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
+            processed_cb.relative_update(n_rows)
+            yield output
+        if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
+            self.inner.teardown()
+    def run_once(
+        self,
+        catalog: "Catalog",
+        arg: "BatchingResult",
+        is_generator: bool = False,
+        cache: bool = False,
+        cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterable[UDFResult]:
+        if isinstance(arg, RowBatch):
+            udf_inputs = [
+                self.bind_parameters(catalog, row, cache=cache, cb=cb)
+                for row in arg.rows
+            ]
+            udf_outputs = self.inner(udf_inputs, cache=cache, download_cb=cb)
+            return self._process_results(arg.rows, udf_outputs, is_generator)
+        if isinstance(arg, RowDict):
+            udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
+            udf_outputs = self.inner(*udf_inputs, cache=cache, download_cb=cb)
+            if not is_generator:
+                # udf_outputs is generator already if is_generator=True
+                udf_outputs = [udf_outputs]
+            return self._process_results([arg], udf_outputs, is_generator)
+        raise ValueError(f"Unexpected UDF argument: {arg}")
 class UDFBase(AbstractUDF):
     is_input_batched = False
     is_output_batched = False
     is_input_grouped = False
+    params_spec: Optional[list[str]]
     def __init__(self):
         self.params = None
@@ -48,7 +118,12 @@ class UDFBase(AbstractUDF):
         This is needed for tasks like closing connections to end-points.
         """
-    def _init(self, sign: UdfSignature, params: SignalSchema, func: Callable):
+    def _init(
+        self,
+        sign: UdfSignature,
+        params: SignalSchema,
+        func: Callable,
+    ):
         self.params = params
         self.output = sign.output_schema
@@ -61,20 +136,19 @@ class UDFBase(AbstractUDF):
     @classmethod
     def _create(
         cls,
-        target_class: type["UDFBase"],
         sign: UdfSignature,
         params: SignalSchema,
-    ) -> "UDFBase":
+    ) -> "Self":
         if isinstance(sign.func, AbstractUDF):
-            if not isinstance(sign.func, target_class):  # type: ignore[unreachable]
+            if not isinstance(sign.func, cls):  # type: ignore[unreachable]
                 raise UdfError(
                     f"cannot create UDF: provided UDF '{sign.func.__name__}'"
-                    f" must be a child of target class '{target_class.__name__}'",
+                    f" must be a child of target class '{cls.__name__}'",
                 )
             result = sign.func
             func = None
         else:
-            result = target_class()
+            result = cls()
             func = sign.func
         result._init(sign, params, func)
@@ -91,18 +165,21 @@ class UDFBase(AbstractUDF):
     def catalog(self):
         return self._catalog
-    def to_udf_wrapper(self, batch=1) -> "UDFWrapper":
-        udf_wrapper = udf(self.params_spec, self.output_spec, batch=batch)
-        return udf_wrapper(self)
+    def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
+        assert self.params_spec is not None
+        properties = UDFProperties(
+            [ColumnParameter(p) for p in self.params_spec], self.output_spec, batch
+        )
+        return UDFAdapter(self, properties)
     def validate_results(self, results, *args, **kwargs):
         return results
-    def __call__(self, *rows):
+    def __call__(self, *rows, cache, download_cb):
         if self.is_input_grouped:
-            objs = self._parse_grouped_rows(rows)
+            objs = self._parse_grouped_rows(rows[0], cache, download_cb)
         else:
-            objs = self._parse_rows(rows)
+            objs = self._parse_rows(rows, cache, download_cb)
         if not self.is_input_batched:
             objs = objs[0]
@@ -117,15 +194,19 @@ class UDFBase(AbstractUDF):
             for tuple_ in result_objs:
                 flat = []
                 for obj in tuple_:
-                    if isinstance(obj, Feature):
-                        flat.extend(Feature._flatten(obj))
+                    if isinstance(obj, BaseModel):
+                        flat.extend(flatten(obj))
                     else:
                         flat.append(obj)
                 res.append(flat)
         else:
             # Generator expression is required, otherwise the value will be materialized
             res = (
-                obj._flatten() if isinstance(obj, Feature) else (obj,)
+                flatten(obj)
+                if isinstance(obj, BaseModel)
+                else obj
+                if isinstance(obj, tuple)
+                else (obj,)
                 for obj in result_objs
             )
@@ -139,24 +220,25 @@ class UDFBase(AbstractUDF):
         return res
-    def _parse_rows(self, rows):
+    def _parse_rows(self, rows, cache, download_cb):
         if not self.is_input_batched:
             rows = [rows]
         objs = []
         for row in rows:
             obj_row = self.params.row_to_objs(row)
             for obj in obj_row:
-                if isinstance(obj, Feature):
-                    obj._set_stream(self._catalog, caching_enabled=True)
+                if isinstance(obj, FileBasic):
+                    obj._set_stream(
+                        self._catalog, caching_enabled=cache, download_cb=download_cb
+                    )
             objs.append(obj_row)
         return objs
-    def _parse_grouped_rows(self, rows):
-        group = rows[0]
+    def _parse_grouped_rows(self, group, cache, download_cb):
         spec_map = {}
         output_map = {}
         for name, (anno, subtree) in self.params.tree.items():
-            if inspect.isclass(anno) and issubclass(anno, Feature):
+            if ModelStore.is_pydantic(anno):
                 length = sum(1 for _ in self.params._get_flat_tree(subtree, [], 0))
             else:
                 length = 1
@@ -169,13 +251,15 @@ class UDFBase(AbstractUDF):
                 slice = flat_obj[position : position + length]
                 position += length
-                if Feature.is_feature(cls):
-                    obj = cls(**cls._unflatten_to_json(slice))
+                if ModelStore.is_pydantic(cls):
+                    obj = cls(**unflatten_to_json(cls, slice))
                 else:
                     obj = slice[0]
-                if isinstance(obj, Feature):
-                    obj._set_stream(self._catalog)
+                if isinstance(obj, FileBasic):
+                    obj._set_stream(
+                        self._catalog, caching_enabled=cache, download_cb=download_cb
+                    )
                 output_map[signal].append(obj)
         return list(output_map.values())

datachain/lib/udf_signature.py CHANGED Viewed

@@ -3,7 +3,7 @@ from collections.abc import Generator, Iterator, Sequence
 from dataclasses import dataclass
 from typing import Callable, Optional, Union, get_args, get_origin
-from datachain.lib.feature import Feature, FeatureType, FeatureTypeNames
+from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.utils import AbstractUDF, DataChainParamsError
@@ -29,7 +29,7 @@ class UdfSignature:
         signal_map: dict[str, Callable],
         func: Optional[Callable] = None,
         params: Union[None, str, Sequence[str]] = None,
-        output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
+        output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
         is_generator: bool = True,
     ) -> "UdfSignature":
         keys = ", ".join(signal_map.keys())
@@ -127,15 +127,15 @@ class UdfSignature:
                         f"output signal '{key}' has type '{type(key)}'"
                         " while 'str' is expected",
                     )
-                if not Feature.is_feature_type(value):
+                if not is_chain_type(value):
                     raise UdfSignatureError(
                         chain,
                         f"output type '{value.__name__}' of signal '{key}' is not"
-                        f" supported. Please use Feature types: {FeatureTypeNames}",
+                        f" supported. Please use Feature types: {DataTypeNames}",
                     )
             udf_output_map = output
-        elif Feature.is_feature_type(output):
+        elif is_chain_type(output):
             udf_output_map = {signal_name: output}
         else:
             raise UdfSignatureError(

datachain/lib/webdataset.py CHANGED Viewed

@@ -15,7 +15,7 @@ from typing import (
 from pydantic import Field
-from datachain.lib.feature import Feature
+from datachain.lib.data_model import DataModel
 from datachain.lib.file import File, TarVFile
 from datachain.lib.utils import DataChainError
@@ -46,7 +46,7 @@ class UnknownFileExtensionError(WDSError):
         super().__init__(tar_stream, f"unknown extension '{ext}' for file '{name}'")
-class WDSBasic(Feature):
+class WDSBasic(DataModel):
     file: File
@@ -75,7 +75,7 @@ class WDSAllFile(WDSBasic):
     cbor: Optional[bytes] = Field(default=None)
-class WDSReadableSubclass(Feature):
+class WDSReadableSubclass(DataModel):
     @staticmethod
     def _reader(builder, item: tarfile.TarInfo) -> "WDSReadableSubclass":
         raise NotImplementedError

datachain 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

Potentially problematic release.

datachain 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl