PyPI - datachain - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

datachain 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show

datachain/__init__.py +3 -4
datachain/cache.py +10 -4
datachain/catalog/catalog.py +35 -15
datachain/cli.py +37 -32
datachain/data_storage/metastore.py +24 -0
datachain/data_storage/warehouse.py +3 -1
datachain/job.py +56 -0
datachain/lib/arrow.py +19 -7
datachain/lib/clip.py +89 -66
datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
datachain/lib/convert/sql_to_python.py +23 -0
datachain/lib/convert/values_to_tuples.py +51 -33
datachain/lib/data_model.py +6 -27
datachain/lib/dataset_info.py +70 -0
datachain/lib/dc.py +646 -152
datachain/lib/file.py +117 -15
datachain/lib/image.py +1 -1
datachain/lib/meta_formats.py +14 -2
datachain/lib/model_store.py +3 -2
datachain/lib/pytorch.py +10 -7
datachain/lib/signal_schema.py +39 -14
datachain/lib/text.py +2 -1
datachain/lib/udf.py +56 -5
datachain/lib/udf_signature.py +1 -1
datachain/lib/webdataset.py +4 -3
datachain/node.py +11 -8
datachain/query/dataset.py +66 -147
datachain/query/dispatch.py +15 -13
datachain/query/schema.py +2 -0
datachain/query/session.py +4 -4
datachain/sql/functions/array.py +12 -0
datachain/sql/functions/string.py +8 -0
datachain/torch/__init__.py +1 -1
datachain/utils.py +45 -0
datachain-0.2.12.dist-info/METADATA +412 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/RECORD +40 -45
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
datachain/lib/feature_registry.py +0 -77
datachain/lib/gpt4_vision.py +0 -97
datachain/lib/hf_image_to_text.py +0 -97
datachain/lib/hf_pipeline.py +0 -90
datachain/lib/image_transform.py +0 -103
datachain/lib/iptc_exif_xmp.py +0 -76
datachain/lib/unstructured.py +0 -41
datachain/text/__init__.py +0 -3
datachain-0.2.10.dist-info/METADATA +0 -430
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
{datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0

datachain/lib/file.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import io
 import json
+import os
+import posixpath
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from datetime import datetime
@@ -16,7 +18,7 @@ from pydantic import Field, field_validator
 from datachain.cache import UniqueId
 from datachain.client.fileslice import FileSlice
-from datachain.lib.data_model import DataModel, FileBasic
+from datachain.lib.data_model import DataModel
 from datachain.lib.utils import DataChainError
 from datachain.sql.types import JSON, Int, String
 from datachain.utils import TIME_ZERO
@@ -24,6 +26,9 @@ from datachain.utils import TIME_ZERO
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
+# how to create file path when exporting
+ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
 class VFileError(DataChainError):
     def __init__(self, file: "File", message: str, vtype: str = ""):
@@ -49,12 +54,15 @@ class VFile(ABC):
 class TarVFile(VFile):
+    """Virtual file model for files extracted from tar archives."""
     @classmethod
     def get_vtype(cls) -> str:
         return "tar"
     @classmethod
     def open(cls, file: "File", location: list[dict]):
+        """Stream file from tar archive based on location in archive."""
         if len(location) > 1:
             VFileError(file, "multiple 'location's are not supported yet")
@@ -100,7 +108,9 @@ class VFileRegistry:
         return reader.open(file, location)
-class File(FileBasic):
+class File(DataModel):
+    """`DataModel` for reading binary files."""
     source: str = Field(default="")
     parent: str = Field(default="")
     name: str
@@ -127,14 +137,17 @@ class File(FileBasic):
         "source",
         "parent",
         "name",
-        "etag",
         "size",
+        "etag",
+        "version",
+        "is_latest",
         "vtype",
         "location",
+        "last_modified",
     ]
     @staticmethod
-    def to_dict(
+    def _validate_dict(
         v: Optional[Union[str, dict, list[dict]]],
     ) -> Optional[Union[str, dict, list[dict]]]:
         if v is None or v == "":
@@ -152,7 +165,7 @@ class File(FileBasic):
     @field_validator("location", mode="before")
     @classmethod
     def validate_location(cls, v):
-        return File.to_dict(v)
+        return File._validate_dict(v)
     @field_validator("parent", mode="before")
     @classmethod
@@ -172,9 +185,10 @@ class File(FileBasic):
         self._caching_enabled = False
     @contextmanager
-    def open(self):
+    def open(self, mode: Literal["rb", "r"] = "rb"):
+        """Open the file and return a file object."""
         if self.location:
-            with VFileRegistry.resolve(self, self.location) as f:
+            with VFileRegistry.resolve(self, self.location) as f:  # type: ignore[arg-type]
                 yield f
         uid = self.get_uid()
@@ -184,7 +198,41 @@ class File(FileBasic):
         with client.open_object(
             uid, use_cache=self._caching_enabled, cb=self._download_cb
         ) as f:
-            yield f
+            yield io.TextIOWrapper(f) if mode == "r" else f
+    def read(self, length: int = -1):
+        """Returns file contents."""
+        with self.open() as stream:
+            return stream.read(length)
+    def read_bytes(self):
+        """Returns file contents as bytes."""
+        return self.read()
+    def read_text(self):
+        """Returns file contents as text."""
+        with self.open(mode="r") as stream:
+            return stream.read()
+    def write(self, destination: str):
+        """Writes it's content to destination"""
+        with open(destination, mode="wb") as f:
+            f.write(self.read())
+    def export(
+        self,
+        output: str,
+        placement: ExportPlacement = "fullpath",
+        use_cache: bool = True,
+    ) -> None:
+        """Export file to new location."""
+        if use_cache:
+            self._caching_enabled = use_cache
+        dst = self.get_destination_path(output, placement)
+        dst_dir = os.path.dirname(dst)
+        os.makedirs(dst_dir, exist_ok=True)
+        self.write(dst)
     def _set_stream(
         self,
@@ -197,11 +245,12 @@ class File(FileBasic):
         self._download_cb = download_cb
     def get_uid(self) -> UniqueId:
+        """Returns unique ID for file."""
         dump = self.model_dump()
         return UniqueId(*(dump[k] for k in self._unique_id_keys))
     def get_local_path(self) -> Optional[str]:
-        """Get path to a file in a local cache.
+        """Returns path to a file in a local cache.
         Return None if file is not cached. Throws an exception if cache is not setup."""
         if self._catalog is None:
             raise RuntimeError(
@@ -210,21 +259,27 @@ class File(FileBasic):
         return self._catalog.cache.get_path(self.get_uid())
     def get_file_suffix(self):
+        """Returns last part of file name with `.`."""
         return Path(self.name).suffix
     def get_file_ext(self):
+        """Returns last part of file name without `.`."""
         return Path(self.name).suffix.strip(".")
     def get_file_stem(self):
+        """Returns file name without extension."""
         return Path(self.name).stem
     def get_full_name(self):
+        """Returns name with parent directories."""
         return (Path(self.parent) / self.name).as_posix()
     def get_uri(self):
+        """Returns file URI."""
         return f"{self.source}/{self.get_full_name()}"
     def get_path(self) -> str:
+        """Returns file path."""
         path = unquote(self.get_uri())
         fs = self.get_fs()
         if isinstance(fs, LocalFileSystem):
@@ -233,21 +288,65 @@ class File(FileBasic):
             path = url2pathname(path)
         return path
+    def get_destination_path(self, output: str, placement: ExportPlacement) -> str:
+        """
+        Returns full destination path of a file for exporting to some output
+        based on export placement
+        """
+        if placement == "filename":
+            path = unquote(self.name)
+        elif placement == "etag":
+            path = f"{self.etag}{self.get_file_suffix()}"
+        elif placement == "fullpath":
+            fs = self.get_fs()
+            if isinstance(fs, LocalFileSystem):
+                path = unquote(self.get_full_name())
+            else:
+                path = (
+                    Path(urlparse(self.source).netloc) / unquote(self.get_full_name())
+                ).as_posix()
+        elif placement == "checksum":
+            raise NotImplementedError("Checksum placement not implemented yet")
+        else:
+            raise ValueError(f"Unsupported file export placement: {placement}")
+        return posixpath.join(output, path)  # type: ignore[union-attr]
     def get_fs(self):
+        """Returns `fsspec` filesystem for the file."""
         return self._catalog.get_client(self.source).fs
 class TextFile(File):
+    """`DataModel` for reading text files."""
     @contextmanager
     def open(self):
-        with super().open() as binary:
-            yield io.TextIOWrapper(binary)
+        """Open the file and return a file object in text mode."""
+        with super().open(mode="r") as stream:
+            yield stream
+    def read_text(self):
+        """Returns file contents as text."""
+        with self.open() as stream:
+            return stream.read()
+    def write(self, destination: str):
+        """Writes it's content to destination"""
+        with open(destination, mode="w") as f:
+            f.write(self.read_text())
 class ImageFile(File):
-    def get_value(self):
-        value = super().get_value()
-        return Image.open(BytesIO(value))
+    """`DataModel` for reading image files."""
+    def read(self):
+        """Returns `PIL.Image.Image` object."""
+        fobj = super().read()
+        return Image.open(BytesIO(fobj))
+    def write(self, destination: str):
+        """Writes it's content to destination"""
+        self.read().save(destination)
 def get_file(type_: Literal["binary", "text", "image"] = "binary"):
@@ -282,7 +381,10 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
 class IndexedFile(DataModel):
-    """File source info for tables."""
+    """Metadata indexed from tabular files.
+    Includes `file` and `index` signals.
+    """
     file: File
     index: int

datachain/lib/image.py CHANGED Viewed

@@ -53,7 +53,7 @@ def convert_images(
     Resize, transform, and otherwise convert one or more images.
     Args:
-        img (Image, list[Image]): PIL.Image object or list of objects.
+        images (Image, list[Image]): PIL.Image object or list of objects.
         mode (str): PIL.Image mode.
         size (tuple[int, int]): Size in (width, height) pixels for resizing.
         transform (Callable): Torchvision transform or huggingface processor to apply.

datachain/lib/meta_formats.py CHANGED Viewed

@@ -13,6 +13,7 @@ from typing import Any, Callable
 import jmespath as jsp
 from pydantic import ValidationError
+from datachain.lib.data_model import ModelStore  # noqa: F401
 from datachain.lib.file import File
@@ -86,6 +87,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
     except subprocess.CalledProcessError as e:
         model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
     print(f"{model_output}")
+    print("\n" + f"ModelStore.register({model_name})" + "\n")
+    print("\n" + f"spec={model_name}" + "\n")
     return model_output
@@ -99,6 +102,7 @@ def read_meta(  # noqa: C901
     jmespath=None,
     show_schema=False,
     model_name=None,
+    nrows=None,
 ) -> Callable:
     from datachain.lib.dc import DataChain
@@ -118,8 +122,7 @@ def read_meta(  # noqa: C901
                     output=str,
                 )
             )
-            # dummy executor (#1616)
-            chain.save()
+            chain.exec()
         finally:
             sys.stdout = current_stdout
         model_output = captured_output.getvalue()
@@ -147,6 +150,7 @@ def read_meta(  # noqa: C901
         DataModel=spec,  # noqa: N803
         meta_type=meta_type,
         jmespath=jmespath,
+        nrows=nrows,
     ) -> Iterator[spec]:
         def validator(json_object: dict) -> spec:
             json_string = json.dumps(json_object)
@@ -175,14 +179,22 @@ def read_meta(  # noqa: C901
                 yield from validator(json_object)
             else:
+                nrow = 0
                 for json_dict in json_object:
+                    nrow = nrow + 1
+                    if nrows is not None and nrow > nrows:
+                        return
                     yield from validator(json_dict)
         if meta_type == "jsonl":
             try:
+                nrow = 0
                 with file.open() as fd:
                     data_string = fd.readline().replace("\r", "")
                     while data_string:
+                        nrow = nrow + 1
+                        if nrows is not None and nrow > nrows:
+                            return
                         json_object = process_json(data_string, jmespath)
                         data_string = fd.readline()
                         yield from validator(json_object)

datachain/lib/model_store.py CHANGED Viewed

@@ -22,7 +22,8 @@ class ModelStore:
         return model.__name__
     @classmethod
-    def add(cls, fr: type):
+    def register(cls, fr: type):
+        """Register a class as a data model for deserialization."""
         if (model := ModelStore.to_pydantic(fr)) is None:
             return
@@ -34,7 +35,7 @@ class ModelStore:
         for f_info in model.model_fields.values():
             if (anno := ModelStore.to_pydantic(f_info.annotation)) is not None:
-                cls.add(anno)
+                cls.register(anno)
     @classmethod
     def get(cls, name: str, version: Optional[int] = None) -> Optional[type]:

datachain/lib/pytorch.py CHANGED Viewed

@@ -3,7 +3,6 @@ from collections.abc import Iterator
 from typing import TYPE_CHECKING, Any, Callable, Optional
 from PIL import Image
-from pydantic import BaseModel
 from torch import float32
 from torch.distributed import get_rank, get_world_size
 from torch.utils.data import IterableDataset, get_worker_info
@@ -11,6 +10,7 @@ from torchvision.transforms import v2
 from datachain.catalog import Catalog, get_catalog
 from datachain.lib.dc import DataChain
+from datachain.lib.file import File
 from datachain.lib.text import convert_text
 if TYPE_CHECKING:
@@ -24,6 +24,7 @@ DEFAULT_TRANSFORM = v2.Compose([v2.ToImage(), v2.ToDtype(float32, scale=True)])
 def label_to_int(value: str, classes: list) -> int:
+    """Given a value and list of classes, return the index of the value's class."""
     return classes.index(value)
@@ -33,7 +34,7 @@ class PytorchDataset(IterableDataset):
         name: str,
         version: Optional[int] = None,
         catalog: Optional["Catalog"] = None,
-        transform: Optional["Transform"] = DEFAULT_TRANSFORM,
+        transform: Optional["Transform"] = None,
         tokenizer: Optional[Callable] = None,
         tokenizer_kwargs: Optional[dict[str, Any]] = None,
         num_samples: int = 0,
@@ -41,6 +42,9 @@ class PytorchDataset(IterableDataset):
         """
         Pytorch IterableDataset that streams DataChain datasets.
+        See Also:
+            `DataChain.to_pytorch()` - convert chain to PyTorch Dataset.
         Args:
             name (str): Name of DataChain dataset to stream.
             version (int): Version of DataChain dataset to stream.
@@ -53,7 +57,7 @@ class PytorchDataset(IterableDataset):
         """
         self.name = name
         self.version = version
-        self.transform = transform
+        self.transform = transform or DEFAULT_TRANSFORM
         self.tokenizer = tokenizer
         self.tokenizer_kwargs = tokenizer_kwargs or {}
         self.num_samples = num_samples
@@ -90,12 +94,11 @@ class PytorchDataset(IterableDataset):
         if self.num_samples > 0:
             ds = ds.sample(self.num_samples)
         ds = ds.chunk(total_rank, total_workers)
-        stream = ds.iterate()
-        for row_features in stream:
+        for row_features in ds.collect():
             row = []
             for fr in row_features:
-                if isinstance(fr, BaseModel):
-                    row.append(fr.get_value())  # type: ignore[unreachable]
+                if isinstance(fr, File):
+                    row.append(fr.read())  # type: ignore[unreachable]
                 else:
                     row.append(fr)
             # Apply transforms

datachain/lib/signal_schema.py CHANGED Viewed

@@ -18,7 +18,8 @@ from pydantic import BaseModel, create_model
 from typing_extensions import Literal as LiteralEx
 from datachain.lib.convert.flatten import DATACHAIN_TO_TYPE
-from datachain.lib.convert.type_converter import convert_to_db_type
+from datachain.lib.convert.python_to_sql import python_to_sql
+from datachain.lib.convert.sql_to_python import sql_to_python
 from datachain.lib.convert.unflatten import unflatten_to_json_pos
 from datachain.lib.data_model import DataModel, DataType
 from datachain.lib.file import File
@@ -102,21 +103,20 @@ class SignalSchema:
     @staticmethod
     def from_column_types(col_types: dict[str, Any]) -> "SignalSchema":
         signals: dict[str, DataType] = {}
-        for field, type_ in col_types.items():
-            type_ = DATACHAIN_TO_TYPE.get(type_, None)
-            if type_ is None:
+        for field, col_type in col_types.items():
+            if (py_type := DATACHAIN_TO_TYPE.get(col_type, None)) is None:
                 raise SignalSchemaError(
                     f"signal schema cannot be obtained for column '{field}':"
-                    f" unsupported type '{type_}'"
+                    f" unsupported type '{py_type}'"
                 )
-            signals[field] = type_
+            signals[field] = py_type
         return SignalSchema(signals)
     def serialize(self) -> dict[str, str]:
         signals = {}
         for name, fr_type in self.values.items():
             if (fr := ModelStore.to_pydantic(fr_type)) is not None:
-                ModelStore.add(fr)
+                ModelStore.register(fr)
                 signals[name] = ModelStore.get_name(fr)
             else:
                 orig = get_origin(fr_type)
@@ -143,8 +143,8 @@ class SignalSchema:
                     if not fr:
                         raise SignalSchemaError(
                             f"cannot deserialize '{signal}': "
-                            f"unregistered type '{type_name}'."
-                            f" Try to register it with `Registry.add({type_name})`."
+                            f"unknown type '{type_name}'."
+                            f" Try to add it with `ModelStore.register({type_name})`."
                         )
             except TypeError as err:
                 raise SignalSchemaError(
@@ -161,7 +161,7 @@ class SignalSchema:
                 continue
             if not has_subtree:
                 db_name = DEFAULT_DELIMITER.join(path)
-                res[db_name] = convert_to_db_type(type_)
+                res[db_name] = python_to_sql(type_)
         return res
     def row_to_objs(self, row: Sequence[Any]) -> list[DataType]:
@@ -192,10 +192,17 @@ class SignalSchema:
     def slice(
         self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
     ) -> "SignalSchema":
+        # Make new schema that combines current schema and setup signals
         setup = setup or {}
         setup_no_types = dict.fromkeys(setup.keys(), str)
-        union = self.values | setup_no_types
-        schema = {k: union[k] for k in keys if k in union}
+        union = SignalSchema(self.values | setup_no_types)
+        # Slice combined schema by keys
+        schema = {}
+        for k in keys:
+            try:
+                schema[k] = union._find_in_tree(k.split("."))
+            except SignalResolvingError:
+                pass
         return SignalSchema(schema, setup)
     def row_to_features(
@@ -271,6 +278,14 @@ class SignalSchema:
                 del schema[signal]
         return SignalSchema(schema)
+    def mutate(self, args_map: dict) -> "SignalSchema":
+        return SignalSchema(self.values | sql_to_python(args_map))
+    def clone_without_sys_signals(self) -> "SignalSchema":
+        schema = copy.deepcopy(self.values)
+        schema.pop("sys", None)
+        return SignalSchema(schema)
     def merge(
         self,
         right_schema: "SignalSchema",
@@ -283,9 +298,9 @@ class SignalSchema:
         return SignalSchema(self.values | schema_right)
-    def get_file_signals(self) -> Iterator[str]:
+    def get_signals(self, target_type: type[DataModel]) -> Iterator[str]:
         for path, type_, has_subtree, _ in self.get_flat_tree():
-            if has_subtree and issubclass(type_, File):
+            if has_subtree and issubclass(type_, target_type):
                 yield ".".join(path)
     def create_model(self, name: str) -> type[DataModel]:
@@ -331,6 +346,16 @@ class SignalSchema:
                     sub_schema = SignalSchema({"* list of": args[0]})
                     sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
+    def get_headers_with_length(self):
+        paths = [
+            path for path, _, has_subtree, _ in self.get_flat_tree() if not has_subtree
+        ]
+        max_length = max([len(path) for path in paths], default=0)
+        return [
+            path + [""] * (max_length - len(path)) if len(path) < max_length else path
+            for path in paths
+        ], max_length
     def __or__(self, other):
         return self.__class__(self.values | other.values)

datachain/lib/text.py CHANGED Viewed

@@ -31,8 +31,9 @@ def convert_text(
         res = tokenizer(text)
     tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
+    tokens = torch.tensor(tokens)
     if not encoder:
         return tokens
-    return encoder(torch.tensor(tokens))
+    return encoder(tokens)

datachain/lib/udf.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pydantic import BaseModel
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
 from datachain.lib.convert.unflatten import unflatten_to_json
-from datachain.lib.data_model import FileBasic
+from datachain.lib.file import File
 from datachain.lib.model_store import ModelStore
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf_signature import UdfSignature
@@ -88,6 +88,53 @@ class UDFAdapter(_UDFBase):
 class UDFBase(AbstractUDF):
+    """Base class for stateful user-defined functions.
+    Any class that inherits from it must have a `process()` method that takes input
+    params from one or more rows in the chain and produces the expected output.
+    Optionally, the class may include these methods:
+    - `setup()` to run code on each  worker before `process()` is called.
+    - `teardown()` to run code on each  worker after `process()` completes.
+    Example:
+        ```py
+        from datachain import C, DataChain, Mapper
+        import open_clip
+        class ImageEncoder(Mapper):
+            def __init__(self, model_name: str, pretrained: str):
+                self.model_name = model_name
+                self.pretrained = pretrained
+            def setup(self):
+                self.model, _, self.preprocess = (
+                    open_clip.create_model_and_transforms(
+                        self.model_name, self.pretrained
+                    )
+                )
+            def process(self, file) -> list[float]:
+                img = file.get_value()
+                img = self.preprocess(img).unsqueeze(0)
+                emb = self.model.encode_image(img)
+                return emb[0].tolist()
+        (
+            DataChain.from_storage(
+                "gs://datachain-demo/fashion-product-images/images", type="image"
+            )
+            .limit(5)
+            .map(
+                ImageEncoder("ViT-B-32", "laion2b_s34b_b79k"),
+                params=["file"],
+                output={"emb": list[float]},
+            )
+            .show()
+        )
+        ```
+    """
     is_input_batched = False
     is_output_batched = False
     is_input_grouped = False
@@ -198,7 +245,7 @@ class UDFBase(AbstractUDF):
                         flat.extend(flatten(obj))
                     else:
                         flat.append(obj)
-                res.append(flat)
+                res.append(tuple(flat))
         else:
             # Generator expression is required, otherwise the value will be materialized
             res = (
@@ -227,7 +274,7 @@ class UDFBase(AbstractUDF):
         for row in rows:
             obj_row = self.params.row_to_objs(row)
             for obj in obj_row:
-                if isinstance(obj, FileBasic):
+                if isinstance(obj, File):
                     obj._set_stream(
                         self._catalog, caching_enabled=cache, download_cb=download_cb
                     )
@@ -256,7 +303,7 @@ class UDFBase(AbstractUDF):
                 else:
                     obj = slice[0]
-                if isinstance(obj, FileBasic):
+                if isinstance(obj, File):
                     obj._set_stream(
                         self._catalog, caching_enabled=cache, download_cb=download_cb
                     )
@@ -280,7 +327,7 @@ class UDFBase(AbstractUDF):
 class Mapper(UDFBase):
-    pass
+    """Inherit from this class to pass to `DataChain.map()`."""
 class BatchMapper(Mapper):
@@ -289,10 +336,14 @@ class BatchMapper(Mapper):
 class Generator(UDFBase):
+    """Inherit from this class to pass to `DataChain.gen()`."""
     is_output_batched = True
 class Aggregator(UDFBase):
+    """Inherit from this class to pass to `DataChain.agg()`."""
     is_input_batched = True
     is_output_batched = True
     is_input_grouped = True

datachain/lib/udf_signature.py CHANGED Viewed

@@ -131,7 +131,7 @@ class UdfSignature:
                     raise UdfSignatureError(
                         chain,
                         f"output type '{value.__name__}' of signal '{key}' is not"
-                        f" supported. Please use Feature types: {DataTypeNames}",
+                        f" supported. Please use DataModel types: {DataTypeNames}",
                     )
             udf_output_map = output

datachain 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

Potentially problematic release.

datachain 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl