PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/tar.py CHANGED Viewed

@@ -6,12 +6,11 @@ from datachain.lib.file import File, TarVFile
 def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
-    new_parent = parent.get_full_name()
     etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
     etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
     return File(
         source=parent.source,
-        path=f"{new_parent}/{info.name}",
+        path=f"{parent.path}/{info.name}",
         version=parent.version,
         size=info.size,
         etag=etag,

datachain/lib/text.py CHANGED Viewed

@@ -1,16 +1,17 @@
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any
 import torch
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 def convert_text(
-    text: Union[str, list[str]],
-    tokenizer: Optional[Callable] = None,
-    tokenizer_kwargs: Optional[dict[str, Any]] = None,
-    encoder: Optional[Callable] = None,
-    device: Optional[Union[str, torch.device]] = None,
-) -> Union[str, list[str], torch.Tensor]:
+    text: str | list[str],
+    tokenizer: Callable | None = None,
+    tokenizer_kwargs: dict[str, Any] | None = None,
+    encoder: Callable | None = None,
+    device: str | torch.device | None = None,
+) -> str | list[str] | torch.Tensor:
     """
     Tokenize and otherwise transform text.

datachain/lib/udf.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import sys
-import traceback
+import hashlib
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
 from contextlib import closing, nullcontext
 from functools import partial
-from typing import TYPE_CHECKING, Any, Optional, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 import attrs
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -12,11 +11,10 @@ from pydantic import BaseModel
 from datachain.asyn import AsyncMapper
 from datachain.cache import temporary_cache
 from datachain.dataset import RowDict
+from datachain.hash_utils import hash_callable
 from datachain.lib.convert.flatten import flatten
-from datachain.lib.data_model import DataValue
-from datachain.lib.file import File
-from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
-from datachain.progress import CombinedDownloadCallback
+from datachain.lib.file import DataModel, File
+from datachain.lib.utils import AbstractUDF, DataChainParamsError
 from datachain.query.batch import (
     Batch,
     BatchingStrategy,
@@ -42,8 +40,44 @@ T = TypeVar("T", bound=Sequence[Any])
 class UdfError(DataChainParamsError):
-    def __init__(self, msg):
-        super().__init__(f"UDF error: {msg}")
+    """Exception raised for UDF-related errors."""
+    def __init__(self, message: str) -> None:
+        self.message = message
+        super().__init__(message)
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__!s}: {self.message!s}"
+    def __reduce__(self):
+        """Custom reduce method for pickling."""
+        return self.__class__, (self.message,)
+class UdfRunError(Exception):
+    """Exception raised when UDF execution fails."""
+    def __init__(
+        self,
+        error: Exception | str,
+        stacktrace: str | None = None,
+        udf_name: str | None = None,
+    ) -> None:
+        self.error = error
+        self.stacktrace = stacktrace
+        self.udf_name = udf_name
+        super().__init__(str(error))
+    def __str__(self) -> str:
+        if isinstance(self.error, UdfRunError):
+            return str(self.error)
+        if isinstance(self.error, Exception):
+            return f"{self.error.__class__.__name__!s}: {self.error!s}"
+        return f"{self.__class__.__name__!s}: {self.error!s}"
+    def __reduce__(self):
+        """Custom reduce method for pickling."""
+        return self.__class__, (self.error, self.stacktrace, self.udf_name)
 ColumnType = Any
@@ -56,38 +90,26 @@ UDFOutputSpec = Mapping[str, ColumnType]
 UDFResult = dict[str, Any]
-@attrs.define
-class UDFProperties:
-    udf: "UDFAdapter"
-    def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
-        return self.udf.get_batching(use_partitioning)
-    @property
-    def batch(self):
-        return self.udf.batch
 @attrs.define(slots=False)
 class UDFAdapter:
     inner: "UDFBase"
     output: UDFOutputSpec
+    batch_size: int | None = None
     batch: int = 1
+    def hash(self) -> str:
+        return self.inner.hash()
     def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
         if use_partitioning:
             return Partition()
         if self.batch == 1:
             return NoBatching()
         if self.batch > 1:
             return Batch(self.batch)
         raise ValueError(f"invalid batch size {self.batch}")
-    @property
-    def properties(self):
-        # For backwards compatibility.
-        return UDFProperties(self)
     def run(
         self,
         udf_fields: "Sequence[str]",
@@ -164,10 +186,31 @@ class UDFBase(AbstractUDF):
     prefetch: int = 0
     def __init__(self):
-        self.params: Optional[SignalSchema] = None
+        self.params: SignalSchema | None = None
         self.output = None
         self._func = None
+    def hash(self) -> str:
+        """
+        Creates SHA hash of this UDF function. It takes into account function,
+        inputs and outputs.
+        For function-based UDFs, hashes self._func.
+        For class-based UDFs, hashes the process method.
+        """
+        # Hash user code: either _func (function-based) or process method (class-based)
+        func_to_hash = self._func if self._func else self.process
+        parts = [
+            hash_callable(func_to_hash),
+            self.params.hash() if self.params else "",
+            self.output.hash(),
+        ]
+        return hashlib.sha256(
+            b"".join([bytes.fromhex(part) for part in parts])
+        ).hexdigest()
     def process(self, *args, **kwargs):
         """Processing function that needs to be defined by user"""
         if not self._func:
@@ -188,7 +231,7 @@ class UDFBase(AbstractUDF):
         self,
         sign: "UdfSignature",
         params: "SignalSchema",
-        func: Optional[Callable],
+        func: Callable | None,
     ):
         self.params = params
         self.output = sign.output_schema
@@ -219,14 +262,31 @@ class UDFBase(AbstractUDF):
     def name(self):
         return self.__class__.__name__
+    @property
+    def verbose_name(self):
+        """Returns the name of the function or class that implements the UDF."""
+        if self._func and callable(self._func):
+            if hasattr(self._func, "__name__"):
+                return self._func.__name__
+            if hasattr(self._func, "__class__") and hasattr(
+                self._func.__class__, "__name__"
+            ):
+                return self._func.__class__.__name__
+        return "<unknown>"
     @property
     def signal_names(self) -> Iterable[str]:
         return self.output.to_udf_spec().keys()
-    def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
+    def to_udf_wrapper(
+        self,
+        batch_size: int | None = None,
+        batch: int = 1,
+    ) -> UDFAdapter:
         return UDFAdapter(
             self,
             self.output.to_udf_spec(),
+            batch_size,
             batch,
         )
@@ -255,38 +315,37 @@ class UDFBase(AbstractUDF):
     def _parse_row(
         self, row_dict: RowDict, catalog: "Catalog", cache: bool, download_cb: Callback
-    ) -> list[DataValue]:
+    ) -> list[Any]:
         assert self.params
         row = [row_dict[p] for p in self.params.to_udf_spec()]
         obj_row = self.params.row_to_objs(row)
         for obj in obj_row:
-            if isinstance(obj, File):
-                obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
+            self._set_stream_recursive(obj, catalog, cache, download_cb)
         return obj_row
+    def _set_stream_recursive(
+        self, obj: Any, catalog: "Catalog", cache: bool, download_cb: Callback
+    ) -> None:
+        """Recursively set the catalog stream on all File objects within an object."""
+        if isinstance(obj, File):
+            obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
+        # Check all fields for nested File objects, but only for DataModel objects
+        if isinstance(obj, DataModel):
+            for field_name in type(obj).model_fields:
+                field_value = getattr(obj, field_name, None)
+                if isinstance(field_value, DataModel):
+                    self._set_stream_recursive(field_value, catalog, cache, download_cb)
     def _prepare_row(self, row, udf_fields, catalog, cache, download_cb):
-        row_dict = RowDict(zip(udf_fields, row))
+        row_dict = RowDict(zip(udf_fields, row, strict=False))
         return self._parse_row(row_dict, catalog, cache, download_cb)
     def _prepare_row_and_id(self, row, udf_fields, catalog, cache, download_cb):
-        row_dict = RowDict(zip(udf_fields, row))
+        row_dict = RowDict(zip(udf_fields, row, strict=False))
         udf_input = self._parse_row(row_dict, catalog, cache, download_cb)
         return row_dict["sys__id"], *udf_input
-    def process_safe(self, obj_rows):
-        try:
-            result_objs = self.process(*obj_rows)
-        except Exception as e:  # noqa: BLE001
-            msg = f"============== Error in user code: '{self.name}' =============="
-            print(msg)
-            exc_type, exc_value, exc_traceback = sys.exc_info()
-            traceback.print_exception(exc_type, exc_value, exc_traceback.tb_next)
-            print("=" * len(msg))
-            raise DataChainError(
-                f"Error in user code in class '{self.name}': {e!s}"
-            ) from None
-        return result_objs
 def noop(*args, **kwargs):
     pass
@@ -294,11 +353,11 @@ def noop(*args, **kwargs):
 async def _prefetch_input(
     row: T,
-    download_cb: Optional["Callback"] = None,
+    download_cb: Callback | None = None,
     after_prefetch: "Callable[[], None]" = noop,
 ) -> T:
     for obj in row:
-        if isinstance(obj, File) and await obj._prefetch(download_cb):
+        if isinstance(obj, File) and obj.path and await obj._prefetch(download_cb):
             after_prefetch()
     return row
@@ -317,8 +376,8 @@ def _remove_prefetched(row: T) -> None:
 def _prefetch_inputs(
     prepared_inputs: "Iterable[T]",
     prefetch: int = 0,
-    download_cb: Optional["Callback"] = None,
-    after_prefetch: Optional[Callable[[], None]] = None,
+    download_cb: Callback | None = None,
+    after_prefetch: Callable[[], None] | None = None,
     remove_prefetched: bool = False,
 ) -> "abc.Generator[T, None, None]":
     if not prefetch:
@@ -327,8 +386,9 @@ def _prefetch_inputs(
     if after_prefetch is None:
         after_prefetch = noop
-        if isinstance(download_cb, CombinedDownloadCallback):
-            after_prefetch = download_cb.increment_file_count
+        if download_cb and hasattr(download_cb, "increment_file_count"):
+            increment_file_count: Callable[[], None] = download_cb.increment_file_count
+            after_prefetch = increment_file_count
     f = partial(_prefetch_input, download_cb=download_cb, after_prefetch=after_prefetch)
     mapper = AsyncMapper(f, prepared_inputs, workers=prefetch)
@@ -384,9 +444,12 @@ class Mapper(UDFBase):
         with closing(prepared_inputs):
             for id_, *udf_args in prepared_inputs:
-                result_objs = self.process_safe(udf_args)
+                result_objs = self.process(*udf_args)
                 udf_output = self._flatten_row(result_objs)
-                output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
+                output = [
+                    {"sys__id": id_}
+                    | dict(zip(self.signal_names, udf_output, strict=False))
+                ]
                 processed_cb.relative_update(1)
                 yield output
@@ -394,11 +457,27 @@ class Mapper(UDFBase):
 class BatchMapper(UDFBase):
-    """Inherit from this class to pass to `DataChain.batch_map()`."""
+    """Inherit from this class to pass to `DataChain.batch_map()`.
+    .. deprecated:: 0.29.0
+        This class is deprecated and will be removed in a future version.
+        Use `Aggregator` instead, which provides the similar functionality.
+    """
     is_input_batched = True
     is_output_batched = True
+    def __init__(self):
+        import warnings
+        warnings.warn(
+            "BatchMapper is deprecated and will be removed in a future version. "
+            "Use Aggregator instead, which provides the similar functionality.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        super().__init__()
     def run(
         self,
         udf_fields: Sequence[str],
@@ -411,24 +490,26 @@ class BatchMapper(UDFBase):
         self.setup()
         for batch in udf_inputs:
-            n_rows = len(batch.rows)
+            n_rows = len(batch)
             row_ids, *udf_args = zip(
                 *[
                     self._prepare_row_and_id(
                         row, udf_fields, catalog, cache, download_cb
                     )
-                    for row in batch.rows
-                ]
+                    for row in batch
+                ],
+                strict=False,
             )
-            result_objs = list(self.process_safe(udf_args))
+            result_objs = list(self.process(*udf_args))
             n_objs = len(result_objs)
             assert n_objs == n_rows, (
                 f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
             )
             udf_outputs = (self._flatten_row(row) for row in result_objs)
             output = [
-                {"sys__id": row_id} | dict(zip(self.signal_names, signals))
-                for row_id, signals in zip(row_ids, udf_outputs)
+                {"sys__id": row_id}
+                | dict(zip(self.signal_names, signals, strict=False))
+                for row_id, signals in zip(row_ids, udf_outputs, strict=False)
             ]
             processed_cb.relative_update(n_rows)
             yield output
@@ -461,10 +542,10 @@ class Generator(UDFBase):
                     )
         def _process_row(row):
-            with safe_closing(self.process_safe(row)) as result_objs:
+            with safe_closing(self.process(*row)) as result_objs:
                 for result_obj in result_objs:
                     udf_output = self._flatten_row(result_obj)
-                    yield dict(zip(self.signal_names, udf_output))
+                    yield dict(zip(self.signal_names, udf_output, strict=False))
         prepared_inputs = _prepare_rows(udf_inputs)
         prepared_inputs = _prefetch_inputs(
@@ -474,8 +555,9 @@ class Generator(UDFBase):
             remove_prefetched=bool(self.prefetch) and not cache,
         )
         with closing(prepared_inputs):
-            for row in processed_cb.wrap(prepared_inputs):
+            for row in prepared_inputs:
                 yield _process_row(row)
+                processed_cb.relative_update(1)
         self.teardown()
@@ -488,7 +570,7 @@ class Aggregator(UDFBase):
     def run(
         self,
-        udf_fields: "Sequence[str]",
+        udf_fields: Sequence[str],
         udf_inputs: Iterable[RowsOutputBatch],
         catalog: "Catalog",
         cache: bool,
@@ -498,16 +580,22 @@ class Aggregator(UDFBase):
         self.setup()
         for batch in udf_inputs:
-            udf_args = zip(
-                *[
-                    self._prepare_row(row, udf_fields, catalog, cache, download_cb)
-                    for row in batch.rows
-                ]
-            )
-            result_objs = self.process_safe(udf_args)
+            prepared_rows = [
+                self._prepare_row(row, udf_fields, catalog, cache, download_cb)
+                for row in batch
+            ]
+            batched_args = zip(*prepared_rows, strict=False)
+            # Convert aggregated column values to lists. This keeps behavior
+            # consistent with the type hints promoted in the public API.
+            udf_args = [
+                list(arg) if isinstance(arg, tuple) else arg for arg in batched_args
+            ]
+            result_objs = self.process(*udf_args)
             udf_outputs = (self._flatten_row(row) for row in result_objs)
-            output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
-            processed_cb.relative_update(len(batch.rows))
+            output = (
+                dict(zip(self.signal_names, row, strict=False)) for row in udf_outputs
+            )
+            processed_cb.relative_update(len(batch))
             yield output
         self.teardown()

datachain/lib/udf_signature.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import inspect
-from collections.abc import Generator, Iterator, Sequence
+from collections.abc import Callable, Generator, Iterator, Sequence
 from dataclasses import dataclass
-from typing import Any, Callable, Union, get_args, get_origin
+from typing import Any, get_args, get_origin
 from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf import UDFBase
-from datachain.lib.utils import AbstractUDF, DataChainParamsError
+from datachain.lib.utils import AbstractUDF, DataChainParamsError, callable_name
 class UdfSignatureError(DataChainParamsError):
@@ -16,9 +16,9 @@ class UdfSignatureError(DataChainParamsError):
 @dataclass
-class UdfSignature:
-    func: Union[Callable, UDFBase]
-    params: dict[str, Union[DataType, Any]]
+class UdfSignature:  # noqa: PLW1641
+    func: Callable | UDFBase
+    params: dict[str, DataType | Any]
     output_schema: SignalSchema
     DEFAULT_RETURN_TYPE = str
@@ -28,24 +28,29 @@ class UdfSignature:
         cls,
         chain: str,
         signal_map: dict[str, Callable],
-        func: Union[None, UDFBase, Callable] = None,
-        params: Union[None, str, Sequence[str]] = None,
-        output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
+        func: UDFBase | Callable | None = None,
+        params: str | Sequence[str] | None = None,
+        output: DataType | Sequence[str] | dict[str, DataType] | None = None,
         is_generator: bool = True,
     ) -> "UdfSignature":
         keys = ", ".join(signal_map.keys())
         if len(signal_map) > 1:
             raise UdfSignatureError(
                 chain,
-                f"multiple signals '{keys}' are not supported in processors."
-                " Chain multiple processors instead.",
+                (
+                    f"multiple signals '{keys}' are not supported in processors."
+                    " Chain multiple processors instead.",
+                ),
             )
-        udf_func: Union[UDFBase, Callable]
+        udf_func: UDFBase | Callable
         if len(signal_map) == 1:
             if func is not None:
                 raise UdfSignatureError(
                     chain,
-                    f"processor can't have signal '{keys}' with function '{func}'",
+                    (
+                        "processor can't have signal "
+                        f"'{keys}' with function '{callable_name(func)}'"
+                    ),
                 )
             signal_name, udf_func = next(iter(signal_map.items()))
         else:
@@ -56,13 +61,16 @@ class UdfSignature:
             signal_name = None
         if not isinstance(udf_func, UDFBase) and not callable(udf_func):
-            raise UdfSignatureError(chain, f"UDF '{udf_func}' is not callable")
+            raise UdfSignatureError(
+                chain,
+                f"UDF '{callable_name(udf_func)}' is not callable",
+            )
         func_params_map_sign, func_outs_sign, is_iterator = cls._func_signature(
             chain, udf_func
         )
-        udf_params: dict[str, Union[DataType, Any]] = {}
+        udf_params: dict[str, DataType | Any] = {}
         if params:
             udf_params = (
                 {params: Any} if isinstance(params, str) else dict.fromkeys(params, Any)
@@ -76,14 +84,15 @@ class UdfSignature:
             }
         if output:
+            # Use the actual resolved function (udf_func) for clearer error messages
             udf_output_map = UdfSignature._validate_output(
-                chain, signal_name, func, func_outs_sign, output
+                chain, signal_name, udf_func, func_outs_sign, output
             )
         else:
             if not func_outs_sign:
                 raise UdfSignatureError(
                     chain,
-                    f"outputs are not defined in function '{udf_func}'"
+                    f"outputs are not defined in function '{callable_name(udf_func)}'"
                     " hints or 'output'",
                 )
@@ -97,9 +106,12 @@ class UdfSignature:
             if is_generator and not is_iterator:
                 raise UdfSignatureError(
                     chain,
-                    f"function '{func}' cannot be used in generator/aggregator"
-                    " because it returns a type that is not Iterator/Generator."
-                    f" Instead, it returns '{func_outs_sign}'",
+                    (
+                        f"function '{callable_name(udf_func)}' cannot be used in "
+                        "generator/aggregator because it returns a type that is "
+                        "not Iterator/Generator. "
+                        f"Instead, it returns '{func_outs_sign}'"
+                    ),
                 )
             if isinstance(func_outs_sign, tuple):
@@ -124,11 +136,14 @@ class UdfSignature:
             if len(func_outs_sign) != len(output):
                 raise UdfSignatureError(
                     chain,
-                    f"length of outputs names ({len(output)}) and function '{func}'"
-                    f" return type length ({len(func_outs_sign)}) does not match",
+                    (
+                        f"length of outputs names ({len(output)}) and function "
+                        f"'{callable_name(func)}' return type length "
+                        f"({len(func_outs_sign)}) does not match"
+                    ),
                 )
-            udf_output_map = dict(zip(output, func_outs_sign))
+            udf_output_map = dict(zip(output, func_outs_sign, strict=False))
         elif isinstance(output, dict):
             for key, value in output.items():
                 if not isinstance(key, str):
@@ -164,7 +179,7 @@ class UdfSignature:
     @staticmethod
     def _func_signature(
-        chain: str, udf_func: Union[Callable, UDFBase]
+        chain: str, udf_func: Callable | UDFBase
     ) -> tuple[dict[str, type], Sequence[type], bool]:
         if isinstance(udf_func, AbstractUDF):
             func = udf_func.process  # type: ignore[unreachable]
@@ -183,17 +198,27 @@ class UdfSignature:
             orig = get_origin(anno)
             if inspect.isclass(orig) and issubclass(orig, Iterator):
                 args = get_args(anno)
-                if len(args) > 1 and not (
-                    issubclass(orig, Generator) and len(args) == 3
-                ):
-                    raise UdfSignatureError(
-                        chain,
-                        f"function '{func}' should return iterator with a single"
-                        f" value while '{args}' are specified",
-                    )
-                is_iterator = True
-                anno = args[0]
-                orig = get_origin(anno)
+                # For typing.Iterator without type args, default to DEFAULT_RETURN_TYPE
+                if len(args) == 0:
+                    is_iterator = True
+                    anno = UdfSignature.DEFAULT_RETURN_TYPE
+                    orig = get_origin(anno)
+                else:
+                    # typing.Generator[T, S, R] has 3 args; allow that shape
+                    if len(args) > 1 and not (
+                        issubclass(orig, Generator) and len(args) == 3
+                    ):
+                        raise UdfSignatureError(
+                            chain,
+                            (
+                                f"function '{callable_name(func)}' should return "
+                                "iterator with a single value while "
+                                f"'{args}' are specified"
+                            ),
+                        )
+                    is_iterator = True
+                    anno = args[0]
+                    orig = get_origin(anno)
             if orig and orig is tuple:
                 output_types = tuple(get_args(anno))  # type: ignore[assignment]

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl