PyPI - datachain - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl - Mend

datachain 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (24) hide show

datachain/data_storage/metastore.py +0 -4
datachain/data_storage/schema.py +7 -3
datachain/data_storage/sqlite.py +22 -4
datachain/data_storage/warehouse.py +25 -26
datachain/lib/arrow.py +27 -8
datachain/lib/convert/flatten.py +10 -5
datachain/lib/convert/python_to_sql.py +1 -1
datachain/lib/convert/values_to_tuples.py +4 -1
datachain/lib/data_model.py +6 -1
datachain/lib/dc.py +206 -29
datachain/lib/file.py +6 -11
datachain/lib/meta_formats.py +12 -11
datachain/lib/settings.py +1 -17
datachain/lib/udf.py +18 -10
datachain/query/dataset.py +24 -65
datachain/sql/sqlite/base.py +3 -3
datachain/sql/sqlite/types.py +5 -13
datachain/sql/types.py +5 -1
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/METADATA +2 -3
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/RECORD +24 -24
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/WHEEL +1 -1
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/LICENSE +0 -0
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/entry_points.txt +0 -0
{datachain-0.2.13.dist-info → datachain-0.2.15.dist-info}/top_level.txt +0 -0

datachain/lib/dc.py CHANGED Viewed

@@ -33,6 +33,7 @@ from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf import (
     Aggregator,
+    BatchMapper,
     Generator,
     Mapper,
     UDFBase,
@@ -192,6 +193,8 @@ class DataChain(DatasetQuery):
         ```
     """
+    max_row_count: Optional[int] = None
     DEFAULT_FILE_RECORD: ClassVar[dict] = {
         "source": "",
         "name": "",
@@ -237,7 +240,6 @@ class DataChain(DatasetQuery):
     def settings(
         self,
         cache=None,
-        batch=None,
         parallel=None,
         workers=None,
         min_task_size=None,
@@ -250,7 +252,6 @@ class DataChain(DatasetQuery):
         Parameters:
             cache : data caching (default=False)
-            batch : size of the batch (default=1000)
             parallel : number of thread for processors. True is a special value to
                 enable all available CPUs (default=1)
             workers : number of distributed workers. Only for Studio mode. (default=1)
@@ -268,7 +269,7 @@ class DataChain(DatasetQuery):
         chain = self.clone()
         if sys is not None:
             chain._sys = sys
-        chain._settings.add(Settings(cache, batch, parallel, workers, min_task_size))
+        chain._settings.add(Settings(cache, parallel, workers, min_task_size))
         return chain
     def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -342,9 +343,9 @@ class DataChain(DatasetQuery):
         spec: Optional[DataType] = None,
         schema_from: Optional[str] = "auto",
         jmespath: Optional[str] = None,
-        object_name: str = "",
+        object_name: Optional[str] = "",
         model_name: Optional[str] = None,
-        show_schema: Optional[bool] = False,
+        print_schema: Optional[bool] = False,
         meta_type: Optional[str] = "json",
         nrows=None,
         **kwargs,
@@ -359,17 +360,17 @@ class DataChain(DatasetQuery):
             schema_from : path to sample to infer spec (if schema not provided)
             object_name : generated object column name
             model_name : optional generated model name
-            show_schema : print auto-generated schema
+            print_schema : print auto-generated schema
             jmespath : optional JMESPATH expression to reduce JSON
             nrows : optional row limit for jsonl and JSON arrays
         Example:
-            infer JSON schema from data, reduce using JMESPATH, print schema
+            infer JSON schema from data, reduce using JMESPATH
             ```py
             chain = DataChain.from_json("gs://json", jmespath="key1.key2")
             ```
-            infer JSON schema from a particular path, print data model
+            infer JSON schema from a particular path
             ```py
             chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
             ```
@@ -384,7 +385,67 @@ class DataChain(DatasetQuery):
         if (not object_name) and jmespath:
             object_name = jmespath_to_name(jmespath)
         if not object_name:
-            object_name = "json"
+            object_name = meta_type
+        chain = DataChain.from_storage(path=path, type=type, **kwargs)
+        signal_dict = {
+            object_name: read_meta(
+                schema_from=schema_from,
+                meta_type=meta_type,
+                spec=spec,
+                model_name=model_name,
+                print_schema=print_schema,
+                jmespath=jmespath,
+                nrows=nrows,
+            )
+        }
+        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
+    @classmethod
+    def from_jsonl(
+        cls,
+        path,
+        type: Literal["binary", "text", "image"] = "text",
+        spec: Optional[DataType] = None,
+        schema_from: Optional[str] = "auto",
+        jmespath: Optional[str] = None,
+        object_name: Optional[str] = "",
+        model_name: Optional[str] = None,
+        print_schema: Optional[bool] = False,
+        meta_type: Optional[str] = "jsonl",
+        nrows=None,
+        **kwargs,
+    ) -> "DataChain":
+        """Get data from JSON lines. It returns the chain itself.
+        Parameters:
+            path : storage URI with directory. URI must start with storage prefix such
+                as `s3://`, `gs://`, `az://` or "file:///"
+            type : read file as "binary", "text", or "image" data. Default is "binary".
+            spec : optional Data Model
+            schema_from : path to sample to infer spec (if schema not provided)
+            object_name : generated object column name
+            model_name : optional generated model name
+            print_schema : print auto-generated schema
+            jmespath : optional JMESPATH expression to reduce JSON
+            nrows : optional row limit for jsonl and JSON arrays
+        Example:
+            infer JSONl schema from data, limit parsing to 1 row
+            ```py
+            chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
+            ```
+        """
+        if schema_from == "auto":
+            schema_from = path
+        def jmespath_to_name(s: str):
+            name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
+            return s[:name_end]
+        if (not object_name) and jmespath:
+            object_name = jmespath_to_name(jmespath)
+        if not object_name:
+            object_name = meta_type
         chain = DataChain.from_storage(path=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
@@ -392,12 +453,12 @@ class DataChain(DatasetQuery):
                 meta_type=meta_type,
                 spec=spec,
                 model_name=model_name,
-                show_schema=show_schema,
+                print_schema=print_schema,
                 jmespath=jmespath,
                 nrows=nrows,
             )
         }
-        return chain.gen(**signal_dict)  # type: ignore[arg-type]
+        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
     @classmethod
     def datasets(
@@ -428,7 +489,7 @@ class DataChain(DatasetQuery):
             **{object_name: datasets},  # type: ignore[arg-type]
         )
-    def show_json_schema(  # type: ignore[override]
+    def print_json_schema(  # type: ignore[override]
         self, jmespath: Optional[str] = None, model_name: Optional[str] = None
     ) -> "DataChain":
         """Print JSON data model and save it. It returns the chain itself.
@@ -453,7 +514,7 @@ class DataChain(DatasetQuery):
             output=str,
         )
-    def show_jsonl_schema(  # type: ignore[override]
+    def print_jsonl_schema(  # type: ignore[override]
         self, jmespath: Optional[str] = None, model_name: Optional[str] = None
     ) -> "DataChain":
         """Print JSON data model and save it. It returns the chain itself.
@@ -538,14 +599,16 @@ class DataChain(DatasetQuery):
             Using func and output as a map:
             ```py
-            chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
+            chain = chain.map(
+                lambda name: name.split("."), output={"stem": str, "ext": str}
+            )
             chain.save("new_dataset")
             ```
         """
         udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
         chain = self.add_signals(
-            udf_obj.to_udf_wrapper(self._settings.batch),
+            udf_obj.to_udf_wrapper(),
             **self._settings.to_dict(),
         )
@@ -558,7 +621,7 @@ class DataChain(DatasetQuery):
         output: OutputType = None,
         **signal_map,
     ) -> "Self":
-        """Apply a function to each row to create new rows (with potentially new
+        r"""Apply a function to each row to create new rows (with potentially new
         signals). The function needs to return a new objects for each of the new rows.
         It returns a chain itself with new signals.
@@ -568,11 +631,20 @@ class DataChain(DatasetQuery):
         one key differences: It produces a sequence of rows for each input row (like
         extracting multiple file records from a single tar file or bounding boxes from a
         single image file).
+        Example:
+            ```py
+            chain = chain.gen(
+                line=lambda file: [l for l in file.read().split("\n")],
+                output=str,
+            )
+            chain.save("new_dataset")
+            ```
         """
         udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
         chain = DatasetQuery.generate(
             self,
-            udf_obj.to_udf_wrapper(self._settings.batch),
+            udf_obj.to_udf_wrapper(),
             **self._settings.to_dict(),
         )
@@ -592,23 +664,68 @@ class DataChain(DatasetQuery):
         Input-output relationship: N:M
-        This method bears similarity to `gen()` and map(), employing a comparable set of
-        parameters, yet differs in two crucial aspects:
+        This method bears similarity to `gen()` and `map()`, employing a comparable set
+        of parameters, yet differs in two crucial aspects:
         1. The `partition_by` parameter: This specifies the column name or a list of
            column names that determine the grouping criteria for aggregation.
         2. Group-based UDF function input: Instead of individual rows, the function
            receives a list all rows within each group defined by `partition_by`.
+        Example:
+            ```py
+            chain = chain.agg(
+                total=lambda category, amount: [sum(amount)],
+                output=float,
+                partition_by="category",
+            )
+            chain.save("new_dataset")
+            ```
         """
         udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
         chain = DatasetQuery.generate(
             self,
-            udf_obj.to_udf_wrapper(self._settings.batch),
+            udf_obj.to_udf_wrapper(),
             partition_by=partition_by,
             **self._settings.to_dict(),
         )
         return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
+    def batch_map(
+        self,
+        func: Optional[Callable] = None,
+        params: Union[None, str, Sequence[str]] = None,
+        output: OutputType = None,
+        batch: int = 1000,
+        **signal_map,
+    ) -> "Self":
+        """This is a batch version of `map()`.
+        Input-output relationship: N:N
+        It accepts the same parameters plus an
+        additional parameter:
+            batch : Size of each batch passed to `func`. Defaults to 1000.
+        Example:
+            ```py
+            chain = chain.batch_map(
+                sqrt=lambda size: np.sqrt(size),
+                output=float
+            )
+            chain.save("new_dataset")
+            ```
+        """
+        udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
+        chain = DatasetQuery.add_signals(
+            self,
+            udf_obj.to_udf_wrapper(batch),
+            **self._settings.to_dict(),
+        )
+        return chain.add_schema(udf_obj.output).reset_settings(self._settings)
     def _udf_to_obj(
         self,
         target_class: type[UDFBase],
@@ -951,6 +1068,41 @@ class DataChain(DatasetQuery):
         return ds
+    def subtract(  # type: ignore[override]
+        self,
+        other: "DataChain",
+        on: Optional[Union[str, Sequence[str]]] = None,
+    ) -> "Self":
+        """Remove rows that appear in another chain.
+        Parameters:
+            other: chain whose rows will be removed from `self`
+            on: columns to consider for determining row equality. If unspecified,
+                defaults to all common columns between `self` and `other`.
+        """
+        if isinstance(on, str):
+            on = [on]
+        if on is None:
+            other_columns = set(other._effective_signals_schema.db_signals())
+            signals = [
+                c
+                for c in self._effective_signals_schema.db_signals()
+                if c in other_columns
+            ]
+            if not signals:
+                raise DataChainParamsError("subtract(): no common columns")
+        elif not isinstance(on, Sequence):
+            raise TypeError(
+                f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
+            )
+        elif not on:
+            raise DataChainParamsError(
+                "'on' cannot be empty",
+            )
+        else:
+            signals = self.signals_schema.resolve(*on).db_signals()
+        return super()._subtract(other, signals)
     @classmethod
     def from_values(
         cls,
@@ -1081,6 +1233,7 @@ class DataChain(DatasetQuery):
         output: OutputType = None,
         object_name: str = "",
         model_name: str = "",
+        source: bool = True,
         nrows: Optional[int] = None,
         **kwargs,
     ) -> "DataChain":
@@ -1092,8 +1245,9 @@ class DataChain(DatasetQuery):
                 case types will be inferred.
             object_name : Generated object column name.
             model_name : Generated model name.
-            kwargs : Parameters to pass to pyarrow.dataset.dataset.
+            source : Whether to include info about the source file.
             nrows : Optional row limit.
+            kwargs : Parameters to pass to pyarrow.dataset.dataset.
         Example:
             Reading a json lines file:
@@ -1120,18 +1274,24 @@ class DataChain(DatasetQuery):
             except ValueError as e:
                 raise DatasetPrepareError(self.name, e) from e
+        if isinstance(output, dict):
+            model_name = model_name or object_name or ""
+            model = DataChain._dict_to_data_model(model_name, output)
+        else:
+            model = output  # type: ignore[assignment]
         if object_name:
-            if isinstance(output, dict):
-                model_name = model_name or object_name
-                output = DataChain._dict_to_data_model(model_name, output)
-            output = {object_name: output}  # type: ignore[dict-item]
+            output = {object_name: model}  # type: ignore[dict-item]
         elif isinstance(output, type(BaseModel)):
             output = {
                 name: info.annotation  # type: ignore[misc]
                 for name, info in output.model_fields.items()
             }
-        output = {"source": IndexedFile} | output  # type: ignore[assignment,operator]
-        return self.gen(ArrowGenerator(schema, nrows, **kwargs), output=output)
+        if source:
+            output = {"source": IndexedFile} | output  # type: ignore[assignment,operator]
+        return self.gen(
+            ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
+        )
     @staticmethod
     def _dict_to_data_model(
@@ -1150,10 +1310,10 @@ class DataChain(DatasetQuery):
         path,
         delimiter: str = ",",
         header: bool = True,
-        column_names: Optional[list[str]] = None,
         output: OutputType = None,
         object_name: str = "",
         model_name: str = "",
+        source: bool = True,
         nrows=None,
         **kwargs,
     ) -> "DataChain":
@@ -1169,6 +1329,7 @@ class DataChain(DatasetQuery):
                 case types will be inferred.
             object_name : Created object column name.
             model_name : Generated model name.
+            source : Whether to include info about the source file.
             nrows : Optional row limit.
         Example:
@@ -1187,6 +1348,7 @@ class DataChain(DatasetQuery):
         chain = DataChain.from_storage(path, **kwargs)
+        column_names = None
         if not header:
             if not output:
                 msg = "error parsing csv - provide output if no header"
@@ -1208,6 +1370,7 @@ class DataChain(DatasetQuery):
             output=output,
             object_name=object_name,
             model_name=model_name,
+            source=source,
             nrows=nrows,
             format=format,
         )
@@ -1220,6 +1383,7 @@ class DataChain(DatasetQuery):
         output: Optional[dict[str, DataType]] = None,
         object_name: str = "",
         model_name: str = "",
+        source: bool = True,
         nrows=None,
         **kwargs,
     ) -> "DataChain":
@@ -1232,6 +1396,7 @@ class DataChain(DatasetQuery):
             output : Dictionary defining column names and their corresponding types.
             object_name : Created object column name.
             model_name : Generated model name.
+            source : Whether to include info about the source file.
             nrows : Optional row limit.
         Example:
@@ -1250,6 +1415,7 @@ class DataChain(DatasetQuery):
             output=output,
             object_name=object_name,
             model_name=model_name,
+            source=source,
             nrows=None,
             format="parquet",
             partitioning=partitioning,
@@ -1436,7 +1602,18 @@ class DataChain(DatasetQuery):
     @detach
     def limit(self, n: int) -> "Self":
         """Return the first n rows of the chain."""
-        return super().limit(n)
+        n = max(n, 0)
+        if self.max_row_count is None:
+            self.max_row_count = n
+            return super().limit(n)
+        limit = min(n, self.max_row_count)
+        if limit == self.max_row_count:
+            return self
+        self.max_row_count = limit
+        return super().limit(self.max_row_count)
     @detach
     def offset(self, offset: int) -> "Self":

datachain/lib/file.py CHANGED Viewed

@@ -12,7 +12,6 @@ from urllib.parse import unquote, urlparse
 from urllib.request import url2pathname
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
-from fsspec.implementations.local import LocalFileSystem
 from PIL import Image
 from pydantic import Field, field_validator
@@ -283,9 +282,8 @@ class File(DataModel):
     def get_path(self) -> str:
         """Returns file path."""
         path = unquote(self.get_uri())
-        fs = self.get_fs()
-        if isinstance(fs, LocalFileSystem):
-            # Drop file:// protocol
+        source = urlparse(self.source)
+        if source.scheme == "file":
             path = urlparse(path).path
             path = url2pathname(path)
         return path
@@ -300,13 +298,10 @@ class File(DataModel):
         elif placement == "etag":
             path = f"{self.etag}{self.get_file_suffix()}"
         elif placement == "fullpath":
-            fs = self.get_fs()
-            if isinstance(fs, LocalFileSystem):
-                path = unquote(self.get_full_name())
-            else:
-                path = (
-                    Path(urlparse(self.source).netloc) / unquote(self.get_full_name())
-                ).as_posix()
+            path = unquote(self.get_full_name())
+            source = urlparse(self.source)
+            if source.scheme and source.scheme != "file":
+                path = posixpath.join(source.netloc, path)
         elif placement == "checksum":
             raise NotImplementedError("Checksum placement not implemented yet")
         else:

datachain/lib/meta_formats.py CHANGED Viewed

@@ -11,9 +11,9 @@ from collections.abc import Iterator
 from typing import Any, Callable
 import jmespath as jsp
-from pydantic import ValidationError
+from pydantic import Field, ValidationError  # noqa: F401
-from datachain.lib.data_model import ModelStore  # noqa: F401
+from datachain.lib.data_model import DataModel  # noqa: F401
 from datachain.lib.file import File
@@ -87,7 +87,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
     except subprocess.CalledProcessError as e:
         model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
     print(f"{model_output}")
-    print("\n" + f"ModelStore.register({model_name})" + "\n")
+    print("\n" + "from datachain.lib.data_model import DataModel" + "\n")
+    print("\n" + f"DataModel.register({model_name})" + "\n")
     print("\n" + f"spec={model_name}" + "\n")
     return model_output
@@ -100,7 +101,7 @@ def read_meta(  # noqa: C901
     schema_from=None,
     meta_type="json",
     jmespath=None,
-    show_schema=False,
+    print_schema=False,
     model_name=None,
     nrows=None,
 ) -> Callable:
@@ -128,7 +129,7 @@ def read_meta(  # noqa: C901
         model_output = captured_output.getvalue()
         captured_output.close()
-        if show_schema:
+        if print_schema:
             print(f"{model_output}")
         # Below 'spec' should be a dynamically converted DataModel from Pydantic
         if not spec:
@@ -147,18 +148,18 @@ def read_meta(  # noqa: C901
     def parse_data(
         file: File,
-        DataModel=spec,  # noqa: N803
+        data_model=spec,
         meta_type=meta_type,
         jmespath=jmespath,
         nrows=nrows,
     ) -> Iterator[spec]:
-        def validator(json_object: dict) -> spec:
+        def validator(json_object: dict, nrow=0) -> spec:
             json_string = json.dumps(json_object)
             try:
-                data_instance = DataModel.model_validate_json(json_string)
+                data_instance = data_model.model_validate_json(json_string)
                 yield data_instance
             except ValidationError as e:
-                print(f"Validation error occurred in file {file.name}:", e)
+                print(f"Validation error occurred in row {nrow} file {file.name}:", e)
         if meta_type == "csv":
             with (
@@ -184,7 +185,7 @@ def read_meta(  # noqa: C901
                     nrow = nrow + 1
                     if nrows is not None and nrow > nrows:
                         return
-                    yield from validator(json_dict)
+                    yield from validator(json_dict, nrow)
         if meta_type == "jsonl":
             try:
@@ -197,7 +198,7 @@ def read_meta(  # noqa: C901
                             return
                         json_object = process_json(data_string, jmespath)
                         data_string = fd.readline()
-                        yield from validator(json_object)
+                        yield from validator(json_object, nrow)
             except OSError as e:
                 print(f"An unexpected file error occurred in file {file.name}: {e}")

datachain/lib/settings.py CHANGED Viewed

@@ -7,11 +7,8 @@ class SettingsError(DataChainParamsError):
 class Settings:
-    def __init__(
-        self, cache=None, batch=None, parallel=None, workers=None, min_task_size=None
-    ):
+    def __init__(self, cache=None, parallel=None, workers=None, min_task_size=None):
         self._cache = cache
-        self._batch = batch
         self.parallel = parallel
         self._workers = workers
         self.min_task_size = min_task_size
@@ -22,12 +19,6 @@ class Settings:
                 f" while {cache.__class__.__name__} was given"
             )
-        if not isinstance(batch, int) and batch is not None:
-            raise SettingsError(
-                "'batch' argument must be int or None"
-                f" while {batch.__class__.__name__} was given"
-            )
         if not isinstance(parallel, int) and parallel is not None:
             raise SettingsError(
                 "'parallel' argument must be int or None"
@@ -54,10 +45,6 @@ class Settings:
     def cache(self):
         return self._cache if self._cache is not None else False
-    @property
-    def batch(self):
-        return self._batch if self._batch is not None else 1
     @property
     def workers(self):
         return self._workers if self._workers is not None else False
@@ -66,8 +53,6 @@ class Settings:
         res = {}
         if self._cache is not None:
             res["cache"] = self.cache
-        if self._batch is not None:
-            res["batch"] = self.batch
         if self.parallel is not None:
             res["parallel"] = self.parallel
         if self._workers is not None:
@@ -78,7 +63,6 @@ class Settings:
     def add(self, settings: "Settings"):
         self._cache = settings._cache or self._cache
-        self._batch = settings._batch or self._batch
         self.parallel = settings.parallel or self.parallel
         self._workers = settings._workers or self._workers
         self.min_task_size = settings.min_task_size or self.min_task_size

datachain/lib/udf.py CHANGED Viewed

@@ -225,11 +225,10 @@ class UDFBase(AbstractUDF):
     def __call__(self, *rows, cache, download_cb):
         if self.is_input_grouped:
             objs = self._parse_grouped_rows(rows[0], cache, download_cb)
+        elif self.is_input_batched:
+            objs = zip(*self._parse_rows(rows[0], cache, download_cb))
         else:
-            objs = self._parse_rows(rows, cache, download_cb)
-        if not self.is_input_batched:
-            objs = objs[0]
+            objs = self._parse_rows([rows], cache, download_cb)[0]
         result_objs = self.process_safe(objs)
@@ -259,17 +258,24 @@ class UDFBase(AbstractUDF):
         if not self.is_output_batched:
             res = list(res)
-            assert len(res) == 1, (
-                f"{self.name} returns {len(res)} " f"rows while it's not batched"
-            )
+            assert (
+                len(res) == 1
+            ), f"{self.name} returns {len(res)} rows while it's not batched"
             if isinstance(res[0], tuple):
                 res = res[0]
+        elif (
+            self.is_input_batched
+            and self.is_output_batched
+            and not self.is_input_grouped
+        ):
+            res = list(res)
+            assert len(res) == len(
+                rows[0]
+            ), f"{self.name} returns {len(res)} rows while len(rows[0]) expected"
         return res
     def _parse_rows(self, rows, cache, download_cb):
-        if not self.is_input_batched:
-            rows = [rows]
         objs = []
         for row in rows:
             obj_row = self.params.row_to_objs(row)
@@ -330,7 +336,9 @@ class Mapper(UDFBase):
     """Inherit from this class to pass to `DataChain.map()`."""
-class BatchMapper(Mapper):
+class BatchMapper(UDFBase):
+    """Inherit from this class to pass to `DataChain.batch_map()`."""
     is_input_batched = True
     is_output_batched = True

datachain 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

Potentially problematic release.

datachain 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl