PyPI - datachain - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

datachain 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (23) hide show

datachain/catalog/catalog.py +8 -0
datachain/data_storage/metastore.py +20 -1
datachain/data_storage/sqlite.py +24 -32
datachain/lib/arrow.py +64 -19
datachain/lib/convert/values_to_tuples.py +2 -2
datachain/lib/data_model.py +1 -1
datachain/lib/dc.py +131 -12
datachain/lib/signal_schema.py +6 -6
datachain/lib/udf.py +208 -160
datachain/lib/udf_signature.py +8 -6
datachain/query/batch.py +0 -10
datachain/query/dataset.py +7 -7
datachain/query/dispatch.py +2 -14
datachain/query/session.py +42 -0
datachain/sql/functions/string.py +12 -0
datachain/sql/sqlite/base.py +10 -5
{datachain-0.5.0.dist-info → datachain-0.6.0.dist-info}/METADATA +1 -1
{datachain-0.5.0.dist-info → datachain-0.6.0.dist-info}/RECORD +22 -23
datachain/query/udf.py +0 -126
{datachain-0.5.0.dist-info → datachain-0.6.0.dist-info}/LICENSE +0 -0
{datachain-0.5.0.dist-info → datachain-0.6.0.dist-info}/WHEEL +0 -0
{datachain-0.5.0.dist-info → datachain-0.6.0.dist-info}/entry_points.txt +0 -0
{datachain-0.5.0.dist-info → datachain-0.6.0.dist-info}/top_level.txt +0 -0

datachain/lib/udf.py CHANGED Viewed

@@ -1,31 +1,32 @@
 import sys
 import traceback
-from typing import TYPE_CHECKING, Callable, Optional
+from collections.abc import Iterable, Iterator, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Callable, Optional
+import attrs
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from pydantic import BaseModel
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
-from datachain.lib.convert.unflatten import unflatten_to_json
+from datachain.lib.data_model import DataValue
 from datachain.lib.file import File
-from datachain.lib.model_store import ModelStore
 from datachain.lib.signal_schema import SignalSchema
-from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
-from datachain.query.batch import UDFInputBatch
-from datachain.query.schema import ColumnParameter
-from datachain.query.udf import UDFBase as _UDFBase
-from datachain.query.udf import UDFProperties
+from datachain.query.batch import (
+    Batch,
+    BatchingStrategy,
+    NoBatching,
+    Partition,
+    RowsOutputBatch,
+)
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Iterator, Sequence
     from typing_extensions import Self
     from datachain.catalog import Catalog
-    from datachain.query.batch import RowsOutput, UDFInput
-    from datachain.query.udf import UDFResult
+    from datachain.lib.udf_signature import UdfSignature
+    from datachain.query.batch import RowsOutput
 class UdfError(DataChainParamsError):
@@ -33,14 +34,47 @@ class UdfError(DataChainParamsError):
         super().__init__(f"UDF error: {msg}")
-class UDFAdapter(_UDFBase):
-    def __init__(
-        self,
-        inner: "UDFBase",
-        properties: UDFProperties,
-    ):
-        self.inner = inner
-        super().__init__(properties)
+ColumnType = Any
+# Specification for the output of a UDF
+UDFOutputSpec = Mapping[str, ColumnType]
+# Result type when calling the UDF wrapper around the actual
+# Python function / class implementing it.
+UDFResult = dict[str, Any]
+@attrs.define
+class UDFProperties:
+    udf: "UDFAdapter"
+    def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
+        return self.udf.get_batching(use_partitioning)
+    @property
+    def batch(self):
+        return self.udf.batch
+@attrs.define(slots=False)
+class UDFAdapter:
+    inner: "UDFBase"
+    output: UDFOutputSpec
+    batch: int = 1
+    def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
+        if use_partitioning:
+            return Partition()
+        if self.batch == 1:
+            return NoBatching()
+        if self.batch > 1:
+            return Batch(self.batch)
+        raise ValueError(f"invalid batch size {self.batch}")
+    @property
+    def properties(self):
+        # For backwards compatibility.
+        return UDFProperties(self)
     def run(
         self,
@@ -51,48 +85,16 @@ class UDFAdapter(_UDFBase):
         cache: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
-    ) -> "Iterator[Iterable[UDFResult]]":
-        self.inner._catalog = catalog
-        if hasattr(self.inner, "setup") and callable(self.inner.setup):
-            self.inner.setup()
-        yield from super().run(
+    ) -> Iterator[Iterable[UDFResult]]:
+        yield from self.inner.run(
             udf_fields,
             udf_inputs,
             catalog,
-            is_generator,
             cache,
             download_cb,
             processed_cb,
         )
-        if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
-            self.inner.teardown()
-    def run_once(
-        self,
-        catalog: "Catalog",
-        arg: "UDFInput",
-        is_generator: bool = False,
-        cache: bool = False,
-        cb: Callback = DEFAULT_CALLBACK,
-    ) -> "Iterable[UDFResult]":
-        if isinstance(arg, UDFInputBatch):
-            udf_inputs = [
-                self.bind_parameters(catalog, row, cache=cache, cb=cb)
-                for row in arg.rows
-            ]
-            udf_outputs = self.inner(udf_inputs, cache=cache, download_cb=cb)
-            return self._process_results(arg.rows, udf_outputs, is_generator)
-        if isinstance(arg, RowDict):
-            udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
-            udf_outputs = self.inner(*udf_inputs, cache=cache, download_cb=cb)
-            if not is_generator:
-                # udf_outputs is generator already if is_generator=True
-                udf_outputs = [udf_outputs]
-            return self._process_results([arg], udf_outputs, is_generator)
-        raise ValueError(f"Unexpected UDF argument: {arg}")
 class UDFBase(AbstractUDF):
     """Base class for stateful user-defined functions.
@@ -142,18 +144,13 @@ class UDFBase(AbstractUDF):
         ```
     """
-    is_input_batched = False
     is_output_batched = False
-    is_input_grouped = False
-    params_spec: Optional[list[str]]
+    catalog: "Optional[Catalog]"
     def __init__(self):
-        self.params = None
+        self.params: Optional[SignalSchema] = None
         self.output = None
-        self.params_spec = None
-        self.output_spec = None
-        self._contains_stream = None
-        self._catalog = None
+        self.catalog = None
         self._func = None
     def process(self, *args, **kwargs):
@@ -174,29 +171,24 @@ class UDFBase(AbstractUDF):
     def _init(
         self,
-        sign: UdfSignature,
+        sign: "UdfSignature",
         params: SignalSchema,
-        func: Callable,
+        func: Optional[Callable],
     ):
         self.params = params
         self.output = sign.output_schema
-        params_spec = self.params.to_udf_spec()
-        self.params_spec = list(params_spec.keys())
-        self.output_spec = self.output.to_udf_spec()
         self._func = func
     @classmethod
     def _create(
         cls,
-        sign: UdfSignature,
+        sign: "UdfSignature",
         params: SignalSchema,
     ) -> "Self":
         if isinstance(sign.func, AbstractUDF):
             if not isinstance(sign.func, cls):  # type: ignore[unreachable]
                 raise UdfError(
-                    f"cannot create UDF: provided UDF '{sign.func.__name__}'"
+                    f"cannot create UDF: provided UDF '{type(sign.func).__name__}'"
                     f" must be a child of target class '{cls.__name__}'",
                 )
             result = sign.func
@@ -212,57 +204,27 @@ class UDFBase(AbstractUDF):
     def name(self):
         return self.__class__.__name__
-    def set_catalog(self, catalog):
-        self._catalog = catalog.copy(db=False)
     @property
-    def catalog(self):
-        return self._catalog
+    def signal_names(self) -> Iterable[str]:
+        return self.output.to_udf_spec().keys()
     def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
-        assert self.params_spec is not None
-        properties = UDFProperties(
-            [ColumnParameter(p) for p in self.params_spec], self.output_spec, batch
+        return UDFAdapter(
+            self,
+            self.output.to_udf_spec(),
+            batch,
         )
-        return UDFAdapter(self, properties)
-    def validate_results(self, results, *args, **kwargs):
-        return results
-    def __call__(self, *rows, cache, download_cb):
-        if self.is_input_grouped:
-            objs = self._parse_grouped_rows(rows[0], cache, download_cb)
-        elif self.is_input_batched:
-            objs = zip(*self._parse_rows(rows[0], cache, download_cb))
-        else:
-            objs = self._parse_rows([rows], cache, download_cb)[0]
-        result_objs = self.process_safe(objs)
-        if not self.is_output_batched:
-            result_objs = [result_objs]
-        # Generator expression is required, otherwise the value will be materialized
-        res = (self._flatten_row(row) for row in result_objs)
-        if not self.is_output_batched:
-            res = list(res)
-            assert (
-                len(res) == 1
-            ), f"{self.name} returns {len(res)} rows while it's not batched"
-            if isinstance(res[0], tuple):
-                res = res[0]
-        elif (
-            self.is_input_batched
-            and self.is_output_batched
-            and not self.is_input_grouped
-        ):
-            res = list(res)
-            assert len(res) == len(
-                rows[0]
-            ), f"{self.name} returns {len(res)} rows while len(rows[0]) expected"
-        return res
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Any]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        raise NotImplementedError
     def _flatten_row(self, row):
         if len(self.output.values) > 1 and not isinstance(row, BaseModel):
@@ -276,47 +238,28 @@ class UDFBase(AbstractUDF):
     def _obj_to_list(obj):
         return flatten(obj) if isinstance(obj, BaseModel) else [obj]
-    def _parse_rows(self, rows, cache, download_cb):
-        objs = []
-        for row in rows:
-            obj_row = self.params.row_to_objs(row)
-            for obj in obj_row:
-                if isinstance(obj, File):
-                    obj._set_stream(
-                        self._catalog, caching_enabled=cache, download_cb=download_cb
-                    )
-            objs.append(obj_row)
-        return objs
-    def _parse_grouped_rows(self, group, cache, download_cb):
-        spec_map = {}
-        output_map = {}
-        for name, (anno, subtree) in self.params.tree.items():
-            if ModelStore.is_pydantic(anno):
-                length = sum(1 for _ in self.params._get_flat_tree(subtree, [], 0))
-            else:
-                length = 1
-            spec_map[name] = anno, length
-            output_map[name] = []
-        for flat_obj in group:
-            position = 0
-            for signal, (cls, length) in spec_map.items():
-                slice = flat_obj[position : position + length]
-                position += length
-                if ModelStore.is_pydantic(cls):
-                    obj = cls(**unflatten_to_json(cls, slice))
-                else:
-                    obj = slice[0]
-                if isinstance(obj, File):
-                    obj._set_stream(
-                        self._catalog, caching_enabled=cache, download_cb=download_cb
-                    )
-                output_map[signal].append(obj)
+    def _parse_row(
+        self, row_dict: RowDict, cache: bool, download_cb: Callback
+    ) -> list[DataValue]:
+        assert self.params
+        row = [row_dict[p] for p in self.params.to_udf_spec()]
+        obj_row = self.params.row_to_objs(row)
+        for obj in obj_row:
+            if isinstance(obj, File):
+                assert self.catalog is not None
+                obj._set_stream(
+                    self.catalog, caching_enabled=cache, download_cb=download_cb
+                )
+        return obj_row
+    def _prepare_row(self, row, udf_fields, cache, download_cb):
+        row_dict = RowDict(zip(udf_fields, row))
+        return self._parse_row(row_dict, cache, download_cb)
-        return list(output_map.values())
+    def _prepare_row_and_id(self, row, udf_fields, cache, download_cb):
+        row_dict = RowDict(zip(udf_fields, row))
+        udf_input = self._parse_row(row_dict, cache, download_cb)
+        return row_dict["sys__id"], *udf_input
     def process_safe(self, obj_rows):
         try:
@@ -336,23 +279,128 @@ class UDFBase(AbstractUDF):
 class Mapper(UDFBase):
     """Inherit from this class to pass to `DataChain.map()`."""
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Sequence[Any]]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for row in udf_inputs:
+            id_, *udf_args = self._prepare_row_and_id(
+                row, udf_fields, cache, download_cb
+            )
+            result_objs = self.process_safe(udf_args)
+            udf_output = self._flatten_row(result_objs)
+            output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
+            processed_cb.relative_update(1)
+            yield output
+        self.teardown()
 class BatchMapper(UDFBase):
     """Inherit from this class to pass to `DataChain.batch_map()`."""
-    is_input_batched = True
     is_output_batched = True
+    def run(
+        self,
+        udf_fields: Sequence[str],
+        udf_inputs: Iterable[RowsOutputBatch],
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for batch in udf_inputs:
+            n_rows = len(batch.rows)
+            row_ids, *udf_args = zip(
+                *[
+                    self._prepare_row_and_id(row, udf_fields, cache, download_cb)
+                    for row in batch.rows
+                ]
+            )
+            result_objs = list(self.process_safe(udf_args))
+            n_objs = len(result_objs)
+            assert (
+                n_objs == n_rows
+            ), f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = [
+                {"sys__id": row_id} | dict(zip(self.signal_names, signals))
+                for row_id, signals in zip(row_ids, udf_outputs)
+            ]
+            processed_cb.relative_update(n_rows)
+            yield output
+        self.teardown()
 class Generator(UDFBase):
     """Inherit from this class to pass to `DataChain.gen()`."""
     is_output_batched = True
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Sequence[Any]]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for row in udf_inputs:
+            udf_args = self._prepare_row(row, udf_fields, cache, download_cb)
+            result_objs = self.process_safe(udf_args)
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
+            processed_cb.relative_update(1)
+            yield output
+        self.teardown()
 class Aggregator(UDFBase):
     """Inherit from this class to pass to `DataChain.agg()`."""
-    is_input_batched = True
     is_output_batched = True
-    is_input_grouped = True
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: Iterable[RowsOutputBatch],
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for batch in udf_inputs:
+            udf_args = zip(
+                *[
+                    self._prepare_row(row, udf_fields, cache, download_cb)
+                    for row in batch.rows
+                ]
+            )
+            result_objs = self.process_safe(udf_args)
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
+            processed_cb.relative_update(len(batch.rows))
+            yield output
+        self.teardown()

datachain/lib/udf_signature.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import inspect
 from collections.abc import Generator, Iterator, Sequence
 from dataclasses import dataclass
-from typing import Callable, Optional, Union, get_args, get_origin
+from typing import Callable, Union, get_args, get_origin
 from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
 from datachain.lib.signal_schema import SignalSchema
+from datachain.lib.udf import UDFBase
 from datachain.lib.utils import AbstractUDF, DataChainParamsError
@@ -16,7 +17,7 @@ class UdfSignatureError(DataChainParamsError):
 @dataclass
 class UdfSignature:
-    func: Callable
+    func: Union[Callable, UDFBase]
     params: Sequence[str]
     output_schema: SignalSchema
@@ -27,7 +28,7 @@ class UdfSignature:
         cls,
         chain: str,
         signal_map: dict[str, Callable],
-        func: Optional[Callable] = None,
+        func: Union[None, UDFBase, Callable] = None,
         params: Union[None, str, Sequence[str]] = None,
         output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
         is_generator: bool = True,
@@ -39,6 +40,7 @@ class UdfSignature:
                 f"multiple signals '{keys}' are not supported in processors."
                 " Chain multiple processors instead.",
             )
+        udf_func: Union[UDFBase, Callable]
         if len(signal_map) == 1:
             if func is not None:
                 raise UdfSignatureError(
@@ -53,7 +55,7 @@ class UdfSignature:
             udf_func = func
             signal_name = None
-        if not callable(udf_func):
+        if not isinstance(udf_func, UDFBase) and not callable(udf_func):
             raise UdfSignatureError(chain, f"UDF '{udf_func}' is not callable")
         func_params_map_sign, func_outs_sign, is_iterator = (
@@ -73,7 +75,7 @@ class UdfSignature:
             if not func_outs_sign:
                 raise UdfSignatureError(
                     chain,
-                    f"outputs are not defined in function '{udf_func.__name__}'"
+                    f"outputs are not defined in function '{udf_func}'"
                     " hints or 'output'",
                 )
@@ -154,7 +156,7 @@ class UdfSignature:
     @staticmethod
     def _func_signature(
-        chain: str, udf_func: Callable
+        chain: str, udf_func: Union[Callable, UDFBase]
     ) -> tuple[dict[str, type], Sequence[type], bool]:
         if isinstance(udf_func, AbstractUDF):
             func = udf_func.process  # type: ignore[unreachable]

datachain/query/batch.py CHANGED Viewed

@@ -11,8 +11,6 @@ from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
 if TYPE_CHECKING:
     from sqlalchemy import Select
-    from datachain.dataset import RowDict
 @dataclass
 class RowsOutputBatch:
@@ -22,14 +20,6 @@ class RowsOutputBatch:
 RowsOutput = Union[Sequence, RowsOutputBatch]
-@dataclass
-class UDFInputBatch:
-    rows: Sequence["RowDict"]
-UDFInput = Union["RowDict", UDFInputBatch]
 class BatchingStrategy(ABC):
     """BatchingStrategy provides means of batching UDF executions."""

datachain/query/dataset.py CHANGED Viewed

@@ -42,6 +42,7 @@ from datachain.data_storage.schema import (
 )
 from datachain.dataset import DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
+from datachain.lib.udf import UDFAdapter
 from datachain.progress import CombinedDownloadCallback
 from datachain.sql.functions import rand
 from datachain.utils import (
@@ -53,7 +54,6 @@ from datachain.utils import (
 from .schema import C, UDFParamSpec, normalize_param
 from .session import Session
-from .udf import UDFBase
 if TYPE_CHECKING:
     from sqlalchemy.sql.elements import ClauseElement
@@ -299,7 +299,7 @@ def adjust_outputs(
     return row
-def get_udf_col_types(warehouse: "AbstractWarehouse", udf: UDFBase) -> list[tuple]:
+def get_udf_col_types(warehouse: "AbstractWarehouse", udf: UDFAdapter) -> list[tuple]:
     """Optimization: Precompute UDF column types so these don't have to be computed
     in the convert_type function for each row in a loop."""
     dialect = warehouse.db.dialect
@@ -320,7 +320,7 @@ def process_udf_outputs(
     warehouse: "AbstractWarehouse",
     udf_table: "Table",
     udf_results: Iterator[Iterable["UDFResult"]],
-    udf: UDFBase,
+    udf: UDFAdapter,
     batch_size: int = INSERT_BATCH_SIZE,
     cb: Callback = DEFAULT_CALLBACK,
 ) -> None:
@@ -364,7 +364,7 @@ def get_generated_callback(is_generator: bool = False) -> Callback:
 @frozen
 class UDFStep(Step, ABC):
-    udf: UDFBase
+    udf: UDFAdapter
     catalog: "Catalog"
     partition_by: Optional[PartitionByType] = None
     parallel: Optional[int] = None
@@ -392,7 +392,7 @@ class UDFStep(Step, ABC):
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
         use_partitioning = self.partition_by is not None
-        batching = self.udf.properties.get_batching(use_partitioning)
+        batching = self.udf.get_batching(use_partitioning)
         workers = self.workers
         if (
             not workers
@@ -1465,7 +1465,7 @@ class DatasetQuery:
     @detach
     def add_signals(
         self,
-        udf: UDFBase,
+        udf: UDFAdapter,
         parallel: Optional[int] = None,
         workers: Union[bool, int] = False,
         min_task_size: Optional[int] = None,
@@ -1509,7 +1509,7 @@ class DatasetQuery:
     @detach
     def generate(
         self,
-        udf: UDFBase,
+        udf: UDFAdapter,
         parallel: Optional[int] = None,
         workers: Union[bool, int] = False,
         min_task_size: Optional[int] = None,

datachain/query/dispatch.py CHANGED Viewed

@@ -13,6 +13,7 @@ from multiprocess import get_context
 from datachain.catalog import Catalog
 from datachain.catalog.loader import get_distributed_class
+from datachain.lib.udf import UDFAdapter, UDFResult
 from datachain.query.dataset import (
     get_download_callback,
     get_generated_callback,
@@ -27,7 +28,6 @@ from datachain.query.queue import (
     put_into_queue,
     unmarshal,
 )
-from datachain.query.udf import UDFBase, UDFResult
 from datachain.utils import batched_it
 DEFAULT_BATCH_SIZE = 10000
@@ -114,7 +114,6 @@ class UDFDispatcher:
     catalog: Optional[Catalog] = None
     task_queue: Optional[multiprocess.Queue] = None
     done_queue: Optional[multiprocess.Queue] = None
-    _batch_size: Optional[int] = None
     def __init__(
         self,
@@ -154,17 +153,6 @@ class UDFDispatcher:
         self.done_queue = None
         self.ctx = get_context("spawn")
-    @property
-    def batch_size(self):
-        if self._batch_size is None:
-            if hasattr(self.udf, "properties") and hasattr(
-                self.udf.properties, "batch"
-            ):
-                self._batch_size = self.udf.properties.batch
-            else:
-                self._batch_size = 1
-        return self._batch_size
     def _create_worker(self) -> "UDFWorker":
         if not self.catalog:
             id_generator = self.id_generator_class(
@@ -336,7 +324,7 @@ class ProcessedCallback(Callback):
 @attrs.define
 class UDFWorker:
     catalog: Catalog
-    udf: UDFBase
+    udf: UDFAdapter
     task_queue: "multiprocess.Queue"
     done_queue: "multiprocess.Queue"
     is_generator: bool

datachain 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

datachain 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl