PyPI - datachain - Versions diffs - 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl - Mend

datachain 0.8.3py3-none-any.whl → 0.8.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (50) hide show

datachain/asyn.py +16 -6
datachain/cache.py +32 -10
datachain/catalog/catalog.py +17 -1
datachain/cli/__init__.py +311 -0
datachain/cli/commands/__init__.py +29 -0
datachain/cli/commands/datasets.py +129 -0
datachain/cli/commands/du.py +14 -0
datachain/cli/commands/index.py +12 -0
datachain/cli/commands/ls.py +169 -0
datachain/cli/commands/misc.py +28 -0
datachain/cli/commands/query.py +53 -0
datachain/cli/commands/show.py +38 -0
datachain/cli/parser/__init__.py +547 -0
datachain/cli/parser/job.py +120 -0
datachain/cli/parser/studio.py +126 -0
datachain/cli/parser/utils.py +63 -0
datachain/{cli_utils.py → cli/utils.py} +27 -1
datachain/client/azure.py +6 -2
datachain/client/fsspec.py +9 -3
datachain/client/gcs.py +6 -2
datachain/client/s3.py +16 -1
datachain/data_storage/db_engine.py +9 -0
datachain/data_storage/schema.py +4 -10
datachain/data_storage/sqlite.py +7 -1
datachain/data_storage/warehouse.py +6 -4
datachain/{lib/diff.py → diff/__init__.py} +116 -12
datachain/func/__init__.py +3 -2
datachain/func/conditional.py +74 -0
datachain/func/func.py +5 -1
datachain/lib/arrow.py +7 -1
datachain/lib/dc.py +8 -3
datachain/lib/file.py +16 -5
datachain/lib/hf.py +1 -1
datachain/lib/listing.py +19 -1
datachain/lib/pytorch.py +57 -13
datachain/lib/signal_schema.py +89 -27
datachain/lib/udf.py +82 -40
datachain/listing.py +1 -0
datachain/progress.py +20 -3
datachain/query/dataset.py +122 -93
datachain/query/dispatch.py +22 -16
datachain/studio.py +58 -38
datachain/utils.py +14 -3
{datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/METADATA +9 -9
{datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/RECORD +49 -37
{datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/WHEEL +1 -1
datachain/cli.py +0 -1475
{datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/LICENSE +0 -0
{datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/entry_points.txt +0 -0
{datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/top_level.txt +0 -0

datachain/lib/udf.py CHANGED Viewed

@@ -1,14 +1,16 @@
-import contextlib
 import sys
 import traceback
-from collections.abc import Iterable, Iterator, Mapping, Sequence
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
+from contextlib import closing, nullcontext
+from functools import partial
+from typing import TYPE_CHECKING, Any, Optional, TypeVar
 import attrs
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from pydantic import BaseModel
 from datachain.asyn import AsyncMapper
+from datachain.cache import temporary_cache
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
 from datachain.lib.data_model import DataValue
@@ -21,17 +23,22 @@ from datachain.query.batch import (
     Partition,
     RowsOutputBatch,
 )
+from datachain.utils import safe_closing
 if TYPE_CHECKING:
     from collections import abc
+    from contextlib import AbstractContextManager
     from typing_extensions import Self
+    from datachain.cache import DataChainCache as Cache
     from datachain.catalog import Catalog
     from datachain.lib.signal_schema import SignalSchema
     from datachain.lib.udf_signature import UdfSignature
     from datachain.query.batch import RowsOutput
+T = TypeVar("T", bound=Sequence[Any])
 class UdfError(DataChainParamsError):
     def __init__(self, msg):
@@ -98,6 +105,10 @@ class UDFAdapter:
             processed_cb,
         )
+    @property
+    def prefetch(self) -> int:
+        return self.inner.prefetch
 class UDFBase(AbstractUDF):
     """Base class for stateful user-defined functions.
@@ -148,12 +159,11 @@ class UDFBase(AbstractUDF):
     """
     is_output_batched = False
-    catalog: "Optional[Catalog]"
+    prefetch: int = 0
     def __init__(self):
         self.params: Optional[SignalSchema] = None
         self.output = None
-        self.catalog = None
         self._func = None
     def process(self, *args, **kwargs):
@@ -242,26 +252,23 @@ class UDFBase(AbstractUDF):
         return flatten(obj) if isinstance(obj, BaseModel) else [obj]
     def _parse_row(
-        self, row_dict: RowDict, cache: bool, download_cb: Callback
+        self, row_dict: RowDict, catalog: "Catalog", cache: bool, download_cb: Callback
     ) -> list[DataValue]:
         assert self.params
         row = [row_dict[p] for p in self.params.to_udf_spec()]
         obj_row = self.params.row_to_objs(row)
         for obj in obj_row:
             if isinstance(obj, File):
-                assert self.catalog is not None
-                obj._set_stream(
-                    self.catalog, caching_enabled=cache, download_cb=download_cb
-                )
+                obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
         return obj_row
-    def _prepare_row(self, row, udf_fields, cache, download_cb):
+    def _prepare_row(self, row, udf_fields, catalog, cache, download_cb):
         row_dict = RowDict(zip(udf_fields, row))
-        return self._parse_row(row_dict, cache, download_cb)
+        return self._parse_row(row_dict, catalog, cache, download_cb)
-    def _prepare_row_and_id(self, row, udf_fields, cache, download_cb):
+    def _prepare_row_and_id(self, row, udf_fields, catalog, cache, download_cb):
         row_dict = RowDict(zip(udf_fields, row))
-        udf_input = self._parse_row(row_dict, cache, download_cb)
+        udf_input = self._parse_row(row_dict, catalog, cache, download_cb)
         return row_dict["sys__id"], *udf_input
     def process_safe(self, obj_rows):
@@ -279,13 +286,47 @@ class UDFBase(AbstractUDF):
         return result_objs
-async def _prefetch_input(row):
+def noop(*args, **kwargs):
+    pass
+async def _prefetch_input(
+    row: T,
+    download_cb: Optional["Callback"] = None,
+    after_prefetch: "Callable[[], None]" = noop,
+) -> T:
     for obj in row:
-        if isinstance(obj, File):
-            await obj._prefetch()
+        if isinstance(obj, File) and await obj._prefetch(download_cb):
+            after_prefetch()
     return row
+def _prefetch_inputs(
+    prepared_inputs: "Iterable[T]",
+    prefetch: int = 0,
+    download_cb: Optional["Callback"] = None,
+    after_prefetch: "Callable[[], None]" = noop,
+) -> "abc.Generator[T, None, None]":
+    if prefetch > 0:
+        f = partial(
+            _prefetch_input,
+            download_cb=download_cb,
+            after_prefetch=after_prefetch,
+        )
+        prepared_inputs = AsyncMapper(f, prepared_inputs, workers=prefetch).iterate()  # type: ignore[assignment]
+    yield from prepared_inputs
+def _get_cache(
+    cache: "Cache", prefetch: int = 0, use_cache: bool = False
+) -> "AbstractContextManager[Cache]":
+    tmp_dir = cache.tmp_dir
+    assert tmp_dir
+    if prefetch and not use_cache:
+        return temporary_cache(tmp_dir, prefix="prefetch-")
+    return nullcontext(cache)
 class Mapper(UDFBase):
     """Inherit from this class to pass to `DataChain.map()`."""
@@ -300,18 +341,18 @@ class Mapper(UDFBase):
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.catalog = catalog
         self.setup()
-        prepared_inputs: abc.Generator[Sequence[Any], None, None] = (
-            self._prepare_row_and_id(row, udf_fields, cache, download_cb)
-            for row in udf_inputs
-        )
-        if self.prefetch > 0:
-            prepared_inputs = AsyncMapper(
-                _prefetch_input, prepared_inputs, workers=self.prefetch
-            ).iterate()
-        with contextlib.closing(prepared_inputs):
+        def _prepare_rows(udf_inputs) -> "abc.Generator[Sequence[Any], None, None]":
+            with safe_closing(udf_inputs):
+                for row in udf_inputs:
+                    yield self._prepare_row_and_id(
+                        row, udf_fields, catalog, cache, download_cb
+                    )
+        prepared_inputs = _prepare_rows(udf_inputs)
+        prepared_inputs = _prefetch_inputs(prepared_inputs, self.prefetch)
+        with closing(prepared_inputs):
             for id_, *udf_args in prepared_inputs:
                 result_objs = self.process_safe(udf_args)
                 udf_output = self._flatten_row(result_objs)
@@ -336,14 +377,15 @@ class BatchMapper(UDFBase):
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.catalog = catalog
         self.setup()
         for batch in udf_inputs:
             n_rows = len(batch.rows)
             row_ids, *udf_args = zip(
                 *[
-                    self._prepare_row_and_id(row, udf_fields, cache, download_cb)
+                    self._prepare_row_and_id(
+                        row, udf_fields, catalog, cache, download_cb
+                    )
                     for row in batch.rows
                 ]
             )
@@ -378,17 +420,18 @@ class Generator(UDFBase):
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.catalog = catalog
         self.setup()
-        prepared_inputs: abc.Generator[Sequence[Any], None, None] = (
-            self._prepare_row(row, udf_fields, cache, download_cb) for row in udf_inputs
-        )
-        if self.prefetch > 0:
-            prepared_inputs = AsyncMapper(
-                _prefetch_input, prepared_inputs, workers=self.prefetch
-            ).iterate()
-        with contextlib.closing(prepared_inputs):
+        def _prepare_rows(udf_inputs) -> "abc.Generator[Sequence[Any], None, None]":
+            with safe_closing(udf_inputs):
+                for row in udf_inputs:
+                    yield self._prepare_row(
+                        row, udf_fields, catalog, cache, download_cb
+                    )
+        prepared_inputs = _prepare_rows(udf_inputs)
+        prepared_inputs = _prefetch_inputs(prepared_inputs, self.prefetch)
+        with closing(prepared_inputs):
             for row in prepared_inputs:
                 result_objs = self.process_safe(row)
                 udf_outputs = (self._flatten_row(row) for row in result_objs)
@@ -413,13 +456,12 @@ class Aggregator(UDFBase):
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.catalog = catalog
         self.setup()
         for batch in udf_inputs:
             udf_args = zip(
                 *[
-                    self._prepare_row(row, udf_fields, cache, download_cb)
+                    self._prepare_row(row, udf_fields, catalog, cache, download_cb)
                     for row in batch.rows
                 ]
             )

datachain/listing.py CHANGED Viewed

@@ -153,6 +153,7 @@ class Listing:
             unit_scale=True,
             unit_divisor=1000,
             total=total_files,
+            leave=False,
         )
         counter = 0

datachain/progress.py CHANGED Viewed

@@ -5,6 +5,7 @@ import sys
 from threading import RLock
 from typing import Any, ClassVar
+from fsspec import Callback
 from fsspec.callbacks import TqdmCallback
 from tqdm import tqdm
@@ -61,7 +62,7 @@ class Tqdm(tqdm):
         disable  : If (default: None) or False,
             will be determined by logging level.
             May be overridden to `True` due to non-TTY status.
-            Skip override by specifying env var `DVC_IGNORE_ISATTY`.
+            Skip override by specifying env var `DATACHAIN_IGNORE_ISATTY`.
         kwargs  : anything accepted by `tqdm.tqdm()`
         """
         kwargs = kwargs.copy()
@@ -77,7 +78,7 @@ class Tqdm(tqdm):
         # auto-disable based on TTY
         if (
             not disable
-            and not env2bool("DVC_IGNORE_ISATTY")
+            and not env2bool("DATACHAIN_IGNORE_ISATTY")
             and hasattr(file, "isatty")
         ):
             disable = not file.isatty()
@@ -132,8 +133,24 @@ class Tqdm(tqdm):
         return d
-class CombinedDownloadCallback(TqdmCallback):
+class CombinedDownloadCallback(Callback):
     def set_size(self, size):
         # This is a no-op to prevent fsspec's .get_file() from setting the combined
         # download size to the size of the current file.
         pass
+    def increment_file_count(self, n: int = 1) -> None:
+        pass
+class TqdmCombinedDownloadCallback(CombinedDownloadCallback, TqdmCallback):
+    def __init__(self, tqdm_kwargs=None, *args, **kwargs):
+        self.files_count = 0
+        tqdm_kwargs = tqdm_kwargs or {}
+        tqdm_kwargs.setdefault("postfix", {}).setdefault("files", self.files_count)
+        super().__init__(tqdm_kwargs, *args, **kwargs)
+    def increment_file_count(self, n: int = 1) -> None:
+        self.files_count += n
+        if self.tqdm is not None:
+            self.tqdm.postfix = f"{self.files_count} files"

datachain/query/dataset.py CHANGED Viewed

@@ -35,6 +35,7 @@ from sqlalchemy.sql.schema import TableClause
 from sqlalchemy.sql.selectable import Select
 from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
+from datachain.catalog.catalog import clone_catalog_with_cache
 from datachain.data_storage.schema import (
     PARTITION_COLUMN_ID,
     partition_col_names,
@@ -43,7 +44,8 @@ from datachain.data_storage.schema import (
 from datachain.dataset import DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.func.base import Function
-from datachain.progress import CombinedDownloadCallback
+from datachain.lib.udf import UDFAdapter, _get_cache
+from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
 from datachain.query.schema import C, UDFParamSpec, normalize_param
 from datachain.query.session import Session
 from datachain.sql.functions.random import rand
@@ -52,6 +54,7 @@ from datachain.utils import (
     determine_processes,
     filtered_cloudpickle_dumps,
     get_datachain_executable,
+    safe_closing,
 )
 if TYPE_CHECKING:
@@ -349,19 +352,26 @@ def process_udf_outputs(
     warehouse.insert_rows_done(udf_table)
-def get_download_callback() -> Callback:
-    return CombinedDownloadCallback(
-        {"desc": "Download", "unit": "B", "unit_scale": True, "unit_divisor": 1024}
+def get_download_callback(suffix: str = "", **kwargs) -> CombinedDownloadCallback:
+    return TqdmCombinedDownloadCallback(
+        {
+            "desc": "Download" + suffix,
+            "unit": "B",
+            "unit_scale": True,
+            "unit_divisor": 1024,
+            "leave": False,
+            **kwargs,
+        },
     )
 def get_processed_callback() -> Callback:
-    return TqdmCallback({"desc": "Processed", "unit": " rows"})
+    return TqdmCallback({"desc": "Processed", "unit": " rows", "leave": False})
 def get_generated_callback(is_generator: bool = False) -> Callback:
     if is_generator:
-        return TqdmCallback({"desc": "Generated", "unit": " rows"})
+        return TqdmCallback({"desc": "Generated", "unit": " rows", "leave": False})
     return DEFAULT_CALLBACK
@@ -412,97 +422,109 @@ class UDFStep(Step, ABC):
         udf_fields = [str(c.name) for c in query.selected_columns]
-        try:
-            if workers:
-                if self.catalog.in_memory:
-                    raise RuntimeError(
-                        "In-memory databases cannot be used with "
-                        "distributed processing."
-                    )
+        prefetch = self.udf.prefetch
+        with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
+            catalog = clone_catalog_with_cache(self.catalog, _cache)
+            try:
+                if workers:
+                    if catalog.in_memory:
+                        raise RuntimeError(
+                            "In-memory databases cannot be used with "
+                            "distributed processing."
+                        )
-                from datachain.catalog.loader import get_distributed_class
-                distributor = get_distributed_class(min_task_size=self.min_task_size)
-                distributor(
-                    self.udf,
-                    self.catalog,
-                    udf_table,
-                    query,
-                    workers,
-                    processes,
-                    udf_fields=udf_fields,
-                    is_generator=self.is_generator,
-                    use_partitioning=use_partitioning,
-                    cache=self.cache,
-                )
-            elif processes:
-                # Parallel processing (faster for more CPU-heavy UDFs)
-                if self.catalog.in_memory:
-                    raise RuntimeError(
-                        "In-memory databases cannot be used with parallel processing."
-                    )
-                udf_info: UdfInfo = {
-                    "udf_data": filtered_cloudpickle_dumps(self.udf),
-                    "catalog_init": self.catalog.get_init_params(),
-                    "metastore_clone_params": self.catalog.metastore.clone_params(),
-                    "warehouse_clone_params": self.catalog.warehouse.clone_params(),
-                    "table": udf_table,
-                    "query": query,
-                    "udf_fields": udf_fields,
-                    "batching": batching,
-                    "processes": processes,
-                    "is_generator": self.is_generator,
-                    "cache": self.cache,
-                }
-                # Run the UDFDispatcher in another process to avoid needing
-                # if __name__ == '__main__': in user scripts
-                exec_cmd = get_datachain_executable()
-                cmd = [*exec_cmd, "internal-run-udf"]
-                envs = dict(os.environ)
-                envs.update({"PYTHONPATH": os.getcwd()})
-                process_data = filtered_cloudpickle_dumps(udf_info)
-                with subprocess.Popen(cmd, env=envs, stdin=subprocess.PIPE) as process:  # noqa: S603
-                    process.communicate(process_data)
-                    if retval := process.poll():
-                        raise RuntimeError(f"UDF Execution Failed! Exit code: {retval}")
-            else:
-                # Otherwise process single-threaded (faster for smaller UDFs)
-                warehouse = self.catalog.warehouse
-                udf_inputs = batching(warehouse.dataset_select_paginated, query)
-                download_cb = get_download_callback()
-                processed_cb = get_processed_callback()
-                generated_cb = get_generated_callback(self.is_generator)
-                try:
-                    udf_results = self.udf.run(
-                        udf_fields,
-                        udf_inputs,
-                        self.catalog,
-                        self.cache,
-                        download_cb,
-                        processed_cb,
+                    from datachain.catalog.loader import get_distributed_class
+                    distributor = get_distributed_class(
+                        min_task_size=self.min_task_size
                     )
-                    process_udf_outputs(
-                        warehouse,
-                        udf_table,
-                        udf_results,
+                    distributor(
                         self.udf,
-                        cb=generated_cb,
+                        catalog,
+                        udf_table,
+                        query,
+                        workers,
+                        processes,
+                        udf_fields=udf_fields,
+                        is_generator=self.is_generator,
+                        use_partitioning=use_partitioning,
+                        cache=self.cache,
                     )
-                finally:
-                    download_cb.close()
-                    processed_cb.close()
-                    generated_cb.close()
-        except QueryScriptCancelError:
-            self.catalog.warehouse.close()
-            sys.exit(QUERY_SCRIPT_CANCELED_EXIT_CODE)
-        except (Exception, KeyboardInterrupt):
-            # Close any open database connections if an error is encountered
-            self.catalog.warehouse.close()
-            raise
+                elif processes:
+                    # Parallel processing (faster for more CPU-heavy UDFs)
+                    if catalog.in_memory:
+                        raise RuntimeError(
+                            "In-memory databases cannot be used "
+                            "with parallel processing."
+                        )
+                    udf_info: UdfInfo = {
+                        "udf_data": filtered_cloudpickle_dumps(self.udf),
+                        "catalog_init": catalog.get_init_params(),
+                        "metastore_clone_params": catalog.metastore.clone_params(),
+                        "warehouse_clone_params": catalog.warehouse.clone_params(),
+                        "table": udf_table,
+                        "query": query,
+                        "udf_fields": udf_fields,
+                        "batching": batching,
+                        "processes": processes,
+                        "is_generator": self.is_generator,
+                        "cache": self.cache,
+                    }
+                    # Run the UDFDispatcher in another process to avoid needing
+                    # if __name__ == '__main__': in user scripts
+                    exec_cmd = get_datachain_executable()
+                    cmd = [*exec_cmd, "internal-run-udf"]
+                    envs = dict(os.environ)
+                    envs.update({"PYTHONPATH": os.getcwd()})
+                    process_data = filtered_cloudpickle_dumps(udf_info)
+                    with subprocess.Popen(  # noqa: S603
+                        cmd, env=envs, stdin=subprocess.PIPE
+                    ) as process:
+                        process.communicate(process_data)
+                        if retval := process.poll():
+                            raise RuntimeError(
+                                f"UDF Execution Failed! Exit code: {retval}"
+                            )
+                else:
+                    # Otherwise process single-threaded (faster for smaller UDFs)
+                    warehouse = catalog.warehouse
+                    udf_inputs = batching(warehouse.dataset_select_paginated, query)
+                    download_cb = get_download_callback()
+                    processed_cb = get_processed_callback()
+                    generated_cb = get_generated_callback(self.is_generator)
+                    try:
+                        udf_results = self.udf.run(
+                            udf_fields,
+                            udf_inputs,
+                            catalog,
+                            self.cache,
+                            download_cb,
+                            processed_cb,
+                        )
+                        with safe_closing(udf_results):
+                            process_udf_outputs(
+                                warehouse,
+                                udf_table,
+                                udf_results,
+                                self.udf,
+                                cb=generated_cb,
+                            )
+                    finally:
+                        download_cb.close()
+                        processed_cb.close()
+                        generated_cb.close()
+            except QueryScriptCancelError:
+                self.catalog.warehouse.close()
+                sys.exit(QUERY_SCRIPT_CANCELED_EXIT_CODE)
+            except (Exception, KeyboardInterrupt):
+                # Close any open database connections if an error is encountered
+                self.catalog.warehouse.close()
+                raise
     def create_partitions_table(self, query: Select) -> "Table":
         """
@@ -602,6 +624,13 @@ class UDFSignal(UDFStep):
         signal_name_cols = {c.name: c for c in signal_cols}
         cols = signal_cols
+        overlap = {c.name for c in original_cols} & {c.name for c in cols}
+        if overlap:
+            raise ValueError(
+                "Column already exists or added in the previous steps: "
+                + ", ".join(overlap)
+            )
         def q(*columns):
             cols1 = []
             cols2 = []

datachain/query/dispatch.py CHANGED Viewed

@@ -14,7 +14,9 @@ from multiprocess import get_context
 from sqlalchemy.sql import func
 from datachain.catalog import Catalog
+from datachain.catalog.catalog import clone_catalog_with_cache
 from datachain.catalog.loader import get_distributed_class
+from datachain.lib.udf import _get_cache
 from datachain.query.batch import RowsOutput, RowsOutputBatch
 from datachain.query.dataset import (
     get_download_callback,
@@ -25,7 +27,7 @@ from datachain.query.dataset import (
 from datachain.query.queue import get_from_queue, put_into_queue
 from datachain.query.udf import UdfInfo
 from datachain.query.utils import get_query_id_column
-from datachain.utils import batched, flatten
+from datachain.utils import batched, flatten, safe_closing
 if TYPE_CHECKING:
     from sqlalchemy import Select, Table
@@ -304,21 +306,25 @@ class UDFWorker:
         processed_cb = ProcessedCallback()
         generated_cb = get_generated_callback(self.is_generator)
-        udf_results = self.udf.run(
-            self.udf_fields,
-            self.get_inputs(),
-            self.catalog,
-            self.cache,
-            download_cb=self.cb,
-            processed_cb=processed_cb,
-        )
-        process_udf_outputs(
-            self.catalog.warehouse,
-            self.table,
-            self.notify_and_process(udf_results, processed_cb),
-            self.udf,
-            cb=generated_cb,
-        )
+        prefetch = self.udf.prefetch
+        with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
+            catalog = clone_catalog_with_cache(self.catalog, _cache)
+            udf_results = self.udf.run(
+                self.udf_fields,
+                self.get_inputs(),
+                catalog,
+                self.cache,
+                download_cb=self.cb,
+                processed_cb=processed_cb,
+            )
+            with safe_closing(udf_results):
+                process_udf_outputs(
+                    catalog.warehouse,
+                    self.table,
+                    self.notify_and_process(udf_results, processed_cb),
+                    self.udf,
+                    cb=generated_cb,
+                )
         put_into_queue(
             self.done_queue,

datachain 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl

Potentially problematic release.

datachain 0.8.3py3-none-any.whl → 0.8.5py3-none-any.whl