PyPI - datachain - Versions diffs - 0.8.8__py3-none-any.whl → 0.8.9__py3-none-any.whl - Mend

datachain 0.8.8py3-none-any.whl → 0.8.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (26) hide show

datachain/cli/__init__.py +12 -4
datachain/cli/commands/datasets.py +2 -3
datachain/cli/parser/__init__.py +51 -69
datachain/cli/parser/job.py +20 -25
datachain/cli/parser/studio.py +22 -46
datachain/cli/parser/utils.py +1 -1
datachain/client/local.py +1 -1
datachain/lib/arrow.py +1 -1
datachain/lib/convert/unflatten.py +1 -2
datachain/lib/dc.py +23 -4
datachain/lib/file.py +27 -4
datachain/lib/listing.py +4 -4
datachain/lib/pytorch.py +3 -1
datachain/lib/udf.py +56 -20
datachain/model/bbox.py +9 -9
datachain/model/pose.py +9 -9
datachain/model/segment.py +6 -6
datachain/progress.py +0 -13
datachain/query/dataset.py +12 -10
datachain/studio.py +15 -9
{datachain-0.8.8.dist-info → datachain-0.8.9.dist-info}/METADATA +4 -3
{datachain-0.8.8.dist-info → datachain-0.8.9.dist-info}/RECORD +26 -26
{datachain-0.8.8.dist-info → datachain-0.8.9.dist-info}/LICENSE +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.9.dist-info}/WHEEL +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.9.dist-info}/entry_points.txt +0 -0
{datachain-0.8.8.dist-info → datachain-0.8.9.dist-info}/top_level.txt +0 -0

datachain/cli/parser/studio.py CHANGED Viewed

@@ -1,10 +1,8 @@
 def add_studio_parser(subparsers, parent_parser) -> None:
-    studio_help = "Commands to authenticate DataChain with Iterative Studio"
+    studio_help = "Manage Studio authentication"
     studio_description = (
-        "Authenticate DataChain with Studio and set the token. "
-        "Once this token has been properly configured,\n"
-        "DataChain will utilize it for seamlessly sharing datasets\n"
-        "and using Studio features from CLI"
+        "Manage authentication and settings for Studio. "
+        "Configure tokens for sharing datasets and using Studio features."
     )
     studio_parser = subparsers.add_parser(
@@ -15,14 +13,13 @@ def add_studio_parser(subparsers, parent_parser) -> None:
     )
     studio_subparser = studio_parser.add_subparsers(
         dest="cmd",
-        help="Use `DataChain studio CMD --help` to display command-specific help.",
-        required=True,
+        help="Use `datachain studio CMD --help` to display command-specific help",
     )
-    studio_login_help = "Authenticate DataChain with Studio host"
+    studio_login_help = "Authenticate with Studio"
     studio_login_description = (
-        "By default, this command authenticates the DataChain with Studio\n"
-        "using default scopes and assigns a random name as the token name."
+        "Authenticate with Studio using default scopes. "
+        "A random name will be assigned as the token name if not specified."
     )
     login_parser = studio_subparser.add_parser(
         "login",
@@ -36,14 +33,14 @@ def add_studio_parser(subparsers, parent_parser) -> None:
         "--hostname",
         action="store",
         default=None,
-        help="The hostname of the Studio instance to authenticate with.",
+        help="Hostname of the Studio instance",
     )
     login_parser.add_argument(
         "-s",
         "--scopes",
         action="store",
         default=None,
-        help="The scopes for the authentication token. ",
+        help="Authentication token scopes",
     )
     login_parser.add_argument(
@@ -51,21 +48,20 @@ def add_studio_parser(subparsers, parent_parser) -> None:
         "--name",
         action="store",
         default=None,
-        help="The name of the authentication token. It will be used to\n"
-        "identify token shown in Studio profile.",
+        help="Authentication token name (shown in Studio profile)",
     )
     login_parser.add_argument(
         "--no-open",
         action="store_true",
         default=False,
-        help="Use authentication flow based on user code.\n"
-        "You will be presented with user code to enter in browser.\n"
-        "DataChain will also use this if it cannot launch browser on your behalf.",
+        help="Use code-based authentication without browser",
     )
-    studio_logout_help = "Logout user from Studio"
-    studio_logout_description = "This removes the studio token from your global config."
+    studio_logout_help = "Log out from Studio"
+    studio_logout_description = (
+        "Remove the Studio authentication token from global config."
+    )
     studio_subparser.add_parser(
         "logout",
@@ -74,10 +70,8 @@ def add_studio_parser(subparsers, parent_parser) -> None:
         help=studio_logout_help,
     )
-    studio_team_help = "Set the default team for DataChain"
-    studio_team_description = (
-        "Set the default team for DataChain to use when interacting with Studio."
-    )
+    studio_team_help = "Set default team for Studio operations"
+    studio_team_description = "Set the default team for Studio operations."
     team_parser = studio_subparser.add_parser(
         "team",
@@ -88,39 +82,21 @@ def add_studio_parser(subparsers, parent_parser) -> None:
     team_parser.add_argument(
         "team_name",
         action="store",
-        help="The name of the team to set as the default.",
+        help="Name of the team to set as default",
     )
     team_parser.add_argument(
         "--global",
         action="store_true",
         default=False,
-        help="Set the team globally for all DataChain projects.",
+        help="Set team globally for all projects",
     )
-    studio_token_help = "View the token datachain uses to contact Studio"  # noqa: S105 # nosec B105
+    studio_token_help = "View Studio authentication token"  # noqa: S105
+    studio_token_description = "Display the current authentication token for Studio."  # noqa: S105
     studio_subparser.add_parser(
         "token",
         parents=[parent_parser],
-        description=studio_token_help,
+        description=studio_token_description,
         help=studio_token_help,
     )
-    studio_ls_dataset_help = "List the available datasets from Studio"
-    studio_ls_dataset_description = (
-        "This command lists all the datasets available in Studio.\n"
-        "It will show the dataset name and the number of versions available."
-    )
-    ls_dataset_parser = studio_subparser.add_parser(
-        "dataset",
-        parents=[parent_parser],
-        description=studio_ls_dataset_description,
-        help=studio_ls_dataset_help,
-    )
-    ls_dataset_parser.add_argument(
-        "--team",
-        action="store",
-        default=None,
-        help="The team to list datasets for. By default, it will use team from config.",
-    )

datachain/cli/parser/utils.py CHANGED Viewed

@@ -30,7 +30,7 @@ def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Act
         "sources",
         type=str,
         nargs=nargs,
-        help="Data sources - paths to cloud storage dirs",
+        help="Data sources - paths to cloud storage directories",
     )

datachain/client/local.py CHANGED Viewed

@@ -38,7 +38,7 @@ class FileClient(Client):
     def get_uri(cls, name: str) -> "StorageURI":
         from datachain.dataset import StorageURI
-        return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
+        return StorageURI(f"{cls.PREFIX}/{name.removeprefix('/')}")
     @classmethod
     def ls_buckets(cls, **kwargs):

datachain/lib/arrow.py CHANGED Viewed

@@ -33,7 +33,7 @@ class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
         # reads the whole file in-memory.
         (uri,) = self.references[path]
         protocol, _ = split_protocol(uri)
-        return self.fss[protocol]._open(uri, mode, *args, **kwargs)
+        return self.fss[protocol].open(uri, mode, *args, **kwargs)
 class ArrowGenerator(Generator):

datachain/lib/convert/unflatten.py CHANGED Viewed

@@ -35,8 +35,7 @@ def unflatten_to_json_pos(
 def _normalize(name: str) -> str:
     if DEFAULT_DELIMITER in name:
         raise RuntimeError(
-            f"variable '{name}' cannot be used "
-            f"because it contains {DEFAULT_DELIMITER}"
+            f"variable '{name}' cannot be used because it contains {DEFAULT_DELIMITER}"
         )
     return _to_snake_case(name)

datachain/lib/dc.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import (
     BinaryIO,
     Callable,
     ClassVar,
+    Literal,
     Optional,
     TypeVar,
     Union,
@@ -1276,7 +1277,12 @@ class DataChain:
                 yield ret[0] if len(cols) == 1 else tuple(ret)
     def to_pytorch(
-        self, transform=None, tokenizer=None, tokenizer_kwargs=None, num_samples=0
+        self,
+        transform=None,
+        tokenizer=None,
+        tokenizer_kwargs=None,
+        num_samples=0,
+        remove_prefetched: bool = False,
     ):
         """Convert to pytorch dataset format.
@@ -1286,6 +1292,7 @@ class DataChain:
             tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
             num_samples (int): Number of random samples to draw for each epoch.
                 This argument is ignored if `num_samples=0` (the default).
+            remove_prefetched (bool): Whether to remove prefetched files after reading.
         Example:
             ```py
@@ -1312,6 +1319,7 @@ class DataChain:
             tokenizer_kwargs=tokenizer_kwargs,
             num_samples=num_samples,
             dc_settings=chain._settings,
+            remove_prefetched=remove_prefetched,
         )
     def remove_file_signals(self) -> "Self":  # noqa: D102
@@ -2415,11 +2423,22 @@ class DataChain:
     def export_files(
         self,
         output: str,
-        signal="file",
+        signal: str = "file",
         placement: FileExportPlacement = "fullpath",
         use_cache: bool = True,
+        link_type: Literal["copy", "symlink"] = "copy",
     ) -> None:
-        """Method that exports all files from chain to some folder."""
+        """Export files from a specified signal to a directory.
+        Args:
+            output: Path to the target directory for exporting files.
+            signal: Name of the signal to export files from.
+            placement: The method to use for naming exported files.
+                The possible values are: "filename", "etag", "fullpath", and "checksum".
+            use_cache: If `True`, cache the files before exporting.
+            link_type: Method to use for exporting files.
+                Falls back to `'copy'` if symlinking fails.
+        """
         if placement == "filename" and (
             self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
             != self._query.count()
@@ -2427,7 +2446,7 @@ class DataChain:
             raise ValueError("Files with the same name found")
         for file in self.collect(signal):
-            file.export(output, placement, use_cache)  # type: ignore[union-attr]
+            file.export(output, placement, use_cache, link_type=link_type)  # type: ignore[union-attr]
     def shuffle(self) -> "Self":
         """Shuffle the rows of the chain deterministically."""

datachain/lib/file.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import errno
 import hashlib
 import io
 import json
@@ -76,18 +77,18 @@ class TarVFile(VFile):
     def open(cls, file: "File", location: list[dict]):
         """Stream file from tar archive based on location in archive."""
         if len(location) > 1:
-            VFileError(file, "multiple 'location's are not supported yet")
+            raise VFileError(file, "multiple 'location's are not supported yet")
         loc = location[0]
         if (offset := loc.get("offset", None)) is None:
-            VFileError(file, "'offset' is not specified")
+            raise VFileError(file, "'offset' is not specified")
         if (size := loc.get("size", None)) is None:
-            VFileError(file, "'size' is not specified")
+            raise VFileError(file, "'size' is not specified")
         if (parent := loc.get("parent", None)) is None:
-            VFileError(file, "'parent' is not specified")
+            raise VFileError(file, "'parent' is not specified")
         tar_file = File(**parent)
         tar_file._set_stream(file._catalog)
@@ -236,11 +237,26 @@ class File(DataModel):
         with open(destination, mode="wb") as f:
             f.write(self.read())
+    def _symlink_to(self, destination: str):
+        if self.location:
+            raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
+        if self._caching_enabled:
+            self.ensure_cached()
+            source = self.get_local_path()
+            assert source, "File was not cached"
+        elif self.source.startswith("file://"):
+            source = self.get_path()
+        else:
+            raise OSError(errno.EXDEV, "can't link across filesystems")
+        return os.symlink(source, destination)
     def export(
         self,
         output: str,
         placement: ExportPlacement = "fullpath",
         use_cache: bool = True,
+        link_type: Literal["copy", "symlink"] = "copy",
     ) -> None:
         """Export file to new location."""
         if use_cache:
@@ -249,6 +265,13 @@ class File(DataModel):
         dst_dir = os.path.dirname(dst)
         os.makedirs(dst_dir, exist_ok=True)
+        if link_type == "symlink":
+            try:
+                return self._symlink_to(dst)
+            except OSError as exc:
+                if exc.errno not in (errno.ENOTSUP, errno.EXDEV, errno.ENOSYS):
+                    raise
         self.save(dst)
     def _set_stream(

datachain/lib/listing.py CHANGED Viewed

@@ -113,14 +113,14 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], st
     telemetry.log_param("client", client.PREFIX)
     if not uri.endswith("/") and _isfile(client, uri):
-        return None, f'{storage_uri}/{path.lstrip("/")}', path
+        return None, f"{storage_uri}/{path.lstrip('/')}", path
     if uses_glob(path):
         lst_uri_path = posixpath.dirname(path)
     else:
-        storage_uri, path = Client.parse_url(f'{uri.rstrip("/")}/')
+        storage_uri, path = Client.parse_url(f"{uri.rstrip('/')}/")
         lst_uri_path = path
-    lst_uri = f'{storage_uri}/{lst_uri_path.lstrip("/")}'
+    lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
     ds_name = (
         f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
     )
@@ -180,7 +180,7 @@ def get_listing(
     # for local file system we need to fix listing path / prefix
     # if we are reusing existing listing
     if isinstance(client, FileClient) and listing and listing.name != ds_name:
-        list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
+        list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
     ds_name = listing.name if listing else ds_name

datachain/lib/pytorch.py CHANGED Viewed

@@ -50,6 +50,7 @@ class PytorchDataset(IterableDataset):
         tokenizer_kwargs: Optional[dict[str, Any]] = None,
         num_samples: int = 0,
         dc_settings: Optional[Settings] = None,
+        remove_prefetched: bool = False,
     ):
         """
         Pytorch IterableDataset that streams DataChain datasets.
@@ -84,6 +85,7 @@ class PytorchDataset(IterableDataset):
         self._cache = catalog.cache
         self._prefetch_cache: Optional[Cache] = None
+        self._remove_prefetched = remove_prefetched
         if prefetch and not self.cache:
             tmp_dir = catalog.cache.tmp_dir
             assert tmp_dir
@@ -147,7 +149,7 @@ class PytorchDataset(IterableDataset):
             rows,
             self.prefetch,
             download_cb=download_cb,
-            after_prefetch=download_cb.increment_file_count,
+            remove_prefetched=self._remove_prefetched,
         )
         with download_cb, closing(rows):

datachain/lib/udf.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datachain.lib.convert.flatten import flatten
 from datachain.lib.data_model import DataValue
 from datachain.lib.file import File
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
+from datachain.progress import CombinedDownloadCallback
 from datachain.query.batch import (
     Batch,
     BatchingStrategy,
@@ -301,20 +302,42 @@ async def _prefetch_input(
     return row
+def _remove_prefetched(row: T) -> None:
+    for obj in row:
+        if isinstance(obj, File):
+            catalog = obj._catalog
+            assert catalog is not None
+            try:
+                catalog.cache.remove(obj)
+            except Exception as e:  # noqa: BLE001
+                print(f"Failed to remove prefetched item {obj.name!r}: {e!s}")
 def _prefetch_inputs(
     prepared_inputs: "Iterable[T]",
     prefetch: int = 0,
     download_cb: Optional["Callback"] = None,
-    after_prefetch: "Callable[[], None]" = noop,
+    after_prefetch: Optional[Callable[[], None]] = None,
+    remove_prefetched: bool = False,
 ) -> "abc.Generator[T, None, None]":
-    if prefetch > 0:
-        f = partial(
-            _prefetch_input,
-            download_cb=download_cb,
-            after_prefetch=after_prefetch,
-        )
-        prepared_inputs = AsyncMapper(f, prepared_inputs, workers=prefetch).iterate()  # type: ignore[assignment]
-    yield from prepared_inputs
+    if not prefetch:
+        yield from prepared_inputs
+        return
+    if after_prefetch is None:
+        after_prefetch = noop
+        if isinstance(download_cb, CombinedDownloadCallback):
+            after_prefetch = download_cb.increment_file_count
+    f = partial(_prefetch_input, download_cb=download_cb, after_prefetch=after_prefetch)
+    mapper = AsyncMapper(f, prepared_inputs, workers=prefetch)
+    with closing(mapper.iterate()) as row_iter:
+        for row in row_iter:
+            try:
+                yield row  # type: ignore[misc]
+            finally:
+                if remove_prefetched:
+                    _remove_prefetched(row)
 def _get_cache(
@@ -351,7 +374,13 @@ class Mapper(UDFBase):
                     )
         prepared_inputs = _prepare_rows(udf_inputs)
-        prepared_inputs = _prefetch_inputs(prepared_inputs, self.prefetch)
+        prepared_inputs = _prefetch_inputs(
+            prepared_inputs,
+            self.prefetch,
+            download_cb=download_cb,
+            remove_prefetched=bool(self.prefetch) and not cache,
+        )
         with closing(prepared_inputs):
             for id_, *udf_args in prepared_inputs:
                 result_objs = self.process_safe(udf_args)
@@ -391,9 +420,9 @@ class BatchMapper(UDFBase):
             )
             result_objs = list(self.process_safe(udf_args))
             n_objs = len(result_objs)
-            assert (
-                n_objs == n_rows
-            ), f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
+            assert n_objs == n_rows, (
+                f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
+            )
             udf_outputs = (self._flatten_row(row) for row in result_objs)
             output = [
                 {"sys__id": row_id} | dict(zip(self.signal_names, signals))
@@ -429,15 +458,22 @@ class Generator(UDFBase):
                         row, udf_fields, catalog, cache, download_cb
                     )
+        def _process_row(row):
+            with safe_closing(self.process_safe(row)) as result_objs:
+                for result_obj in result_objs:
+                    udf_output = self._flatten_row(result_obj)
+                    yield dict(zip(self.signal_names, udf_output))
         prepared_inputs = _prepare_rows(udf_inputs)
-        prepared_inputs = _prefetch_inputs(prepared_inputs, self.prefetch)
+        prepared_inputs = _prefetch_inputs(
+            prepared_inputs,
+            self.prefetch,
+            download_cb=download_cb,
+            remove_prefetched=bool(self.prefetch) and not cache,
+        )
         with closing(prepared_inputs):
-            for row in prepared_inputs:
-                result_objs = self.process_safe(row)
-                udf_outputs = (self._flatten_row(row) for row in result_objs)
-                output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
-                processed_cb.relative_update(1)
-                yield output
+            for row in processed_cb.wrap(prepared_inputs):
+                yield _process_row(row)
         self.teardown()

datachain/model/bbox.py CHANGED Viewed

@@ -22,9 +22,9 @@ class BBox(DataModel):
     @staticmethod
     def from_list(coords: list[float], title: str = "") -> "BBox":
         assert len(coords) == 4, "Bounding box must be a list of 4 coordinates."
-        assert all(
-            isinstance(value, (int, float)) for value in coords
-        ), "Bounding box coordinates must be floats or integers."
+        assert all(isinstance(value, (int, float)) for value in coords), (
+            "Bounding box coordinates must be floats or integers."
+        )
         return BBox(
             title=title,
             coords=[round(c) for c in coords],
@@ -64,12 +64,12 @@ class OBBox(DataModel):
     @staticmethod
     def from_list(coords: list[float], title: str = "") -> "OBBox":
-        assert (
-            len(coords) == 8
-        ), "Oriented bounding box must be a list of 8 coordinates."
-        assert all(
-            isinstance(value, (int, float)) for value in coords
-        ), "Oriented bounding box coordinates must be floats or integers."
+        assert len(coords) == 8, (
+            "Oriented bounding box must be a list of 8 coordinates."
+        )
+        assert all(isinstance(value, (int, float)) for value in coords), (
+            "Oriented bounding box coordinates must be floats or integers."
+        )
         return OBBox(
             title=title,
             coords=[round(c) for c in coords],

datachain/model/pose.py CHANGED Viewed

@@ -22,9 +22,9 @@ class Pose(DataModel):
     def from_list(points: list[list[float]]) -> "Pose":
         assert len(points) == 2, "Pose must be a list of 2 lists: x and y coordinates."
         points_x, points_y = points
-        assert (
-            len(points_x) == len(points_y) == 17
-        ), "Pose x and y coordinates must have the same length of 17."
+        assert len(points_x) == len(points_y) == 17, (
+            "Pose x and y coordinates must have the same length of 17."
+        )
         assert all(
             isinstance(value, (int, float)) for value in [*points_x, *points_y]
         ), "Pose coordinates must be floats or integers."
@@ -61,13 +61,13 @@ class Pose3D(DataModel):
     @staticmethod
     def from_list(points: list[list[float]]) -> "Pose3D":
-        assert (
-            len(points) == 3
-        ), "Pose3D must be a list of 3 lists: x, y coordinates and visible."
+        assert len(points) == 3, (
+            "Pose3D must be a list of 3 lists: x, y coordinates and visible."
+        )
         points_x, points_y, points_v = points
-        assert (
-            len(points_x) == len(points_y) == len(points_v) == 17
-        ), "Pose3D x, y coordinates and visible must have the same length of 17."
+        assert len(points_x) == len(points_y) == len(points_v) == 17, (
+            "Pose3D x, y coordinates and visible must have the same length of 17."
+        )
         assert all(
             isinstance(value, (int, float))
             for value in [*points_x, *points_y, *points_v]

datachain/model/segment.py CHANGED Viewed

@@ -22,13 +22,13 @@ class Segment(DataModel):
     @staticmethod
     def from_list(points: list[list[float]], title: str = "") -> "Segment":
-        assert (
-            len(points) == 2
-        ), "Segment must be a list of 2 lists: x and y coordinates."
+        assert len(points) == 2, (
+            "Segment must be a list of 2 lists: x and y coordinates."
+        )
         points_x, points_y = points
-        assert len(points_x) == len(
-            points_y
-        ), "Segment x and y coordinates must have the same length."
+        assert len(points_x) == len(points_y), (
+            "Segment x and y coordinates must have the same length."
+        )
         assert all(
             isinstance(value, (int, float)) for value in [*points_x, *points_y]
         ), "Segment coordinates must be floats or integers."

datachain/progress.py CHANGED Viewed

@@ -1,14 +1,5 @@
-"""Manages progress bars."""
-import logging
-from threading import RLock
 from fsspec import Callback
 from fsspec.callbacks import TqdmCallback
-from tqdm.auto import tqdm
-logger = logging.getLogger(__name__)
-tqdm.set_lock(RLock())
 class CombinedDownloadCallback(Callback):
@@ -24,10 +15,6 @@ class CombinedDownloadCallback(Callback):
 class TqdmCombinedDownloadCallback(CombinedDownloadCallback, TqdmCallback):
     def __init__(self, tqdm_kwargs=None, *args, **kwargs):
         self.files_count = 0
-        tqdm_kwargs = tqdm_kwargs or {}
-        tqdm_kwargs.setdefault("postfix", {}).setdefault("files", self.files_count)
-        kwargs = kwargs or {}
-        kwargs["tqdm_cls"] = tqdm
         super().__init__(tqdm_kwargs, *args, **kwargs)
     def increment_file_count(self, n: int = 1) -> None:

datachain/query/dataset.py CHANGED Viewed

@@ -336,15 +336,16 @@ def process_udf_outputs(
     for udf_output in udf_results:
         if not udf_output:
             continue
-        for row in udf_output:
-            cb.relative_update()
-            rows.append(adjust_outputs(warehouse, row, udf_col_types))
-            if len(rows) >= batch_size or (
-                len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
-            ):
-                for row_chunk in batched(rows, batch_size):
-                    warehouse.insert_rows(udf_table, row_chunk)
-                rows.clear()
+        with safe_closing(udf_output):
+            for row in udf_output:
+                cb.relative_update()
+                rows.append(adjust_outputs(warehouse, row, udf_col_types))
+                if len(rows) >= batch_size or (
+                    len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
+                ):
+                    for row_chunk in batched(rows, batch_size):
+                        warehouse.insert_rows(udf_table, row_chunk)
+                    rows.clear()
     if rows:
         for row_chunk in batched(rows, batch_size):
@@ -355,7 +356,7 @@ def process_udf_outputs(
 def get_download_callback(suffix: str = "", **kwargs) -> CombinedDownloadCallback:
     return TqdmCombinedDownloadCallback(
-        {
+        tqdm_kwargs={
             "desc": "Download" + suffix,
             "unit": "B",
             "unit_scale": True,
@@ -363,6 +364,7 @@ def get_download_callback(suffix: str = "", **kwargs) -> CombinedDownloadCallbac
             "leave": False,
             **kwargs,
         },
+        tqdm_cls=tqdm,
     )

datachain 0.8.8__py3-none-any.whl → 0.8.9__py3-none-any.whl

Potentially problematic release.

datachain 0.8.8py3-none-any.whl → 0.8.9py3-none-any.whl