PyPI - datachain - Versions diffs - 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl - Mend

datachain 0.8.4py3-none-any.whl → 0.8.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (31) hide show

datachain/asyn.py +16 -6
datachain/cache.py +32 -10
datachain/catalog/catalog.py +17 -1
datachain/client/azure.py +6 -2
datachain/client/fsspec.py +1 -1
datachain/client/gcs.py +6 -2
datachain/client/s3.py +22 -4
datachain/data_storage/db_engine.py +9 -0
datachain/data_storage/schema.py +4 -10
datachain/data_storage/sqlite.py +7 -1
datachain/data_storage/warehouse.py +6 -4
datachain/{lib/diff.py → diff/__init__.py} +116 -12
datachain/func/__init__.py +2 -1
datachain/func/conditional.py +31 -9
datachain/lib/arrow.py +3 -1
datachain/lib/dc.py +5 -3
datachain/lib/file.py +15 -4
datachain/lib/hf.py +1 -1
datachain/lib/pytorch.py +57 -13
datachain/lib/udf.py +82 -40
datachain/listing.py +1 -0
datachain/progress.py +18 -1
datachain/query/dataset.py +122 -93
datachain/query/dispatch.py +22 -16
datachain/utils.py +13 -2
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/METADATA +6 -6
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/RECORD +31 -31
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/WHEEL +1 -1
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/LICENSE +0 -0
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/entry_points.txt +0 -0
{datachain-0.8.4.dist-info → datachain-0.8.6.dist-info}/top_level.txt +0 -0

datachain/func/conditional.py CHANGED Viewed

@@ -8,6 +8,8 @@ from datachain.sql.functions import conditional
 from .func import ColT, Func
+CaseT = Union[int, float, complex, bool, str]
 def greatest(*args: Union[ColT, float]) -> Func:
     """
@@ -85,9 +87,7 @@ def least(*args: Union[ColT, float]) -> Func:
     )
-def case(
-    *args: tuple[BinaryExpression, Union[int, float, complex, bool, str]], else_=None
-) -> Func:
+def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func:
     """
     Returns the case function that produces case expression which has a list of
     conditions and corresponding results. Results can only be python primitives
@@ -108,26 +108,48 @@ def case(
             res=func.case((C("num") > 0, "P"), (C("num") < 0, "N"), else_="Z"),
         )
         ```
-    Note:
-        - Result column will always be of the same type as the input columns.
     """
     supported_types = [int, float, complex, str, bool]
     type_ = type(else_) if else_ else None
     if not args:
-        raise DataChainParamsError("Missing case statements")
+        raise DataChainParamsError("Missing statements")
     for arg in args:
         if type_ and not isinstance(arg[1], type_):
-            raise DataChainParamsError("Case statement values must be of the same type")
+            raise DataChainParamsError("Statement values must be of the same type")
         type_ = type(arg[1])
     if type_ not in supported_types:
         raise DataChainParamsError(
-            f"Case supports only python literals ({supported_types}) for values"
+            f"Only python literals ({supported_types}) are supported for values"
         )
     kwargs = {"else_": else_}
     return Func("case", inner=sql_case, args=args, kwargs=kwargs, result_type=type_)
+def ifelse(condition: BinaryExpression, if_val: CaseT, else_val: CaseT) -> Func:
+    """
+    Returns the ifelse function that produces if expression which has a condition
+    and values for true and false outcome. Results can only be python primitives
+    like string, numbes or booleans. Result type is inferred from the values.
+    Args:
+        condition: BinaryExpression - condition which is evaluated
+        if_val: (str | int | float | complex | bool): value for true condition outcome
+        else_val: (str | int | float | complex | bool): value for false condition
+         outcome
+    Returns:
+        Func: A Func object that represents the ifelse function.
+    Example:
+        ```py
+        dc.mutate(
+            res=func.ifelse(C("num") > 0, "P", "N"),
+        )
+        ```
+    """
+    return case((condition, if_val), else_=else_val)

datachain/lib/arrow.py CHANGED Viewed

@@ -91,7 +91,9 @@ class ArrowGenerator(Generator):
                 yield from record_batch.to_pylist()
         it = islice(iter_records(), self.nrows)
-        with tqdm(it, desc="Parsed by pyarrow", unit="rows", total=self.nrows) as pbar:
+        with tqdm(
+            it, desc="Parsed by pyarrow", unit="rows", total=self.nrows, leave=False
+        ) as pbar:
             for index, record in enumerate(pbar):
                 yield self._process_record(
                     record, file, index, hf_schema, use_datachain_schema

datachain/lib/dc.py CHANGED Viewed

@@ -451,6 +451,7 @@ class DataChain:
             return dc
         if update or not list_ds_exists:
+            # disable prefetch for listing, as it pre-downloads all files
             (
                 cls.from_records(
                     DataChain.DEFAULT_FILE_RECORD,
@@ -458,6 +459,7 @@ class DataChain:
                     settings=settings,
                     in_memory=in_memory,
                 )
+                .settings(prefetch=0)
                 .gen(
                     list_bucket(list_uri, cache, client_config=client_config),
                     output={f"{object_name}": File},
@@ -1534,7 +1536,7 @@ class DataChain:
         Example:
             ```py
-            diff = persons.diff(
+            res = persons.compare(
                 new_persons,
                 on=["id"],
                 right_on=["other_id"],
@@ -1547,9 +1549,9 @@ class DataChain:
             )
             ```
         """
-        from datachain.lib.diff import compare as chain_compare
+        from datachain.diff import _compare
-        return chain_compare(
+        return _compare(
             self,
             other,
             on,

datachain/lib/file.py CHANGED Viewed

@@ -269,10 +269,21 @@ class File(DataModel):
         client = self._catalog.get_client(self.source)
         client.download(self, callback=self._download_cb)
-    async def _prefetch(self) -> None:
-        if self._caching_enabled:
-            client = self._catalog.get_client(self.source)
-            await client._download(self, callback=self._download_cb)
+    async def _prefetch(self, download_cb: Optional["Callback"] = None) -> bool:
+        from datachain.client.hf import HfClient
+        if self._catalog is None:
+            raise RuntimeError("cannot prefetch file because catalog is not setup")
+        client = self._catalog.get_client(self.source)
+        if client.protocol == HfClient.protocol:
+            return False
+        await client._download(self, callback=download_cb or self._download_cb)
+        self._set_stream(
+            self._catalog, caching_enabled=True, download_cb=DEFAULT_CALLBACK
+        )
+        return True
     def get_local_path(self) -> Optional[str]:
         """Return path to a file in a local cache.

datachain/lib/hf.py CHANGED Viewed

@@ -95,7 +95,7 @@ class HFGenerator(Generator):
         ds = self.ds_dict[split]
         if split:
             desc += f" split '{split}'"
-        with tqdm(desc=desc, unit=" rows") as pbar:
+        with tqdm(desc=desc, unit=" rows", leave=False) as pbar:
             for row in ds:
                 output_dict = {}
                 if split and "split" in self.output_schema.model_fields:

datachain/lib/pytorch.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import logging
-from collections.abc import Iterator
+import os
+import weakref
+from collections.abc import Generator, Iterable, Iterator
+from contextlib import closing
 from typing import TYPE_CHECKING, Any, Callable, Optional
 from PIL import Image
@@ -9,15 +12,19 @@ from torch.utils.data import IterableDataset, get_worker_info
 from torchvision.transforms import v2
 from datachain import Session
-from datachain.asyn import AsyncMapper
+from datachain.cache import get_temp_cache
 from datachain.catalog import Catalog, get_catalog
 from datachain.lib.dc import DataChain
 from datachain.lib.settings import Settings
 from datachain.lib.text import convert_text
+from datachain.progress import CombinedDownloadCallback
+from datachain.query.dataset import get_download_callback
 if TYPE_CHECKING:
     from torchvision.transforms.v2 import Transform
+    from datachain.cache import DataChainCache as Cache
 logger = logging.getLogger("datachain")
@@ -75,6 +82,19 @@ class PytorchDataset(IterableDataset):
         if (prefetch := dc_settings.prefetch) is not None:
             self.prefetch = prefetch
+        self._cache = catalog.cache
+        self._prefetch_cache: Optional[Cache] = None
+        if prefetch and not self.cache:
+            tmp_dir = catalog.cache.tmp_dir
+            assert tmp_dir
+            self._prefetch_cache = get_temp_cache(tmp_dir, prefix="prefetch-")
+            self._cache = self._prefetch_cache
+            weakref.finalize(self, self._prefetch_cache.destroy)
+    def close(self) -> None:
+        if self._prefetch_cache:
+            self._prefetch_cache.destroy()
     def _init_catalog(self, catalog: "Catalog"):
         # For compatibility with multiprocessing,
         # we can only store params in __init__(), as Catalog isn't picklable
@@ -89,9 +109,15 @@ class PytorchDataset(IterableDataset):
         ms = ms_cls(*ms_args, **ms_kwargs)
         wh_cls, wh_args, wh_kwargs = self._wh_params
         wh = wh_cls(*wh_args, **wh_kwargs)
-        return Catalog(ms, wh, **self._catalog_params)
+        catalog = Catalog(ms, wh, **self._catalog_params)
+        catalog.cache = self._cache
+        return catalog
-    def _rows_iter(self, total_rank: int, total_workers: int):
+    def _row_iter(
+        self,
+        total_rank: int,
+        total_workers: int,
+    ) -> Generator[tuple[Any, ...], None, None]:
         catalog = self._get_catalog()
         session = Session("PyTorch", catalog=catalog)
         ds = DataChain.from_dataset(
@@ -104,16 +130,34 @@ class PytorchDataset(IterableDataset):
         ds = ds.chunk(total_rank, total_workers)
         yield from ds.collect()
-    def __iter__(self) -> Iterator[Any]:
-        total_rank, total_workers = self.get_rank_and_workers()
-        rows = self._rows_iter(total_rank, total_workers)
-        if self.prefetch > 0:
-            from datachain.lib.udf import _prefetch_input
-            rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
-        yield from map(self._process_row, rows)
+    def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
+        from datachain.lib.udf import _prefetch_inputs
-    def _process_row(self, row_features):
+        total_rank, total_workers = self.get_rank_and_workers()
+        download_cb = CombinedDownloadCallback()
+        if os.getenv("DATACHAIN_SHOW_PREFETCH_PROGRESS"):
+            download_cb = get_download_callback(
+                f"{total_rank}/{total_workers}",
+                position=total_rank,
+                leave=True,
+            )
+        rows = self._row_iter(total_rank, total_workers)
+        rows = _prefetch_inputs(
+            rows,
+            self.prefetch,
+            download_cb=download_cb,
+            after_prefetch=download_cb.increment_file_count,
+        )
+        with download_cb, closing(rows):
+            yield from rows
+    def __iter__(self) -> Iterator[list[Any]]:
+        with closing(self._iter_with_prefetch()) as rows:
+            yield from map(self._process_row, rows)
+    def _process_row(self, row_features: Iterable[Any]) -> list[Any]:
         row = []
         for fr in row_features:
             if hasattr(fr, "read"):

datachain/lib/udf.py CHANGED Viewed

@@ -1,14 +1,16 @@
-import contextlib
 import sys
 import traceback
-from collections.abc import Iterable, Iterator, Mapping, Sequence
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
+from contextlib import closing, nullcontext
+from functools import partial
+from typing import TYPE_CHECKING, Any, Optional, TypeVar
 import attrs
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from pydantic import BaseModel
 from datachain.asyn import AsyncMapper
+from datachain.cache import temporary_cache
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
 from datachain.lib.data_model import DataValue
@@ -21,17 +23,22 @@ from datachain.query.batch import (
     Partition,
     RowsOutputBatch,
 )
+from datachain.utils import safe_closing
 if TYPE_CHECKING:
     from collections import abc
+    from contextlib import AbstractContextManager
     from typing_extensions import Self
+    from datachain.cache import DataChainCache as Cache
     from datachain.catalog import Catalog
     from datachain.lib.signal_schema import SignalSchema
     from datachain.lib.udf_signature import UdfSignature
     from datachain.query.batch import RowsOutput
+T = TypeVar("T", bound=Sequence[Any])
 class UdfError(DataChainParamsError):
     def __init__(self, msg):
@@ -98,6 +105,10 @@ class UDFAdapter:
             processed_cb,
         )
+    @property
+    def prefetch(self) -> int:
+        return self.inner.prefetch
 class UDFBase(AbstractUDF):
     """Base class for stateful user-defined functions.
@@ -148,12 +159,11 @@ class UDFBase(AbstractUDF):
     """
     is_output_batched = False
-    catalog: "Optional[Catalog]"
+    prefetch: int = 0
     def __init__(self):
         self.params: Optional[SignalSchema] = None
         self.output = None
-        self.catalog = None
         self._func = None
     def process(self, *args, **kwargs):
@@ -242,26 +252,23 @@ class UDFBase(AbstractUDF):
         return flatten(obj) if isinstance(obj, BaseModel) else [obj]
     def _parse_row(
-        self, row_dict: RowDict, cache: bool, download_cb: Callback
+        self, row_dict: RowDict, catalog: "Catalog", cache: bool, download_cb: Callback
     ) -> list[DataValue]:
         assert self.params
         row = [row_dict[p] for p in self.params.to_udf_spec()]
         obj_row = self.params.row_to_objs(row)
         for obj in obj_row:
             if isinstance(obj, File):
-                assert self.catalog is not None
-                obj._set_stream(
-                    self.catalog, caching_enabled=cache, download_cb=download_cb
-                )
+                obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
         return obj_row
-    def _prepare_row(self, row, udf_fields, cache, download_cb):
+    def _prepare_row(self, row, udf_fields, catalog, cache, download_cb):
         row_dict = RowDict(zip(udf_fields, row))
-        return self._parse_row(row_dict, cache, download_cb)
+        return self._parse_row(row_dict, catalog, cache, download_cb)
-    def _prepare_row_and_id(self, row, udf_fields, cache, download_cb):
+    def _prepare_row_and_id(self, row, udf_fields, catalog, cache, download_cb):
         row_dict = RowDict(zip(udf_fields, row))
-        udf_input = self._parse_row(row_dict, cache, download_cb)
+        udf_input = self._parse_row(row_dict, catalog, cache, download_cb)
         return row_dict["sys__id"], *udf_input
     def process_safe(self, obj_rows):
@@ -279,13 +286,47 @@ class UDFBase(AbstractUDF):
         return result_objs
-async def _prefetch_input(row):
+def noop(*args, **kwargs):
+    pass
+async def _prefetch_input(
+    row: T,
+    download_cb: Optional["Callback"] = None,
+    after_prefetch: "Callable[[], None]" = noop,
+) -> T:
     for obj in row:
-        if isinstance(obj, File):
-            await obj._prefetch()
+        if isinstance(obj, File) and await obj._prefetch(download_cb):
+            after_prefetch()
     return row
+def _prefetch_inputs(
+    prepared_inputs: "Iterable[T]",
+    prefetch: int = 0,
+    download_cb: Optional["Callback"] = None,
+    after_prefetch: "Callable[[], None]" = noop,
+) -> "abc.Generator[T, None, None]":
+    if prefetch > 0:
+        f = partial(
+            _prefetch_input,
+            download_cb=download_cb,
+            after_prefetch=after_prefetch,
+        )
+        prepared_inputs = AsyncMapper(f, prepared_inputs, workers=prefetch).iterate()  # type: ignore[assignment]
+    yield from prepared_inputs
+def _get_cache(
+    cache: "Cache", prefetch: int = 0, use_cache: bool = False
+) -> "AbstractContextManager[Cache]":
+    tmp_dir = cache.tmp_dir
+    assert tmp_dir
+    if prefetch and not use_cache:
+        return temporary_cache(tmp_dir, prefix="prefetch-")
+    return nullcontext(cache)
 class Mapper(UDFBase):
     """Inherit from this class to pass to `DataChain.map()`."""
@@ -300,18 +341,18 @@ class Mapper(UDFBase):
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.catalog = catalog
         self.setup()
-        prepared_inputs: abc.Generator[Sequence[Any], None, None] = (
-            self._prepare_row_and_id(row, udf_fields, cache, download_cb)
-            for row in udf_inputs
-        )
-        if self.prefetch > 0:
-            prepared_inputs = AsyncMapper(
-                _prefetch_input, prepared_inputs, workers=self.prefetch
-            ).iterate()
-        with contextlib.closing(prepared_inputs):
+        def _prepare_rows(udf_inputs) -> "abc.Generator[Sequence[Any], None, None]":
+            with safe_closing(udf_inputs):
+                for row in udf_inputs:
+                    yield self._prepare_row_and_id(
+                        row, udf_fields, catalog, cache, download_cb
+                    )
+        prepared_inputs = _prepare_rows(udf_inputs)
+        prepared_inputs = _prefetch_inputs(prepared_inputs, self.prefetch)
+        with closing(prepared_inputs):
             for id_, *udf_args in prepared_inputs:
                 result_objs = self.process_safe(udf_args)
                 udf_output = self._flatten_row(result_objs)
@@ -336,14 +377,15 @@ class BatchMapper(UDFBase):
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.catalog = catalog
         self.setup()
         for batch in udf_inputs:
             n_rows = len(batch.rows)
             row_ids, *udf_args = zip(
                 *[
-                    self._prepare_row_and_id(row, udf_fields, cache, download_cb)
+                    self._prepare_row_and_id(
+                        row, udf_fields, catalog, cache, download_cb
+                    )
                     for row in batch.rows
                 ]
             )
@@ -378,17 +420,18 @@ class Generator(UDFBase):
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.catalog = catalog
         self.setup()
-        prepared_inputs: abc.Generator[Sequence[Any], None, None] = (
-            self._prepare_row(row, udf_fields, cache, download_cb) for row in udf_inputs
-        )
-        if self.prefetch > 0:
-            prepared_inputs = AsyncMapper(
-                _prefetch_input, prepared_inputs, workers=self.prefetch
-            ).iterate()
-        with contextlib.closing(prepared_inputs):
+        def _prepare_rows(udf_inputs) -> "abc.Generator[Sequence[Any], None, None]":
+            with safe_closing(udf_inputs):
+                for row in udf_inputs:
+                    yield self._prepare_row(
+                        row, udf_fields, catalog, cache, download_cb
+                    )
+        prepared_inputs = _prepare_rows(udf_inputs)
+        prepared_inputs = _prefetch_inputs(prepared_inputs, self.prefetch)
+        with closing(prepared_inputs):
             for row in prepared_inputs:
                 result_objs = self.process_safe(row)
                 udf_outputs = (self._flatten_row(row) for row in result_objs)
@@ -413,13 +456,12 @@ class Aggregator(UDFBase):
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.catalog = catalog
         self.setup()
         for batch in udf_inputs:
             udf_args = zip(
                 *[
-                    self._prepare_row(row, udf_fields, cache, download_cb)
+                    self._prepare_row(row, udf_fields, catalog, cache, download_cb)
                     for row in batch.rows
                 ]
             )

datachain/listing.py CHANGED Viewed

@@ -153,6 +153,7 @@ class Listing:
             unit_scale=True,
             unit_divisor=1000,
             total=total_files,
+            leave=False,
         )
         counter = 0

datachain/progress.py CHANGED Viewed

@@ -5,6 +5,7 @@ import sys
 from threading import RLock
 from typing import Any, ClassVar
+from fsspec import Callback
 from fsspec.callbacks import TqdmCallback
 from tqdm import tqdm
@@ -132,8 +133,24 @@ class Tqdm(tqdm):
         return d
-class CombinedDownloadCallback(TqdmCallback):
+class CombinedDownloadCallback(Callback):
     def set_size(self, size):
         # This is a no-op to prevent fsspec's .get_file() from setting the combined
         # download size to the size of the current file.
         pass
+    def increment_file_count(self, n: int = 1) -> None:
+        pass
+class TqdmCombinedDownloadCallback(CombinedDownloadCallback, TqdmCallback):
+    def __init__(self, tqdm_kwargs=None, *args, **kwargs):
+        self.files_count = 0
+        tqdm_kwargs = tqdm_kwargs or {}
+        tqdm_kwargs.setdefault("postfix", {}).setdefault("files", self.files_count)
+        super().__init__(tqdm_kwargs, *args, **kwargs)
+    def increment_file_count(self, n: int = 1) -> None:
+        self.files_count += n
+        if self.tqdm is not None:
+            self.tqdm.postfix = f"{self.files_count} files"

datachain 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

Potentially problematic release.

datachain 0.8.4py3-none-any.whl → 0.8.6py3-none-any.whl