PyPI - datachain - Versions diffs - 0.18.11__py3-none-any.whl → 0.19.1__py3-none-any.whl - Mend

datachain 0.18.11py3-none-any.whl → 0.19.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (15) hide show

datachain/client/fsspec.py +1 -0
datachain/client/hf.py +19 -2
datachain/delta.py +164 -39
datachain/lib/arrow.py +2 -0
datachain/lib/dc/datachain.py +19 -4
datachain/lib/dc/datasets.py +12 -1
datachain/lib/dc/storage.py +13 -1
datachain/lib/file.py +40 -11
datachain/semver.py +15 -5
{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/METADATA +56 -2
{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/RECORD +15 -15
{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/WHEEL +0 -0
{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/entry_points.txt +0 -0
{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/licenses/LICENSE +0 -0
{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/top_level.txt +0 -0

datachain/client/fsspec.py CHANGED Viewed

@@ -330,6 +330,7 @@ class Client(ABC):
         return getattr(self.fs, "version_aware", False)
     async def ls_dir(self, path):
+        kwargs = {}
         if self._is_version_aware():
             kwargs = {"versions": True}
         return await self.fs._ls(path, detail=True, **kwargs)

datachain/client/hf.py CHANGED Viewed

@@ -15,6 +15,24 @@ class classproperty:  # noqa: N801
         return self.fget(owner)
+def _wrap_class(sync_fs_class):
+    """
+    Analog of `AsyncFileSystemWrapper.wrap_class` from fsspec, but sets
+    asynchronous to False by default. This is similar to other Async FS
+    we initialize. E.g. it means we don't break things in Jupyter where code
+    run in async.
+    """
+    from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
+    class GeneratedAsyncFileSystemWrapper(AsyncFileSystemWrapper):
+        def __init__(self, *args, **kwargs):
+            sync_fs = sync_fs_class(*args, **kwargs)
+            super().__init__(sync_fs, asynchronous=False)
+    GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
+    return GeneratedAsyncFileSystemWrapper
 @functools.cache
 def get_hf_filesystem_cls():
     import fsspec
@@ -29,10 +47,9 @@ def get_hf_filesystem_cls():
             f"{fsspec_version} is installed."
         )
-    from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
     from huggingface_hub import HfFileSystem
-    fs_cls = AsyncFileSystemWrapper.wrap_class(HfFileSystem)
+    fs_cls = _wrap_class(HfFileSystem)
     # AsyncFileSystemWrapper does not set class properties, so we need to set them back.
     fs_cls.protocol = HfFileSystem.protocol
     return fs_cls

datachain/delta.py CHANGED Viewed

@@ -48,72 +48,197 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
     return dc
-def delta_update(
+def _get_delta_chain(
+    source_ds_name: str,
+    source_ds_version: str,
+    source_ds_latest_version: str,
+    on: Union[str, Sequence[str]],
+    compare: Optional[Union[str, Sequence[str]]] = None,
+) -> "DataChain":
+    """Get delta chain for processing changes between versions."""
+    source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
+    source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
+    # Calculate diff between source versions
+    return source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
+def _get_retry_chain(
+    name: str,
+    latest_version: str,
+    source_ds_name: str,
+    source_ds_latest_version: str,
+    on: Union[str, Sequence[str]],
+    right_on: Optional[Union[str, Sequence[str]]],
+    delta_retry: Optional[Union[bool, str]],
+) -> Optional["DataChain"]:
+    """Get retry chain for processing error records and missing records."""
+    # Import here to avoid circular import
+    from datachain.lib.dc import C
+    retry_chain = None
+    # Read the latest version of the result dataset for retry logic
+    result_dataset = datachain.read_dataset(name, latest_version)
+    source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
+    # Handle error records if delta_retry is a string (column name)
+    if isinstance(delta_retry, str):
+        error_records = result_dataset.filter(C(delta_retry) != "")
+        error_source_records = source_dc_latest.merge(
+            error_records, on=on, right_on=right_on, inner=True
+        ).select(*list(source_dc_latest.signals_schema.values))
+        retry_chain = error_source_records
+    # Handle missing records if delta_retry is True
+    elif delta_retry is True:
+        missing_records = source_dc_latest.subtract(
+            result_dataset, on=on, right_on=right_on
+        )
+        retry_chain = missing_records
+    return retry_chain
+def _get_source_info(
+    name: str,
+    latest_version: str,
+    catalog,
+) -> tuple[
+    Optional[str], Optional[str], Optional[str], Optional[list[DatasetDependency]]
+]:
+    """Get source dataset information and dependencies.
+    Returns:
+        Tuple of (source_name, source_version, source_latest_version, dependencies)
+        Returns (None, None, None, None) if source dataset was removed.
+    """
+    dependencies = catalog.get_dataset_dependencies(
+        name, latest_version, indirect=False
+    )
+    dep = dependencies[0]
+    if not dep:
+        # Starting dataset was removed, back off to normal dataset creation
+        return None, None, None, None
+    source_ds_name = dep.name
+    source_ds_version = dep.version
+    source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
+    return source_ds_name, source_ds_version, source_ds_latest_version, dependencies
+def delta_retry_update(
     dc: "DataChain",
     name: str,
     on: Union[str, Sequence[str]],
     right_on: Optional[Union[str, Sequence[str]]] = None,
     compare: Optional[Union[str, Sequence[str]]] = None,
+    delta_retry: Optional[Union[bool, str]] = None,
 ) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
     """
     Creates new chain that consists of the last version of current delta dataset
     plus diff from the source with all needed modifications.
-    This way we don't need to re-calculate the whole chain from the source again(
-    apply all the DataChain methods like filters, mappers, generators etc.)
+    This way we don't need to re-calculate the whole chain from the source again
+    (apply all the DataChain methods like filters, mappers, generators etc.)
     but just the diff part which is very important for performance.
-    Note that currently delta update works only if there is only one direct dependency.
+    Note that currently delta update works only if there is only one direct
+    dependency.
+    Additionally supports retry functionality to filter records that either:
+    1. Have a non-None value in the field specified by delta_retry (when it's a string)
+    2. Exist in the source dataset but are missing in the result dataset
+       (when delta_retry=True)
+    Parameters:
+        dc: The DataChain to filter for records that need reprocessing
+        name: Name of the destination dataset
+        on: Field(s) in source dataset that uniquely identify records
+        right_on: Corresponding field(s) in result dataset if they differ from
+                  source
+        compare: Field(s) used to check if the same row has been modified
+        delta_retry: If string, field in result dataset that indicates an error
+                    when not None. If True, include records missing from result dataset.
+                    If False/None, no retry functionality.
+    Returns:
+        A tuple containing (filtered chain for delta/retry processing,
+                          dependencies, found records flag)
     """
     catalog = dc.session.catalog
     dc._query.apply_listing_pre_step()
+    # Check if dataset exists
     try:
-        latest_version = catalog.get_dataset(name).latest_version
+        dataset = catalog.get_dataset(name)
+        latest_version = dataset.latest_version
     except DatasetNotFoundError:
-        # first creation of delta update dataset
+        # First creation of result dataset
         return None, None, True
-    dependencies = catalog.get_dataset_dependencies(
-        name, latest_version, indirect=False
+    # Initialize variables
+    diff_chain = None
+    dependencies = None
+    retry_chain = None
+    processing_chain = None
+    source_ds_name, source_ds_version, source_ds_latest_version, dependencies = (
+        _get_source_info(name, latest_version, catalog)
     )
-    dep = dependencies[0]
-    if not dep:
-        # starting dataset (e.g listing) was removed so we are backing off to normal
-        # dataset creation, as it was created first time
+    # If source_ds_name is None, starting dataset was removed
+    if source_ds_name is None:
         return None, None, True
-    source_ds_name = dep.name
-    source_ds_version = dep.version
-    source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
-    dependencies = copy(dependencies)
-    dependencies = [d for d in dependencies if d is not None]  # filter out removed dep
-    dependencies[0].version = source_ds_latest_version  # type: ignore[union-attr]
+    assert source_ds_version
+    assert source_ds_latest_version
-    source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
-    source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
+    diff_chain = _get_delta_chain(
+        source_ds_name, source_ds_version, source_ds_latest_version, on, compare
+    )
+    # Filter out removed dep
+    if dependencies:
+        dependencies = copy(dependencies)
+        dependencies = [d for d in dependencies if d is not None]
+        # Update to latest version
+        dependencies[0].version = source_ds_latest_version  # type: ignore[union-attr]
+    # Handle retry functionality if enabled
+    if delta_retry:
+        retry_chain = _get_retry_chain(
+            name,
+            latest_version,
+            source_ds_name,
+            source_ds_latest_version,
+            on,
+            right_on,
+            delta_retry,
+        )
-    diff = source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
-    # We append all the steps from the original chain to diff, e.g filters, mappers.
-    diff = _append_steps(diff, dc)
+    # Combine delta and retry chains
+    if retry_chain is not None:
+        processing_chain = diff_chain.union(retry_chain)
+    else:
+        processing_chain = diff_chain
-    # to avoid re-calculating diff multiple times
-    diff = diff.persist()
+    # Apply all the steps from the original chain to processing_chain
+    processing_chain = _append_steps(processing_chain, dc).persist()
-    if diff.empty:
+    # Check if chain becomes empty after applying steps
+    if processing_chain is None or (processing_chain and processing_chain.empty):
         return None, None, False
-    # merging diff and the latest version of dataset
-    delta_chain = (
-        datachain.read_dataset(name, latest_version)
-        .compare(
-            diff,
-            on=right_on or on,
-            added=True,
-            modified=False,
-            deleted=False,
-        )
-        .union(diff)
+    latest_dataset = datachain.read_dataset(name, latest_version)
+    compared_chain = latest_dataset.compare(
+        processing_chain,
+        on=right_on or on,
+        added=True,
+        modified=False,
+        deleted=False,
     )
-    return delta_chain, dependencies, True  # type: ignore[return-value]
+    result_chain = compared_chain.union(processing_chain)
+    return result_chain, dependencies, True

datachain/lib/arrow.py CHANGED Viewed

@@ -241,6 +241,8 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa:
         return dict
     if isinstance(col_type, pa.lib.DictionaryType):
         return arrow_type_mapper(col_type.value_type)  # type: ignore[return-value]
+    if pa.types.is_null(col_type):
+        return str  # use strings for null columns
     raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -25,7 +25,7 @@ from tqdm import tqdm
 from datachain import semver
 from datachain.dataset import DatasetRecord
-from datachain.delta import delta_disabled, delta_update
+from datachain.delta import delta_disabled
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -169,6 +169,10 @@ class DataChain:
         self._setup: dict = setup or {}
         self._sys = _sys
         self._delta = False
+        self._delta_on: Optional[Union[str, Sequence[str]]] = None
+        self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
+        self._delta_compare: Optional[Union[str, Sequence[str]]] = None
+        self._delta_retry: Optional[Union[bool, str]] = None
     def __repr__(self) -> str:
         """Return a string representation of the chain."""
@@ -187,6 +191,7 @@ class DataChain:
         on: Optional[Union[str, Sequence[str]]] = None,
         right_on: Optional[Union[str, Sequence[str]]] = None,
         compare: Optional[Union[str, Sequence[str]]] = None,
+        delta_retry: Optional[Union[bool, str]] = None,
     ) -> "Self":
         """Marks this chain as delta, which means special delta process will be
         called on saving dataset for optimization"""
@@ -196,6 +201,7 @@ class DataChain:
         self._delta_on = on
         self._delta_result_on = right_on
         self._delta_compare = compare
+        self._delta_retry = delta_retry
         return self
     @property
@@ -293,6 +299,7 @@ class DataChain:
                 on=self._delta_on,
                 right_on=self._delta_result_on,
                 compare=self._delta_compare,
+                delta_retry=self._delta_retry,
             )
         return chain
@@ -529,18 +536,26 @@ class DataChain:
             )
         schema = self.signals_schema.clone_without_sys_signals().serialize()
+        # Handle retry and delta functionality
         if self.delta and name:
-            delta_ds, dependencies, has_changes = delta_update(
+            from datachain.delta import delta_retry_update
+            # Delta chains must have delta_on defined (ensured by _as_delta method)
+            assert self._delta_on is not None, "Delta chain must have delta_on defined"
+            result_ds, dependencies, has_changes = delta_retry_update(
                 self,
                 name,
                 on=self._delta_on,
                 right_on=self._delta_result_on,
                 compare=self._delta_compare,
+                delta_retry=self._delta_retry,
             )
-            if delta_ds:
+            if result_ds:
                 return self._evolve(
-                    query=delta_ds._query.save(
+                    query=result_ds._query.save(
                         name=name,
                         version=version,
                         feature_schema=schema,

datachain/lib/dc/datasets.py CHANGED Viewed

@@ -32,6 +32,7 @@ def read_dataset(
     delta_on: Optional[Union[str, Sequence[str]]] = None,
     delta_result_on: Optional[Union[str, Sequence[str]]] = None,
     delta_compare: Optional[Union[str, Sequence[str]]] = None,
+    delta_retry: Optional[Union[bool, str]] = None,
 ) -> "DataChain":
     """Get data from a saved Dataset. It returns the chain itself.
     If dataset or version is not found locally, it will try to pull it from Studio.
@@ -73,6 +74,11 @@ def read_dataset(
         delta_compare: A list of fields used to check if the same row has been modified
             in the new version of the source.
             If not defined, all fields except those defined in delta_on will be used.
+        delta_retry: Specifies retry behavior for delta processing. If a string,
+            it's the name of a field in the result dataset that indicates an error
+            when not None - records with errors will be reprocessed. If True,
+            records that exist in the source dataset but not in the result dataset
+            will be reprocessed.
     Example:
         ```py
@@ -149,10 +155,15 @@ def read_dataset(
     else:
         signals_schema |= SignalSchema.from_column_types(query.column_types or {})
     chain = DataChain(query, _settings, signals_schema)
     if delta:
         chain = chain._as_delta(
-            on=delta_on, right_on=delta_result_on, compare=delta_compare
+            on=delta_on,
+            right_on=delta_result_on,
+            compare=delta_compare,
+            delta_retry=delta_retry,
         )
     return chain

datachain/lib/dc/storage.py CHANGED Viewed

@@ -38,6 +38,7 @@ def read_storage(
     delta_on: Optional[Union[str, Sequence[str]]] = None,
     delta_result_on: Optional[Union[str, Sequence[str]]] = None,
     delta_compare: Optional[Union[str, Sequence[str]]] = None,
+    delta_retry: Optional[Union[bool, str]] = None,
     client_config: Optional[dict] = None,
 ) -> "DataChain":
     """Get data from storage(s) as a list of file with all file attributes.
@@ -83,6 +84,13 @@ def read_storage(
         delta_compare: A list of fields used to check if the same row has been modified
             in the new version of the source.
             If not defined, all fields except those defined in `delta_on` will be used.
+        delta_retry: Controls which records to reprocess. Can be:
+            - A string specifying a field name: Records where this field is not None
+              will be reprocessed (error checking mode).
+            - True: Records that exist in the source dataset but not in the result
+              dataset (based on delta_on/delta_result_on fields) will be reprocessed
+              (missing records mode).
+            - False or None: No retry processing.
     Returns:
         DataChain: A DataChain object containing the file information.
@@ -208,6 +216,10 @@ def read_storage(
     if delta:
         storage_chain = storage_chain._as_delta(
-            on=delta_on, right_on=delta_result_on, compare=delta_compare
+            on=delta_on,
+            right_on=delta_result_on,
+            compare=delta_compare,
+            delta_retry=delta_retry,
         )
     return storage_chain

datachain/lib/file.py CHANGED Viewed

@@ -127,10 +127,7 @@ class TarVFile(VFile):
     @classmethod
     def open(cls, file: "File", location: list[dict]):
         """Stream file from tar archive based on location in archive."""
-        if len(location) > 1:
-            raise VFileError(
-                "multiple 'location's are not supported yet", file.source, file.path
-            )
+        tar_file = cls.parent(file, location)
         loc = location[0]
@@ -140,15 +137,26 @@ class TarVFile(VFile):
         if (size := loc.get("size", None)) is None:
             raise VFileError("'size' is not specified", file.source, file.path)
+        client = file._catalog.get_client(tar_file.source)
+        fd = client.open_object(tar_file, use_cache=file._caching_enabled)
+        return FileSlice(fd, offset, size, file.name)
+    @classmethod
+    def parent(cls, file: "File", location: list[dict]) -> "File":
+        if len(location) > 1:
+            raise VFileError(
+                "multiple 'location's are not supported yet", file.source, file.path
+            )
+        loc = location[0]
         if (parent := loc.get("parent", None)) is None:
             raise VFileError("'parent' is not specified", file.source, file.path)
         tar_file = File(**parent)
         tar_file._set_stream(file._catalog)
-        client = file._catalog.get_client(tar_file.source)
-        fd = client.open_object(tar_file, use_cache=file._caching_enabled)
-        return FileSlice(fd, offset, size, file.name)
+        return tar_file
 class VFileRegistry:
@@ -159,7 +167,7 @@ class VFileRegistry:
         cls._vtype_readers[reader.get_vtype()] = reader
     @classmethod
-    def resolve(cls, file: "File", location: list[dict]):
+    def _get_reader(cls, file: "File", location: list[dict]):
         if len(location) == 0:
             raise VFileError(
                 "'location' must not be list of JSONs", file.source, file.path
@@ -174,8 +182,18 @@ class VFileRegistry:
                 "reader not registered", file.source, file.path, vtype=vtype
             )
+        return reader
+    @classmethod
+    def open(cls, file: "File", location: list[dict]):
+        reader = cls._get_reader(file, location)
         return reader.open(file, location)
+    @classmethod
+    def parent(cls, file: "File", location: list[dict]) -> "File":
+        reader = cls._get_reader(file, location)
+        return reader.parent(file, location)
 class File(DataModel):
     """
@@ -330,7 +348,7 @@ class File(DataModel):
     def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
         """Open the file and return a file object."""
         if self.location:
-            with VFileRegistry.resolve(self, self.location) as f:  # type: ignore[arg-type]
+            with VFileRegistry.open(self, self.location) as f:  # type: ignore[arg-type]
                 yield f
         else:
@@ -349,6 +367,13 @@ class File(DataModel):
     def read_text(self):
         """Returns file contents as text."""
+        if self.location:
+            raise VFileError(
+                "Reading text from virtual file is not supported",
+                self.source,
+                self.path,
+            )
         with self.open(mode="r") as stream:
             return stream.read()
@@ -427,9 +452,13 @@ class File(DataModel):
         if self._catalog is None:
             raise RuntimeError("cannot prefetch file because catalog is not setup")
+        file = self
+        if self.location:
+            file = VFileRegistry.parent(self, self.location)  # type: ignore[arg-type]
         client = self._catalog.get_client(self.source)
-        await client._download(self, callback=download_cb or self._download_cb)
-        self._set_stream(
+        await client._download(file, callback=download_cb or self._download_cb)
+        file._set_stream(
             self._catalog, caching_enabled=True, download_cb=DEFAULT_CALLBACK
         )
         return True

datachain/semver.py CHANGED Viewed

@@ -1,8 +1,13 @@
+# Maximum version number for semver (major.minor.patch) is 999999.999999.999999
+# this number was chosen because value("999999.999999.999999") < 2**63 - 1
+MAX_VERSION_NUMBER = 999_999
 def parse(version: str) -> tuple[int, int, int]:
     """Parsing semver into 3 integers: major, minor, patch"""
     validate(version)
     parts = version.split(".")
-    return (int(parts[0]), int(parts[1]), int(parts[2]))
+    return int(parts[0]), int(parts[1]), int(parts[2])
 def validate(version: str) -> None:
@@ -20,14 +25,18 @@ def validate(version: str) -> None:
     for part in parts:
         try:
             val = int(part)
-            assert val >= 0
+            assert 0 <= val <= MAX_VERSION_NUMBER
         except (ValueError, AssertionError):
             raise ValueError(error_message) from None
 def create(major: int = 0, minor: int = 0, patch: int = 0) -> str:
     """Creates new semver from 3 integers: major, minor and patch"""
-    if major < 0 or minor < 0 or patch < 0:
+    if not (
+        0 <= major <= MAX_VERSION_NUMBER
+        and 0 <= minor <= MAX_VERSION_NUMBER
+        and 0 <= patch <= MAX_VERSION_NUMBER
+    ):
         raise ValueError("Major, minor and patch must be greater or equal to zero")
     return ".".join([str(major), str(minor), str(patch)])
@@ -35,10 +44,11 @@ def create(major: int = 0, minor: int = 0, patch: int = 0) -> str:
 def value(version: str) -> int:
     """
-    Calculate integer value of a version. This is useful when comparing two versions
+    Calculate integer value of a version. This is useful when comparing two versions.
     """
     major, minor, patch = parse(version)
-    return major * 100 + minor * 10 + patch
+    limit = MAX_VERSION_NUMBER + 1
+    return major * (limit**2) + minor * limit + patch
 def compare(v1: str, v2: str) -> int:

{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.18.11
+Version: 0.19.1
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
@@ -146,6 +146,12 @@ Use Cases
    on these tables at scale.
 3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
    Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
+4. **Incremental Processing.** DataChain's delta and retry features allow for efficient
+   processing workflows:
+   - **Delta Processing**: Process only new or changed files/records
+   - **Retry Processing**: Automatically reprocess records with errors or missing results
+   - **Combined Approach**: Process new data and fix errors in a single pipeline
 Getting Started
 ===============
@@ -158,7 +164,7 @@ to get started with `DataChain` and learn more.
         pip install datachain
-Example: download subset of files based on metadata
+Example: Download Subset of Files Based on Metadata
 ---------------------------------------------------
 Sometimes users only need to download a specific subset of files from cloud storage,
@@ -182,6 +188,54 @@ high confidence scores.
     likely_cats.to_storage("high-confidence-cats/", signal="file")
+Example: Incremental Processing with Error Handling
+---------------------------------------------------
+This example shows how to use both delta and retry processing for efficient handling of large
+datasets that evolve over time and may occasionally have processing errors.
+.. code:: py
+    import datachain as dc
+    from datachain import C, File
+    def process_file(file: File):
+        """Process a file, which may occasionally fail."""
+        try:
+            # Your processing logic here
+            content = file.read_text()
+            result = analyze_content(content)
+            return {
+                "content": content,
+                "result": result,
+                "error": None  # No error
+            }
+        except Exception as e:
+            # Return an error that will trigger reprocessing next time
+            return {
+                "content": None,
+                "result": None,
+                "error": str(e)  # Error field will trigger retry
+            }
+    # Process files efficiently with delta and retry
+    chain = (
+        dc.read_storage(
+            "data/",
+            update=True,
+            delta=True,              # Process only new/changed files
+            delta_on="file.path",    # Identify files by path
+            retry_on="error"         # Field that indicates errors
+        )
+        .map(processed_result=process_file)
+        .mutate(
+            content=C("processed_result.content"),
+            result=C("processed_result.result"),
+            error=C("processed_result.error")
+        )
+        .save(name="processed_data")
+    )
 Example: LLM based text-file evaluation
 ---------------------------------------

{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/RECORD RENAMED Viewed

@@ -4,7 +4,7 @@ datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
 datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
 datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
 datachain/dataset.py,sha256=XUZ-kSBL1y6juFqlSWXXbattGS1E53lXpyhc0Ip1_AA,20527
-datachain/delta.py,sha256=q-ritPMxgsTh53qJYd2N1TqZ3Inxc7GJ9JED9rE-Z1M,3994
+datachain/delta.py,sha256=fP1Yy_MfdnTZmIOe243SBiDWTzd6MqLw0tQxvZNxLcs,8384
 datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
 datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
 datachain/listing.py,sha256=JtExYIfKMFhEIIcSSWBmaxWpoS3ben7kb692cHHm4Lo,7079
@@ -14,7 +14,7 @@ datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2Sm
 datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
 datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
-datachain/semver.py,sha256=t_3Y5OGLEthrstBwuwrf5pXVquEuRFu3ZoGe3ajfJB8,1715
+datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
 datachain/studio.py,sha256=1J2ANFVVA1ysPxBuLibQSnSXt0U9Vfgz9ZNGikYtWdk,11038
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
@@ -39,9 +39,9 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
 datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
 datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
-datachain/client/fsspec.py,sha256=c8oRBUMo31k8bMB_mIA60PDfna4nYTdslzHqmqL2Uvg,13918
+datachain/client/fsspec.py,sha256=huPHNDZRGz_rSN7XnS9hKmRoS2fsSz_y2-cxUSlvsOA,13938
 datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
-datachain/client/hf.py,sha256=posnI5WOKOMG1yY_ZiV9Orcd24QsUPKZlOXgJVLxxrM,1558
+datachain/client/hf.py,sha256=mRBqHeBT758TJicU-Fn2L3l5AbHWwMzycWwttNUACKk,2180
 datachain/client/local.py,sha256=cGoCYflribzexiOe-Y1qbaE2fJRh-_EgQrfCSa0yK_E,4568
 datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
 datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
@@ -68,11 +68,11 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
 datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
 datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/lib/arrow.py,sha256=mFO_6wRqzpEzBhXf7Xn1aeLUvaiHcC6XQ-8as9sbcgY,10253
+datachain/lib/arrow.py,sha256=2IuNZ6tRFsxVNhWElqr0ptz28geSDzlDHUtzD4qeDNM,10339
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
 datachain/lib/dataset_info.py,sha256=d-jz6zeDU5DEgYtyeSF5nK0MU-40FV5km_iOCh4pXzo,3179
-datachain/lib/file.py,sha256=mzc7_fpHAkVhs4z3jBUhFQzPEbODdXJpzjVfby2IkC4,31117
+datachain/lib/file.py,sha256=PuTa6CEG9CaJXPhxrZFY-R9-DS7ynB9l7Y0bUbd_Qwg,31952
 datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
 datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
 datachain/lib/listing.py,sha256=5_GoATtIwCtd1JMqlorPB_vQDxndOQZpiWjNOG3NMw4,7007
@@ -99,15 +99,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
 datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
 datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
 datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
-datachain/lib/dc/datachain.py,sha256=5rR_QqG4vesq-x545ZTSFJDSb6Oc5CW4-ziQYD6DpW4,80993
-datachain/lib/dc/datasets.py,sha256=G65leCuo_3bItmvjoV1wK0pzj7a2IQqe3xRsflpF3xM,10794
+datachain/lib/dc/datachain.py,sha256=cQjq6_OWQ_1JKvIqb8snl6mKfuBbpllPEao5ygVINog,81733
+datachain/lib/dc/datasets.py,sha256=g_bBGCUwAwNJypYSUQvrDDqnaw7nfXpvrEvUVPtWATY,11268
 datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
 datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
 datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
 datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
 datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
 datachain/lib/dc/records.py,sha256=J1I69J2gFIBjRTGr2LG-5qn_rTVzRLcr2y3tVDrmHdg,3068
-datachain/lib/dc/storage.py,sha256=YUlw3OtdRmYc2k24AmqjnqJK8k1H-onjh-mCxu_3BbE,8195
+datachain/lib/dc/storage.py,sha256=u-QB_0sn1Wwc0-9phi1zT38UDe5uBIc25xbAhKMU2fA,8774
 datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
 datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
 datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.18.11.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.18.11.dist-info/METADATA,sha256=TgOokr9DxfY4A1mq7-5APy8DTHUqFEf2FslYxASH1IA,11320
-datachain-0.18.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-datachain-0.18.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.18.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.18.11.dist-info/RECORD,,
+datachain-0.19.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.19.1.dist-info/METADATA,sha256=qg4KSU457ARE-A00yjNYNtFP3vhX0yqsxrCGKctXva4,13281
+datachain-0.19.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+datachain-0.19.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.19.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.19.1.dist-info/RECORD,,

{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.18.11__py3-none-any.whl → 0.19.1__py3-none-any.whl

Potentially problematic release.

datachain 0.18.11py3-none-any.whl → 0.19.1py3-none-any.whl