PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/utils.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import glob
 import io
-import json
 import logging
 import os
 import os.path as osp
@@ -10,10 +9,8 @@ import sys
 import time
 from collections.abc import Iterable, Iterator, Sequence
 from contextlib import contextmanager
-from datetime import date, datetime, timezone
-from itertools import chain, islice
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
-from uuid import UUID
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING, Any, TypeVar
 import cloudpickle
 import platformdirs
@@ -26,6 +23,8 @@ if TYPE_CHECKING:
     from typing_extensions import Self
+DEFAULT_BATCH_SIZE = 2000
 logger = logging.getLogger("datachain")
 NUL = b"\0"
@@ -52,11 +51,11 @@ class DataChainDir:
     def __init__(
         self,
-        root: Optional[str] = None,
-        cache: Optional[str] = None,
-        tmp: Optional[str] = None,
-        db: Optional[str] = None,
-        config: Optional[str] = None,
+        root: str | None = None,
+        cache: str | None = None,
+        tmp: str | None = None,
+        db: str | None = None,
+        config: str | None = None,
     ) -> None:
         self.root = osp.abspath(root) if root is not None else self.default_root()
         self.cache = (
@@ -121,7 +120,7 @@ def global_config_dir():
     )
-def human_time_to_int(time: str) -> Optional[int]:
+def human_time_to_int(time: str) -> int | None:
     if not time:
         return None
@@ -145,7 +144,7 @@ def time_to_str(dt):
     return dt.strftime("%Y-%m-%d %H:%M:%S")
-def time_to_local(dt: Union[datetime, str]) -> datetime:
+def time_to_local(dt: datetime | str) -> datetime:
     # TODO check usage
     if isinstance(dt, str):
         dt = isoparse(dt)
@@ -155,11 +154,11 @@ def time_to_local(dt: Union[datetime, str]) -> datetime:
         return dt
-def time_to_local_str(dt: Union[datetime, str]) -> str:
+def time_to_local_str(dt: datetime | str) -> str:
     return time_to_str(time_to_local(dt))
-def is_expired(expires: Optional[Union[datetime, str]]):
+def is_expired(expires: datetime | str | None):
     if expires:
         return time_to_local(expires) < time_to_local(datetime.now())  # noqa: DTZ005
@@ -225,30 +224,43 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
 _T_co = TypeVar("_T_co", covariant=True)
-def batched(iterable: Iterable[_T_co], n: int) -> Iterator[tuple[_T_co, ...]]:
-    """Batch data into tuples of length n. The last batch may be shorter."""
-    # Based on: https://docs.python.org/3/library/itertools.html#itertools-recipes
-    # batched('ABCDEFG', 3) --> ABC DEF G
-    if n < 1:
-        raise ValueError("Batch size must be at least one")
-    it = iter(iterable)
-    while batch := tuple(islice(it, n)):
+def _dynamic_batched_core(
+    iterable: Iterable[_T_co],
+    batch_size: int,
+) -> Iterator[list[_T_co]]:
+    """Core batching logic that yields lists."""
+    batch: list[_T_co] = []
+    for item in iterable:
+        # Check if adding this item would exceed limits
+        if len(batch) >= batch_size and batch:  # Yield current batch if we have one
+            yield batch
+            batch = []
+        batch.append(item)
+    # Yield any remaining items
+    if batch:
         yield batch
-def batched_it(iterable: Iterable[_T_co], n: int) -> Iterator[Iterator[_T_co]]:
-    """Batch data into iterators of length n. The last batch may be shorter."""
-    # batched('ABCDEFG', 3) --> ABC DEF G
-    if n < 1:
-        raise ValueError("Batch size must be at least one")
-    it = iter(iterable)
-    while True:
-        chunk_it = islice(it, n)
-        try:
-            first_el = next(chunk_it)
-        except StopIteration:
-            return
-        yield chain((first_el,), chunk_it)
+def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
+    """
+    Batch data into tuples of length batch_size.
+    The last batch may be shorter.
+    """
+    yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
+def batched_it(
+    iterable: Iterable[_T_co],
+    batch_size: int = DEFAULT_BATCH_SIZE,
+) -> Iterator[Iterator[_T_co]]:
+    """
+    Batch data into iterators with dynamic sizing based on row count and memory usage.
+    """
+    yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
 def flatten(items):
@@ -286,23 +298,52 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
     return retry
-def determine_processes(parallel: Optional[Union[bool, int]]) -> Union[bool, int]:
+def determine_workers(
+    workers: bool | int,
+    rows_total: int | None = None,
+) -> bool | int:
+    """Determine the number of workers to use for distributed processing."""
+    if rows_total is not None and rows_total <= 1:
+        # Disable distributed processing if there is no rows or only one row.
+        return False
+    if (
+        workers is False
+        and os.environ.get("DATACHAIN_DISTRIBUTED")
+        and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
+    ):
+        # Enable distributed processing by default if the module is available,
+        # and a default number of workers is provided.
+        workers = int(os.environ["DATACHAIN_SETTINGS_WORKERS"])
+    if not workers or workers <= 0:
+        return False
+    return workers
+def determine_processes(
+    parallel: bool | int | None = None,
+    rows_total: int | None = None,
+) -> bool | int:
+    """Determine the number of processes to use for parallel processing."""
+    if rows_total is not None and rows_total <= 1:
+        # Disable parallel processing if there is no rows or only one row.
+        return False
     if parallel is None and os.environ.get("DATACHAIN_SETTINGS_PARALLEL") is not None:
         parallel = int(os.environ["DATACHAIN_SETTINGS_PARALLEL"])
-    if parallel is None or parallel is False:
+    if parallel is None or parallel is False or parallel == 0:
         return False
     if parallel is True:
         return True
-    if parallel == 0:
-        return False
     if parallel < 0:
         return True
+    if parallel == 1:
+        # Disable parallel processing if only one process is requested.
+        return False
     return parallel
 def get_env_list(
-    key: str, default: Optional[Sequence] = None, sep: str = ","
-) -> Optional[Sequence[str]]:
+    key: str, default: Sequence | None = None, sep: str = ","
+) -> Sequence[str] | None:
     try:
         str_val = os.environ[key]
     except KeyError:
@@ -343,10 +384,10 @@ def show_df(
 def show_records(
-    records: Optional[list[dict]],
+    records: list[dict] | None,
     collapse_columns: bool = False,
     system_columns: bool = False,
-    hidden_fields: Optional[list[str]] = None,
+    hidden_fields: list[str] | None = None,
 ) -> None:
     import pandas as pd
@@ -359,21 +400,9 @@ def show_records(
     return show_df(df, collapse_columns=collapse_columns, system_columns=system_columns)
-class JSONSerialize(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, bytes):
-            return list(obj[:1024])
-        if isinstance(obj, (datetime, date)):
-            return obj.isoformat()
-        if isinstance(obj, UUID):
-            return str(obj)
-        return super().default(obj)
 def inside_colab() -> bool:
     try:
-        from google import colab  # noqa: F401
+        from google import colab  # type: ignore[attr-defined]  # noqa: F401
     except ImportError:
         return False
     return True
@@ -390,7 +419,7 @@ def inside_notebook() -> bool:
     if shell == "ZMQInteractiveShell":
         try:
-            import IPython
+            import IPython  # type: ignore[import-not-found]
             return IPython.__version__ >= "6.0.0"
         except ImportError:
@@ -475,7 +504,7 @@ def row_to_nested_dict(
 ) -> dict[str, Any]:
     """Converts a row to a nested dict based on the provided headers."""
     result: dict[str, Any] = {}
-    for h, v in zip(headers, row):
+    for h, v in zip(headers, row, strict=False):
         nested_dict_path_set(result, h, v)
     return result
@@ -486,4 +515,17 @@ def safe_closing(thing: T) -> Iterator[T]:
         yield thing
     finally:
         if hasattr(thing, "close"):
-            thing.close()
+            thing.close()  # type: ignore[attr-defined]
+def getenv_bool(name: str, default: bool = False) -> bool:
+    val = os.getenv(name)
+    if val is None:
+        return default
+    return val.lower() in ("1", "true", "yes", "on")
+def ensure_sequence(x) -> Sequence:
+    if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
+        return x
+    return [x]

{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA RENAMED Viewed

@@ -1,20 +1,19 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.14.2
+Version: 0.39.0
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
 Project-URL: Documentation, https://datachain.dvc.ai
-Project-URL: Issues, https://github.com/iterative/datachain/issues
-Project-URL: Source, https://github.com/iterative/datachain
+Project-URL: Issues, https://github.com/datachain-ai/datachain/issues
+Project-URL: Source, https://github.com/datachain-ai/datachain
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Development Status :: 2 - Pre-Alpha
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Description-Content-Type: text/x-rst
 License-File: LICENSE
 Requires-Dist: pyyaml
@@ -22,10 +21,12 @@ Requires-Dist: tomlkit
 Requires-Dist: tqdm
 Requires-Dist: numpy<3,>=1
 Requires-Dist: pandas>=2.0.0
+Requires-Dist: ujson>=5.10.0
 Requires-Dist: packaging
 Requires-Dist: pyarrow
 Requires-Dist: typing-extensions
 Requires-Dist: python-dateutil>=2
+Requires-Dist: dateparser>=1.0.0
 Requires-Dist: attrs>=21.3.0
 Requires-Dist: fsspec>=2024.2.0
 Requires-Dist: s3fs>=2024.2.0
@@ -37,11 +38,10 @@ Requires-Dist: shtab<2,>=1.3.4
 Requires-Dist: sqlalchemy>=2
 Requires-Dist: multiprocess==0.70.16
 Requires-Dist: cloudpickle
-Requires-Dist: orjson>=3.10.5
-Requires-Dist: pydantic<2.11,>=2
+Requires-Dist: pydantic
 Requires-Dist: jmespath>=1.0
 Requires-Dist: datamodel-code-generator>=0.25
-Requires-Dist: Pillow<12,>=10.0.0
+Requires-Dist: Pillow<13,>=10.0.0
 Requires-Dist: msgpack<2,>=1.0.4
 Requires-Dist: psutil
 Requires-Dist: huggingface_hub
@@ -55,14 +55,16 @@ Provides-Extra: docs
 Requires-Dist: mkdocs>=1.5.2; extra == "docs"
 Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
 Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
-Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
 Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
 Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
+Requires-Dist: mkdocs-section-index>=0.3.10; extra == "docs"
 Requires-Dist: eval-type-backport; extra == "docs"
 Provides-Extra: torch
 Requires-Dist: torch>=2.1.0; extra == "torch"
 Requires-Dist: torchvision; extra == "torch"
 Requires-Dist: transformers>=4.36.0; extra == "torch"
+Provides-Extra: audio
+Requires-Dist: soundfile; extra == "audio"
 Provides-Extra: remote
 Requires-Dist: lz4; extra == "remote"
 Requires-Dist: requests>=2.22.0; extra == "remote"
@@ -70,21 +72,26 @@ Provides-Extra: vector
 Requires-Dist: usearch; extra == "vector"
 Provides-Extra: hf
 Requires-Dist: numba>=0.60.0; extra == "hf"
-Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
+Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
+Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
 Requires-Dist: fsspec>=2024.12.0; extra == "hf"
 Provides-Extra: video
 Requires-Dist: ffmpeg-python; extra == "video"
 Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
 Requires-Dist: opencv-python; extra == "video"
+Provides-Extra: postgres
+Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
 Provides-Extra: tests
-Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
-Requires-Dist: pytest<9,>=8; extra == "tests"
+Requires-Dist: datachain[audio,hf,postgres,remote,torch,vector,video]; extra == "tests"
+Requires-Dist: pytest<10,>=8; extra == "tests"
+Requires-Dist: pytest-asyncio; extra == "tests"
 Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
 Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
 Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
 Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
 Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
 Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
+Requires-Dist: pytest-dotenv; extra == "tests"
 Requires-Dist: virtualenv; extra == "tests"
 Requires-Dist: dulwich; extra == "tests"
 Requires-Dist: hypothesis; extra == "tests"
@@ -94,8 +101,9 @@ Requires-Dist: scipy; extra == "tests"
 Requires-Dist: ultralytics; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests]; extra == "dev"
-Requires-Dist: mypy==1.15.0; extra == "dev"
+Requires-Dist: mypy==1.19.0; extra == "dev"
 Requires-Dist: types-python-dateutil; extra == "dev"
+Requires-Dist: types-dateparser; extra == "dev"
 Requires-Dist: types-pytz; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"
 Requires-Dist: types-requests; extra == "dev"
@@ -107,13 +115,15 @@ Requires-Dist: accelerate; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
 Requires-Dist: ultralytics; extra == "examples"
 Requires-Dist: open_clip_torch; extra == "examples"
+Requires-Dist: openai; extra == "examples"
+Requires-Dist: torchaudio; extra == "examples"
 Dynamic: license-file
 ================
 |logo| DataChain
 ================
-|PyPI| |Python Version| |Codecov| |Tests|
+|PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
 .. |logo| image:: docs/assets/datachain.svg
    :height: 24
@@ -123,12 +133,15 @@ Dynamic: license-file
 .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
    :target: https://pypi.org/project/datachain
    :alt: Python Version
-.. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
-   :target: https://codecov.io/gh/iterative/datachain
+.. |Codecov| image:: https://codecov.io/gh/datachain-ai/datachain/graph/badge.svg?token=byliXGGyGB
+   :target: https://codecov.io/gh/datachain-ai/datachain
    :alt: Codecov
-.. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
-   :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
+.. |Tests| image:: https://github.com/datachain-ai/datachain/actions/workflows/tests.yml/badge.svg
+   :target: https://github.com/datachain-ai/datachain/actions/workflows/tests.yml
    :alt: Tests
+.. |DeepWiki| image:: https://deepwiki.com/badge.svg
+   :target: https://deepwiki.com/datachain-ai/datachain
+   :alt: DeepWiki
 DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
 data like images, audio, videos, text and PDFs. It integrates with external storage
@@ -146,6 +159,12 @@ Use Cases
    on these tables at scale.
 3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
    Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
+4. **Incremental Processing.** DataChain's delta and retry features allow for efficient
+   processing workflows:
+   - **Delta Processing**: Process only new or changed files/records
+   - **Retry Processing**: Automatically reprocess records with errors or missing results
+   - **Combined Approach**: Process new data and fix errors in a single pipeline
 Getting Started
 ===============
@@ -158,7 +177,7 @@ to get started with `DataChain` and learn more.
         pip install datachain
-Example: download subset of files based on metadata
+Example: Download Subset of Files Based on Metadata
 ---------------------------------------------------
 Sometimes users only need to download a specific subset of files from cloud storage,
@@ -171,7 +190,7 @@ high confidence scores.
     import datachain as dc
-    meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
+    meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
     images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
     images_id = images.map(id=lambda file: file.path.split('.')[-2])
@@ -182,6 +201,42 @@ high confidence scores.
     likely_cats.to_storage("high-confidence-cats/", signal="file")
+Example: Incremental Processing with Error Handling
+---------------------------------------------------
+This example shows how to use both delta and retry processing for efficient handling of large
+datasets that evolve over time and may occasionally have processing errors.
+.. code:: py
+    import datachain as dc
+    def process_file(file: dc.File) -> tuple[str, str, str]:
+        """Analyze a file, may occasionally fail."""
+        try:
+            # Your processing logic here
+            content = file.read_text()
+            result = content.upper()
+            return content, result, ""  # No error
+        except Exception as e:
+            # Return an error that will trigger reprocessing next time
+            return "", "", str(e)  # Error field will trigger retry
+    # Process files efficiently with delta and retry
+    # Run it many times, keep adding files, to see delta and retry in action
+    chain = (
+        dc.read_storage(
+            "data/",
+            update=True,
+            delta=True,              # Process only new/changed files
+            delta_on="file.path",    # Identify files by path
+            delta_retry="error",     # Process files with error again
+        )
+        .map(process_file, output=("content", "result", "error"))
+        .save("processed-data")
+    )
 Example: LLM based text-file evaluation
 ---------------------------------------
@@ -213,7 +268,7 @@ Python code:
          return result.lower().startswith("success")
     chain = (
-       dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
+       dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
        .settings(parallel=4, cache=True)
        .map(is_success=eval_dialogue)
        .save("mistral_files")
@@ -288,7 +343,7 @@ DataChain Studio Platform
 - **Access control** including SSO and team based collaboration.
 .. _PyPI: https://pypi.org/
-.. _file an issue: https://github.com/iterative/datachain/issues
+.. _file an issue: https://github.com/datachain-ai/datachain/issues
 .. github-only
 .. _Contributor Guide: https://docs.datachain.ai/contributing
 .. _Pydantic: https://github.com/pydantic/pydantic

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl