datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/utils.py CHANGED
@@ -1,6 +1,5 @@
1
1
  import glob
2
2
  import io
3
- import json
4
3
  import logging
5
4
  import os
6
5
  import os.path as osp
@@ -10,10 +9,8 @@ import sys
10
9
  import time
11
10
  from collections.abc import Iterable, Iterator, Sequence
12
11
  from contextlib import contextmanager
13
- from datetime import date, datetime, timezone
14
- from itertools import chain, islice
15
- from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
16
- from uuid import UUID
12
+ from datetime import datetime, timezone
13
+ from typing import TYPE_CHECKING, Any, TypeVar
17
14
 
18
15
  import cloudpickle
19
16
  import platformdirs
@@ -26,6 +23,8 @@ if TYPE_CHECKING:
26
23
  from typing_extensions import Self
27
24
 
28
25
 
26
+ DEFAULT_BATCH_SIZE = 2000
27
+
29
28
  logger = logging.getLogger("datachain")
30
29
 
31
30
  NUL = b"\0"
@@ -52,11 +51,11 @@ class DataChainDir:
52
51
 
53
52
  def __init__(
54
53
  self,
55
- root: Optional[str] = None,
56
- cache: Optional[str] = None,
57
- tmp: Optional[str] = None,
58
- db: Optional[str] = None,
59
- config: Optional[str] = None,
54
+ root: str | None = None,
55
+ cache: str | None = None,
56
+ tmp: str | None = None,
57
+ db: str | None = None,
58
+ config: str | None = None,
60
59
  ) -> None:
61
60
  self.root = osp.abspath(root) if root is not None else self.default_root()
62
61
  self.cache = (
@@ -121,7 +120,7 @@ def global_config_dir():
121
120
  )
122
121
 
123
122
 
124
- def human_time_to_int(time: str) -> Optional[int]:
123
+ def human_time_to_int(time: str) -> int | None:
125
124
  if not time:
126
125
  return None
127
126
 
@@ -145,7 +144,7 @@ def time_to_str(dt):
145
144
  return dt.strftime("%Y-%m-%d %H:%M:%S")
146
145
 
147
146
 
148
- def time_to_local(dt: Union[datetime, str]) -> datetime:
147
+ def time_to_local(dt: datetime | str) -> datetime:
149
148
  # TODO check usage
150
149
  if isinstance(dt, str):
151
150
  dt = isoparse(dt)
@@ -155,11 +154,11 @@ def time_to_local(dt: Union[datetime, str]) -> datetime:
155
154
  return dt
156
155
 
157
156
 
158
- def time_to_local_str(dt: Union[datetime, str]) -> str:
157
+ def time_to_local_str(dt: datetime | str) -> str:
159
158
  return time_to_str(time_to_local(dt))
160
159
 
161
160
 
162
- def is_expired(expires: Optional[Union[datetime, str]]):
161
+ def is_expired(expires: datetime | str | None):
163
162
  if expires:
164
163
  return time_to_local(expires) < time_to_local(datetime.now()) # noqa: DTZ005
165
164
 
@@ -225,30 +224,43 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
225
224
  _T_co = TypeVar("_T_co", covariant=True)
226
225
 
227
226
 
228
- def batched(iterable: Iterable[_T_co], n: int) -> Iterator[tuple[_T_co, ...]]:
229
- """Batch data into tuples of length n. The last batch may be shorter."""
230
- # Based on: https://docs.python.org/3/library/itertools.html#itertools-recipes
231
- # batched('ABCDEFG', 3) --> ABC DEF G
232
- if n < 1:
233
- raise ValueError("Batch size must be at least one")
234
- it = iter(iterable)
235
- while batch := tuple(islice(it, n)):
227
+ def _dynamic_batched_core(
228
+ iterable: Iterable[_T_co],
229
+ batch_size: int,
230
+ ) -> Iterator[list[_T_co]]:
231
+ """Core batching logic that yields lists."""
232
+
233
+ batch: list[_T_co] = []
234
+
235
+ for item in iterable:
236
+ # Check if adding this item would exceed limits
237
+ if len(batch) >= batch_size and batch: # Yield current batch if we have one
238
+ yield batch
239
+ batch = []
240
+
241
+ batch.append(item)
242
+
243
+ # Yield any remaining items
244
+ if batch:
236
245
  yield batch
237
246
 
238
247
 
239
- def batched_it(iterable: Iterable[_T_co], n: int) -> Iterator[Iterator[_T_co]]:
240
- """Batch data into iterators of length n. The last batch may be shorter."""
241
- # batched('ABCDEFG', 3) --> ABC DEF G
242
- if n < 1:
243
- raise ValueError("Batch size must be at least one")
244
- it = iter(iterable)
245
- while True:
246
- chunk_it = islice(it, n)
247
- try:
248
- first_el = next(chunk_it)
249
- except StopIteration:
250
- return
251
- yield chain((first_el,), chunk_it)
248
+ def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
249
+ """
250
+ Batch data into tuples of length batch_size.
251
+ The last batch may be shorter.
252
+ """
253
+ yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
254
+
255
+
256
+ def batched_it(
257
+ iterable: Iterable[_T_co],
258
+ batch_size: int = DEFAULT_BATCH_SIZE,
259
+ ) -> Iterator[Iterator[_T_co]]:
260
+ """
261
+ Batch data into iterators with dynamic sizing based on row count and memory usage.
262
+ """
263
+ yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
252
264
 
253
265
 
254
266
  def flatten(items):
@@ -286,23 +298,52 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
286
298
  return retry
287
299
 
288
300
 
289
- def determine_processes(parallel: Optional[Union[bool, int]]) -> Union[bool, int]:
301
+ def determine_workers(
302
+ workers: bool | int,
303
+ rows_total: int | None = None,
304
+ ) -> bool | int:
305
+ """Determine the number of workers to use for distributed processing."""
306
+ if rows_total is not None and rows_total <= 1:
307
+ # Disable distributed processing if there is no rows or only one row.
308
+ return False
309
+ if (
310
+ workers is False
311
+ and os.environ.get("DATACHAIN_DISTRIBUTED")
312
+ and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
313
+ ):
314
+ # Enable distributed processing by default if the module is available,
315
+ # and a default number of workers is provided.
316
+ workers = int(os.environ["DATACHAIN_SETTINGS_WORKERS"])
317
+ if not workers or workers <= 0:
318
+ return False
319
+ return workers
320
+
321
+
322
+ def determine_processes(
323
+ parallel: bool | int | None = None,
324
+ rows_total: int | None = None,
325
+ ) -> bool | int:
326
+ """Determine the number of processes to use for parallel processing."""
327
+ if rows_total is not None and rows_total <= 1:
328
+ # Disable parallel processing if there is no rows or only one row.
329
+ return False
290
330
  if parallel is None and os.environ.get("DATACHAIN_SETTINGS_PARALLEL") is not None:
291
331
  parallel = int(os.environ["DATACHAIN_SETTINGS_PARALLEL"])
292
- if parallel is None or parallel is False:
332
+ if parallel is None or parallel is False or parallel == 0:
293
333
  return False
294
334
  if parallel is True:
295
335
  return True
296
- if parallel == 0:
297
- return False
298
336
  if parallel < 0:
299
337
  return True
338
+ if parallel == 1:
339
+ # Disable parallel processing if only one process is requested.
340
+ return False
300
341
  return parallel
301
342
 
302
343
 
303
344
  def get_env_list(
304
- key: str, default: Optional[Sequence] = None, sep: str = ","
305
- ) -> Optional[Sequence[str]]:
345
+ key: str, default: Sequence | None = None, sep: str = ","
346
+ ) -> Sequence[str] | None:
306
347
  try:
307
348
  str_val = os.environ[key]
308
349
  except KeyError:
@@ -343,10 +384,10 @@ def show_df(
343
384
 
344
385
 
345
386
  def show_records(
346
- records: Optional[list[dict]],
387
+ records: list[dict] | None,
347
388
  collapse_columns: bool = False,
348
389
  system_columns: bool = False,
349
- hidden_fields: Optional[list[str]] = None,
390
+ hidden_fields: list[str] | None = None,
350
391
  ) -> None:
351
392
  import pandas as pd
352
393
 
@@ -359,21 +400,9 @@ def show_records(
359
400
  return show_df(df, collapse_columns=collapse_columns, system_columns=system_columns)
360
401
 
361
402
 
362
- class JSONSerialize(json.JSONEncoder):
363
- def default(self, obj):
364
- if isinstance(obj, bytes):
365
- return list(obj[:1024])
366
- if isinstance(obj, (datetime, date)):
367
- return obj.isoformat()
368
- if isinstance(obj, UUID):
369
- return str(obj)
370
-
371
- return super().default(obj)
372
-
373
-
374
403
  def inside_colab() -> bool:
375
404
  try:
376
- from google import colab # noqa: F401
405
+ from google import colab # type: ignore[attr-defined] # noqa: F401
377
406
  except ImportError:
378
407
  return False
379
408
  return True
@@ -390,7 +419,7 @@ def inside_notebook() -> bool:
390
419
 
391
420
  if shell == "ZMQInteractiveShell":
392
421
  try:
393
- import IPython
422
+ import IPython # type: ignore[import-not-found]
394
423
 
395
424
  return IPython.__version__ >= "6.0.0"
396
425
  except ImportError:
@@ -475,7 +504,7 @@ def row_to_nested_dict(
475
504
  ) -> dict[str, Any]:
476
505
  """Converts a row to a nested dict based on the provided headers."""
477
506
  result: dict[str, Any] = {}
478
- for h, v in zip(headers, row):
507
+ for h, v in zip(headers, row, strict=False):
479
508
  nested_dict_path_set(result, h, v)
480
509
  return result
481
510
 
@@ -486,4 +515,17 @@ def safe_closing(thing: T) -> Iterator[T]:
486
515
  yield thing
487
516
  finally:
488
517
  if hasattr(thing, "close"):
489
- thing.close()
518
+ thing.close() # type: ignore[attr-defined]
519
+
520
+
521
+ def getenv_bool(name: str, default: bool = False) -> bool:
522
+ val = os.getenv(name)
523
+ if val is None:
524
+ return default
525
+ return val.lower() in ("1", "true", "yes", "on")
526
+
527
+
528
+ def ensure_sequence(x) -> Sequence:
529
+ if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
530
+ return x
531
+ return [x]
@@ -1,20 +1,19 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.2
3
+ Version: 0.39.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
7
7
  Project-URL: Documentation, https://datachain.dvc.ai
8
- Project-URL: Issues, https://github.com/iterative/datachain/issues
9
- Project-URL: Source, https://github.com/iterative/datachain
8
+ Project-URL: Issues, https://github.com/datachain-ai/datachain/issues
9
+ Project-URL: Source, https://github.com/datachain-ai/datachain
10
10
  Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.9
12
11
  Classifier: Programming Language :: Python :: 3.10
13
12
  Classifier: Programming Language :: Python :: 3.11
14
13
  Classifier: Programming Language :: Python :: 3.12
15
14
  Classifier: Programming Language :: Python :: 3.13
16
15
  Classifier: Development Status :: 2 - Pre-Alpha
17
- Requires-Python: >=3.9
16
+ Requires-Python: >=3.10
18
17
  Description-Content-Type: text/x-rst
19
18
  License-File: LICENSE
20
19
  Requires-Dist: pyyaml
@@ -22,10 +21,12 @@ Requires-Dist: tomlkit
22
21
  Requires-Dist: tqdm
23
22
  Requires-Dist: numpy<3,>=1
24
23
  Requires-Dist: pandas>=2.0.0
24
+ Requires-Dist: ujson>=5.10.0
25
25
  Requires-Dist: packaging
26
26
  Requires-Dist: pyarrow
27
27
  Requires-Dist: typing-extensions
28
28
  Requires-Dist: python-dateutil>=2
29
+ Requires-Dist: dateparser>=1.0.0
29
30
  Requires-Dist: attrs>=21.3.0
30
31
  Requires-Dist: fsspec>=2024.2.0
31
32
  Requires-Dist: s3fs>=2024.2.0
@@ -37,11 +38,10 @@ Requires-Dist: shtab<2,>=1.3.4
37
38
  Requires-Dist: sqlalchemy>=2
38
39
  Requires-Dist: multiprocess==0.70.16
39
40
  Requires-Dist: cloudpickle
40
- Requires-Dist: orjson>=3.10.5
41
- Requires-Dist: pydantic<2.11,>=2
41
+ Requires-Dist: pydantic
42
42
  Requires-Dist: jmespath>=1.0
43
43
  Requires-Dist: datamodel-code-generator>=0.25
44
- Requires-Dist: Pillow<12,>=10.0.0
44
+ Requires-Dist: Pillow<13,>=10.0.0
45
45
  Requires-Dist: msgpack<2,>=1.0.4
46
46
  Requires-Dist: psutil
47
47
  Requires-Dist: huggingface_hub
@@ -55,14 +55,16 @@ Provides-Extra: docs
55
55
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
56
56
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
57
57
  Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
58
- Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
59
58
  Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
60
59
  Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
60
+ Requires-Dist: mkdocs-section-index>=0.3.10; extra == "docs"
61
61
  Requires-Dist: eval-type-backport; extra == "docs"
62
62
  Provides-Extra: torch
63
63
  Requires-Dist: torch>=2.1.0; extra == "torch"
64
64
  Requires-Dist: torchvision; extra == "torch"
65
65
  Requires-Dist: transformers>=4.36.0; extra == "torch"
66
+ Provides-Extra: audio
67
+ Requires-Dist: soundfile; extra == "audio"
66
68
  Provides-Extra: remote
67
69
  Requires-Dist: lz4; extra == "remote"
68
70
  Requires-Dist: requests>=2.22.0; extra == "remote"
@@ -70,21 +72,26 @@ Provides-Extra: vector
70
72
  Requires-Dist: usearch; extra == "vector"
71
73
  Provides-Extra: hf
72
74
  Requires-Dist: numba>=0.60.0; extra == "hf"
73
- Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
75
+ Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
76
+ Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
74
77
  Requires-Dist: fsspec>=2024.12.0; extra == "hf"
75
78
  Provides-Extra: video
76
79
  Requires-Dist: ffmpeg-python; extra == "video"
77
80
  Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
78
81
  Requires-Dist: opencv-python; extra == "video"
82
+ Provides-Extra: postgres
83
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
79
84
  Provides-Extra: tests
80
- Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
81
- Requires-Dist: pytest<9,>=8; extra == "tests"
85
+ Requires-Dist: datachain[audio,hf,postgres,remote,torch,vector,video]; extra == "tests"
86
+ Requires-Dist: pytest<10,>=8; extra == "tests"
87
+ Requires-Dist: pytest-asyncio; extra == "tests"
82
88
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
83
89
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
84
90
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
85
91
  Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
86
92
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
87
93
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
94
+ Requires-Dist: pytest-dotenv; extra == "tests"
88
95
  Requires-Dist: virtualenv; extra == "tests"
89
96
  Requires-Dist: dulwich; extra == "tests"
90
97
  Requires-Dist: hypothesis; extra == "tests"
@@ -94,8 +101,9 @@ Requires-Dist: scipy; extra == "tests"
94
101
  Requires-Dist: ultralytics; extra == "tests"
95
102
  Provides-Extra: dev
96
103
  Requires-Dist: datachain[docs,tests]; extra == "dev"
97
- Requires-Dist: mypy==1.15.0; extra == "dev"
104
+ Requires-Dist: mypy==1.19.0; extra == "dev"
98
105
  Requires-Dist: types-python-dateutil; extra == "dev"
106
+ Requires-Dist: types-dateparser; extra == "dev"
99
107
  Requires-Dist: types-pytz; extra == "dev"
100
108
  Requires-Dist: types-PyYAML; extra == "dev"
101
109
  Requires-Dist: types-requests; extra == "dev"
@@ -107,13 +115,15 @@ Requires-Dist: accelerate; extra == "examples"
107
115
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
108
116
  Requires-Dist: ultralytics; extra == "examples"
109
117
  Requires-Dist: open_clip_torch; extra == "examples"
118
+ Requires-Dist: openai; extra == "examples"
119
+ Requires-Dist: torchaudio; extra == "examples"
110
120
  Dynamic: license-file
111
121
 
112
122
  ================
113
123
  |logo| DataChain
114
124
  ================
115
125
 
116
- |PyPI| |Python Version| |Codecov| |Tests|
126
+ |PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
117
127
 
118
128
  .. |logo| image:: docs/assets/datachain.svg
119
129
  :height: 24
@@ -123,12 +133,15 @@ Dynamic: license-file
123
133
  .. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
124
134
  :target: https://pypi.org/project/datachain
125
135
  :alt: Python Version
126
- .. |Codecov| image:: https://codecov.io/gh/iterative/datachain/graph/badge.svg?token=byliXGGyGB
127
- :target: https://codecov.io/gh/iterative/datachain
136
+ .. |Codecov| image:: https://codecov.io/gh/datachain-ai/datachain/graph/badge.svg?token=byliXGGyGB
137
+ :target: https://codecov.io/gh/datachain-ai/datachain
128
138
  :alt: Codecov
129
- .. |Tests| image:: https://github.com/iterative/datachain/actions/workflows/tests.yml/badge.svg
130
- :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
139
+ .. |Tests| image:: https://github.com/datachain-ai/datachain/actions/workflows/tests.yml/badge.svg
140
+ :target: https://github.com/datachain-ai/datachain/actions/workflows/tests.yml
131
141
  :alt: Tests
142
+ .. |DeepWiki| image:: https://deepwiki.com/badge.svg
143
+ :target: https://deepwiki.com/datachain-ai/datachain
144
+ :alt: DeepWiki
132
145
 
133
146
  DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
134
147
  data like images, audio, videos, text and PDFs. It integrates with external storage
@@ -146,6 +159,12 @@ Use Cases
146
159
  on these tables at scale.
147
160
  3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
148
161
  Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
162
+ 4. **Incremental Processing.** DataChain's delta and retry features allow for efficient
163
+ processing workflows:
164
+
165
+ - **Delta Processing**: Process only new or changed files/records
166
+ - **Retry Processing**: Automatically reprocess records with errors or missing results
167
+ - **Combined Approach**: Process new data and fix errors in a single pipeline
149
168
 
150
169
  Getting Started
151
170
  ===============
@@ -158,7 +177,7 @@ to get started with `DataChain` and learn more.
158
177
  pip install datachain
159
178
 
160
179
 
161
- Example: download subset of files based on metadata
180
+ Example: Download Subset of Files Based on Metadata
162
181
  ---------------------------------------------------
163
182
 
164
183
  Sometimes users only need to download a specific subset of files from cloud storage,
@@ -171,7 +190,7 @@ high confidence scores.
171
190
 
172
191
  import datachain as dc
173
192
 
174
- meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
193
+ meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
175
194
  images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
176
195
 
177
196
  images_id = images.map(id=lambda file: file.path.split('.')[-2])
@@ -182,6 +201,42 @@ high confidence scores.
182
201
  likely_cats.to_storage("high-confidence-cats/", signal="file")
183
202
 
184
203
 
204
+ Example: Incremental Processing with Error Handling
205
+ ---------------------------------------------------
206
+
207
+ This example shows how to use both delta and retry processing for efficient handling of large
208
+ datasets that evolve over time and may occasionally have processing errors.
209
+
210
+ .. code:: py
211
+
212
+ import datachain as dc
213
+
214
+ def process_file(file: dc.File) -> tuple[str, str, str]:
215
+ """Analyze a file, may occasionally fail."""
216
+ try:
217
+ # Your processing logic here
218
+ content = file.read_text()
219
+ result = content.upper()
220
+ return content, result, "" # No error
221
+ except Exception as e:
222
+ # Return an error that will trigger reprocessing next time
223
+ return "", "", str(e) # Error field will trigger retry
224
+
225
+ # Process files efficiently with delta and retry
226
+ # Run it many times, keep adding files, to see delta and retry in action
227
+ chain = (
228
+ dc.read_storage(
229
+ "data/",
230
+ update=True,
231
+ delta=True, # Process only new/changed files
232
+ delta_on="file.path", # Identify files by path
233
+ delta_retry="error", # Process files with error again
234
+ )
235
+ .map(process_file, output=("content", "result", "error"))
236
+ .save("processed-data")
237
+ )
238
+
239
+
185
240
  Example: LLM based text-file evaluation
186
241
  ---------------------------------------
187
242
 
@@ -213,7 +268,7 @@ Python code:
213
268
  return result.lower().startswith("success")
214
269
 
215
270
  chain = (
216
- dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
271
+ dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
217
272
  .settings(parallel=4, cache=True)
218
273
  .map(is_success=eval_dialogue)
219
274
  .save("mistral_files")
@@ -288,7 +343,7 @@ DataChain Studio Platform
288
343
  - **Access control** including SSO and team based collaboration.
289
344
 
290
345
  .. _PyPI: https://pypi.org/
291
- .. _file an issue: https://github.com/iterative/datachain/issues
346
+ .. _file an issue: https://github.com/datachain-ai/datachain/issues
292
347
  .. github-only
293
348
  .. _Contributor Guide: https://docs.datachain.ai/contributing
294
349
  .. _Pydantic: https://github.com/pydantic/pydantic