datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/utils.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import glob
|
|
2
2
|
import io
|
|
3
|
-
import json
|
|
4
3
|
import logging
|
|
5
4
|
import os
|
|
6
5
|
import os.path as osp
|
|
@@ -10,10 +9,8 @@ import sys
|
|
|
10
9
|
import time
|
|
11
10
|
from collections.abc import Iterable, Iterator, Sequence
|
|
12
11
|
from contextlib import contextmanager
|
|
13
|
-
from datetime import
|
|
14
|
-
from
|
|
15
|
-
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
|
16
|
-
from uuid import UUID
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
17
14
|
|
|
18
15
|
import cloudpickle
|
|
19
16
|
import platformdirs
|
|
@@ -26,6 +23,8 @@ if TYPE_CHECKING:
|
|
|
26
23
|
from typing_extensions import Self
|
|
27
24
|
|
|
28
25
|
|
|
26
|
+
DEFAULT_BATCH_SIZE = 2000
|
|
27
|
+
|
|
29
28
|
logger = logging.getLogger("datachain")
|
|
30
29
|
|
|
31
30
|
NUL = b"\0"
|
|
@@ -52,11 +51,11 @@ class DataChainDir:
|
|
|
52
51
|
|
|
53
52
|
def __init__(
|
|
54
53
|
self,
|
|
55
|
-
root:
|
|
56
|
-
cache:
|
|
57
|
-
tmp:
|
|
58
|
-
db:
|
|
59
|
-
config:
|
|
54
|
+
root: str | None = None,
|
|
55
|
+
cache: str | None = None,
|
|
56
|
+
tmp: str | None = None,
|
|
57
|
+
db: str | None = None,
|
|
58
|
+
config: str | None = None,
|
|
60
59
|
) -> None:
|
|
61
60
|
self.root = osp.abspath(root) if root is not None else self.default_root()
|
|
62
61
|
self.cache = (
|
|
@@ -121,7 +120,7 @@ def global_config_dir():
|
|
|
121
120
|
)
|
|
122
121
|
|
|
123
122
|
|
|
124
|
-
def human_time_to_int(time: str) ->
|
|
123
|
+
def human_time_to_int(time: str) -> int | None:
|
|
125
124
|
if not time:
|
|
126
125
|
return None
|
|
127
126
|
|
|
@@ -145,7 +144,7 @@ def time_to_str(dt):
|
|
|
145
144
|
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
146
145
|
|
|
147
146
|
|
|
148
|
-
def time_to_local(dt:
|
|
147
|
+
def time_to_local(dt: datetime | str) -> datetime:
|
|
149
148
|
# TODO check usage
|
|
150
149
|
if isinstance(dt, str):
|
|
151
150
|
dt = isoparse(dt)
|
|
@@ -155,11 +154,11 @@ def time_to_local(dt: Union[datetime, str]) -> datetime:
|
|
|
155
154
|
return dt
|
|
156
155
|
|
|
157
156
|
|
|
158
|
-
def time_to_local_str(dt:
|
|
157
|
+
def time_to_local_str(dt: datetime | str) -> str:
|
|
159
158
|
return time_to_str(time_to_local(dt))
|
|
160
159
|
|
|
161
160
|
|
|
162
|
-
def is_expired(expires:
|
|
161
|
+
def is_expired(expires: datetime | str | None):
|
|
163
162
|
if expires:
|
|
164
163
|
return time_to_local(expires) < time_to_local(datetime.now()) # noqa: DTZ005
|
|
165
164
|
|
|
@@ -225,30 +224,43 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
|
|
|
225
224
|
_T_co = TypeVar("_T_co", covariant=True)
|
|
226
225
|
|
|
227
226
|
|
|
228
|
-
def
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
227
|
+
def _dynamic_batched_core(
|
|
228
|
+
iterable: Iterable[_T_co],
|
|
229
|
+
batch_size: int,
|
|
230
|
+
) -> Iterator[list[_T_co]]:
|
|
231
|
+
"""Core batching logic that yields lists."""
|
|
232
|
+
|
|
233
|
+
batch: list[_T_co] = []
|
|
234
|
+
|
|
235
|
+
for item in iterable:
|
|
236
|
+
# Check if adding this item would exceed limits
|
|
237
|
+
if len(batch) >= batch_size and batch: # Yield current batch if we have one
|
|
238
|
+
yield batch
|
|
239
|
+
batch = []
|
|
240
|
+
|
|
241
|
+
batch.append(item)
|
|
242
|
+
|
|
243
|
+
# Yield any remaining items
|
|
244
|
+
if batch:
|
|
236
245
|
yield batch
|
|
237
246
|
|
|
238
247
|
|
|
239
|
-
def
|
|
240
|
-
"""
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
248
|
+
def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
|
|
249
|
+
"""
|
|
250
|
+
Batch data into tuples of length batch_size.
|
|
251
|
+
The last batch may be shorter.
|
|
252
|
+
"""
|
|
253
|
+
yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def batched_it(
|
|
257
|
+
iterable: Iterable[_T_co],
|
|
258
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
259
|
+
) -> Iterator[Iterator[_T_co]]:
|
|
260
|
+
"""
|
|
261
|
+
Batch data into iterators with dynamic sizing based on row count and memory usage.
|
|
262
|
+
"""
|
|
263
|
+
yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
|
|
252
264
|
|
|
253
265
|
|
|
254
266
|
def flatten(items):
|
|
@@ -286,23 +298,52 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
|
|
|
286
298
|
return retry
|
|
287
299
|
|
|
288
300
|
|
|
289
|
-
def
|
|
301
|
+
def determine_workers(
|
|
302
|
+
workers: bool | int,
|
|
303
|
+
rows_total: int | None = None,
|
|
304
|
+
) -> bool | int:
|
|
305
|
+
"""Determine the number of workers to use for distributed processing."""
|
|
306
|
+
if rows_total is not None and rows_total <= 1:
|
|
307
|
+
# Disable distributed processing if there is no rows or only one row.
|
|
308
|
+
return False
|
|
309
|
+
if (
|
|
310
|
+
workers is False
|
|
311
|
+
and os.environ.get("DATACHAIN_DISTRIBUTED")
|
|
312
|
+
and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
|
|
313
|
+
):
|
|
314
|
+
# Enable distributed processing by default if the module is available,
|
|
315
|
+
# and a default number of workers is provided.
|
|
316
|
+
workers = int(os.environ["DATACHAIN_SETTINGS_WORKERS"])
|
|
317
|
+
if not workers or workers <= 0:
|
|
318
|
+
return False
|
|
319
|
+
return workers
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def determine_processes(
|
|
323
|
+
parallel: bool | int | None = None,
|
|
324
|
+
rows_total: int | None = None,
|
|
325
|
+
) -> bool | int:
|
|
326
|
+
"""Determine the number of processes to use for parallel processing."""
|
|
327
|
+
if rows_total is not None and rows_total <= 1:
|
|
328
|
+
# Disable parallel processing if there is no rows or only one row.
|
|
329
|
+
return False
|
|
290
330
|
if parallel is None and os.environ.get("DATACHAIN_SETTINGS_PARALLEL") is not None:
|
|
291
331
|
parallel = int(os.environ["DATACHAIN_SETTINGS_PARALLEL"])
|
|
292
|
-
if parallel is None or parallel is False:
|
|
332
|
+
if parallel is None or parallel is False or parallel == 0:
|
|
293
333
|
return False
|
|
294
334
|
if parallel is True:
|
|
295
335
|
return True
|
|
296
|
-
if parallel == 0:
|
|
297
|
-
return False
|
|
298
336
|
if parallel < 0:
|
|
299
337
|
return True
|
|
338
|
+
if parallel == 1:
|
|
339
|
+
# Disable parallel processing if only one process is requested.
|
|
340
|
+
return False
|
|
300
341
|
return parallel
|
|
301
342
|
|
|
302
343
|
|
|
303
344
|
def get_env_list(
|
|
304
|
-
key: str, default:
|
|
305
|
-
) ->
|
|
345
|
+
key: str, default: Sequence | None = None, sep: str = ","
|
|
346
|
+
) -> Sequence[str] | None:
|
|
306
347
|
try:
|
|
307
348
|
str_val = os.environ[key]
|
|
308
349
|
except KeyError:
|
|
@@ -343,10 +384,10 @@ def show_df(
|
|
|
343
384
|
|
|
344
385
|
|
|
345
386
|
def show_records(
|
|
346
|
-
records:
|
|
387
|
+
records: list[dict] | None,
|
|
347
388
|
collapse_columns: bool = False,
|
|
348
389
|
system_columns: bool = False,
|
|
349
|
-
hidden_fields:
|
|
390
|
+
hidden_fields: list[str] | None = None,
|
|
350
391
|
) -> None:
|
|
351
392
|
import pandas as pd
|
|
352
393
|
|
|
@@ -359,21 +400,9 @@ def show_records(
|
|
|
359
400
|
return show_df(df, collapse_columns=collapse_columns, system_columns=system_columns)
|
|
360
401
|
|
|
361
402
|
|
|
362
|
-
class JSONSerialize(json.JSONEncoder):
|
|
363
|
-
def default(self, obj):
|
|
364
|
-
if isinstance(obj, bytes):
|
|
365
|
-
return list(obj[:1024])
|
|
366
|
-
if isinstance(obj, (datetime, date)):
|
|
367
|
-
return obj.isoformat()
|
|
368
|
-
if isinstance(obj, UUID):
|
|
369
|
-
return str(obj)
|
|
370
|
-
|
|
371
|
-
return super().default(obj)
|
|
372
|
-
|
|
373
|
-
|
|
374
403
|
def inside_colab() -> bool:
|
|
375
404
|
try:
|
|
376
|
-
from google import colab # noqa: F401
|
|
405
|
+
from google import colab # type: ignore[attr-defined] # noqa: F401
|
|
377
406
|
except ImportError:
|
|
378
407
|
return False
|
|
379
408
|
return True
|
|
@@ -390,7 +419,7 @@ def inside_notebook() -> bool:
|
|
|
390
419
|
|
|
391
420
|
if shell == "ZMQInteractiveShell":
|
|
392
421
|
try:
|
|
393
|
-
import IPython
|
|
422
|
+
import IPython # type: ignore[import-not-found]
|
|
394
423
|
|
|
395
424
|
return IPython.__version__ >= "6.0.0"
|
|
396
425
|
except ImportError:
|
|
@@ -475,7 +504,7 @@ def row_to_nested_dict(
|
|
|
475
504
|
) -> dict[str, Any]:
|
|
476
505
|
"""Converts a row to a nested dict based on the provided headers."""
|
|
477
506
|
result: dict[str, Any] = {}
|
|
478
|
-
for h, v in zip(headers, row):
|
|
507
|
+
for h, v in zip(headers, row, strict=False):
|
|
479
508
|
nested_dict_path_set(result, h, v)
|
|
480
509
|
return result
|
|
481
510
|
|
|
@@ -486,4 +515,17 @@ def safe_closing(thing: T) -> Iterator[T]:
|
|
|
486
515
|
yield thing
|
|
487
516
|
finally:
|
|
488
517
|
if hasattr(thing, "close"):
|
|
489
|
-
thing.close()
|
|
518
|
+
thing.close() # type: ignore[attr-defined]
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def getenv_bool(name: str, default: bool = False) -> bool:
|
|
522
|
+
val = os.getenv(name)
|
|
523
|
+
if val is None:
|
|
524
|
+
return default
|
|
525
|
+
return val.lower() in ("1", "true", "yes", "on")
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def ensure_sequence(x) -> Sequence:
|
|
529
|
+
if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
|
|
530
|
+
return x
|
|
531
|
+
return [x]
|
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.39.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
-
Project-URL: Issues, https://github.com/
|
|
9
|
-
Project-URL: Source, https://github.com/
|
|
8
|
+
Project-URL: Issues, https://github.com/datachain-ai/datachain/issues
|
|
9
|
+
Project-URL: Source, https://github.com/datachain-ai/datachain
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
16
15
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
17
|
-
Requires-Python: >=3.
|
|
16
|
+
Requires-Python: >=3.10
|
|
18
17
|
Description-Content-Type: text/x-rst
|
|
19
18
|
License-File: LICENSE
|
|
20
19
|
Requires-Dist: pyyaml
|
|
@@ -22,10 +21,12 @@ Requires-Dist: tomlkit
|
|
|
22
21
|
Requires-Dist: tqdm
|
|
23
22
|
Requires-Dist: numpy<3,>=1
|
|
24
23
|
Requires-Dist: pandas>=2.0.0
|
|
24
|
+
Requires-Dist: ujson>=5.10.0
|
|
25
25
|
Requires-Dist: packaging
|
|
26
26
|
Requires-Dist: pyarrow
|
|
27
27
|
Requires-Dist: typing-extensions
|
|
28
28
|
Requires-Dist: python-dateutil>=2
|
|
29
|
+
Requires-Dist: dateparser>=1.0.0
|
|
29
30
|
Requires-Dist: attrs>=21.3.0
|
|
30
31
|
Requires-Dist: fsspec>=2024.2.0
|
|
31
32
|
Requires-Dist: s3fs>=2024.2.0
|
|
@@ -37,11 +38,10 @@ Requires-Dist: shtab<2,>=1.3.4
|
|
|
37
38
|
Requires-Dist: sqlalchemy>=2
|
|
38
39
|
Requires-Dist: multiprocess==0.70.16
|
|
39
40
|
Requires-Dist: cloudpickle
|
|
40
|
-
Requires-Dist:
|
|
41
|
-
Requires-Dist: pydantic<2.11,>=2
|
|
41
|
+
Requires-Dist: pydantic
|
|
42
42
|
Requires-Dist: jmespath>=1.0
|
|
43
43
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
44
|
-
Requires-Dist: Pillow<
|
|
44
|
+
Requires-Dist: Pillow<13,>=10.0.0
|
|
45
45
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
46
46
|
Requires-Dist: psutil
|
|
47
47
|
Requires-Dist: huggingface_hub
|
|
@@ -55,14 +55,16 @@ Provides-Extra: docs
|
|
|
55
55
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
56
56
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
57
57
|
Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
|
|
58
|
-
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
59
58
|
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
60
59
|
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
60
|
+
Requires-Dist: mkdocs-section-index>=0.3.10; extra == "docs"
|
|
61
61
|
Requires-Dist: eval-type-backport; extra == "docs"
|
|
62
62
|
Provides-Extra: torch
|
|
63
63
|
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
64
64
|
Requires-Dist: torchvision; extra == "torch"
|
|
65
65
|
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
66
|
+
Provides-Extra: audio
|
|
67
|
+
Requires-Dist: soundfile; extra == "audio"
|
|
66
68
|
Provides-Extra: remote
|
|
67
69
|
Requires-Dist: lz4; extra == "remote"
|
|
68
70
|
Requires-Dist: requests>=2.22.0; extra == "remote"
|
|
@@ -70,21 +72,26 @@ Provides-Extra: vector
|
|
|
70
72
|
Requires-Dist: usearch; extra == "vector"
|
|
71
73
|
Provides-Extra: hf
|
|
72
74
|
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
73
|
-
Requires-Dist: datasets[
|
|
75
|
+
Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
|
|
76
|
+
Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
|
|
74
77
|
Requires-Dist: fsspec>=2024.12.0; extra == "hf"
|
|
75
78
|
Provides-Extra: video
|
|
76
79
|
Requires-Dist: ffmpeg-python; extra == "video"
|
|
77
80
|
Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
|
|
78
81
|
Requires-Dist: opencv-python; extra == "video"
|
|
82
|
+
Provides-Extra: postgres
|
|
83
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
|
|
79
84
|
Provides-Extra: tests
|
|
80
|
-
Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
|
|
81
|
-
Requires-Dist: pytest<
|
|
85
|
+
Requires-Dist: datachain[audio,hf,postgres,remote,torch,vector,video]; extra == "tests"
|
|
86
|
+
Requires-Dist: pytest<10,>=8; extra == "tests"
|
|
87
|
+
Requires-Dist: pytest-asyncio; extra == "tests"
|
|
82
88
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
83
89
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
84
90
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
85
91
|
Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
|
|
86
92
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
87
93
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
94
|
+
Requires-Dist: pytest-dotenv; extra == "tests"
|
|
88
95
|
Requires-Dist: virtualenv; extra == "tests"
|
|
89
96
|
Requires-Dist: dulwich; extra == "tests"
|
|
90
97
|
Requires-Dist: hypothesis; extra == "tests"
|
|
@@ -94,8 +101,9 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
94
101
|
Requires-Dist: ultralytics; extra == "tests"
|
|
95
102
|
Provides-Extra: dev
|
|
96
103
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
97
|
-
Requires-Dist: mypy==1.
|
|
104
|
+
Requires-Dist: mypy==1.19.0; extra == "dev"
|
|
98
105
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
106
|
+
Requires-Dist: types-dateparser; extra == "dev"
|
|
99
107
|
Requires-Dist: types-pytz; extra == "dev"
|
|
100
108
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
101
109
|
Requires-Dist: types-requests; extra == "dev"
|
|
@@ -107,13 +115,15 @@ Requires-Dist: accelerate; extra == "examples"
|
|
|
107
115
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
108
116
|
Requires-Dist: ultralytics; extra == "examples"
|
|
109
117
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
118
|
+
Requires-Dist: openai; extra == "examples"
|
|
119
|
+
Requires-Dist: torchaudio; extra == "examples"
|
|
110
120
|
Dynamic: license-file
|
|
111
121
|
|
|
112
122
|
================
|
|
113
123
|
|logo| DataChain
|
|
114
124
|
================
|
|
115
125
|
|
|
116
|
-
|PyPI| |Python Version| |Codecov| |Tests|
|
|
126
|
+
|PyPI| |Python Version| |Codecov| |Tests| |DeepWiki|
|
|
117
127
|
|
|
118
128
|
.. |logo| image:: docs/assets/datachain.svg
|
|
119
129
|
:height: 24
|
|
@@ -123,12 +133,15 @@ Dynamic: license-file
|
|
|
123
133
|
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
124
134
|
:target: https://pypi.org/project/datachain
|
|
125
135
|
:alt: Python Version
|
|
126
|
-
.. |Codecov| image:: https://codecov.io/gh/
|
|
127
|
-
:target: https://codecov.io/gh/
|
|
136
|
+
.. |Codecov| image:: https://codecov.io/gh/datachain-ai/datachain/graph/badge.svg?token=byliXGGyGB
|
|
137
|
+
:target: https://codecov.io/gh/datachain-ai/datachain
|
|
128
138
|
:alt: Codecov
|
|
129
|
-
.. |Tests| image:: https://github.com/
|
|
130
|
-
:target: https://github.com/
|
|
139
|
+
.. |Tests| image:: https://github.com/datachain-ai/datachain/actions/workflows/tests.yml/badge.svg
|
|
140
|
+
:target: https://github.com/datachain-ai/datachain/actions/workflows/tests.yml
|
|
131
141
|
:alt: Tests
|
|
142
|
+
.. |DeepWiki| image:: https://deepwiki.com/badge.svg
|
|
143
|
+
:target: https://deepwiki.com/datachain-ai/datachain
|
|
144
|
+
:alt: DeepWiki
|
|
132
145
|
|
|
133
146
|
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
134
147
|
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
@@ -146,6 +159,12 @@ Use Cases
|
|
|
146
159
|
on these tables at scale.
|
|
147
160
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
148
161
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
162
|
+
4. **Incremental Processing.** DataChain's delta and retry features allow for efficient
|
|
163
|
+
processing workflows:
|
|
164
|
+
|
|
165
|
+
- **Delta Processing**: Process only new or changed files/records
|
|
166
|
+
- **Retry Processing**: Automatically reprocess records with errors or missing results
|
|
167
|
+
- **Combined Approach**: Process new data and fix errors in a single pipeline
|
|
149
168
|
|
|
150
169
|
Getting Started
|
|
151
170
|
===============
|
|
@@ -158,7 +177,7 @@ to get started with `DataChain` and learn more.
|
|
|
158
177
|
pip install datachain
|
|
159
178
|
|
|
160
179
|
|
|
161
|
-
Example:
|
|
180
|
+
Example: Download Subset of Files Based on Metadata
|
|
162
181
|
---------------------------------------------------
|
|
163
182
|
|
|
164
183
|
Sometimes users only need to download a specific subset of files from cloud storage,
|
|
@@ -171,7 +190,7 @@ high confidence scores.
|
|
|
171
190
|
|
|
172
191
|
import datachain as dc
|
|
173
192
|
|
|
174
|
-
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json",
|
|
193
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
|
|
175
194
|
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
176
195
|
|
|
177
196
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
@@ -182,6 +201,42 @@ high confidence scores.
|
|
|
182
201
|
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
183
202
|
|
|
184
203
|
|
|
204
|
+
Example: Incremental Processing with Error Handling
|
|
205
|
+
---------------------------------------------------
|
|
206
|
+
|
|
207
|
+
This example shows how to use both delta and retry processing for efficient handling of large
|
|
208
|
+
datasets that evolve over time and may occasionally have processing errors.
|
|
209
|
+
|
|
210
|
+
.. code:: py
|
|
211
|
+
|
|
212
|
+
import datachain as dc
|
|
213
|
+
|
|
214
|
+
def process_file(file: dc.File) -> tuple[str, str, str]:
|
|
215
|
+
"""Analyze a file, may occasionally fail."""
|
|
216
|
+
try:
|
|
217
|
+
# Your processing logic here
|
|
218
|
+
content = file.read_text()
|
|
219
|
+
result = content.upper()
|
|
220
|
+
return content, result, "" # No error
|
|
221
|
+
except Exception as e:
|
|
222
|
+
# Return an error that will trigger reprocessing next time
|
|
223
|
+
return "", "", str(e) # Error field will trigger retry
|
|
224
|
+
|
|
225
|
+
# Process files efficiently with delta and retry
|
|
226
|
+
# Run it many times, keep adding files, to see delta and retry in action
|
|
227
|
+
chain = (
|
|
228
|
+
dc.read_storage(
|
|
229
|
+
"data/",
|
|
230
|
+
update=True,
|
|
231
|
+
delta=True, # Process only new/changed files
|
|
232
|
+
delta_on="file.path", # Identify files by path
|
|
233
|
+
delta_retry="error", # Process files with error again
|
|
234
|
+
)
|
|
235
|
+
.map(process_file, output=("content", "result", "error"))
|
|
236
|
+
.save("processed-data")
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
185
240
|
Example: LLM based text-file evaluation
|
|
186
241
|
---------------------------------------
|
|
187
242
|
|
|
@@ -213,7 +268,7 @@ Python code:
|
|
|
213
268
|
return result.lower().startswith("success")
|
|
214
269
|
|
|
215
270
|
chain = (
|
|
216
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
271
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
217
272
|
.settings(parallel=4, cache=True)
|
|
218
273
|
.map(is_success=eval_dialogue)
|
|
219
274
|
.save("mistral_files")
|
|
@@ -288,7 +343,7 @@ DataChain Studio Platform
|
|
|
288
343
|
- **Access control** including SSO and team based collaboration.
|
|
289
344
|
|
|
290
345
|
.. _PyPI: https://pypi.org/
|
|
291
|
-
.. _file an issue: https://github.com/
|
|
346
|
+
.. _file an issue: https://github.com/datachain-ai/datachain/issues
|
|
292
347
|
.. github-only
|
|
293
348
|
.. _Contributor Guide: https://docs.datachain.ai/contributing
|
|
294
349
|
.. _Pydantic: https://github.com/pydantic/pydantic
|