datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/utils.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import glob
|
|
2
2
|
import io
|
|
3
|
-
import json
|
|
4
3
|
import logging
|
|
5
4
|
import os
|
|
6
5
|
import os.path as osp
|
|
@@ -10,9 +9,8 @@ import sys
|
|
|
10
9
|
import time
|
|
11
10
|
from collections.abc import Iterable, Iterator, Sequence
|
|
12
11
|
from contextlib import contextmanager
|
|
13
|
-
from datetime import
|
|
14
|
-
from typing import TYPE_CHECKING, Any,
|
|
15
|
-
from uuid import UUID
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
16
14
|
|
|
17
15
|
import cloudpickle
|
|
18
16
|
import platformdirs
|
|
@@ -25,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
25
23
|
from typing_extensions import Self
|
|
26
24
|
|
|
27
25
|
|
|
28
|
-
|
|
26
|
+
DEFAULT_BATCH_SIZE = 2000
|
|
29
27
|
|
|
30
28
|
logger = logging.getLogger("datachain")
|
|
31
29
|
|
|
@@ -53,11 +51,11 @@ class DataChainDir:
|
|
|
53
51
|
|
|
54
52
|
def __init__(
|
|
55
53
|
self,
|
|
56
|
-
root:
|
|
57
|
-
cache:
|
|
58
|
-
tmp:
|
|
59
|
-
db:
|
|
60
|
-
config:
|
|
54
|
+
root: str | None = None,
|
|
55
|
+
cache: str | None = None,
|
|
56
|
+
tmp: str | None = None,
|
|
57
|
+
db: str | None = None,
|
|
58
|
+
config: str | None = None,
|
|
61
59
|
) -> None:
|
|
62
60
|
self.root = osp.abspath(root) if root is not None else self.default_root()
|
|
63
61
|
self.cache = (
|
|
@@ -122,7 +120,7 @@ def global_config_dir():
|
|
|
122
120
|
)
|
|
123
121
|
|
|
124
122
|
|
|
125
|
-
def human_time_to_int(time: str) ->
|
|
123
|
+
def human_time_to_int(time: str) -> int | None:
|
|
126
124
|
if not time:
|
|
127
125
|
return None
|
|
128
126
|
|
|
@@ -146,7 +144,7 @@ def time_to_str(dt):
|
|
|
146
144
|
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
147
145
|
|
|
148
146
|
|
|
149
|
-
def time_to_local(dt:
|
|
147
|
+
def time_to_local(dt: datetime | str) -> datetime:
|
|
150
148
|
# TODO check usage
|
|
151
149
|
if isinstance(dt, str):
|
|
152
150
|
dt = isoparse(dt)
|
|
@@ -156,11 +154,11 @@ def time_to_local(dt: Union[datetime, str]) -> datetime:
|
|
|
156
154
|
return dt
|
|
157
155
|
|
|
158
156
|
|
|
159
|
-
def time_to_local_str(dt:
|
|
157
|
+
def time_to_local_str(dt: datetime | str) -> str:
|
|
160
158
|
return time_to_str(time_to_local(dt))
|
|
161
159
|
|
|
162
160
|
|
|
163
|
-
def is_expired(expires:
|
|
161
|
+
def is_expired(expires: datetime | str | None):
|
|
164
162
|
if expires:
|
|
165
163
|
return time_to_local(expires) < time_to_local(datetime.now()) # noqa: DTZ005
|
|
166
164
|
|
|
@@ -228,7 +226,7 @@ _T_co = TypeVar("_T_co", covariant=True)
|
|
|
228
226
|
|
|
229
227
|
def _dynamic_batched_core(
|
|
230
228
|
iterable: Iterable[_T_co],
|
|
231
|
-
|
|
229
|
+
batch_size: int,
|
|
232
230
|
) -> Iterator[list[_T_co]]:
|
|
233
231
|
"""Core batching logic that yields lists."""
|
|
234
232
|
|
|
@@ -236,7 +234,7 @@ def _dynamic_batched_core(
|
|
|
236
234
|
|
|
237
235
|
for item in iterable:
|
|
238
236
|
# Check if adding this item would exceed limits
|
|
239
|
-
if len(batch) >=
|
|
237
|
+
if len(batch) >= batch_size and batch: # Yield current batch if we have one
|
|
240
238
|
yield batch
|
|
241
239
|
batch = []
|
|
242
240
|
|
|
@@ -247,23 +245,22 @@ def _dynamic_batched_core(
|
|
|
247
245
|
yield batch
|
|
248
246
|
|
|
249
247
|
|
|
250
|
-
def batched(iterable: Iterable[_T_co],
|
|
248
|
+
def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
|
|
251
249
|
"""
|
|
252
|
-
Batch data into tuples of length
|
|
250
|
+
Batch data into tuples of length batch_size.
|
|
253
251
|
The last batch may be shorter.
|
|
254
252
|
"""
|
|
255
|
-
yield from (tuple(batch) for batch in _dynamic_batched_core(iterable,
|
|
253
|
+
yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
|
|
256
254
|
|
|
257
255
|
|
|
258
256
|
def batched_it(
|
|
259
257
|
iterable: Iterable[_T_co],
|
|
260
|
-
|
|
258
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
261
259
|
) -> Iterator[Iterator[_T_co]]:
|
|
262
260
|
"""
|
|
263
|
-
Batch data into iterators with dynamic sizing
|
|
264
|
-
based on row count and memory usage.
|
|
261
|
+
Batch data into iterators with dynamic sizing based on row count and memory usage.
|
|
265
262
|
"""
|
|
266
|
-
yield from (iter(batch) for batch in _dynamic_batched_core(iterable,
|
|
263
|
+
yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
|
|
267
264
|
|
|
268
265
|
|
|
269
266
|
def flatten(items):
|
|
@@ -302,9 +299,9 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
|
|
|
302
299
|
|
|
303
300
|
|
|
304
301
|
def determine_workers(
|
|
305
|
-
workers:
|
|
306
|
-
rows_total:
|
|
307
|
-
) ->
|
|
302
|
+
workers: bool | int,
|
|
303
|
+
rows_total: int | None = None,
|
|
304
|
+
) -> bool | int:
|
|
308
305
|
"""Determine the number of workers to use for distributed processing."""
|
|
309
306
|
if rows_total is not None and rows_total <= 1:
|
|
310
307
|
# Disable distributed processing if there is no rows or only one row.
|
|
@@ -323,9 +320,9 @@ def determine_workers(
|
|
|
323
320
|
|
|
324
321
|
|
|
325
322
|
def determine_processes(
|
|
326
|
-
parallel:
|
|
327
|
-
rows_total:
|
|
328
|
-
) ->
|
|
323
|
+
parallel: bool | int | None = None,
|
|
324
|
+
rows_total: int | None = None,
|
|
325
|
+
) -> bool | int:
|
|
329
326
|
"""Determine the number of processes to use for parallel processing."""
|
|
330
327
|
if rows_total is not None and rows_total <= 1:
|
|
331
328
|
# Disable parallel processing if there is no rows or only one row.
|
|
@@ -345,8 +342,8 @@ def determine_processes(
|
|
|
345
342
|
|
|
346
343
|
|
|
347
344
|
def get_env_list(
|
|
348
|
-
key: str, default:
|
|
349
|
-
) ->
|
|
345
|
+
key: str, default: Sequence | None = None, sep: str = ","
|
|
346
|
+
) -> Sequence[str] | None:
|
|
350
347
|
try:
|
|
351
348
|
str_val = os.environ[key]
|
|
352
349
|
except KeyError:
|
|
@@ -387,10 +384,10 @@ def show_df(
|
|
|
387
384
|
|
|
388
385
|
|
|
389
386
|
def show_records(
|
|
390
|
-
records:
|
|
387
|
+
records: list[dict] | None,
|
|
391
388
|
collapse_columns: bool = False,
|
|
392
389
|
system_columns: bool = False,
|
|
393
|
-
hidden_fields:
|
|
390
|
+
hidden_fields: list[str] | None = None,
|
|
394
391
|
) -> None:
|
|
395
392
|
import pandas as pd
|
|
396
393
|
|
|
@@ -403,18 +400,6 @@ def show_records(
|
|
|
403
400
|
return show_df(df, collapse_columns=collapse_columns, system_columns=system_columns)
|
|
404
401
|
|
|
405
402
|
|
|
406
|
-
class JSONSerialize(json.JSONEncoder):
|
|
407
|
-
def default(self, obj):
|
|
408
|
-
if isinstance(obj, bytes):
|
|
409
|
-
return list(obj[:1024])
|
|
410
|
-
if isinstance(obj, (datetime, date)):
|
|
411
|
-
return obj.isoformat()
|
|
412
|
-
if isinstance(obj, UUID):
|
|
413
|
-
return str(obj)
|
|
414
|
-
|
|
415
|
-
return super().default(obj)
|
|
416
|
-
|
|
417
|
-
|
|
418
403
|
def inside_colab() -> bool:
|
|
419
404
|
try:
|
|
420
405
|
from google import colab # type: ignore[attr-defined] # noqa: F401
|
|
@@ -434,7 +419,7 @@ def inside_notebook() -> bool:
|
|
|
434
419
|
|
|
435
420
|
if shell == "ZMQInteractiveShell":
|
|
436
421
|
try:
|
|
437
|
-
import IPython
|
|
422
|
+
import IPython # type: ignore[import-not-found]
|
|
438
423
|
|
|
439
424
|
return IPython.__version__ >= "6.0.0"
|
|
440
425
|
except ImportError:
|
|
@@ -519,7 +504,7 @@ def row_to_nested_dict(
|
|
|
519
504
|
) -> dict[str, Any]:
|
|
520
505
|
"""Converts a row to a nested dict based on the provided headers."""
|
|
521
506
|
result: dict[str, Any] = {}
|
|
522
|
-
for h, v in zip(headers, row):
|
|
507
|
+
for h, v in zip(headers, row, strict=False):
|
|
523
508
|
nested_dict_path_set(result, h, v)
|
|
524
509
|
return result
|
|
525
510
|
|
|
@@ -530,7 +515,7 @@ def safe_closing(thing: T) -> Iterator[T]:
|
|
|
530
515
|
yield thing
|
|
531
516
|
finally:
|
|
532
517
|
if hasattr(thing, "close"):
|
|
533
|
-
thing.close()
|
|
518
|
+
thing.close() # type: ignore[attr-defined]
|
|
534
519
|
|
|
535
520
|
|
|
536
521
|
def getenv_bool(name: str, default: bool = False) -> bool:
|
|
@@ -538,3 +523,9 @@ def getenv_bool(name: str, default: bool = False) -> bool:
|
|
|
538
523
|
if val is None:
|
|
539
524
|
return default
|
|
540
525
|
return val.lower() in ("1", "true", "yes", "on")
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def ensure_sequence(x) -> Sequence:
|
|
529
|
+
if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
|
|
530
|
+
return x
|
|
531
|
+
return [x]
|
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.39.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
|
-
Project-URL: Issues, https://github.com/
|
|
9
|
-
Project-URL: Source, https://github.com/
|
|
8
|
+
Project-URL: Issues, https://github.com/datachain-ai/datachain/issues
|
|
9
|
+
Project-URL: Source, https://github.com/datachain-ai/datachain
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
16
15
|
Classifier: Development Status :: 2 - Pre-Alpha
|
|
17
|
-
Requires-Python: >=3.
|
|
16
|
+
Requires-Python: >=3.10
|
|
18
17
|
Description-Content-Type: text/x-rst
|
|
19
18
|
License-File: LICENSE
|
|
20
19
|
Requires-Dist: pyyaml
|
|
@@ -42,7 +41,7 @@ Requires-Dist: cloudpickle
|
|
|
42
41
|
Requires-Dist: pydantic
|
|
43
42
|
Requires-Dist: jmespath>=1.0
|
|
44
43
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
45
|
-
Requires-Dist: Pillow<
|
|
44
|
+
Requires-Dist: Pillow<13,>=10.0.0
|
|
46
45
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
47
46
|
Requires-Dist: psutil
|
|
48
47
|
Requires-Dist: huggingface_hub
|
|
@@ -56,16 +55,15 @@ Provides-Extra: docs
|
|
|
56
55
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
57
56
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
58
57
|
Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
|
|
59
|
-
Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
|
|
60
58
|
Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
|
|
61
59
|
Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
|
|
60
|
+
Requires-Dist: mkdocs-section-index>=0.3.10; extra == "docs"
|
|
62
61
|
Requires-Dist: eval-type-backport; extra == "docs"
|
|
63
62
|
Provides-Extra: torch
|
|
64
63
|
Requires-Dist: torch>=2.1.0; extra == "torch"
|
|
65
64
|
Requires-Dist: torchvision; extra == "torch"
|
|
66
65
|
Requires-Dist: transformers>=4.36.0; extra == "torch"
|
|
67
66
|
Provides-Extra: audio
|
|
68
|
-
Requires-Dist: torchaudio; extra == "audio"
|
|
69
67
|
Requires-Dist: soundfile; extra == "audio"
|
|
70
68
|
Provides-Extra: remote
|
|
71
69
|
Requires-Dist: lz4; extra == "remote"
|
|
@@ -85,7 +83,8 @@ Provides-Extra: postgres
|
|
|
85
83
|
Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
|
|
86
84
|
Provides-Extra: tests
|
|
87
85
|
Requires-Dist: datachain[audio,hf,postgres,remote,torch,vector,video]; extra == "tests"
|
|
88
|
-
Requires-Dist: pytest<
|
|
86
|
+
Requires-Dist: pytest<10,>=8; extra == "tests"
|
|
87
|
+
Requires-Dist: pytest-asyncio; extra == "tests"
|
|
89
88
|
Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
|
|
90
89
|
Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
|
|
91
90
|
Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
@@ -102,7 +101,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
102
101
|
Requires-Dist: ultralytics; extra == "tests"
|
|
103
102
|
Provides-Extra: dev
|
|
104
103
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
105
|
-
Requires-Dist: mypy==1.
|
|
104
|
+
Requires-Dist: mypy==1.19.0; extra == "dev"
|
|
106
105
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
107
106
|
Requires-Dist: types-dateparser; extra == "dev"
|
|
108
107
|
Requires-Dist: types-pytz; extra == "dev"
|
|
@@ -117,6 +116,7 @@ Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
|
117
116
|
Requires-Dist: ultralytics; extra == "examples"
|
|
118
117
|
Requires-Dist: open_clip_torch; extra == "examples"
|
|
119
118
|
Requires-Dist: openai; extra == "examples"
|
|
119
|
+
Requires-Dist: torchaudio; extra == "examples"
|
|
120
120
|
Dynamic: license-file
|
|
121
121
|
|
|
122
122
|
================
|
|
@@ -133,14 +133,14 @@ Dynamic: license-file
|
|
|
133
133
|
.. |Python Version| image:: https://img.shields.io/pypi/pyversions/datachain
|
|
134
134
|
:target: https://pypi.org/project/datachain
|
|
135
135
|
:alt: Python Version
|
|
136
|
-
.. |Codecov| image:: https://codecov.io/gh/
|
|
137
|
-
:target: https://codecov.io/gh/
|
|
136
|
+
.. |Codecov| image:: https://codecov.io/gh/datachain-ai/datachain/graph/badge.svg?token=byliXGGyGB
|
|
137
|
+
:target: https://codecov.io/gh/datachain-ai/datachain
|
|
138
138
|
:alt: Codecov
|
|
139
|
-
.. |Tests| image:: https://github.com/
|
|
140
|
-
:target: https://github.com/
|
|
139
|
+
.. |Tests| image:: https://github.com/datachain-ai/datachain/actions/workflows/tests.yml/badge.svg
|
|
140
|
+
:target: https://github.com/datachain-ai/datachain/actions/workflows/tests.yml
|
|
141
141
|
:alt: Tests
|
|
142
142
|
.. |DeepWiki| image:: https://deepwiki.com/badge.svg
|
|
143
|
-
:target: https://deepwiki.com/
|
|
143
|
+
:target: https://deepwiki.com/datachain-ai/datachain
|
|
144
144
|
:alt: DeepWiki
|
|
145
145
|
|
|
146
146
|
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
@@ -210,45 +210,33 @@ datasets that evolve over time and may occasionally have processing errors.
|
|
|
210
210
|
.. code:: py
|
|
211
211
|
|
|
212
212
|
import datachain as dc
|
|
213
|
-
from datachain import C, File
|
|
214
213
|
|
|
215
|
-
def process_file(file: File):
|
|
216
|
-
"""
|
|
214
|
+
def process_file(file: dc.File) -> tuple[str, str, str]:
|
|
215
|
+
"""Analyze a file, may occasionally fail."""
|
|
217
216
|
try:
|
|
218
217
|
# Your processing logic here
|
|
219
218
|
content = file.read_text()
|
|
220
|
-
result =
|
|
221
|
-
return
|
|
222
|
-
"content": content,
|
|
223
|
-
"result": result,
|
|
224
|
-
"error": None # No error
|
|
225
|
-
}
|
|
219
|
+
result = content.upper()
|
|
220
|
+
return content, result, "" # No error
|
|
226
221
|
except Exception as e:
|
|
227
222
|
# Return an error that will trigger reprocessing next time
|
|
228
|
-
return
|
|
229
|
-
"content": None,
|
|
230
|
-
"result": None,
|
|
231
|
-
"error": str(e) # Error field will trigger retry
|
|
232
|
-
}
|
|
223
|
+
return "", "", str(e) # Error field will trigger retry
|
|
233
224
|
|
|
234
225
|
# Process files efficiently with delta and retry
|
|
226
|
+
# Run it many times, keep adding files, to see delta and retry in action
|
|
235
227
|
chain = (
|
|
236
228
|
dc.read_storage(
|
|
237
229
|
"data/",
|
|
238
230
|
update=True,
|
|
239
231
|
delta=True, # Process only new/changed files
|
|
240
232
|
delta_on="file.path", # Identify files by path
|
|
241
|
-
|
|
233
|
+
delta_retry="error", # Process files with error again
|
|
242
234
|
)
|
|
243
|
-
.map(
|
|
244
|
-
.
|
|
245
|
-
content=C("processed_result.content"),
|
|
246
|
-
result=C("processed_result.result"),
|
|
247
|
-
error=C("processed_result.error")
|
|
248
|
-
)
|
|
249
|
-
.save(name="processed_data")
|
|
235
|
+
.map(process_file, output=("content", "result", "error"))
|
|
236
|
+
.save("processed-data")
|
|
250
237
|
)
|
|
251
238
|
|
|
239
|
+
|
|
252
240
|
Example: LLM based text-file evaluation
|
|
253
241
|
---------------------------------------
|
|
254
242
|
|
|
@@ -355,7 +343,7 @@ DataChain Studio Platform
|
|
|
355
343
|
- **Access control** including SSO and team based collaboration.
|
|
356
344
|
|
|
357
345
|
.. _PyPI: https://pypi.org/
|
|
358
|
-
.. _file an issue: https://github.com/
|
|
346
|
+
.. _file an issue: https://github.com/datachain-ai/datachain/issues
|
|
359
347
|
.. github-only
|
|
360
348
|
.. _Contributor Guide: https://docs.datachain.ai/contributing
|
|
361
349
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
datachain/__init__.py,sha256=KVwlU1kC6qWTFQD1DxQs1M_4rh0GC9ZUG3FXJZmQjh4,1852
|
|
2
|
+
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
|
+
datachain/asyn.py,sha256=3xSQkc2AMABDZcbDdrbJq6QxVWH3yIaY0pzCiXRqR0U,9634
|
|
4
|
+
datachain/cache.py,sha256=Klkc7iL_KvryeZk-UNjtByTFk7URbpb60XblalqHoYI,3604
|
|
5
|
+
datachain/checkpoint.py,sha256=AOMqN_2fNuEBJDAsmc-P4L7FU444eQxTU4MCgr-XEH8,1121
|
|
6
|
+
datachain/config.py,sha256=KPXef6P4NAZiEbSDMUcFwuNVTul2fZBs5xrCbyRl6Tg,4193
|
|
7
|
+
datachain/dataset.py,sha256=U3UAbElJEWND0_-aO8JPdNlAL-BDyOa6D_fVbvs5c1U,25978
|
|
8
|
+
datachain/delta.py,sha256=2c4XO-nKKYD1wMOW_zhEfUceAhH_8nISfnKVpZbScDg,12194
|
|
9
|
+
datachain/error.py,sha256=5Qkd55kl4NIG33iwOeY1Qa6SM8FLftIZ8vcf663r-9Y,1961
|
|
10
|
+
datachain/hash_utils.py,sha256=uU7xa-XQRC_8zmUxHg9h__Kts4TxW566311d1LRTz-Q,4510
|
|
11
|
+
datachain/job.py,sha256=mx4Cs-qX8DLyaie5JbZMSYYQSRspbQruM7L1s5kg1ws,1374
|
|
12
|
+
datachain/json.py,sha256=yK6ID17dP0vd0NIhSHAa3u2ym4N0jjtDEpBZ5PUuRlE,3673
|
|
13
|
+
datachain/listing.py,sha256=yIZYiCLVHQirGsXs52Ssph2j-woCIRnagqLJIRT1l7U,7406
|
|
14
|
+
datachain/namespace.py,sha256=YhxHdmCekWH4l-ZayNHGiPy5KAz_5LGqviivFYu0u9U,2337
|
|
15
|
+
datachain/node.py,sha256=gBLCoh-3nyaCDnMPt3gLS_t3m7qL0_JDiN02a9DH_kY,5552
|
|
16
|
+
datachain/nodes_fetcher.py,sha256=_wgaKyqEjkqdwJ_Hj6D8vUYz7hnU7g6xhm0H6ZnYxmE,1095
|
|
17
|
+
datachain/nodes_thread_pool.py,sha256=Fh4YZlwNzZMJxoUpuLGPKAnsvTGn8zy6xfrxBOeySfA,3971
|
|
18
|
+
datachain/plugins.py,sha256=QRpM-xcDAyGGOGAO6K3Hk4rGb78plgQ-k0iVWU8tn-I,693
|
|
19
|
+
datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
|
|
20
|
+
datachain/project.py,sha256=2_RAUmkDpbk5q5Iab9H6_k3S8Cju7dFRofk3OaaaQYo,2261
|
|
21
|
+
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
datachain/script_meta.py,sha256=vQOnPnqsNEDGtCHfy2DozcOdPwjGQ-dwDrrgHGpw90s,4935
|
|
23
|
+
datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
|
|
24
|
+
datachain/studio.py,sha256=seWeSACPVgUI23Z8UbgjeJQIZWVLu45aVG4p7kMDj5E,15611
|
|
25
|
+
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
26
|
+
datachain/utils.py,sha256=qizzUaT3SOoO7cWeZImWAMG00sMx69_j6EmcaiqO_1s,15414
|
|
27
|
+
datachain/catalog/__init__.py,sha256=r5aZompzY1jKJGt9PmvMTrObx1rQDiG5PsRp_Da3yLs,299
|
|
28
|
+
datachain/catalog/catalog.py,sha256=pM1lXssrI4PdnGms6QJDVMGjBJrwZyKd5djskbndG54,64739
|
|
29
|
+
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
30
|
+
datachain/catalog/dependency.py,sha256=EHuu_Ox76sEhy71NXjFJiHxQVTz19KecqBcrjwFCa7M,5280
|
|
31
|
+
datachain/catalog/loader.py,sha256=VTaGPc4ASNdUdr7Elobp8qcXUOHwd0oqQcnk3LUwtF0,6244
|
|
32
|
+
datachain/cli/__init__.py,sha256=DrJ9hABuo7ujKwoqt66wiH5vLNYFw_3yz6KJ9khXYdk,8133
|
|
33
|
+
datachain/cli/utils.py,sha256=v4__FknyILQejFE-GdqvBamOaXhfdeHxmJ9hBmEbwFE,2471
|
|
34
|
+
datachain/cli/commands/__init__.py,sha256=vH52z3H9wmfMJRiaFY-_Gk8tIzK3x6A_26oE79RROY8,434
|
|
35
|
+
datachain/cli/commands/datasets.py,sha256=S6sMuG9MvC2svQzfVZ9DKkLqMONWWJ1P5k1fywkrBr8,6915
|
|
36
|
+
datachain/cli/commands/du.py,sha256=9edEzDEs98K2VYk8Wf-ZMpUzALcgm9uD6YtoqbvtUGU,391
|
|
37
|
+
datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR88,198
|
|
38
|
+
datachain/cli/commands/ls.py,sha256=0QyE1dSUES4EmhG_qcisfdDdr8m2G6i5nmQS7sjGeVU,5293
|
|
39
|
+
datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibVE,600
|
|
40
|
+
datachain/cli/commands/show.py,sha256=zkgBEVW04j41ewNnyD0nUY-NcCBANoEpQ8dpIrKof5U,1593
|
|
41
|
+
datachain/cli/parser/__init__.py,sha256=ub3Y7AM0stEKmW3QZPcXhO_Pefy98GEMAgKjuAcwmt0,14019
|
|
42
|
+
datachain/cli/parser/job.py,sha256=f6IQrFUyXCSseOYxlDFPs414MsJszA8YWxEYAv530r0,6105
|
|
43
|
+
datachain/cli/parser/studio.py,sha256=Bo__LKM7qhJGgkyX8M_bCvgZ2Gvqq6r_X4t1NdtaBIY,3881
|
|
44
|
+
datachain/cli/parser/utils.py,sha256=WbuKIDTTAPItgbMoxyz-DBInHKxOkNfgW7L2V0hcBxs,2857
|
|
45
|
+
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
46
|
+
datachain/client/azure.py,sha256=cknEJaVrSGQNApdcDKrq9HykJJWyzEPetfs_sYZuahE,3257
|
|
47
|
+
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
48
|
+
datachain/client/fsspec.py,sha256=MZ1vmCIUIIxFCI9mlp_Fbd0KuqTL49vf-qQ5OMj5mFg,14603
|
|
49
|
+
datachain/client/gcs.py,sha256=Qy2mqYJnZCbTR80D2ZLygLjaVkmzjkYJ7L3LeV3449M,5188
|
|
50
|
+
datachain/client/hf.py,sha256=n5xJZdvNLS-SqokxuBCIPfGbhIeC_XfLm_BNYtEVvg4,2677
|
|
51
|
+
datachain/client/http.py,sha256=ebeXD0xy73VFmxuAr9Dk79KpyvCk2TL8Qx-3JTK02rY,5152
|
|
52
|
+
datachain/client/local.py,sha256=slWi7dB77A8XsNcwDFMyKzaDyUONwaCVwRAiMkLpBV8,4739
|
|
53
|
+
datachain/client/s3.py,sha256=KS9o0jxXJRFp7Isdibz366VaWrULmpegzfYdurJpAl0,7499
|
|
54
|
+
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
55
|
+
datachain/data_storage/db_engine.py,sha256=-XHNnmLtg8_oqmlYB1aQ2swVB7G5P3vllNPmH0mWQCQ,3831
|
|
56
|
+
datachain/data_storage/job.py,sha256=KTNKarEq1p7sXEsfV3oeQHXWVy6stv4wmD3qv4nF4F4,465
|
|
57
|
+
datachain/data_storage/metastore.py,sha256=kYSqEx1ZvcnqyGqLOsykD0owjiCsvCuGyD14dkYwz10,74006
|
|
58
|
+
datachain/data_storage/schema.py,sha256=3fAgiE11TIDYCW7EbTdiOm61SErRitvsLr7YPnUlVm0,9801
|
|
59
|
+
datachain/data_storage/serializer.py,sha256=lhWm_N1F97S8wJW647sJcdOZQSX25XVERL5xcw-XrWI,3877
|
|
60
|
+
datachain/data_storage/sqlite.py,sha256=qFmf0YQKNsPmlkD1wiVYIo5n5QodksjzQZsx0gJgw28,30107
|
|
61
|
+
datachain/data_storage/warehouse.py,sha256=AI281_7_VEkP7lgh5_vplow8ue_cB6XquoaSOuj8Rtc,35972
|
|
62
|
+
datachain/diff/__init__.py,sha256=lGrygGzdWSSYJ1DgX4h2q_ko5QINEW8PKfxOwE9ZFnI,9394
|
|
63
|
+
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
|
+
datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
|
|
65
|
+
datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
|
|
66
|
+
datachain/func/__init__.py,sha256=9K2MEC1NclY_zWuqevfEUOcrSE26cXDVnGqhNTj4lF8,1288
|
|
67
|
+
datachain/func/aggregate.py,sha256=jtyzePQ_vRDZuEeLM7YZw5PzekOU86c4HDR5qeIWLwY,12363
|
|
68
|
+
datachain/func/array.py,sha256=EpGSeBnnwmX16XqL8v7aqkJ8PAqkMZKLcnA0Auke0VI,13565
|
|
69
|
+
datachain/func/base.py,sha256=ykhGUBlMGWSNN1fRDlPSyDJVf8DV7k_r_tyNVm_o7ZY,590
|
|
70
|
+
datachain/func/conditional.py,sha256=vpBLKqwIA0OKrAClvQhnN6NiplCitXMOAim-enyrjUk,10076
|
|
71
|
+
datachain/func/func.py,sha256=Au3B51AiJSh7OC2LANNBv220L6vhYu1CTgrEz0Xxkqg,18092
|
|
72
|
+
datachain/func/numeric.py,sha256=L2fq6DgZLTEsqVPrPIf4b84WRxidZvQcBZLMd-kOBPo,6932
|
|
73
|
+
datachain/func/path.py,sha256=9Jas35QhEtRai4l54hMqVvuJsqxHvOx88oo4vym1H_I,4077
|
|
74
|
+
datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
|
|
75
|
+
datachain/func/string.py,sha256=kXkPHimtA__EVg_Th1yldGaLJpw4HYVhIeYtKy3DuyQ,7406
|
|
76
|
+
datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
|
|
77
|
+
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
78
|
+
datachain/lib/arrow.py,sha256=EoGpA9Lqv5xUmc5P0SQYfjBQ-wSNHN16PvkZ6exMcsI,10821
|
|
79
|
+
datachain/lib/audio.py,sha256=hHG29vqrV389im152wCjh80d0xqXGGvFnUpUwkzZejQ,7385
|
|
80
|
+
datachain/lib/clip.py,sha256=nF8-N6Uz0MbAsPJBY2iXEYa3DPLo80OOer5SRNAtcGM,6149
|
|
81
|
+
datachain/lib/data_model.py,sha256=srz0pfFohSXwFnt5OMi1fNjSbKkFq8vzkcO0n4PHxlQ,3904
|
|
82
|
+
datachain/lib/dataset_info.py,sha256=Yl11SqW47Uf-WkuB9zTVUS17ALGTQiWkvxzKLN2GWQ4,3288
|
|
83
|
+
datachain/lib/file.py,sha256=-iLKN2lnn4fHxDGk91b4FDjTgQFUvk3KCX-ZgxvKh-k,49277
|
|
84
|
+
datachain/lib/hf.py,sha256=jmyqRDXdksojUJCiU_2XFSIoMzzDJAZQs9xr-sEwEJc,7281
|
|
85
|
+
datachain/lib/image.py,sha256=xKyVsFKi1Shji7oluvd4Ibr3Atiz-Q0MNJhIsXeGcMI,3197
|
|
86
|
+
datachain/lib/listing.py,sha256=pXRzHCUxX0b1sZrFWPN77bHY69Hrn6rFwr5IzSxuhvI,7060
|
|
87
|
+
datachain/lib/listing_info.py,sha256=lnl5oQpkt7kVHyb06TdWhy5Fi2A0gBVIbnECgDtvLec,1079
|
|
88
|
+
datachain/lib/meta_formats.py,sha256=HO8WxPeQLdVSNh2sOy0XSc66yRhBjFOjNgkzIVFIFBk,6345
|
|
89
|
+
datachain/lib/model_store.py,sha256=GQoqNTPNwizX1hB4peKWs-h0VovJxX8dt1MdnTPI7bQ,3713
|
|
90
|
+
datachain/lib/namespaces.py,sha256=d4Zt2mYdGFctkA20SkB1woUxrNI4JwSxruxUGKwfauc,3731
|
|
91
|
+
datachain/lib/projects.py,sha256=FfBfGoWvy1SccCQW2ITKdDA6V03FbnRCusOeHdPHr6Y,4059
|
|
92
|
+
datachain/lib/pytorch.py,sha256=Ux_Gpl0dUJuME7d8qJSpgVQL0kQv6-lnq22KXQCy1NE,8069
|
|
93
|
+
datachain/lib/settings.py,sha256=maMtywOUetJvEApDiMVfTTq-oaRNvUIfDCrqZwFL2GE,7559
|
|
94
|
+
datachain/lib/signal_schema.py,sha256=Pi5HMYisCio4dq3zmKn1ijDf6-9vHvZONVfa9kVWSu0,47623
|
|
95
|
+
datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
|
|
96
|
+
datachain/lib/text.py,sha256=uZom8qXfrv9QYvuDrvd0PuvPmj6qCsjVUwZSNr60BI4,1242
|
|
97
|
+
datachain/lib/udf.py,sha256=CUodkCS3GCndDSjFwRh4m9zB9SsaPpOHk_zVaxJdm28,19356
|
|
98
|
+
datachain/lib/udf_signature.py,sha256=C3J0E_01elO81swP7Sy6Ne5P__BbHTfWc_ena-CCcmQ,8563
|
|
99
|
+
datachain/lib/utils.py,sha256=506ULwIQYwT1DWPyI3WwWS10qQ-qgUf39GDDoVcT2G0,5220
|
|
100
|
+
datachain/lib/video.py,sha256=7Q4oWvf4_HMX2QrkjZwTE0e8mZpoQlBxOxV9k2ubRag,6804
|
|
101
|
+
datachain/lib/webdataset.py,sha256=GKq_R9wDIH8Ckn9Hh-ipDu9X6eIsCPvcR95nCqSZ9yA,7315
|
|
102
|
+
datachain/lib/webdataset_laion.py,sha256=3vMWW_MrMvTA4FvwJVRHHL-Hxf9bq0wmLzefXyNW2bM,2486
|
|
103
|
+
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
104
|
+
datachain/lib/convert/flatten.py,sha256=_5rjGFnN6t1KCX5ftL5rG7tiiNat7j0SdNqajO15KUY,1539
|
|
105
|
+
datachain/lib/convert/python_to_sql.py,sha256=wfnqJ2vRL5UydNPQHshd82hUONsDBa4XyobCSTGqcEo,3187
|
|
106
|
+
datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8pyeN1MxRrU,670
|
|
107
|
+
datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
|
|
108
|
+
datachain/lib/convert/values_to_tuples.py,sha256=25VmP7b9Fa3BPAlSFFo54O9FAgEAqicmE44R4pPsLcM,7313
|
|
109
|
+
datachain/lib/dc/__init__.py,sha256=xguTjbh3iMDTUjfggCDGYZNA0_1iOVvkc-53f6cUpXc,926
|
|
110
|
+
datachain/lib/dc/csv.py,sha256=fIfj5-2Ix4z5D5yZueagd5WUWw86pusJ9JJKD-U3KGg,4407
|
|
111
|
+
datachain/lib/dc/database.py,sha256=RNXpTUdEyBAtlMJIesVfEzZee5peTDPszZPwFQtqEYA,14817
|
|
112
|
+
datachain/lib/dc/datachain.py,sha256=iYScoUXH1QeqffcoLBtj_RnZwcw9MSH-qF7IogK0b3g,107239
|
|
113
|
+
datachain/lib/dc/datasets.py,sha256=oY1t8QBAaZdhjwR439zZT74hMOspewVCrgdwy6juXng,15321
|
|
114
|
+
datachain/lib/dc/hf.py,sha256=FeruEO176L2qQ1Mnx0QmK4kV0GuQ4xtj717N8fGJrBI,2849
|
|
115
|
+
datachain/lib/dc/json.py,sha256=iJ6G0jwTKz8xtfh1eICShnWk_bAMWjF5bFnOXLHaTlw,2683
|
|
116
|
+
datachain/lib/dc/listings.py,sha256=0XTZERQZ2ErP3LSVg9lF9i3alKebqA1Kip2Zf15unUM,4507
|
|
117
|
+
datachain/lib/dc/pandas.py,sha256=o9rTcZf27-3mCEaDdX1ZzM0I4bSOsu-4mA2zK6rWoS4,1460
|
|
118
|
+
datachain/lib/dc/parquet.py,sha256=wa_VazXotY5RZ8ypC0_M9Qo30tamzXmYeVE6P-NcQ1Y,2375
|
|
119
|
+
datachain/lib/dc/records.py,sha256=WvbaLhMqM9e54gJLLeG54QX5ZXkkBIK3FokojLTSbZc,2974
|
|
120
|
+
datachain/lib/dc/storage.py,sha256=zfVMkYqwmhI4bnOqyO6bW5gg_DfdYPM7ltWLTHDjGZo,9737
|
|
121
|
+
datachain/lib/dc/storage_pattern.py,sha256=TqaDb5yq050W9IxpESz9iotjs0R__i5ngRtVo5BmJ-8,7645
|
|
122
|
+
datachain/lib/dc/utils.py,sha256=Ya2rqVj0UHzY_gxdZrIKnmoPZaKgLs397Y1o6M6kGQE,4273
|
|
123
|
+
datachain/lib/dc/values.py,sha256=-EI3xYUNzfwzogbW8WdHX0XbWev-je6_5-CnDsLRcF4,1399
|
|
124
|
+
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
125
|
+
datachain/model/bbox.py,sha256=7RT-xsKL8Rywy7l_R9DhzsTF4eQx35VCNdbaQEFDcVc,9362
|
|
126
|
+
datachain/model/pose.py,sha256=rjquA6M-I-Y30Xm6YSkGv1OY52hJZmR2AuxbIpE5uD0,3865
|
|
127
|
+
datachain/model/segment.py,sha256=NhcEYB_KVa0aLQYiZ4jEwkylH9QBLd8fZhmg6PVnx1Y,1967
|
|
128
|
+
datachain/model/utils.py,sha256=5elwCKleOO6CZM0IuWjFykPekrhc5m7V4jSIOcgGMms,6733
|
|
129
|
+
datachain/model/ultralytics/__init__.py,sha256=EvcNX9qUyxKXXlKCPpsXeRrabyXk5E9EkN-tyiYkfS4,750
|
|
130
|
+
datachain/model/ultralytics/bbox.py,sha256=C-aDiBhVa_ML2oERWvksRkyMU1XuYSpb6eItHB5q0qc,4764
|
|
131
|
+
datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigFYNZWUA,3392
|
|
132
|
+
datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
|
|
133
|
+
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
134
|
+
datachain/query/batch.py,sha256=ugTlSFqh_kxMcG6vJ5XrEzG9jBXRdb7KRAEEsFWiPew,4190
|
|
135
|
+
datachain/query/dataset.py,sha256=CxHOFJwMhlmZUHhMKVw22HAuMOgORiQJ-25qJWCjk7g,69548
|
|
136
|
+
datachain/query/dispatch.py,sha256=VOq0Lzp-Dmy4dkVIeqL5ekYIe5QqcN4o0_xtPJGI-X4,16907
|
|
137
|
+
datachain/query/metrics.py,sha256=qOMHiYPTMtVs2zI-mUSy8OPAVwrg4oJtVF85B9tdQyM,810
|
|
138
|
+
datachain/query/params.py,sha256=JkVz6IKUIpF58JZRkUXFT8DAHX2yfaULbhVaGmHKFLc,826
|
|
139
|
+
datachain/query/queue.py,sha256=kCetMG6y7_ynV_jJDAXkLsf8WsVZCEk1fAuQGd7yTOo,3543
|
|
140
|
+
datachain/query/schema.py,sha256=Cn1keXjktptAbEDbHlxSzdoCu5H6h_Vzp_DtNpMSr5w,6697
|
|
141
|
+
datachain/query/session.py,sha256=IN0ruNMxEU5K5xp_cL4-zMOxeYD0EnCAUunHa0zNdJ4,12793
|
|
142
|
+
datachain/query/udf.py,sha256=SsVHb_TksVygWsqKX7SX8SOnjOyyd8n5NPF7i-EKb2Q,1364
|
|
143
|
+
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
144
|
+
datachain/remote/studio.py,sha256=4voPFVDXAU6BSBHDAvB_LTYiCACA6Zr0IfYnDjrnN6s,16737
|
|
145
|
+
datachain/sql/__init__.py,sha256=8D2omsBiATt8bjLjGo6jBEtaKEkOlnlNFWhVryHMDv0,388
|
|
146
|
+
datachain/sql/postgresql_dialect.py,sha256=pDTfH8xaXz5xZsq8O1aQUvWLRIv_ogYeAqtmKlPp3Rw,280
|
|
147
|
+
datachain/sql/postgresql_types.py,sha256=ryb_0lzuA9UOJ_B6nW9Yb8nJjzeSmEItAL_Ceue65lc,627
|
|
148
|
+
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
149
|
+
datachain/sql/types.py,sha256=u0WNBjZafuG54lHQUtSSUOd14vHASQ1Nk3bKYjgPpJs,16152
|
|
150
|
+
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
151
|
+
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
152
|
+
datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
|
|
153
|
+
datachain/sql/functions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
154
|
+
datachain/sql/functions/aggregate.py,sha256=3AQdA8YHPFdtCEfwZKQXTT8SlQWdG9gD5PBtGN3Odqs,944
|
|
155
|
+
datachain/sql/functions/array.py,sha256=eRWpDRItwIG87-AU7jb8WuiR-MGuhklVxWwR7t97GvY,2050
|
|
156
|
+
datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
|
|
157
|
+
datachain/sql/functions/numeric.py,sha256=BK2KCiPSgM2IveCq-9M_PG3CtPBlztaS9TTn1LGzyLs,1250
|
|
158
|
+
datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
|
|
159
|
+
datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
|
|
160
|
+
datachain/sql/functions/string.py,sha256=E-T9OIzUR-GKaLgjZsEtg5CJrY_sLf1lt1awTvY7w2w,1426
|
|
161
|
+
datachain/sql/sqlite/__init__.py,sha256=PsLaDSij9a03VxGSpagpNl7NQsGtgm72ArUeALZONoc,183
|
|
162
|
+
datachain/sql/sqlite/base.py,sha256=llTx_wzuXo7pCM95eq8xvZ1iFOeKlcN7WVyJyHlBrW8,21840
|
|
163
|
+
datachain/sql/sqlite/types.py,sha256=EDmvEJz8oBQnv0r_ST-M1jQVPC39ceAC5pgzSB9vnkg,1821
|
|
164
|
+
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
165
|
+
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
166
|
+
datachain/toolkit/split.py,sha256=9HHZl0fGs5Zj8b9l2L3IKf0AiiVNL9SnWbc2rfDiXRA,3710
|
|
167
|
+
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
168
|
+
datachain-0.39.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
169
|
+
datachain-0.39.0.dist-info/METADATA,sha256=0rSXP0DTd51AK74TiFsQL1ftNgY6huoeHUu8IvqWEHI,13635
|
|
170
|
+
datachain-0.39.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
171
|
+
datachain-0.39.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
172
|
+
datachain-0.39.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
173
|
+
datachain-0.39.0.dist-info/RECORD,,
|
datachain/cli/commands/query.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sys
|
|
3
|
-
import traceback
|
|
4
|
-
from typing import TYPE_CHECKING, Optional
|
|
5
|
-
|
|
6
|
-
if TYPE_CHECKING:
|
|
7
|
-
from datachain.catalog import Catalog
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def query(
|
|
11
|
-
catalog: "Catalog",
|
|
12
|
-
script: str,
|
|
13
|
-
parallel: Optional[int] = None,
|
|
14
|
-
params: Optional[dict[str, str]] = None,
|
|
15
|
-
) -> None:
|
|
16
|
-
from datachain.data_storage import JobQueryType, JobStatus
|
|
17
|
-
|
|
18
|
-
with open(script, encoding="utf-8") as f:
|
|
19
|
-
script_content = f.read()
|
|
20
|
-
|
|
21
|
-
if parallel is not None:
|
|
22
|
-
# This also sets this environment variable for any subprocesses
|
|
23
|
-
os.environ["DATACHAIN_SETTINGS_PARALLEL"] = str(parallel)
|
|
24
|
-
|
|
25
|
-
python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
|
|
26
|
-
python_executable = sys.executable
|
|
27
|
-
|
|
28
|
-
job_id = catalog.metastore.create_job(
|
|
29
|
-
name=os.path.basename(script),
|
|
30
|
-
query=script_content,
|
|
31
|
-
query_type=JobQueryType.PYTHON,
|
|
32
|
-
status=JobStatus.RUNNING,
|
|
33
|
-
python_version=python_version,
|
|
34
|
-
params=params,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
try:
|
|
38
|
-
catalog.query(
|
|
39
|
-
script_content,
|
|
40
|
-
python_executable=python_executable,
|
|
41
|
-
params=params,
|
|
42
|
-
job_id=job_id,
|
|
43
|
-
)
|
|
44
|
-
except Exception as e:
|
|
45
|
-
error_message = str(e)
|
|
46
|
-
error_stack = traceback.format_exc()
|
|
47
|
-
catalog.metastore.set_job_status(
|
|
48
|
-
job_id,
|
|
49
|
-
JobStatus.FAILED,
|
|
50
|
-
error_message=error_message,
|
|
51
|
-
error_stack=error_stack,
|
|
52
|
-
)
|
|
53
|
-
raise
|
|
54
|
-
catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
|
datachain/query/utils.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from typing import Optional, Union
|
|
2
|
-
|
|
3
|
-
import sqlalchemy as sa
|
|
4
|
-
|
|
5
|
-
ColT = Union[sa.Column, sa.ColumnElement, sa.TextClause, sa.Label]
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def column_name(col: ColT) -> str:
|
|
9
|
-
"""Returns column name from column element."""
|
|
10
|
-
return col.name if isinstance(col, (sa.Column, sa.Label)) else str(col)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def get_query_column(query: sa.Select, name: str) -> Optional[ColT]:
|
|
14
|
-
"""Returns column element from query by name or None if column not found."""
|
|
15
|
-
return next((col for col in query.inner_columns if column_name(col) == name), None)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def get_query_id_column(query: sa.Select) -> Optional[sa.ColumnElement]:
|
|
19
|
-
"""Returns ID column element from query or None if column not found."""
|
|
20
|
-
col = get_query_column(query, "sys__id")
|
|
21
|
-
return col if col is not None and isinstance(col, sa.ColumnElement) else None
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def select_only_columns(query: sa.Select, *names: str) -> sa.Select:
|
|
25
|
-
"""Returns query selecting defined columns only."""
|
|
26
|
-
if not names:
|
|
27
|
-
return query
|
|
28
|
-
|
|
29
|
-
cols: list[ColT] = []
|
|
30
|
-
for name in names:
|
|
31
|
-
col = get_query_column(query, name)
|
|
32
|
-
if col is None:
|
|
33
|
-
raise ValueError(f"Column '{name}' not found in query")
|
|
34
|
-
cols.append(col)
|
|
35
|
-
|
|
36
|
-
return query.with_only_columns(*cols)
|