datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -5,8 +5,13 @@ from datachain.lib.dc import (
|
|
|
5
5
|
DataChain,
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
|
+
delete_dataset,
|
|
9
|
+
is_local,
|
|
10
|
+
is_studio,
|
|
8
11
|
listings,
|
|
12
|
+
move_dataset,
|
|
9
13
|
read_csv,
|
|
14
|
+
read_database,
|
|
10
15
|
read_dataset,
|
|
11
16
|
read_hf,
|
|
12
17
|
read_json,
|
|
@@ -18,6 +23,9 @@ from datachain.lib.dc import (
|
|
|
18
23
|
)
|
|
19
24
|
from datachain.lib.file import (
|
|
20
25
|
ArrowRow,
|
|
26
|
+
Audio,
|
|
27
|
+
AudioFile,
|
|
28
|
+
AudioFragment,
|
|
21
29
|
File,
|
|
22
30
|
FileError,
|
|
23
31
|
Image,
|
|
@@ -30,6 +38,8 @@ from datachain.lib.file import (
|
|
|
30
38
|
VideoFrame,
|
|
31
39
|
)
|
|
32
40
|
from datachain.lib.model_store import ModelStore
|
|
41
|
+
from datachain.lib.namespaces import delete_namespace
|
|
42
|
+
from datachain.lib.projects import create as create_project
|
|
33
43
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
34
44
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
35
45
|
from datachain.query import metrics, param
|
|
@@ -39,6 +49,9 @@ __all__ = [
|
|
|
39
49
|
"AbstractUDF",
|
|
40
50
|
"Aggregator",
|
|
41
51
|
"ArrowRow",
|
|
52
|
+
"Audio",
|
|
53
|
+
"AudioFile",
|
|
54
|
+
"AudioFragment",
|
|
42
55
|
"C",
|
|
43
56
|
"Column",
|
|
44
57
|
"DataChain",
|
|
@@ -60,12 +73,19 @@ __all__ = [
|
|
|
60
73
|
"VideoFile",
|
|
61
74
|
"VideoFragment",
|
|
62
75
|
"VideoFrame",
|
|
76
|
+
"create_project",
|
|
63
77
|
"datasets",
|
|
78
|
+
"delete_dataset",
|
|
79
|
+
"delete_namespace",
|
|
64
80
|
"is_chain_type",
|
|
81
|
+
"is_local",
|
|
82
|
+
"is_studio",
|
|
65
83
|
"listings",
|
|
66
84
|
"metrics",
|
|
85
|
+
"move_dataset",
|
|
67
86
|
"param",
|
|
68
87
|
"read_csv",
|
|
88
|
+
"read_database",
|
|
69
89
|
"read_dataset",
|
|
70
90
|
"read_hf",
|
|
71
91
|
"read_json",
|
datachain/asyn.py
CHANGED
|
@@ -3,6 +3,7 @@ import threading
|
|
|
3
3
|
from collections.abc import (
|
|
4
4
|
AsyncIterable,
|
|
5
5
|
Awaitable,
|
|
6
|
+
Callable,
|
|
6
7
|
Coroutine,
|
|
7
8
|
Generator,
|
|
8
9
|
Iterable,
|
|
@@ -10,7 +11,7 @@ from collections.abc import (
|
|
|
10
11
|
)
|
|
11
12
|
from concurrent.futures import ThreadPoolExecutor, wait
|
|
12
13
|
from heapq import heappop, heappush
|
|
13
|
-
from typing import Any,
|
|
14
|
+
from typing import Any, Generic, TypeVar
|
|
14
15
|
|
|
15
16
|
from fsspec.asyn import get_loop
|
|
16
17
|
|
|
@@ -49,7 +50,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
|
|
|
49
50
|
iterable: Iterable[InputT],
|
|
50
51
|
*,
|
|
51
52
|
workers: int = ASYNC_WORKERS,
|
|
52
|
-
loop:
|
|
53
|
+
loop: asyncio.AbstractEventLoop | None = None,
|
|
53
54
|
):
|
|
54
55
|
self.func = func
|
|
55
56
|
self.iterable = iterable
|
|
@@ -107,9 +108,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
|
|
|
107
108
|
|
|
108
109
|
async def init(self) -> None:
|
|
109
110
|
self.work_queue = asyncio.Queue(2 * self.workers)
|
|
110
|
-
self.result_queue: asyncio.Queue[
|
|
111
|
-
self.workers
|
|
112
|
-
)
|
|
111
|
+
self.result_queue: asyncio.Queue[ResultT | None] = asyncio.Queue(self.workers)
|
|
113
112
|
|
|
114
113
|
async def run(self) -> None:
|
|
115
114
|
producer = self.start_task(self.produce())
|
|
@@ -149,10 +148,10 @@ class AsyncMapper(Generic[InputT, ResultT]):
|
|
|
149
148
|
if exc:
|
|
150
149
|
raise exc
|
|
151
150
|
|
|
152
|
-
async def _pop_result(self) ->
|
|
151
|
+
async def _pop_result(self) -> ResultT | None:
|
|
153
152
|
return await self.result_queue.get()
|
|
154
153
|
|
|
155
|
-
def next_result(self, timeout=None) ->
|
|
154
|
+
def next_result(self, timeout=None) -> ResultT | None:
|
|
156
155
|
"""
|
|
157
156
|
Return the next available result.
|
|
158
157
|
|
|
@@ -212,17 +211,17 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
|
|
|
212
211
|
iterable: Iterable[InputT],
|
|
213
212
|
*,
|
|
214
213
|
workers: int = ASYNC_WORKERS,
|
|
215
|
-
loop:
|
|
214
|
+
loop: asyncio.AbstractEventLoop | None = None,
|
|
216
215
|
):
|
|
217
216
|
super().__init__(func, iterable, workers=workers, loop=loop)
|
|
218
217
|
self._waiters: dict[int, Any] = {}
|
|
219
|
-
self._getters: dict[int, asyncio.Future[
|
|
220
|
-
self.heap: list[tuple[int,
|
|
218
|
+
self._getters: dict[int, asyncio.Future[ResultT | None]] = {}
|
|
219
|
+
self.heap: list[tuple[int, ResultT | None]] = []
|
|
221
220
|
self._next_yield = 0
|
|
222
221
|
self._items_seen = 0
|
|
223
222
|
self._window = 2 * workers
|
|
224
223
|
|
|
225
|
-
def _push_result(self, i: int, result:
|
|
224
|
+
def _push_result(self, i: int, result: ResultT | None) -> None:
|
|
226
225
|
if i in self._getters:
|
|
227
226
|
future = self._getters.pop(i)
|
|
228
227
|
future.set_result(result)
|
|
@@ -243,7 +242,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
|
|
|
243
242
|
async def init(self) -> None:
|
|
244
243
|
self.work_queue = asyncio.Queue(2 * self.workers)
|
|
245
244
|
|
|
246
|
-
async def _pop_result(self) ->
|
|
245
|
+
async def _pop_result(self) -> ResultT | None:
|
|
247
246
|
if self.heap and self.heap[0][0] == self._next_yield:
|
|
248
247
|
_i, out = heappop(self.heap)
|
|
249
248
|
else:
|
datachain/cache.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from collections.abc import Iterator
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from tempfile import mkdtemp
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
6
|
|
|
7
7
|
from dvc_data.hashfile.db.local import LocalHashFileDB
|
|
8
8
|
from dvc_objects.fs.local import LocalFileSystem
|
|
@@ -22,14 +22,14 @@ def try_scandir(path):
|
|
|
22
22
|
pass
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def get_temp_cache(tmp_dir: str, prefix:
|
|
25
|
+
def get_temp_cache(tmp_dir: str, prefix: str | None = None) -> "Cache":
|
|
26
26
|
cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
|
|
27
27
|
return Cache(cache_dir, tmp_dir=tmp_dir)
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@contextmanager
|
|
31
31
|
def temporary_cache(
|
|
32
|
-
tmp_dir: str, prefix:
|
|
32
|
+
tmp_dir: str, prefix: str | None = None, delete: bool = True
|
|
33
33
|
) -> Iterator["Cache"]:
|
|
34
34
|
cache = get_temp_cache(tmp_dir, prefix=prefix)
|
|
35
35
|
try:
|
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class Cache:
|
|
42
|
+
class Cache: # noqa: PLW1641
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
|
@@ -58,7 +58,7 @@ class Cache:
|
|
|
58
58
|
def tmp_dir(self):
|
|
59
59
|
return self.odb.tmp_dir
|
|
60
60
|
|
|
61
|
-
def get_path(self, file: "File") ->
|
|
61
|
+
def get_path(self, file: "File") -> str | None:
|
|
62
62
|
if self.contains(file):
|
|
63
63
|
return self.path_from_checksum(file.get_hash())
|
|
64
64
|
return None
|
|
@@ -74,11 +74,11 @@ class Cache:
|
|
|
74
74
|
self.odb.delete(file.get_hash())
|
|
75
75
|
|
|
76
76
|
async def download(
|
|
77
|
-
self, file: "File", client: "Client", callback:
|
|
77
|
+
self, file: "File", client: "Client", callback: Callback | None = None
|
|
78
78
|
) -> None:
|
|
79
|
-
from_path = f"{file.source}/{file.path}"
|
|
80
79
|
from dvc_objects.fs.utils import tmp_fname
|
|
81
80
|
|
|
81
|
+
from_path = file.get_uri()
|
|
82
82
|
odb_fs = self.odb.fs
|
|
83
83
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
84
84
|
size = file.size
|
datachain/catalog/__init__.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
from .catalog import (
|
|
2
2
|
QUERY_DATASET_PREFIX,
|
|
3
3
|
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
4
|
-
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
5
4
|
Catalog,
|
|
5
|
+
is_namespace_local,
|
|
6
6
|
)
|
|
7
7
|
from .loader import get_catalog
|
|
8
8
|
|
|
9
9
|
__all__ = [
|
|
10
10
|
"QUERY_DATASET_PREFIX",
|
|
11
11
|
"QUERY_SCRIPT_CANCELED_EXIT_CODE",
|
|
12
|
-
"QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
|
|
13
12
|
"Catalog",
|
|
14
13
|
"get_catalog",
|
|
14
|
+
"is_namespace_local",
|
|
15
15
|
]
|