datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/client/gcs.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
from collections.abc import Iterable
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
|
|
8
8
|
from dateutil.parser import isoparse
|
|
9
9
|
from gcsfs import GCSFileSystem
|
|
@@ -15,7 +15,7 @@ from .fsspec import DELIMITER, Client, ResultQueue
|
|
|
15
15
|
|
|
16
16
|
# Patch gcsfs for consistency with s3fs
|
|
17
17
|
GCSFileSystem.set_session = GCSFileSystem._set_session
|
|
18
|
-
PageQueue = asyncio.Queue[
|
|
18
|
+
PageQueue = asyncio.Queue[Iterable[dict[str, Any]] | None]
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class GCSClient(Client):
|
|
@@ -74,7 +74,7 @@ class GCSClient(Client):
|
|
|
74
74
|
try:
|
|
75
75
|
await self._get_pages(prefix, page_queue)
|
|
76
76
|
found = await consumer
|
|
77
|
-
if not found:
|
|
77
|
+
if not found and prefix:
|
|
78
78
|
raise FileNotFoundError(f"Unable to resolve remote path: {prefix}")
|
|
79
79
|
finally:
|
|
80
80
|
consumer.cancel() # In case _get_pages() raised
|
|
@@ -115,7 +115,7 @@ class GCSClient(Client):
|
|
|
115
115
|
maxResults=page_size,
|
|
116
116
|
pageToken=next_page_token,
|
|
117
117
|
json_out=True,
|
|
118
|
-
versions="true",
|
|
118
|
+
versions="true" if self._is_version_aware() else "false",
|
|
119
119
|
)
|
|
120
120
|
assert page["kind"] == "storage#objects"
|
|
121
121
|
await page_queue.put(page.get("items", []))
|
|
@@ -134,12 +134,12 @@ class GCSClient(Client):
|
|
|
134
134
|
source=self.uri,
|
|
135
135
|
path=path,
|
|
136
136
|
etag=v.get("etag", ""),
|
|
137
|
-
version=v.get("generation", ""),
|
|
137
|
+
version=v.get("generation", "") if self._is_version_aware() else "",
|
|
138
138
|
is_latest=not v.get("timeDeleted"),
|
|
139
139
|
last_modified=self.parse_timestamp(v["updated"]),
|
|
140
140
|
size=v.get("size", ""),
|
|
141
141
|
)
|
|
142
142
|
|
|
143
143
|
@classmethod
|
|
144
|
-
def version_path(cls, path: str, version_id:
|
|
144
|
+
def version_path(cls, path: str, version_id: str | None) -> str:
|
|
145
145
|
return f"{path}#{version_id}" if version_id else path
|
datachain/client/hf.py
CHANGED
|
@@ -15,6 +15,34 @@ class classproperty: # noqa: N801
|
|
|
15
15
|
return self.fget(owner)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
def _wrap_class(sync_fs_class):
|
|
19
|
+
"""
|
|
20
|
+
Analog of `AsyncFileSystemWrapper.wrap_class` from fsspec, but sets
|
|
21
|
+
asynchronous to False by default. This is similar to other Async FS
|
|
22
|
+
we initialize. E.g. it means we don't break things in Jupyter where code
|
|
23
|
+
run in async.
|
|
24
|
+
|
|
25
|
+
This also fixes write operations by ensuring they are properly forwarded
|
|
26
|
+
to the underlying filesystem without async buffering issues.
|
|
27
|
+
"""
|
|
28
|
+
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
29
|
+
|
|
30
|
+
class GeneratedAsyncFileSystemWrapper(AsyncFileSystemWrapper):
|
|
31
|
+
def __init__(self, *args, **kwargs):
|
|
32
|
+
sync_fs = sync_fs_class(*args, **kwargs)
|
|
33
|
+
super().__init__(sync_fs, asynchronous=False)
|
|
34
|
+
|
|
35
|
+
def open(self, path, mode="rb", **kwargs):
|
|
36
|
+
# Override open to ensure write operations work correctly.
|
|
37
|
+
# It seems to be a bug in the fsspec wrapper. It avoids
|
|
38
|
+
# wrapping open() explicitly but also doesn't redirect it to
|
|
39
|
+
# sync filesystem.
|
|
40
|
+
return self.sync_fs.open(path, mode, **kwargs)
|
|
41
|
+
|
|
42
|
+
GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
|
|
43
|
+
return GeneratedAsyncFileSystemWrapper
|
|
44
|
+
|
|
45
|
+
|
|
18
46
|
@functools.cache
|
|
19
47
|
def get_hf_filesystem_cls():
|
|
20
48
|
import fsspec
|
|
@@ -29,10 +57,9 @@ def get_hf_filesystem_cls():
|
|
|
29
57
|
f"{fsspec_version} is installed."
|
|
30
58
|
)
|
|
31
59
|
|
|
32
|
-
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
33
60
|
from huggingface_hub import HfFileSystem
|
|
34
61
|
|
|
35
|
-
fs_cls =
|
|
62
|
+
fs_cls = _wrap_class(HfFileSystem)
|
|
36
63
|
# AsyncFileSystemWrapper does not set class properties, so we need to set them back.
|
|
37
64
|
fs_cls.protocol = HfFileSystem.protocol
|
|
38
65
|
return fs_cls
|
datachain/client/http.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
from fsspec.implementations.http import HTTPFileSystem
|
|
6
|
+
|
|
7
|
+
from datachain.dataset import StorageURI
|
|
8
|
+
from datachain.lib.file import File
|
|
9
|
+
|
|
10
|
+
from .fsspec import Client
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from datachain.cache import Cache
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HTTPClient(Client):
|
|
17
|
+
FS_CLASS = HTTPFileSystem
|
|
18
|
+
PREFIX: ClassVar[str] = "http://"
|
|
19
|
+
protocol: ClassVar[str] = "http"
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def create_fs(cls, **kwargs) -> HTTPFileSystem:
|
|
23
|
+
# Configure HTTPFileSystem options
|
|
24
|
+
kwargs.setdefault("simple_links", True)
|
|
25
|
+
kwargs.setdefault("same_scheme", True)
|
|
26
|
+
kwargs.setdefault("cache_type", "bytes")
|
|
27
|
+
|
|
28
|
+
kwargs.pop("version_aware", None)
|
|
29
|
+
|
|
30
|
+
fs = cls.FS_CLASS(**kwargs)
|
|
31
|
+
fs.invalidate_cache()
|
|
32
|
+
return cast("HTTPFileSystem", fs)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_name(
|
|
36
|
+
cls,
|
|
37
|
+
name: str,
|
|
38
|
+
cache: "Cache",
|
|
39
|
+
kwargs: dict[str, Any],
|
|
40
|
+
) -> "HTTPClient":
|
|
41
|
+
parsed = urlparse(name)
|
|
42
|
+
|
|
43
|
+
if parsed.scheme:
|
|
44
|
+
name = parsed.netloc + parsed.path
|
|
45
|
+
|
|
46
|
+
return cls(name, kwargs, cache)
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def split_url(cls, url: str) -> tuple[str, str]:
|
|
50
|
+
"""Split HTTP/HTTPS URL into domain (bucket equivalent) and path."""
|
|
51
|
+
parsed = urlparse(url)
|
|
52
|
+
domain = parsed.netloc
|
|
53
|
+
path = parsed.path.lstrip("/")
|
|
54
|
+
|
|
55
|
+
if parsed.query:
|
|
56
|
+
path += f"?{parsed.query}"
|
|
57
|
+
if parsed.fragment:
|
|
58
|
+
path += f"#{parsed.fragment}"
|
|
59
|
+
|
|
60
|
+
return domain, path
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def get_uri(cls, name: str) -> "StorageURI":
|
|
64
|
+
if not name.startswith(("http://", "https://")):
|
|
65
|
+
return StorageURI(f"{cls.PREFIX}{name}")
|
|
66
|
+
return StorageURI(name)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def is_root_url(cls, url: str) -> bool:
|
|
70
|
+
parsed = urlparse(url)
|
|
71
|
+
return parsed.path in ("", "/") and not parsed.query and not parsed.fragment
|
|
72
|
+
|
|
73
|
+
def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
|
|
74
|
+
if self.name.startswith(("http://", "https://")):
|
|
75
|
+
base_url = self.name
|
|
76
|
+
else:
|
|
77
|
+
if rel_path and "/" in rel_path:
|
|
78
|
+
first_part = rel_path.split("/")[0]
|
|
79
|
+
if "." in first_part and not first_part.startswith("."):
|
|
80
|
+
return f"{self.protocol}://{rel_path}"
|
|
81
|
+
|
|
82
|
+
base_url = f"{self.protocol}://{self.name}"
|
|
83
|
+
|
|
84
|
+
if rel_path:
|
|
85
|
+
if not base_url.endswith("/") and not rel_path.startswith("/"):
|
|
86
|
+
base_url += "/"
|
|
87
|
+
full_url = base_url + rel_path
|
|
88
|
+
else:
|
|
89
|
+
full_url = base_url
|
|
90
|
+
|
|
91
|
+
return full_url
|
|
92
|
+
|
|
93
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Generate URL for the given path.
|
|
96
|
+
Note: HTTP URLs don't support signed/expiring URLs.
|
|
97
|
+
"""
|
|
98
|
+
return self.get_full_path(path, kwargs.pop("version_id", None))
|
|
99
|
+
|
|
100
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
101
|
+
etag = v.get("ETag", "").strip('"')
|
|
102
|
+
last_modified = v.get("last_modified")
|
|
103
|
+
if last_modified:
|
|
104
|
+
if isinstance(last_modified, str):
|
|
105
|
+
try:
|
|
106
|
+
from email.utils import parsedate_to_datetime
|
|
107
|
+
|
|
108
|
+
last_modified = parsedate_to_datetime(last_modified)
|
|
109
|
+
except (ValueError, TypeError):
|
|
110
|
+
last_modified = datetime.now(timezone.utc)
|
|
111
|
+
elif isinstance(last_modified, (int, float)):
|
|
112
|
+
last_modified = datetime.fromtimestamp(last_modified, timezone.utc)
|
|
113
|
+
else:
|
|
114
|
+
last_modified = datetime.now(timezone.utc)
|
|
115
|
+
|
|
116
|
+
return File(
|
|
117
|
+
source=self.uri,
|
|
118
|
+
path=path,
|
|
119
|
+
size=v.get("size", 0),
|
|
120
|
+
etag=etag,
|
|
121
|
+
version="",
|
|
122
|
+
is_latest=True,
|
|
123
|
+
last_modified=last_modified,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def upload(self, data: bytes, path: str) -> "File":
|
|
127
|
+
raise NotImplementedError(
|
|
128
|
+
"HTTP/HTTPS client is read-only. Upload operations are not supported."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def get_file_info(self, path: str, version_id: str | None = None) -> "File":
|
|
132
|
+
info = self.fs.info(self.get_full_path(path))
|
|
133
|
+
return self.info_to_file(info, path)
|
|
134
|
+
|
|
135
|
+
def open_object(self, file: "File", use_cache: bool = True, cb=None):
|
|
136
|
+
from datachain.client.fileslice import FileWrapper
|
|
137
|
+
|
|
138
|
+
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
139
|
+
return open(cache_path, mode="rb")
|
|
140
|
+
|
|
141
|
+
assert not file.location
|
|
142
|
+
return FileWrapper(
|
|
143
|
+
self.fs.open(self.get_full_path(file.get_path_normalized())),
|
|
144
|
+
cb or (lambda x: None),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
|
|
148
|
+
return await self.fs._get_file(lpath, rpath, callback=callback)
|
|
149
|
+
|
|
150
|
+
async def _fetch_dir(self, prefix: str, pbar, result_queue) -> set[str]:
|
|
151
|
+
full_url = self.get_full_path(prefix)
|
|
152
|
+
raise NotImplementedError(f"Cannot download file from {full_url}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class HTTPSClient(HTTPClient):
|
|
156
|
+
protocol = "https"
|
|
157
|
+
PREFIX = "https://"
|
datachain/client/local.py
CHANGED
|
@@ -2,14 +2,14 @@ import os
|
|
|
2
2
|
import posixpath
|
|
3
3
|
from datetime import datetime, timezone
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
9
9
|
|
|
10
10
|
from datachain.lib.file import File
|
|
11
11
|
|
|
12
|
-
from .fsspec import Client
|
|
12
|
+
from .fsspec import Client, is_win_local_path
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
15
|
from datachain.cache import Cache
|
|
@@ -57,9 +57,13 @@ class FileClient(Client):
|
|
|
57
57
|
/home/user/animals/ -> file:///home/user/animals/
|
|
58
58
|
C:\\windows\animals -> file:///C:/windows/animals
|
|
59
59
|
"""
|
|
60
|
+
parsed = urlparse(path)
|
|
61
|
+
if parsed.scheme and not is_win_local_path(path):
|
|
62
|
+
return path
|
|
63
|
+
|
|
60
64
|
uri = Path(path).expanduser().absolute().resolve().as_uri()
|
|
61
|
-
if path[-1]
|
|
62
|
-
#
|
|
65
|
+
if path and path[-1] in (os.sep, "/"):
|
|
66
|
+
# keep trailing separator so directory URIs stay rooted
|
|
63
67
|
uri += "/" # in uri (file:///...) all separators are / regardless of os
|
|
64
68
|
|
|
65
69
|
return uri
|
|
@@ -99,13 +103,13 @@ class FileClient(Client):
|
|
|
99
103
|
)
|
|
100
104
|
|
|
101
105
|
async def get_current_etag(self, file: "File") -> str:
|
|
102
|
-
info = self.fs.info(self.get_full_path(file.
|
|
106
|
+
info = self.fs.info(self.get_full_path(file.get_path_normalized()))
|
|
103
107
|
return self.info_to_file(info, "").etag
|
|
104
108
|
|
|
105
|
-
async def get_size(self, path: str, version_id:
|
|
109
|
+
async def get_size(self, path: str, version_id: str | None = None) -> int:
|
|
106
110
|
return self.fs.size(path)
|
|
107
111
|
|
|
108
|
-
async def get_file(self, lpath, rpath, callback, version_id:
|
|
112
|
+
async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
|
|
109
113
|
return self.fs.get_file(lpath, rpath, callback=callback)
|
|
110
114
|
|
|
111
115
|
async def ls_dir(self, path):
|
|
@@ -114,7 +118,7 @@ class FileClient(Client):
|
|
|
114
118
|
def rel_path(self, path):
|
|
115
119
|
return posixpath.relpath(path, self.name)
|
|
116
120
|
|
|
117
|
-
def get_full_path(self, rel_path, version_id:
|
|
121
|
+
def get_full_path(self, rel_path, version_id: str | None = None):
|
|
118
122
|
full_path = Path(self.name, rel_path).as_posix()
|
|
119
123
|
if rel_path.endswith("/") or not rel_path:
|
|
120
124
|
full_path += "/"
|
|
@@ -138,8 +142,8 @@ class FileClient(Client):
|
|
|
138
142
|
if not self.use_symlinks:
|
|
139
143
|
super().fetch_nodes(nodes, shared_progress_bar)
|
|
140
144
|
|
|
141
|
-
def do_instantiate_object(self,
|
|
145
|
+
def do_instantiate_object(self, file: File, dst: str) -> None:
|
|
142
146
|
if self.use_symlinks:
|
|
143
|
-
os.symlink(Path(self.name,
|
|
147
|
+
os.symlink(Path(self.name, file.path), dst)
|
|
144
148
|
else:
|
|
145
|
-
super().do_instantiate_object(
|
|
149
|
+
super().do_instantiate_object(file, dst)
|
datachain/client/s3.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, cast
|
|
4
4
|
from urllib.parse import parse_qs, urlsplit, urlunsplit
|
|
5
5
|
|
|
6
6
|
from botocore.exceptions import NoCredentialsError
|
|
@@ -80,7 +80,7 @@ class ClientS3(Client):
|
|
|
80
80
|
finally:
|
|
81
81
|
await page_queue.put(None)
|
|
82
82
|
|
|
83
|
-
async def process_pages(page_queue, result_queue):
|
|
83
|
+
async def process_pages(page_queue, result_queue, prefix):
|
|
84
84
|
found = False
|
|
85
85
|
with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
|
|
86
86
|
while (res := await page_queue.get()) is not None:
|
|
@@ -94,14 +94,14 @@ class ClientS3(Client):
|
|
|
94
94
|
if entries:
|
|
95
95
|
await result_queue.put(entries)
|
|
96
96
|
pbar.update(len(entries))
|
|
97
|
-
if not found:
|
|
97
|
+
if not found and prefix:
|
|
98
98
|
raise FileNotFoundError(f"Unable to resolve remote path: {prefix}")
|
|
99
99
|
|
|
100
100
|
try:
|
|
101
101
|
prefix = start_prefix
|
|
102
102
|
if prefix:
|
|
103
103
|
prefix = prefix.lstrip(DELIMITER) + DELIMITER
|
|
104
|
-
versions =
|
|
104
|
+
versions = self._is_version_aware()
|
|
105
105
|
fs = self.fs
|
|
106
106
|
await fs.set_session()
|
|
107
107
|
s3 = await fs.get_s3(self.name)
|
|
@@ -118,7 +118,9 @@ class ClientS3(Client):
|
|
|
118
118
|
Delimiter="",
|
|
119
119
|
)
|
|
120
120
|
page_queue: asyncio.Queue[list] = asyncio.Queue(2)
|
|
121
|
-
consumer = asyncio.create_task(
|
|
121
|
+
consumer = asyncio.create_task(
|
|
122
|
+
process_pages(page_queue, result_queue, prefix)
|
|
123
|
+
)
|
|
122
124
|
try:
|
|
123
125
|
await get_pages(it, page_queue)
|
|
124
126
|
await consumer
|
|
@@ -137,14 +139,16 @@ class ClientS3(Client):
|
|
|
137
139
|
source=self.uri,
|
|
138
140
|
path=v["Key"],
|
|
139
141
|
etag=v.get("ETag", "").strip('"'),
|
|
140
|
-
version=
|
|
142
|
+
version=(
|
|
143
|
+
ClientS3.clean_s3_version(v.get("VersionId", "")) if versions else ""
|
|
144
|
+
),
|
|
141
145
|
is_latest=v.get("IsLatest", True),
|
|
142
146
|
last_modified=v.get("LastModified", ""),
|
|
143
147
|
size=v["Size"],
|
|
144
148
|
)
|
|
145
149
|
|
|
146
150
|
@classmethod
|
|
147
|
-
def version_path(cls, path: str, version_id:
|
|
151
|
+
def version_path(cls, path: str, version_id: str | None) -> str:
|
|
148
152
|
parts = list(urlsplit(path))
|
|
149
153
|
query = parse_qs(parts[3])
|
|
150
154
|
if "versionId" in query:
|
|
@@ -183,7 +187,7 @@ class ClientS3(Client):
|
|
|
183
187
|
return subdirs
|
|
184
188
|
|
|
185
189
|
@staticmethod
|
|
186
|
-
def clean_s3_version(ver:
|
|
190
|
+
def clean_s3_version(ver: str | None) -> str:
|
|
187
191
|
return ver if (ver is not None and ver != "null") else ""
|
|
188
192
|
|
|
189
193
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
@@ -191,7 +195,11 @@ class ClientS3(Client):
|
|
|
191
195
|
source=self.uri,
|
|
192
196
|
path=path,
|
|
193
197
|
size=v["size"],
|
|
194
|
-
version=
|
|
198
|
+
version=(
|
|
199
|
+
ClientS3.clean_s3_version(v.get("VersionId", ""))
|
|
200
|
+
if self._is_version_aware()
|
|
201
|
+
else ""
|
|
202
|
+
),
|
|
195
203
|
etag=v.get("ETag", "").strip('"'),
|
|
196
204
|
is_latest=v.get("IsLatest", True),
|
|
197
205
|
last_modified=v.get("LastModified", ""),
|
datachain/config.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from collections.abc import Mapping
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Optional, Union
|
|
5
4
|
|
|
6
5
|
from tomlkit import TOMLDocument, dump, load
|
|
7
6
|
|
|
@@ -22,16 +21,13 @@ class Config:
|
|
|
22
21
|
# In the order of precedence
|
|
23
22
|
LEVELS = SYSTEM_LEVELS + LOCAL_LEVELS
|
|
24
23
|
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
level: Optional[ConfigLevel] = None,
|
|
28
|
-
):
|
|
24
|
+
def __init__(self, level: ConfigLevel | None = None):
|
|
29
25
|
self.level = level
|
|
30
26
|
|
|
31
27
|
self.init()
|
|
32
28
|
|
|
33
29
|
@classmethod
|
|
34
|
-
def get_dir(cls, level:
|
|
30
|
+
def get_dir(cls, level: ConfigLevel | None) -> str:
|
|
35
31
|
if level == ConfigLevel.SYSTEM:
|
|
36
32
|
return system_config_dir()
|
|
37
33
|
if level == ConfigLevel.GLOBAL:
|
|
@@ -43,7 +39,7 @@ class Config:
|
|
|
43
39
|
d = DataChainDir(self.get_dir(self.level))
|
|
44
40
|
d.init()
|
|
45
41
|
|
|
46
|
-
def load_one(self, level:
|
|
42
|
+
def load_one(self, level: ConfigLevel | None = None) -> TOMLDocument:
|
|
47
43
|
config_path = DataChainDir(self.get_dir(level)).config
|
|
48
44
|
|
|
49
45
|
try:
|
|
@@ -128,7 +124,7 @@ class Config:
|
|
|
128
124
|
return remote_conf
|
|
129
125
|
|
|
130
126
|
|
|
131
|
-
def merge(into:
|
|
127
|
+
def merge(into: TOMLDocument | dict, update: TOMLDocument | dict):
|
|
132
128
|
"""Merges second dict into first recursively"""
|
|
133
129
|
for key, val in update.items():
|
|
134
130
|
if isinstance(into.get(key), dict) and isinstance(val, dict):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from collections.abc import Iterator
|
|
4
|
-
from typing import TYPE_CHECKING, Any, ClassVar
|
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sa
|
|
7
7
|
from sqlalchemy.sql import FROM_LINTING
|
|
@@ -58,7 +58,7 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
58
58
|
@classmethod
|
|
59
59
|
def compile_to_args(
|
|
60
60
|
cls, statement: "ClauseElement", **kwargs
|
|
61
|
-
) ->
|
|
61
|
+
) -> tuple[str] | tuple[str, dict[str, Any]]:
|
|
62
62
|
"""
|
|
63
63
|
Compile a sqlalchemy query or ddl object to an args tuple.
|
|
64
64
|
|
|
@@ -75,8 +75,8 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
75
75
|
def execute(
|
|
76
76
|
self,
|
|
77
77
|
query,
|
|
78
|
-
cursor:
|
|
79
|
-
conn:
|
|
78
|
+
cursor: Any | None = None,
|
|
79
|
+
conn: Any | None = None,
|
|
80
80
|
) -> Iterator[tuple[Any, ...]]: ...
|
|
81
81
|
|
|
82
82
|
def get_table(self, name: str) -> "Table":
|
|
@@ -90,7 +90,7 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
90
90
|
|
|
91
91
|
@abstractmethod
|
|
92
92
|
def executemany(
|
|
93
|
-
self, query, params, cursor:
|
|
93
|
+
self, query, params, cursor: Any | None = None
|
|
94
94
|
) -> Iterator[tuple[Any, ...]]: ...
|
|
95
95
|
|
|
96
96
|
@abstractmethod
|
|
@@ -112,7 +112,13 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
112
112
|
return sa.inspect(self.engine).has_table(name)
|
|
113
113
|
|
|
114
114
|
@abstractmethod
|
|
115
|
-
def create_table(
|
|
115
|
+
def create_table(
|
|
116
|
+
self,
|
|
117
|
+
table: "Table",
|
|
118
|
+
if_not_exists: bool = True,
|
|
119
|
+
*,
|
|
120
|
+
kind: str | None = None,
|
|
121
|
+
) -> None: ...
|
|
116
122
|
|
|
117
123
|
@abstractmethod
|
|
118
124
|
def drop_table(self, table: "Table", if_exists: bool = False) -> None: ...
|
datachain/data_storage/job.py
CHANGED
|
@@ -3,6 +3,8 @@ from enum import Enum
|
|
|
3
3
|
|
|
4
4
|
class JobStatus(int, Enum):
|
|
5
5
|
CREATED = 1
|
|
6
|
+
SCHEDULED = 10
|
|
7
|
+
PROVISIONING = 12
|
|
6
8
|
QUEUED = 2
|
|
7
9
|
INIT = 3
|
|
8
10
|
RUNNING = 4
|
|
@@ -11,10 +13,12 @@ class JobStatus(int, Enum):
|
|
|
11
13
|
CANCELING = 7
|
|
12
14
|
CANCELED = 8
|
|
13
15
|
CANCELING_SCHEDULED = 9
|
|
16
|
+
TASK = 11
|
|
17
|
+
PENDING = 13
|
|
14
18
|
|
|
15
19
|
@classmethod
|
|
16
20
|
def finished(cls) -> tuple[int, ...]:
|
|
17
|
-
return cls.COMPLETE, cls.FAILED, cls.CANCELED
|
|
21
|
+
return cls.COMPLETE, cls.FAILED, cls.CANCELED, cls.TASK
|
|
18
22
|
|
|
19
23
|
|
|
20
24
|
class JobQueryType(int, Enum):
|