datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/cli/parser/utils.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from argparse import Action, ArgumentParser, ArgumentTypeError, HelpFormatter
|
|
2
|
-
from typing import Union
|
|
3
2
|
|
|
4
3
|
from datachain.cli.utils import CommaSeparatedArgs
|
|
5
4
|
|
|
@@ -44,7 +43,7 @@ def parse_find_column(column: str) -> str:
|
|
|
44
43
|
)
|
|
45
44
|
|
|
46
45
|
|
|
47
|
-
def add_sources_arg(parser: ArgumentParser, nargs:
|
|
46
|
+
def add_sources_arg(parser: ArgumentParser, nargs: str | int = "+") -> Action:
|
|
48
47
|
return parser.add_argument(
|
|
49
48
|
"sources",
|
|
50
49
|
type=str,
|
datachain/cli/utils.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from argparse import SUPPRESS, Action,
|
|
3
|
-
from typing import Optional
|
|
2
|
+
from argparse import SUPPRESS, Action, Namespace, _AppendAction
|
|
4
3
|
|
|
5
4
|
from datachain.error import DataChainError
|
|
6
5
|
|
|
@@ -64,18 +63,6 @@ class CommaSeparatedArgs(_AppendAction): # pylint: disable=protected-access
|
|
|
64
63
|
setattr(namespace, self.dest, list(dict.fromkeys(items)))
|
|
65
64
|
|
|
66
65
|
|
|
67
|
-
class KeyValueArgs(_AppendAction): # pylint: disable=protected-access
|
|
68
|
-
def __call__(self, parser, namespace, values, option_string=None):
|
|
69
|
-
items = getattr(namespace, self.dest) or {}
|
|
70
|
-
for raw_value in filter(bool, values):
|
|
71
|
-
key, sep, value = raw_value.partition("=")
|
|
72
|
-
if not key or not sep or value == "":
|
|
73
|
-
raise ArgumentError(self, f"expected 'key=value', got {raw_value!r}")
|
|
74
|
-
items[key.strip()] = value
|
|
75
|
-
|
|
76
|
-
setattr(namespace, self.dest, items)
|
|
77
|
-
|
|
78
|
-
|
|
79
66
|
def get_logging_level(args: Namespace) -> int:
|
|
80
67
|
if args.quiet:
|
|
81
68
|
return logging.CRITICAL
|
|
@@ -84,7 +71,7 @@ def get_logging_level(args: Namespace) -> int:
|
|
|
84
71
|
return logging.INFO
|
|
85
72
|
|
|
86
73
|
|
|
87
|
-
def determine_flavors(studio: bool, local: bool, all: bool, token:
|
|
74
|
+
def determine_flavors(studio: bool, local: bool, all: bool, token: str | None):
|
|
88
75
|
if studio and not token:
|
|
89
76
|
raise DataChainError(
|
|
90
77
|
"Not logged in to Studio. Log in with 'datachain auth login'."
|
datachain/client/azure.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
from urllib.parse import parse_qs, urlsplit, urlunsplit
|
|
3
3
|
|
|
4
4
|
from adlfs import AzureBlobFileSystem
|
|
@@ -73,7 +73,7 @@ class AzureClient(Client):
|
|
|
73
73
|
result_queue.put_nowait(None)
|
|
74
74
|
|
|
75
75
|
@classmethod
|
|
76
|
-
def version_path(cls, path: str, version_id:
|
|
76
|
+
def version_path(cls, path: str, version_id: str | None) -> str:
|
|
77
77
|
parts = list(urlsplit(path))
|
|
78
78
|
query = parse_qs(parts[3])
|
|
79
79
|
if "versionid" in query:
|
datachain/client/fsspec.py
CHANGED
|
@@ -10,15 +10,7 @@ from abc import ABC, abstractmethod
|
|
|
10
10
|
from collections.abc import AsyncIterator, Iterator, Sequence
|
|
11
11
|
from datetime import datetime
|
|
12
12
|
from shutil import copy2
|
|
13
|
-
from typing import
|
|
14
|
-
TYPE_CHECKING,
|
|
15
|
-
Any,
|
|
16
|
-
BinaryIO,
|
|
17
|
-
ClassVar,
|
|
18
|
-
NamedTuple,
|
|
19
|
-
Optional,
|
|
20
|
-
Union,
|
|
21
|
-
)
|
|
13
|
+
from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, NamedTuple
|
|
22
14
|
from urllib.parse import urlparse
|
|
23
15
|
|
|
24
16
|
from dvc_objects.fs.system import reflink
|
|
@@ -44,11 +36,12 @@ FETCH_WORKERS = 100
|
|
|
44
36
|
DELIMITER = "/" # Path delimiter.
|
|
45
37
|
|
|
46
38
|
DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
|
|
39
|
+
CLOUD_STORAGE_PROTOCOLS = {"s3", "gs", "az", "hf"}
|
|
47
40
|
|
|
48
|
-
ResultQueue = asyncio.Queue[
|
|
41
|
+
ResultQueue = asyncio.Queue[Sequence["File"] | None]
|
|
49
42
|
|
|
50
43
|
|
|
51
|
-
def
|
|
44
|
+
def is_win_local_path(uri: str) -> bool:
|
|
52
45
|
if sys.platform == "win32":
|
|
53
46
|
if len(uri) >= 1 and uri[0] == "\\":
|
|
54
47
|
return True
|
|
@@ -62,10 +55,20 @@ def _is_win_local_path(uri: str) -> bool:
|
|
|
62
55
|
return False
|
|
63
56
|
|
|
64
57
|
|
|
58
|
+
def is_cloud_uri(uri: str) -> bool:
|
|
59
|
+
protocol = urlparse(uri).scheme
|
|
60
|
+
return protocol in CLOUD_STORAGE_PROTOCOLS
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_cloud_schemes() -> list[str]:
|
|
64
|
+
"""Get list of cloud storage scheme prefixes."""
|
|
65
|
+
return [f"{p}://" for p in CLOUD_STORAGE_PROTOCOLS]
|
|
66
|
+
|
|
67
|
+
|
|
65
68
|
class Bucket(NamedTuple):
|
|
66
69
|
name: str
|
|
67
70
|
uri: "StorageURI"
|
|
68
|
-
created:
|
|
71
|
+
created: datetime | None
|
|
69
72
|
|
|
70
73
|
|
|
71
74
|
class Client(ABC):
|
|
@@ -77,21 +80,22 @@ class Client(ABC):
|
|
|
77
80
|
def __init__(self, name: str, fs_kwargs: dict[str, Any], cache: Cache) -> None:
|
|
78
81
|
self.name = name
|
|
79
82
|
self.fs_kwargs = fs_kwargs
|
|
80
|
-
self._fs:
|
|
83
|
+
self._fs: AbstractFileSystem | None = None
|
|
81
84
|
self.cache = cache
|
|
82
85
|
self.uri = self.get_uri(self.name)
|
|
83
86
|
|
|
84
87
|
@staticmethod
|
|
85
|
-
def get_implementation(url:
|
|
88
|
+
def get_implementation(url: str | os.PathLike[str]) -> type["Client"]: # noqa: PLR0911
|
|
86
89
|
from .azure import AzureClient
|
|
87
90
|
from .gcs import GCSClient
|
|
88
91
|
from .hf import HfClient
|
|
92
|
+
from .http import HTTPClient, HTTPSClient
|
|
89
93
|
from .local import FileClient
|
|
90
94
|
from .s3 import ClientS3
|
|
91
95
|
|
|
92
96
|
protocol = urlparse(os.fspath(url)).scheme
|
|
93
97
|
|
|
94
|
-
if not protocol or
|
|
98
|
+
if not protocol or is_win_local_path(os.fspath(url)):
|
|
95
99
|
return FileClient
|
|
96
100
|
if protocol == ClientS3.protocol:
|
|
97
101
|
return ClientS3
|
|
@@ -103,9 +107,18 @@ class Client(ABC):
|
|
|
103
107
|
return FileClient
|
|
104
108
|
if protocol == HfClient.protocol:
|
|
105
109
|
return HfClient
|
|
110
|
+
if protocol == HTTPClient.protocol:
|
|
111
|
+
return HTTPClient
|
|
112
|
+
if protocol == HTTPSClient.protocol:
|
|
113
|
+
return HTTPSClient
|
|
106
114
|
|
|
107
115
|
raise NotImplementedError(f"Unsupported protocol: {protocol}")
|
|
108
116
|
|
|
117
|
+
@classmethod
|
|
118
|
+
def path_to_uri(cls, path: str) -> str:
|
|
119
|
+
"""Convert a path-like object to a URI. Default: identity."""
|
|
120
|
+
return path
|
|
121
|
+
|
|
109
122
|
@staticmethod
|
|
110
123
|
def is_data_source_uri(name: str) -> bool:
|
|
111
124
|
# Returns True if name is one of supported data sources URIs, e.g s3 bucket
|
|
@@ -118,9 +131,7 @@ class Client(ABC):
|
|
|
118
131
|
return cls.get_uri(storage_name), rel_path
|
|
119
132
|
|
|
120
133
|
@staticmethod
|
|
121
|
-
def get_client(
|
|
122
|
-
source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
|
|
123
|
-
) -> "Client":
|
|
134
|
+
def get_client(source: str | os.PathLike[str], cache: Cache, **kwargs) -> "Client":
|
|
124
135
|
cls = Client.get_implementation(source)
|
|
125
136
|
storage_url, _ = cls.split_url(os.fspath(source))
|
|
126
137
|
if os.name == "nt":
|
|
@@ -136,7 +147,7 @@ class Client(ABC):
|
|
|
136
147
|
return fs
|
|
137
148
|
|
|
138
149
|
@classmethod
|
|
139
|
-
def version_path(cls, path: str, version_id:
|
|
150
|
+
def version_path(cls, path: str, version_id: str | None) -> str:
|
|
140
151
|
return path
|
|
141
152
|
|
|
142
153
|
@classmethod
|
|
@@ -216,16 +227,16 @@ class Client(ABC):
|
|
|
216
227
|
)
|
|
217
228
|
return self.info_to_file(info, file_path).etag
|
|
218
229
|
|
|
219
|
-
def get_file_info(self, path: str, version_id:
|
|
230
|
+
def get_file_info(self, path: str, version_id: str | None = None) -> "File":
|
|
220
231
|
info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
|
|
221
232
|
return self.info_to_file(info, path)
|
|
222
233
|
|
|
223
|
-
async def get_size(self, path: str, version_id:
|
|
234
|
+
async def get_size(self, path: str, version_id: str | None = None) -> int:
|
|
224
235
|
return await self.fs._size(
|
|
225
236
|
self.version_path(path, version_id), version_id=version_id
|
|
226
237
|
)
|
|
227
238
|
|
|
228
|
-
async def get_file(self, lpath, rpath, callback, version_id:
|
|
239
|
+
async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
|
|
229
240
|
return await self.fs._get_file(
|
|
230
241
|
self.version_path(lpath, version_id),
|
|
231
242
|
rpath,
|
|
@@ -339,7 +350,7 @@ class Client(ABC):
|
|
|
339
350
|
def rel_path(self, path: str) -> str:
|
|
340
351
|
return self.fs.split_path(path)[1]
|
|
341
352
|
|
|
342
|
-
def get_full_path(self, rel_path: str, version_id:
|
|
353
|
+
def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
|
|
343
354
|
return self.version_path(f"{self.PREFIX}{self.name}/{rel_path}", version_id)
|
|
344
355
|
|
|
345
356
|
@abstractmethod
|
datachain/client/gcs.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import os
|
|
4
4
|
from collections.abc import Iterable
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
|
|
8
8
|
from dateutil.parser import isoparse
|
|
9
9
|
from gcsfs import GCSFileSystem
|
|
@@ -15,7 +15,7 @@ from .fsspec import DELIMITER, Client, ResultQueue
|
|
|
15
15
|
|
|
16
16
|
# Patch gcsfs for consistency with s3fs
|
|
17
17
|
GCSFileSystem.set_session = GCSFileSystem._set_session
|
|
18
|
-
PageQueue = asyncio.Queue[
|
|
18
|
+
PageQueue = asyncio.Queue[Iterable[dict[str, Any]] | None]
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class GCSClient(Client):
|
|
@@ -141,5 +141,5 @@ class GCSClient(Client):
|
|
|
141
141
|
)
|
|
142
142
|
|
|
143
143
|
@classmethod
|
|
144
|
-
def version_path(cls, path: str, version_id:
|
|
144
|
+
def version_path(cls, path: str, version_id: str | None) -> str:
|
|
145
145
|
return f"{path}#{version_id}" if version_id else path
|
datachain/client/http.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
from fsspec.implementations.http import HTTPFileSystem
|
|
6
|
+
|
|
7
|
+
from datachain.dataset import StorageURI
|
|
8
|
+
from datachain.lib.file import File
|
|
9
|
+
|
|
10
|
+
from .fsspec import Client
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from datachain.cache import Cache
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HTTPClient(Client):
|
|
17
|
+
FS_CLASS = HTTPFileSystem
|
|
18
|
+
PREFIX: ClassVar[str] = "http://"
|
|
19
|
+
protocol: ClassVar[str] = "http"
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def create_fs(cls, **kwargs) -> HTTPFileSystem:
|
|
23
|
+
# Configure HTTPFileSystem options
|
|
24
|
+
kwargs.setdefault("simple_links", True)
|
|
25
|
+
kwargs.setdefault("same_scheme", True)
|
|
26
|
+
kwargs.setdefault("cache_type", "bytes")
|
|
27
|
+
|
|
28
|
+
kwargs.pop("version_aware", None)
|
|
29
|
+
|
|
30
|
+
fs = cls.FS_CLASS(**kwargs)
|
|
31
|
+
fs.invalidate_cache()
|
|
32
|
+
return cast("HTTPFileSystem", fs)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_name(
|
|
36
|
+
cls,
|
|
37
|
+
name: str,
|
|
38
|
+
cache: "Cache",
|
|
39
|
+
kwargs: dict[str, Any],
|
|
40
|
+
) -> "HTTPClient":
|
|
41
|
+
parsed = urlparse(name)
|
|
42
|
+
|
|
43
|
+
if parsed.scheme:
|
|
44
|
+
name = parsed.netloc + parsed.path
|
|
45
|
+
|
|
46
|
+
return cls(name, kwargs, cache)
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def split_url(cls, url: str) -> tuple[str, str]:
|
|
50
|
+
"""Split HTTP/HTTPS URL into domain (bucket equivalent) and path."""
|
|
51
|
+
parsed = urlparse(url)
|
|
52
|
+
domain = parsed.netloc
|
|
53
|
+
path = parsed.path.lstrip("/")
|
|
54
|
+
|
|
55
|
+
if parsed.query:
|
|
56
|
+
path += f"?{parsed.query}"
|
|
57
|
+
if parsed.fragment:
|
|
58
|
+
path += f"#{parsed.fragment}"
|
|
59
|
+
|
|
60
|
+
return domain, path
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def get_uri(cls, name: str) -> "StorageURI":
|
|
64
|
+
if not name.startswith(("http://", "https://")):
|
|
65
|
+
return StorageURI(f"{cls.PREFIX}{name}")
|
|
66
|
+
return StorageURI(name)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def is_root_url(cls, url: str) -> bool:
|
|
70
|
+
parsed = urlparse(url)
|
|
71
|
+
return parsed.path in ("", "/") and not parsed.query and not parsed.fragment
|
|
72
|
+
|
|
73
|
+
def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
|
|
74
|
+
if self.name.startswith(("http://", "https://")):
|
|
75
|
+
base_url = self.name
|
|
76
|
+
else:
|
|
77
|
+
if rel_path and "/" in rel_path:
|
|
78
|
+
first_part = rel_path.split("/")[0]
|
|
79
|
+
if "." in first_part and not first_part.startswith("."):
|
|
80
|
+
return f"{self.protocol}://{rel_path}"
|
|
81
|
+
|
|
82
|
+
base_url = f"{self.protocol}://{self.name}"
|
|
83
|
+
|
|
84
|
+
if rel_path:
|
|
85
|
+
if not base_url.endswith("/") and not rel_path.startswith("/"):
|
|
86
|
+
base_url += "/"
|
|
87
|
+
full_url = base_url + rel_path
|
|
88
|
+
else:
|
|
89
|
+
full_url = base_url
|
|
90
|
+
|
|
91
|
+
return full_url
|
|
92
|
+
|
|
93
|
+
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Generate URL for the given path.
|
|
96
|
+
Note: HTTP URLs don't support signed/expiring URLs.
|
|
97
|
+
"""
|
|
98
|
+
return self.get_full_path(path, kwargs.pop("version_id", None))
|
|
99
|
+
|
|
100
|
+
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
101
|
+
etag = v.get("ETag", "").strip('"')
|
|
102
|
+
last_modified = v.get("last_modified")
|
|
103
|
+
if last_modified:
|
|
104
|
+
if isinstance(last_modified, str):
|
|
105
|
+
try:
|
|
106
|
+
from email.utils import parsedate_to_datetime
|
|
107
|
+
|
|
108
|
+
last_modified = parsedate_to_datetime(last_modified)
|
|
109
|
+
except (ValueError, TypeError):
|
|
110
|
+
last_modified = datetime.now(timezone.utc)
|
|
111
|
+
elif isinstance(last_modified, (int, float)):
|
|
112
|
+
last_modified = datetime.fromtimestamp(last_modified, timezone.utc)
|
|
113
|
+
else:
|
|
114
|
+
last_modified = datetime.now(timezone.utc)
|
|
115
|
+
|
|
116
|
+
return File(
|
|
117
|
+
source=self.uri,
|
|
118
|
+
path=path,
|
|
119
|
+
size=v.get("size", 0),
|
|
120
|
+
etag=etag,
|
|
121
|
+
version="",
|
|
122
|
+
is_latest=True,
|
|
123
|
+
last_modified=last_modified,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def upload(self, data: bytes, path: str) -> "File":
|
|
127
|
+
raise NotImplementedError(
|
|
128
|
+
"HTTP/HTTPS client is read-only. Upload operations are not supported."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def get_file_info(self, path: str, version_id: str | None = None) -> "File":
|
|
132
|
+
info = self.fs.info(self.get_full_path(path))
|
|
133
|
+
return self.info_to_file(info, path)
|
|
134
|
+
|
|
135
|
+
def open_object(self, file: "File", use_cache: bool = True, cb=None):
|
|
136
|
+
from datachain.client.fileslice import FileWrapper
|
|
137
|
+
|
|
138
|
+
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
139
|
+
return open(cache_path, mode="rb")
|
|
140
|
+
|
|
141
|
+
assert not file.location
|
|
142
|
+
return FileWrapper(
|
|
143
|
+
self.fs.open(self.get_full_path(file.get_path_normalized())),
|
|
144
|
+
cb or (lambda x: None),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
|
|
148
|
+
return await self.fs._get_file(lpath, rpath, callback=callback)
|
|
149
|
+
|
|
150
|
+
async def _fetch_dir(self, prefix: str, pbar, result_queue) -> set[str]:
|
|
151
|
+
full_url = self.get_full_path(prefix)
|
|
152
|
+
raise NotImplementedError(f"Cannot download file from {full_url}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class HTTPSClient(HTTPClient):
|
|
156
|
+
protocol = "https"
|
|
157
|
+
PREFIX = "https://"
|
datachain/client/local.py
CHANGED
|
@@ -2,14 +2,14 @@ import os
|
|
|
2
2
|
import posixpath
|
|
3
3
|
from datetime import datetime, timezone
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
9
9
|
|
|
10
10
|
from datachain.lib.file import File
|
|
11
11
|
|
|
12
|
-
from .fsspec import Client
|
|
12
|
+
from .fsspec import Client, is_win_local_path
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
15
|
from datachain.cache import Cache
|
|
@@ -57,9 +57,13 @@ class FileClient(Client):
|
|
|
57
57
|
/home/user/animals/ -> file:///home/user/animals/
|
|
58
58
|
C:\\windows\animals -> file:///C:/windows/animals
|
|
59
59
|
"""
|
|
60
|
+
parsed = urlparse(path)
|
|
61
|
+
if parsed.scheme and not is_win_local_path(path):
|
|
62
|
+
return path
|
|
63
|
+
|
|
60
64
|
uri = Path(path).expanduser().absolute().resolve().as_uri()
|
|
61
|
-
if path[-1]
|
|
62
|
-
#
|
|
65
|
+
if path and path[-1] in (os.sep, "/"):
|
|
66
|
+
# keep trailing separator so directory URIs stay rooted
|
|
63
67
|
uri += "/" # in uri (file:///...) all separators are / regardless of os
|
|
64
68
|
|
|
65
69
|
return uri
|
|
@@ -102,10 +106,10 @@ class FileClient(Client):
|
|
|
102
106
|
info = self.fs.info(self.get_full_path(file.get_path_normalized()))
|
|
103
107
|
return self.info_to_file(info, "").etag
|
|
104
108
|
|
|
105
|
-
async def get_size(self, path: str, version_id:
|
|
109
|
+
async def get_size(self, path: str, version_id: str | None = None) -> int:
|
|
106
110
|
return self.fs.size(path)
|
|
107
111
|
|
|
108
|
-
async def get_file(self, lpath, rpath, callback, version_id:
|
|
112
|
+
async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
|
|
109
113
|
return self.fs.get_file(lpath, rpath, callback=callback)
|
|
110
114
|
|
|
111
115
|
async def ls_dir(self, path):
|
|
@@ -114,7 +118,7 @@ class FileClient(Client):
|
|
|
114
118
|
def rel_path(self, path):
|
|
115
119
|
return posixpath.relpath(path, self.name)
|
|
116
120
|
|
|
117
|
-
def get_full_path(self, rel_path, version_id:
|
|
121
|
+
def get_full_path(self, rel_path, version_id: str | None = None):
|
|
118
122
|
full_path = Path(self.name, rel_path).as_posix()
|
|
119
123
|
if rel_path.endswith("/") or not rel_path:
|
|
120
124
|
full_path += "/"
|
datachain/client/s3.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, cast
|
|
4
4
|
from urllib.parse import parse_qs, urlsplit, urlunsplit
|
|
5
5
|
|
|
6
6
|
from botocore.exceptions import NoCredentialsError
|
|
@@ -148,7 +148,7 @@ class ClientS3(Client):
|
|
|
148
148
|
)
|
|
149
149
|
|
|
150
150
|
@classmethod
|
|
151
|
-
def version_path(cls, path: str, version_id:
|
|
151
|
+
def version_path(cls, path: str, version_id: str | None) -> str:
|
|
152
152
|
parts = list(urlsplit(path))
|
|
153
153
|
query = parse_qs(parts[3])
|
|
154
154
|
if "versionId" in query:
|
|
@@ -187,7 +187,7 @@ class ClientS3(Client):
|
|
|
187
187
|
return subdirs
|
|
188
188
|
|
|
189
189
|
@staticmethod
|
|
190
|
-
def clean_s3_version(ver:
|
|
190
|
+
def clean_s3_version(ver: str | None) -> str:
|
|
191
191
|
return ver if (ver is not None and ver != "null") else ""
|
|
192
192
|
|
|
193
193
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
datachain/config.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from collections.abc import Mapping
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Optional, Union
|
|
5
4
|
|
|
6
5
|
from tomlkit import TOMLDocument, dump, load
|
|
7
6
|
|
|
@@ -22,16 +21,13 @@ class Config:
|
|
|
22
21
|
# In the order of precedence
|
|
23
22
|
LEVELS = SYSTEM_LEVELS + LOCAL_LEVELS
|
|
24
23
|
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
level: Optional[ConfigLevel] = None,
|
|
28
|
-
):
|
|
24
|
+
def __init__(self, level: ConfigLevel | None = None):
|
|
29
25
|
self.level = level
|
|
30
26
|
|
|
31
27
|
self.init()
|
|
32
28
|
|
|
33
29
|
@classmethod
|
|
34
|
-
def get_dir(cls, level:
|
|
30
|
+
def get_dir(cls, level: ConfigLevel | None) -> str:
|
|
35
31
|
if level == ConfigLevel.SYSTEM:
|
|
36
32
|
return system_config_dir()
|
|
37
33
|
if level == ConfigLevel.GLOBAL:
|
|
@@ -43,7 +39,7 @@ class Config:
|
|
|
43
39
|
d = DataChainDir(self.get_dir(self.level))
|
|
44
40
|
d.init()
|
|
45
41
|
|
|
46
|
-
def load_one(self, level:
|
|
42
|
+
def load_one(self, level: ConfigLevel | None = None) -> TOMLDocument:
|
|
47
43
|
config_path = DataChainDir(self.get_dir(level)).config
|
|
48
44
|
|
|
49
45
|
try:
|
|
@@ -128,7 +124,7 @@ class Config:
|
|
|
128
124
|
return remote_conf
|
|
129
125
|
|
|
130
126
|
|
|
131
|
-
def merge(into:
|
|
127
|
+
def merge(into: TOMLDocument | dict, update: TOMLDocument | dict):
|
|
132
128
|
"""Merges second dict into first recursively"""
|
|
133
129
|
for key, val in update.items():
|
|
134
130
|
if isinstance(into.get(key), dict) and isinstance(val, dict):
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from collections.abc import Iterator
|
|
4
|
-
from typing import TYPE_CHECKING, Any, ClassVar
|
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sa
|
|
7
7
|
from sqlalchemy.sql import FROM_LINTING
|
|
@@ -58,7 +58,7 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
58
58
|
@classmethod
|
|
59
59
|
def compile_to_args(
|
|
60
60
|
cls, statement: "ClauseElement", **kwargs
|
|
61
|
-
) ->
|
|
61
|
+
) -> tuple[str] | tuple[str, dict[str, Any]]:
|
|
62
62
|
"""
|
|
63
63
|
Compile a sqlalchemy query or ddl object to an args tuple.
|
|
64
64
|
|
|
@@ -75,8 +75,8 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
75
75
|
def execute(
|
|
76
76
|
self,
|
|
77
77
|
query,
|
|
78
|
-
cursor:
|
|
79
|
-
conn:
|
|
78
|
+
cursor: Any | None = None,
|
|
79
|
+
conn: Any | None = None,
|
|
80
80
|
) -> Iterator[tuple[Any, ...]]: ...
|
|
81
81
|
|
|
82
82
|
def get_table(self, name: str) -> "Table":
|
|
@@ -90,7 +90,7 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
90
90
|
|
|
91
91
|
@abstractmethod
|
|
92
92
|
def executemany(
|
|
93
|
-
self, query, params, cursor:
|
|
93
|
+
self, query, params, cursor: Any | None = None
|
|
94
94
|
) -> Iterator[tuple[Any, ...]]: ...
|
|
95
95
|
|
|
96
96
|
@abstractmethod
|
|
@@ -112,7 +112,13 @@ class DatabaseEngine(ABC, Serializable):
|
|
|
112
112
|
return sa.inspect(self.engine).has_table(name)
|
|
113
113
|
|
|
114
114
|
@abstractmethod
|
|
115
|
-
def create_table(
|
|
115
|
+
def create_table(
|
|
116
|
+
self,
|
|
117
|
+
table: "Table",
|
|
118
|
+
if_not_exists: bool = True,
|
|
119
|
+
*,
|
|
120
|
+
kind: str | None = None,
|
|
121
|
+
) -> None: ...
|
|
116
122
|
|
|
117
123
|
@abstractmethod
|
|
118
124
|
def drop_table(self, table: "Table", if_exists: bool = False) -> None: ...
|
datachain/data_storage/job.py
CHANGED
|
@@ -4,6 +4,7 @@ from enum import Enum
|
|
|
4
4
|
class JobStatus(int, Enum):
|
|
5
5
|
CREATED = 1
|
|
6
6
|
SCHEDULED = 10
|
|
7
|
+
PROVISIONING = 12
|
|
7
8
|
QUEUED = 2
|
|
8
9
|
INIT = 3
|
|
9
10
|
RUNNING = 4
|
|
@@ -13,6 +14,7 @@ class JobStatus(int, Enum):
|
|
|
13
14
|
CANCELED = 8
|
|
14
15
|
CANCELING_SCHEDULED = 9
|
|
15
16
|
TASK = 11
|
|
17
|
+
PENDING = 13
|
|
16
18
|
|
|
17
19
|
@classmethod
|
|
18
20
|
def finished(cls) -> tuple[int, ...]:
|