thds.adls 3.2.20250630174944__py3-none-any.whl → 4.1.20250701190349__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/__init__.py +10 -5
- thds/adls/_upload.py +54 -41
- thds/adls/azcopy/__init__.py +1 -1
- thds/adls/azcopy/download.py +66 -100
- thds/adls/azcopy/login.py +39 -0
- thds/adls/azcopy/progress.py +49 -0
- thds/adls/azcopy/system_resources.py +26 -0
- thds/adls/azcopy/upload.py +95 -0
- thds/adls/{cached_up_down.py → cached.py} +21 -16
- thds/adls/conf.py +1 -0
- thds/adls/download.py +129 -152
- thds/adls/download_lock.py +9 -2
- thds/adls/errors.py +10 -2
- thds/adls/file_properties.py +8 -0
- thds/adls/hashes.py +147 -0
- thds/adls/impl.py +3 -4
- thds/adls/md5.py +5 -52
- thds/adls/ro_cache.py +1 -2
- thds/adls/source.py +37 -34
- thds/adls/tools/download.py +3 -3
- thds/adls/tools/upload.py +3 -4
- thds/adls/upload.py +162 -0
- thds/adls/uri.py +6 -0
- {thds_adls-3.2.20250630174944.dist-info → thds_adls-4.1.20250701190349.dist-info}/METADATA +3 -1
- thds_adls-4.1.20250701190349.dist-info/RECORD +42 -0
- thds/adls/resource/__init__.py +0 -36
- thds/adls/resource/core.py +0 -77
- thds/adls/resource/file_pointers.py +0 -54
- thds/adls/resource/up_down.py +0 -242
- thds_adls-3.2.20250630174944.dist-info/RECORD +0 -40
- {thds_adls-3.2.20250630174944.dist-info → thds_adls-4.1.20250701190349.dist-info}/WHEEL +0 -0
- {thds_adls-3.2.20250630174944.dist-info → thds_adls-4.1.20250701190349.dist-info}/entry_points.txt +0 -0
- {thds_adls-3.2.20250630174944.dist-info → thds_adls-4.1.20250701190349.dist-info}/top_level.txt +0 -0
|
@@ -1,45 +1,53 @@
|
|
|
1
1
|
import typing as ty
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
from thds
|
|
5
|
-
from thds.core import parallel
|
|
4
|
+
from thds import core
|
|
6
5
|
from thds.core.source.tree import SourceTree
|
|
7
6
|
from thds.core.thunks import thunking
|
|
8
7
|
|
|
8
|
+
from . import upload
|
|
9
9
|
from .download import download_or_use_verified
|
|
10
10
|
from .fqn import AdlsFqn
|
|
11
11
|
from .global_client import get_global_fs_client
|
|
12
12
|
from .impl import ADLSFileSystem
|
|
13
|
-
from .resource.up_down import AdlsHashedResource, upload
|
|
14
13
|
from .ro_cache import global_cache
|
|
15
14
|
from .uri import UriIsh, parse_any
|
|
16
15
|
|
|
17
16
|
|
|
18
|
-
def download_to_cache(
|
|
17
|
+
def download_to_cache(
|
|
18
|
+
fqn_or_uri: UriIsh,
|
|
19
|
+
*,
|
|
20
|
+
expected_hash: ty.Optional[core.hashing.Hash] = None,
|
|
21
|
+
) -> Path:
|
|
19
22
|
"""Downloads directly to the cache and returns a Path to the read-only file.
|
|
20
23
|
|
|
21
24
|
This will allow you to download a file 'into' the cache even if
|
|
22
|
-
you provide no
|
|
25
|
+
you provide no expected hash and the remote file properties does not have
|
|
23
26
|
one. However, future attempts to reuse the cache will force a
|
|
24
|
-
re-download if no
|
|
27
|
+
re-download if no remote hash is available at that time.
|
|
25
28
|
"""
|
|
26
29
|
fqn = parse_any(fqn_or_uri)
|
|
27
30
|
cache_path = global_cache().path(fqn)
|
|
28
31
|
download_or_use_verified(
|
|
29
|
-
get_global_fs_client(fqn.sa, fqn.container),
|
|
32
|
+
get_global_fs_client(fqn.sa, fqn.container),
|
|
33
|
+
fqn.path,
|
|
34
|
+
cache_path,
|
|
35
|
+
expected_hash=expected_hash,
|
|
36
|
+
cache=global_cache(),
|
|
30
37
|
)
|
|
38
|
+
assert cache_path.is_file(), "File should have been downloaded to the cache."
|
|
31
39
|
return cache_path
|
|
32
40
|
|
|
33
41
|
|
|
34
|
-
def upload_through_cache(dest: UriIsh, src_path: Path) ->
|
|
35
|
-
"""Return
|
|
42
|
+
def upload_through_cache(dest: UriIsh, src_path: Path) -> core.source.Source:
|
|
43
|
+
"""Return a Source with a Hash, since by definition an upload through the cache must have a known checksum.
|
|
36
44
|
|
|
37
45
|
Uses global client, which is pretty much always what you want.
|
|
38
46
|
"""
|
|
39
47
|
assert src_path.is_file(), "src_path must be a file."
|
|
40
|
-
|
|
41
|
-
assert
|
|
42
|
-
return
|
|
48
|
+
new_src = upload.upload(dest, src_path, write_through_cache=global_cache())
|
|
49
|
+
assert new_src.hash, "hash should always be calculable for a local path."
|
|
50
|
+
return new_src
|
|
43
51
|
|
|
44
52
|
|
|
45
53
|
def download_directory(fqn: AdlsFqn) -> Path:
|
|
@@ -75,9 +83,6 @@ def upload_directory_through_cache(dest: UriIsh, src_path: Path) -> SourceTree:
|
|
|
75
83
|
|
|
76
84
|
return SourceTree(
|
|
77
85
|
sources=list(
|
|
78
|
-
|
|
79
|
-
source.from_adls,
|
|
80
|
-
parallel.yield_results(upload_thunks, named="upload_directory_through_cache"),
|
|
81
|
-
)
|
|
86
|
+
core.parallel.yield_results(upload_thunks, named="upload_directory_through_cache"),
|
|
82
87
|
)
|
|
83
88
|
)
|
thds/adls/conf.py
CHANGED
thds/adls/download.py
CHANGED
|
@@ -2,30 +2,21 @@ import contextlib
|
|
|
2
2
|
import enum
|
|
3
3
|
import os
|
|
4
4
|
import shutil
|
|
5
|
+
import threading
|
|
5
6
|
import typing as ty
|
|
6
|
-
from
|
|
7
|
+
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import aiohttp.http_exceptions
|
|
9
|
-
from azure.core.exceptions import AzureError, HttpResponseError
|
|
10
|
-
from azure.storage.filedatalake import
|
|
11
|
-
ContentSettings,
|
|
12
|
-
DataLakeFileClient,
|
|
13
|
-
FileProperties,
|
|
14
|
-
FileSystemClient,
|
|
15
|
-
aio,
|
|
16
|
-
)
|
|
10
|
+
from azure.core.exceptions import AzureError, HttpResponseError, ResourceModifiedError
|
|
11
|
+
from azure.storage.filedatalake import DataLakeFileClient, FileProperties, FileSystemClient, aio
|
|
17
12
|
|
|
18
|
-
from thds.core import fretry, log, scope, tmp
|
|
19
|
-
from thds.core.hashing import b64
|
|
13
|
+
from thds.core import fretry, hash_cache, hashing, log, scope, tmp
|
|
20
14
|
from thds.core.types import StrOrPath
|
|
21
15
|
|
|
22
|
-
from . import azcopy
|
|
16
|
+
from . import azcopy, errors, etag, hashes
|
|
23
17
|
from ._progress import report_download_progress
|
|
24
18
|
from .download_lock import download_lock
|
|
25
|
-
from .errors import MD5MismatchError, translate_azure_error
|
|
26
|
-
from .etag import match_etag
|
|
27
19
|
from .fqn import AdlsFqn
|
|
28
|
-
from .md5 import check_reasonable_md5b64, md5_file
|
|
29
20
|
from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
|
|
30
21
|
|
|
31
22
|
logger = log.getLogger(__name__)
|
|
@@ -39,13 +30,16 @@ def _atomic_download_and_move(
|
|
|
39
30
|
) -> ty.Iterator[azcopy.download.DownloadRequest]:
|
|
40
31
|
known_size = (properties.size or 0) if properties else 0
|
|
41
32
|
with tmp.temppath_same_fs(dest) as dpath:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
yield azcopy.download.DownloadRequest(
|
|
45
|
-
|
|
46
|
-
)
|
|
33
|
+
logger.debug("Downloading %s", fqn)
|
|
34
|
+
if azcopy.download.should_use_azcopy(known_size):
|
|
35
|
+
yield azcopy.download.DownloadRequest(dpath, known_size)
|
|
36
|
+
else:
|
|
37
|
+
with open(dpath, "wb") as down_f:
|
|
38
|
+
yield azcopy.download.SdkDownloadRequest(
|
|
39
|
+
dpath, known_size, report_download_progress(down_f, str(fqn), known_size)
|
|
40
|
+
)
|
|
47
41
|
if known_size and os.path.getsize(dpath) != known_size:
|
|
48
|
-
raise
|
|
42
|
+
raise errors.ContentLengthMismatchError(
|
|
49
43
|
f"Downloaded file {dpath} has size {os.path.getsize(dpath)}"
|
|
50
44
|
f" but expected {known_size}."
|
|
51
45
|
)
|
|
@@ -54,58 +48,14 @@ def _atomic_download_and_move(
|
|
|
54
48
|
except OSError as oserr:
|
|
55
49
|
if "Invalid cross-device link" in str(oserr):
|
|
56
50
|
# this shouldn't ever happen because of temppath_same_fs, but just in case...
|
|
51
|
+
logger.warning('Failed to move "%s" to "%s" - copying instead', dpath, dest)
|
|
57
52
|
shutil.copyfile(dpath, dest)
|
|
53
|
+
logger.info('Copied "%s" to "%s"', dpath, dest)
|
|
58
54
|
else:
|
|
55
|
+
logger.error('Failed to move "%s" to "%s" - raising', dpath, dest)
|
|
59
56
|
raise
|
|
60
57
|
|
|
61
58
|
|
|
62
|
-
@contextlib.contextmanager
|
|
63
|
-
def _verify_md5s_before_and_after_download(
|
|
64
|
-
remote_md5b64: str, expected_md5b64: str, fqn: AdlsFqn, local_dest: StrOrPath
|
|
65
|
-
) -> ty.Iterator[None]:
|
|
66
|
-
if expected_md5b64:
|
|
67
|
-
check_reasonable_md5b64(expected_md5b64)
|
|
68
|
-
if remote_md5b64:
|
|
69
|
-
check_reasonable_md5b64(remote_md5b64)
|
|
70
|
-
if remote_md5b64 and expected_md5b64 and remote_md5b64 != expected_md5b64:
|
|
71
|
-
raise MD5MismatchError(
|
|
72
|
-
f"ADLS thinks the MD5 of {fqn} is {remote_md5b64}, but we expected {expected_md5b64}."
|
|
73
|
-
" This may indicate that we need to update a hash in the codebase."
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
yield # perform download
|
|
77
|
-
|
|
78
|
-
with log.logger_context(hash_for="after-download"):
|
|
79
|
-
local_md5b64 = b64(md5_file(local_dest))
|
|
80
|
-
check_reasonable_md5b64(local_md5b64) # must always exist
|
|
81
|
-
if remote_md5b64 and remote_md5b64 != local_md5b64:
|
|
82
|
-
raise MD5MismatchError(
|
|
83
|
-
f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
|
|
84
|
-
f" but the remote ({fqn}) says it should be {remote_md5b64}."
|
|
85
|
-
f" This may indicate that ADLS has an erroneous MD5 for {fqn}."
|
|
86
|
-
)
|
|
87
|
-
if expected_md5b64 and local_md5b64 != expected_md5b64:
|
|
88
|
-
raise MD5MismatchError(
|
|
89
|
-
f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
|
|
90
|
-
f" but we expected it to be {expected_md5b64}."
|
|
91
|
-
f" This probably indicates a corrupted download of {fqn}"
|
|
92
|
-
)
|
|
93
|
-
all_hashes = dict(local=local_md5b64, remote=remote_md5b64, expected=expected_md5b64)
|
|
94
|
-
assert 1 == len(set(filter(None, all_hashes.values()))), all_hashes
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def _md5b64_path_if_exists(path: StrOrPath) -> ty.Optional[str]:
|
|
98
|
-
if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
|
|
99
|
-
return None
|
|
100
|
-
return b64(md5_file(path))
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def _remote_md5b64(file_properties: FileProperties) -> str:
|
|
104
|
-
if file_properties.content_settings.content_md5:
|
|
105
|
-
return b64(file_properties.content_settings.content_md5)
|
|
106
|
-
return ""
|
|
107
|
-
|
|
108
|
-
|
|
109
59
|
# Async is weird.
|
|
110
60
|
#
|
|
111
61
|
# You cannot easily call an async function from within a standard/non-async function.
|
|
@@ -144,6 +94,57 @@ def _remote_md5b64(file_properties: FileProperties) -> str:
|
|
|
144
94
|
# again and rely on the controller to re-send the previously fetched result.
|
|
145
95
|
|
|
146
96
|
|
|
97
|
+
class _FileResult(ty.NamedTuple):
|
|
98
|
+
hash: hashing.Hash
|
|
99
|
+
hit: ty.Optional[Path]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _attempt_cache_hit(
|
|
103
|
+
expected_hash: ty.Optional[hashing.Hash],
|
|
104
|
+
fqn: AdlsFqn,
|
|
105
|
+
local_path: StrOrPath,
|
|
106
|
+
cache: ty.Optional[Cache],
|
|
107
|
+
) -> ty.Optional[_FileResult]:
|
|
108
|
+
if not expected_hash:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
hash_path_if_exists = hashes.hash_path_for_algo(expected_hash.algo)
|
|
112
|
+
|
|
113
|
+
with log.logger_context(hash_for="before-download-dest"):
|
|
114
|
+
local_hash = hash_path_if_exists(local_path)
|
|
115
|
+
if local_hash == expected_hash:
|
|
116
|
+
logger.debug("Local path matches %s - no need to look further", expected_hash.algo)
|
|
117
|
+
if cache:
|
|
118
|
+
cache_path = cache.path(fqn)
|
|
119
|
+
with log.logger_context(hash_for="before-download-cache"):
|
|
120
|
+
if local_hash != hash_path_if_exists(cache_path):
|
|
121
|
+
# only copy if the cache is out of date
|
|
122
|
+
from_local_path_to_cache(local_path, cache_path, cache.link)
|
|
123
|
+
return _FileResult(local_hash, hit=cache_path)
|
|
124
|
+
return _FileResult(local_hash, hit=Path(local_path))
|
|
125
|
+
|
|
126
|
+
if local_hash:
|
|
127
|
+
logger.debug(
|
|
128
|
+
"Local path exists but does not match expected %s %s",
|
|
129
|
+
expected_hash.algo,
|
|
130
|
+
expected_hash.bytes,
|
|
131
|
+
)
|
|
132
|
+
if cache:
|
|
133
|
+
cache_path = cache.path(fqn)
|
|
134
|
+
cache_hash = hash_path_if_exists(cache_path)
|
|
135
|
+
if cache_hash == expected_hash: # file in cache matches!
|
|
136
|
+
from_cache_path_to_local(cache_path, local_path, cache.link)
|
|
137
|
+
return _FileResult(cache_hash, hit=cache_path)
|
|
138
|
+
|
|
139
|
+
if cache_hash:
|
|
140
|
+
logger.debug(
|
|
141
|
+
"Cache path exists but does not match expected %s %s",
|
|
142
|
+
expected_hash.algo,
|
|
143
|
+
expected_hash.bytes,
|
|
144
|
+
)
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
|
|
147
148
|
class _IoRequest(enum.Enum):
|
|
148
149
|
FILE_PROPERTIES = "file_properties"
|
|
149
150
|
|
|
@@ -152,18 +153,13 @@ IoRequest = ty.Union[_IoRequest, azcopy.download.DownloadRequest]
|
|
|
152
153
|
IoResponse = ty.Union[FileProperties, None]
|
|
153
154
|
|
|
154
155
|
|
|
155
|
-
class _FileResult(ty.NamedTuple):
|
|
156
|
-
md5b64: str
|
|
157
|
-
hit: bool
|
|
158
|
-
|
|
159
|
-
|
|
160
156
|
_dl_scope = scope.Scope("adls.download")
|
|
161
157
|
|
|
162
158
|
|
|
163
159
|
def _download_or_use_verified_cached_coroutine( # noqa: C901
|
|
164
160
|
fqn: AdlsFqn,
|
|
165
161
|
local_path: StrOrPath,
|
|
166
|
-
|
|
162
|
+
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
167
163
|
cache: ty.Optional[Cache] = None,
|
|
168
164
|
) -> ty.Generator[IoRequest, IoResponse, _FileResult]:
|
|
169
165
|
"""Make a file on ADLS available at the local path provided.
|
|
@@ -202,82 +198,72 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
|
|
|
202
198
|
writing in a standard fashion.
|
|
203
199
|
|
|
204
200
|
Raises StopIteration when complete. StopIteration.value.hit will
|
|
205
|
-
be
|
|
206
|
-
required. `.value` will also contain the
|
|
207
|
-
file, which may be used as desired.
|
|
201
|
+
be the Path to the cached file if there was a cache hit, and None
|
|
202
|
+
if a download was required. `.value` will also contain the Hash of
|
|
203
|
+
the downloaded file, which may be used as desired.
|
|
208
204
|
"""
|
|
209
205
|
if not local_path:
|
|
210
206
|
raise ValueError("Must provide a destination path.")
|
|
211
207
|
|
|
212
|
-
_dl_scope.enter(log.logger_context(dl=fqn))
|
|
208
|
+
_dl_scope.enter(log.logger_context(dl=fqn, pid=os.getpid(), tid=threading.get_ident()))
|
|
213
209
|
file_properties = None
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
#
|
|
210
|
+
|
|
211
|
+
if not expected_hash:
|
|
212
|
+
# we don't know what we expect, so attempt to retrieve
|
|
213
|
+
# expectations from ADLS itself.
|
|
217
214
|
file_properties = yield _IoRequest.FILE_PROPERTIES
|
|
218
|
-
|
|
219
|
-
|
|
215
|
+
if file_properties:
|
|
216
|
+
# critically, we expect the _first_ one in this list to be the fastest to verify.
|
|
217
|
+
expected_hash = next(iter(hashes.extract_hashes_from_props(file_properties).values()), None)
|
|
220
218
|
|
|
221
219
|
def attempt_cache_hit() -> ty.Optional[_FileResult]:
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
check_reasonable_md5b64(md5b64)
|
|
226
|
-
with log.logger_context(hash_for="before-download-dest"):
|
|
227
|
-
local_md5b64 = _md5b64_path_if_exists(local_path)
|
|
228
|
-
if local_md5b64 == md5b64:
|
|
229
|
-
logger.debug("Local path matches MD5 - no need to look further")
|
|
230
|
-
if cache:
|
|
231
|
-
cache_path = cache.path(fqn)
|
|
232
|
-
with log.logger_context(hash_for="before-download-cache"):
|
|
233
|
-
if local_md5b64 != _md5b64_path_if_exists(cache_path):
|
|
234
|
-
# only copy if the cache is out of date
|
|
235
|
-
from_local_path_to_cache(local_path, cache_path, cache.link)
|
|
236
|
-
return _FileResult(local_md5b64, hit=True)
|
|
237
|
-
|
|
238
|
-
if local_md5b64:
|
|
239
|
-
logger.debug("Local path exists but does not match expected md5 %s", md5b64)
|
|
240
|
-
if cache:
|
|
241
|
-
cache_path = cache.path(fqn)
|
|
242
|
-
cache_md5b64 = _md5b64_path_if_exists(cache_path)
|
|
243
|
-
if cache_md5b64 == md5b64: # file in cache matches!
|
|
244
|
-
from_cache_path_to_local(cache_path, local_path, cache.link)
|
|
245
|
-
return _FileResult(cache_md5b64, hit=True)
|
|
246
|
-
|
|
247
|
-
if cache_md5b64:
|
|
248
|
-
logger.debug("Cache path exists but does not match expected md5 %s", md5b64)
|
|
249
|
-
return None
|
|
220
|
+
return _attempt_cache_hit(
|
|
221
|
+
expected_hash=expected_hash, cache=cache, fqn=fqn, local_path=local_path
|
|
222
|
+
)
|
|
250
223
|
|
|
251
|
-
# attempt cache
|
|
224
|
+
# attempt cache hits before taking a lock, to avoid contention for existing files.
|
|
252
225
|
if file_result := attempt_cache_hit():
|
|
253
226
|
return file_result # noqa: B901
|
|
254
227
|
|
|
255
|
-
|
|
228
|
+
# No cache hit, so its time to prepare to download. if a cache was provided, we will
|
|
229
|
+
# _put_ the resulting file in it.
|
|
230
|
+
|
|
231
|
+
file_lock = str(cache.path(fqn) if cache else local_path)
|
|
256
232
|
# create lockfile name from the (shared) cache path if present, otherwise the final
|
|
257
233
|
# destination. Non-cache users may then still incur multiple downloads in parallel,
|
|
258
234
|
# but if you wanted to coordinate then you should probably have been using the global
|
|
259
235
|
# cache in the first place.
|
|
236
|
+
_dl_scope.enter(download_lock(file_lock))
|
|
260
237
|
|
|
261
238
|
# re-attempt cache hit - we may have gotten the lock after somebody else downloaded
|
|
262
239
|
if file_result := attempt_cache_hit():
|
|
263
|
-
logger.
|
|
240
|
+
logger.info("Got cache hit on the second attempt, after acquiring lock for %s", fqn)
|
|
264
241
|
return file_result # noqa: B901
|
|
265
242
|
|
|
266
243
|
logger.debug("Unable to find a cached version anywhere that we looked...")
|
|
267
244
|
file_properties = yield _IoRequest.FILE_PROPERTIES
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
245
|
+
|
|
246
|
+
# if any of the remote hashes match the expected hash, verify that one.
|
|
247
|
+
# otherwise, verify the first remote hash in the list, since that's the fastest one.
|
|
248
|
+
all_remote_hashes = hashes.extract_hashes_from_props(file_properties)
|
|
249
|
+
remote_hash_to_match = all_remote_hashes.get(expected_hash.algo) if expected_hash else None
|
|
250
|
+
with hashes.verify_hashes_before_and_after_download(
|
|
251
|
+
remote_hash_to_match,
|
|
252
|
+
expected_hash,
|
|
273
253
|
fqn,
|
|
274
254
|
local_path,
|
|
275
255
|
): # download new data directly to local path
|
|
276
256
|
with _atomic_download_and_move(fqn, local_path, file_properties) as tmpwriter:
|
|
277
257
|
yield tmpwriter
|
|
258
|
+
|
|
278
259
|
if cache:
|
|
279
260
|
from_local_path_to_cache(local_path, cache.path(fqn), cache.link)
|
|
280
|
-
|
|
261
|
+
|
|
262
|
+
hash_to_set_if_missing = expected_hash or remote_hash_to_match
|
|
263
|
+
if not hash_to_set_if_missing or hash_to_set_if_missing.algo not in hashes.PREFERRED_ALGOS:
|
|
264
|
+
hash_to_set_if_missing = hash_cache.filehash(hashes.PREFERRED_ALGOS[0], local_path)
|
|
265
|
+
assert hash_to_set_if_missing, "We should have a preferred hash to set at this point."
|
|
266
|
+
return _FileResult(hash_to_set_if_missing, hit=None)
|
|
281
267
|
|
|
282
268
|
|
|
283
269
|
# So ends the crazy download caching coroutine.
|
|
@@ -293,7 +279,7 @@ def _prep_download_coroutine(
|
|
|
293
279
|
fs_client: FileSystemClient,
|
|
294
280
|
remote_key: str,
|
|
295
281
|
local_path: StrOrPath,
|
|
296
|
-
|
|
282
|
+
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
297
283
|
cache: ty.Optional[Cache] = None,
|
|
298
284
|
) -> ty.Tuple[
|
|
299
285
|
ty.Generator[IoRequest, IoResponse, _FileResult],
|
|
@@ -304,22 +290,12 @@ def _prep_download_coroutine(
|
|
|
304
290
|
co = _download_or_use_verified_cached_coroutine(
|
|
305
291
|
AdlsFqn(ty.cast(str, fs_client.account_name), fs_client.file_system_name, remote_key),
|
|
306
292
|
local_path,
|
|
307
|
-
|
|
293
|
+
expected_hash=expected_hash,
|
|
308
294
|
cache=cache,
|
|
309
295
|
)
|
|
310
296
|
return co, co.send(None), None, fs_client.get_file_client(remote_key)
|
|
311
297
|
|
|
312
298
|
|
|
313
|
-
def _set_md5_if_missing(
|
|
314
|
-
file_properties: ty.Optional[FileProperties], md5b64: str
|
|
315
|
-
) -> ty.Optional[ContentSettings]:
|
|
316
|
-
if not file_properties or file_properties.content_settings.content_md5:
|
|
317
|
-
return None
|
|
318
|
-
file_properties.content_settings.content_md5 = b64decode(md5b64) # type: ignore[assignment]
|
|
319
|
-
# TODO - check above type ignore
|
|
320
|
-
return file_properties.content_settings
|
|
321
|
-
|
|
322
|
-
|
|
323
299
|
def _excs_to_retry() -> ty.Callable[[Exception], bool]:
|
|
324
300
|
"""These are exceptions that we observe to be spurious failures worth retrying."""
|
|
325
301
|
return fretry.is_exc(
|
|
@@ -327,6 +303,7 @@ def _excs_to_retry() -> ty.Callable[[Exception], bool]:
|
|
|
327
303
|
filter(
|
|
328
304
|
None,
|
|
329
305
|
(
|
|
306
|
+
errors.ContentLengthMismatchError,
|
|
330
307
|
aiohttp.http_exceptions.ContentLengthError,
|
|
331
308
|
aiohttp.client_exceptions.ClientPayloadError,
|
|
332
309
|
getattr(
|
|
@@ -343,9 +320,10 @@ def download_or_use_verified(
|
|
|
343
320
|
fs_client: FileSystemClient,
|
|
344
321
|
remote_key: str,
|
|
345
322
|
local_path: StrOrPath,
|
|
346
|
-
|
|
323
|
+
*,
|
|
324
|
+
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
347
325
|
cache: ty.Optional[Cache] = None,
|
|
348
|
-
) ->
|
|
326
|
+
) -> ty.Optional[Path]:
|
|
349
327
|
"""Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
|
|
350
328
|
|
|
351
329
|
Note that you will get a logged warning if `local_path` already exists when you call
|
|
@@ -354,7 +332,7 @@ def download_or_use_verified(
|
|
|
354
332
|
file_properties = None
|
|
355
333
|
try:
|
|
356
334
|
co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
|
|
357
|
-
fs_client, remote_key, local_path,
|
|
335
|
+
fs_client, remote_key, local_path, expected_hash, cache
|
|
358
336
|
)
|
|
359
337
|
_dl_scope.enter(dl_file_client) # on __exit__, will release the connection to the pool
|
|
360
338
|
while True:
|
|
@@ -373,16 +351,16 @@ def download_or_use_verified(
|
|
|
373
351
|
else:
|
|
374
352
|
raise ValueError(f"Unexpected coroutine request: {co_request}")
|
|
375
353
|
except StopIteration as si:
|
|
376
|
-
if
|
|
354
|
+
if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
|
|
377
355
|
try:
|
|
378
|
-
logger.info(f"Setting missing
|
|
356
|
+
logger.info(f"Setting missing hash for {remote_key}")
|
|
379
357
|
assert file_properties
|
|
380
|
-
dl_file_client.
|
|
381
|
-
except HttpResponseError as
|
|
382
|
-
logger.info(f"Unable to set
|
|
358
|
+
dl_file_client.set_metadata(meta, **etag.match_etag(file_properties))
|
|
359
|
+
except (HttpResponseError, ResourceModifiedError) as ex:
|
|
360
|
+
logger.info(f"Unable to set Hash for {remote_key}: {ex}")
|
|
383
361
|
return si.value.hit
|
|
384
362
|
except AzureError as err:
|
|
385
|
-
translate_azure_error(fs_client, remote_key, err)
|
|
363
|
+
errors.translate_azure_error(fs_client, remote_key, err)
|
|
386
364
|
|
|
387
365
|
|
|
388
366
|
_async_dl_scope = scope.AsyncScope("adls.download.async")
|
|
@@ -394,13 +372,14 @@ async def async_download_or_use_verified(
|
|
|
394
372
|
fs_client: aio.FileSystemClient,
|
|
395
373
|
remote_key: str,
|
|
396
374
|
local_path: StrOrPath,
|
|
397
|
-
|
|
375
|
+
*,
|
|
376
|
+
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
398
377
|
cache: ty.Optional[Cache] = None,
|
|
399
|
-
) ->
|
|
378
|
+
) -> ty.Optional[Path]:
|
|
400
379
|
file_properties = None
|
|
401
380
|
try:
|
|
402
381
|
co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
|
|
403
|
-
fs_client, remote_key, local_path,
|
|
382
|
+
fs_client, remote_key, local_path, expected_hash, cache
|
|
404
383
|
)
|
|
405
384
|
await _async_dl_scope.async_enter(
|
|
406
385
|
dl_file_client # type: ignore[arg-type]
|
|
@@ -428,16 +407,14 @@ async def async_download_or_use_verified(
|
|
|
428
407
|
raise ValueError(f"Unexpected coroutine request: {co_request}")
|
|
429
408
|
|
|
430
409
|
except StopIteration as si:
|
|
431
|
-
if
|
|
410
|
+
if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
|
|
432
411
|
try:
|
|
433
|
-
logger.info(f"Setting missing
|
|
412
|
+
logger.info(f"Setting missing Hash for {remote_key}")
|
|
434
413
|
assert file_properties
|
|
435
|
-
await dl_file_client.
|
|
436
|
-
cs, **match_etag(file_properties)
|
|
437
|
-
)
|
|
414
|
+
await dl_file_client.set_metadata(meta, **etag.match_etag(file_properties)) # type: ignore[misc]
|
|
438
415
|
# TODO - check above type ignore
|
|
439
|
-
except HttpResponseError as
|
|
440
|
-
logger.info(f"Unable to set
|
|
416
|
+
except (HttpResponseError, ResourceModifiedError) as ex:
|
|
417
|
+
logger.info(f"Unable to set Hash for {remote_key}: {ex}")
|
|
441
418
|
return si.value.hit
|
|
442
419
|
except AzureError as err:
|
|
443
|
-
translate_azure_error(fs_client, remote_key, err)
|
|
420
|
+
errors.translate_azure_error(fs_client, remote_key, err)
|
thds/adls/download_lock.py
CHANGED
|
@@ -9,7 +9,7 @@ from thds.core import config, home, log
|
|
|
9
9
|
|
|
10
10
|
from .md5 import hex_md5_str
|
|
11
11
|
|
|
12
|
-
DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".adls
|
|
12
|
+
DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/download-locks", parse=Path)
|
|
13
13
|
_CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
|
|
14
14
|
_CLEAN_UP_LOCKFILES_EVERY = timedelta(hours=1).total_seconds()
|
|
15
15
|
_LAST_CLEANED_BY_THIS_PROCESS = time.monotonic() - _CLEAN_UP_LOCKFILES_EVERY
|
|
@@ -60,4 +60,11 @@ def download_lock(download_unique_str: str) -> FileLock:
|
|
|
60
60
|
"""
|
|
61
61
|
DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
|
|
62
62
|
_occasionally_clean_download_locks()
|
|
63
|
-
return FileLock(
|
|
63
|
+
return FileLock(
|
|
64
|
+
DOWNLOAD_LOCKS_DIR()
|
|
65
|
+
/ (download_unique_str.split("/")[-1][:50] + hex_md5_str(download_unique_str)),
|
|
66
|
+
# is_singleton=True,
|
|
67
|
+
# critical for keeping this reentrant without passing the lock around.
|
|
68
|
+
# see https://github.com/tox-dev/filelock/issues/315#issuecomment-2016797681
|
|
69
|
+
# however, this is not compatible with the version of Databricks we use, so.....
|
|
70
|
+
)
|
thds/adls/errors.py
CHANGED
|
@@ -15,8 +15,16 @@ class BlobNotFoundError(HttpResponseError):
|
|
|
15
15
|
super().__init__(f"{type_hint} not found: {fqn}")
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class
|
|
19
|
-
"""
|
|
18
|
+
class BlobPropertiesValidationError(ValueError):
|
|
19
|
+
"""Raised when the properties of a blob do not match the expected values."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HashMismatchError(BlobPropertiesValidationError):
|
|
23
|
+
"""Raised when the hash of a file does not match the expected value."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ContentLengthMismatchError(BlobPropertiesValidationError):
|
|
27
|
+
"""Raised when the content length of a file does not match the expected value as retrieved from the server."""
|
|
20
28
|
|
|
21
29
|
|
|
22
30
|
def is_blob_not_found(exc: Exception) -> bool:
|
thds/adls/file_properties.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
|
|
1
3
|
from azure.core.exceptions import AzureError, ResourceNotFoundError
|
|
2
4
|
from azure.storage.blob import BlobProperties
|
|
3
5
|
from azure.storage.filedatalake import FileProperties
|
|
@@ -27,6 +29,12 @@ def get_blob_properties(fqn: AdlsFqn) -> BlobProperties:
|
|
|
27
29
|
)
|
|
28
30
|
|
|
29
31
|
|
|
32
|
+
class PropertiesP(ty.Protocol):
|
|
33
|
+
name: ty.Any
|
|
34
|
+
metadata: ty.Any
|
|
35
|
+
content_settings: ty.Any
|
|
36
|
+
|
|
37
|
+
|
|
30
38
|
# At some point it may make sense to separate file and blob property modules,
|
|
31
39
|
# but they also are very closely tied together. AFAIK all files are blobs, and given our usage of ADLS,
|
|
32
40
|
# I don't know if we ever deal with things that are blobs but not files.
|