thds.adls 4.1.20250701001205__py3-none-any.whl → 4.1.20250702194306__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/__init__.py +10 -5
- thds/adls/_upload.py +54 -41
- thds/adls/azcopy/__init__.py +1 -1
- thds/adls/azcopy/download.py +68 -102
- thds/adls/azcopy/login.py +39 -0
- thds/adls/azcopy/progress.py +49 -0
- thds/adls/azcopy/system_resources.py +26 -0
- thds/adls/azcopy/upload.py +95 -0
- thds/adls/{cached_up_down.py → cached.py} +21 -16
- thds/adls/conf.py +1 -0
- thds/adls/download.py +142 -161
- thds/adls/download_lock.py +9 -2
- thds/adls/errors.py +10 -2
- thds/adls/file_properties.py +8 -0
- thds/adls/hashes.py +147 -0
- thds/adls/impl.py +3 -4
- thds/adls/md5.py +5 -52
- thds/adls/ro_cache.py +1 -2
- thds/adls/source.py +37 -34
- thds/adls/tools/download.py +6 -5
- thds/adls/tools/upload.py +3 -4
- thds/adls/upload.py +168 -0
- thds/adls/uri.py +6 -0
- {thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250702194306.dist-info}/METADATA +1 -1
- thds_adls-4.1.20250702194306.dist-info/RECORD +42 -0
- thds/adls/resource/__init__.py +0 -36
- thds/adls/resource/core.py +0 -77
- thds/adls/resource/file_pointers.py +0 -54
- thds/adls/resource/up_down.py +0 -242
- thds_adls-4.1.20250701001205.dist-info/RECORD +0 -40
- {thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250702194306.dist-info}/WHEEL +0 -0
- {thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250702194306.dist-info}/entry_points.txt +0 -0
- {thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250702194306.dist-info}/top_level.txt +0 -0
|
@@ -1,45 +1,53 @@
|
|
|
1
1
|
import typing as ty
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
from thds
|
|
5
|
-
from thds.core import parallel
|
|
4
|
+
from thds import core
|
|
6
5
|
from thds.core.source.tree import SourceTree
|
|
7
6
|
from thds.core.thunks import thunking
|
|
8
7
|
|
|
8
|
+
from . import upload
|
|
9
9
|
from .download import download_or_use_verified
|
|
10
10
|
from .fqn import AdlsFqn
|
|
11
11
|
from .global_client import get_global_fs_client
|
|
12
12
|
from .impl import ADLSFileSystem
|
|
13
|
-
from .resource.up_down import AdlsHashedResource, upload
|
|
14
13
|
from .ro_cache import global_cache
|
|
15
14
|
from .uri import UriIsh, parse_any
|
|
16
15
|
|
|
17
16
|
|
|
18
|
-
def download_to_cache(
|
|
17
|
+
def download_to_cache(
|
|
18
|
+
fqn_or_uri: UriIsh,
|
|
19
|
+
*,
|
|
20
|
+
expected_hash: ty.Optional[core.hashing.Hash] = None,
|
|
21
|
+
) -> Path:
|
|
19
22
|
"""Downloads directly to the cache and returns a Path to the read-only file.
|
|
20
23
|
|
|
21
24
|
This will allow you to download a file 'into' the cache even if
|
|
22
|
-
you provide no
|
|
25
|
+
you provide no expected hash and the remote file properties does not have
|
|
23
26
|
one. However, future attempts to reuse the cache will force a
|
|
24
|
-
re-download if no
|
|
27
|
+
re-download if no remote hash is available at that time.
|
|
25
28
|
"""
|
|
26
29
|
fqn = parse_any(fqn_or_uri)
|
|
27
30
|
cache_path = global_cache().path(fqn)
|
|
28
31
|
download_or_use_verified(
|
|
29
|
-
get_global_fs_client(fqn.sa, fqn.container),
|
|
32
|
+
get_global_fs_client(fqn.sa, fqn.container),
|
|
33
|
+
fqn.path,
|
|
34
|
+
cache_path,
|
|
35
|
+
expected_hash=expected_hash,
|
|
36
|
+
cache=global_cache(),
|
|
30
37
|
)
|
|
38
|
+
assert cache_path.is_file(), "File should have been downloaded to the cache."
|
|
31
39
|
return cache_path
|
|
32
40
|
|
|
33
41
|
|
|
34
|
-
def upload_through_cache(dest: UriIsh, src_path: Path) ->
|
|
35
|
-
"""Return
|
|
42
|
+
def upload_through_cache(dest: UriIsh, src_path: Path) -> core.source.Source:
|
|
43
|
+
"""Return a Source with a Hash, since by definition an upload through the cache must have a known checksum.
|
|
36
44
|
|
|
37
45
|
Uses global client, which is pretty much always what you want.
|
|
38
46
|
"""
|
|
39
47
|
assert src_path.is_file(), "src_path must be a file."
|
|
40
|
-
|
|
41
|
-
assert
|
|
42
|
-
return
|
|
48
|
+
new_src = upload.upload(dest, src_path, write_through_cache=global_cache())
|
|
49
|
+
assert new_src.hash, "hash should always be calculable for a local path."
|
|
50
|
+
return new_src
|
|
43
51
|
|
|
44
52
|
|
|
45
53
|
def download_directory(fqn: AdlsFqn) -> Path:
|
|
@@ -75,9 +83,6 @@ def upload_directory_through_cache(dest: UriIsh, src_path: Path) -> SourceTree:
|
|
|
75
83
|
|
|
76
84
|
return SourceTree(
|
|
77
85
|
sources=list(
|
|
78
|
-
|
|
79
|
-
source.from_adls,
|
|
80
|
-
parallel.yield_results(upload_thunks, named="upload_directory_through_cache"),
|
|
81
|
-
)
|
|
86
|
+
core.parallel.yield_results(upload_thunks, named="upload_directory_through_cache"),
|
|
82
87
|
)
|
|
83
88
|
)
|
thds/adls/conf.py
CHANGED
thds/adls/download.py
CHANGED
|
@@ -2,110 +2,64 @@ import contextlib
|
|
|
2
2
|
import enum
|
|
3
3
|
import os
|
|
4
4
|
import shutil
|
|
5
|
+
import threading
|
|
5
6
|
import typing as ty
|
|
6
|
-
from
|
|
7
|
+
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import aiohttp.http_exceptions
|
|
9
|
-
from azure.core.exceptions import AzureError, HttpResponseError
|
|
10
|
-
from azure.storage.filedatalake import
|
|
11
|
-
ContentSettings,
|
|
12
|
-
DataLakeFileClient,
|
|
13
|
-
FileProperties,
|
|
14
|
-
FileSystemClient,
|
|
15
|
-
aio,
|
|
16
|
-
)
|
|
10
|
+
from azure.core.exceptions import AzureError, HttpResponseError, ResourceModifiedError
|
|
11
|
+
from azure.storage.filedatalake import DataLakeFileClient, FileProperties, FileSystemClient, aio
|
|
17
12
|
|
|
18
|
-
from thds.core import fretry, log, scope, tmp
|
|
19
|
-
from thds.core.hashing import b64
|
|
13
|
+
from thds.core import fretry, hash_cache, hashing, log, scope, tmp
|
|
20
14
|
from thds.core.types import StrOrPath
|
|
21
15
|
|
|
22
|
-
from . import azcopy
|
|
16
|
+
from . import azcopy, errors, etag, hashes
|
|
23
17
|
from ._progress import report_download_progress
|
|
24
18
|
from .download_lock import download_lock
|
|
25
|
-
from .errors import MD5MismatchError, translate_azure_error
|
|
26
|
-
from .etag import match_etag
|
|
27
19
|
from .fqn import AdlsFqn
|
|
28
|
-
from .md5 import check_reasonable_md5b64, md5_file
|
|
29
20
|
from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
|
|
30
21
|
|
|
31
22
|
logger = log.getLogger(__name__)
|
|
32
23
|
|
|
33
24
|
|
|
25
|
+
def _check_size(dpath: Path, expected_size: ty.Optional[int]) -> None:
|
|
26
|
+
actual_size = os.path.getsize(dpath)
|
|
27
|
+
if expected_size is not None and actual_size != expected_size:
|
|
28
|
+
raise errors.ContentLengthMismatchError(
|
|
29
|
+
f"Downloaded file {dpath} has size {actual_size} but expected {expected_size}"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
34
33
|
@contextlib.contextmanager
|
|
35
34
|
def _atomic_download_and_move(
|
|
36
35
|
fqn: AdlsFqn,
|
|
37
36
|
dest: StrOrPath,
|
|
38
37
|
properties: ty.Optional[FileProperties] = None,
|
|
39
38
|
) -> ty.Iterator[azcopy.download.DownloadRequest]:
|
|
40
|
-
known_size =
|
|
39
|
+
known_size = properties.size if properties else None
|
|
41
40
|
with tmp.temppath_same_fs(dest) as dpath:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
yield azcopy.download.DownloadRequest(
|
|
45
|
-
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
)
|
|
41
|
+
logger.debug("Downloading %s", fqn)
|
|
42
|
+
if azcopy.download.should_use_azcopy(known_size or -1):
|
|
43
|
+
yield azcopy.download.DownloadRequest(dpath, known_size)
|
|
44
|
+
else:
|
|
45
|
+
with open(dpath, "wb") as down_f:
|
|
46
|
+
yield azcopy.download.SdkDownloadRequest(
|
|
47
|
+
dpath, known_size, report_download_progress(down_f, str(fqn), known_size or 0)
|
|
48
|
+
)
|
|
49
|
+
_check_size(dpath, known_size)
|
|
52
50
|
try:
|
|
53
51
|
os.rename(dpath, dest) # will succeed even if dest is read-only
|
|
54
52
|
except OSError as oserr:
|
|
55
53
|
if "Invalid cross-device link" in str(oserr):
|
|
56
54
|
# this shouldn't ever happen because of temppath_same_fs, but just in case...
|
|
55
|
+
logger.warning('Failed to move "%s" to "%s" - copying instead', dpath, dest)
|
|
57
56
|
shutil.copyfile(dpath, dest)
|
|
57
|
+
logger.info('Copied "%s" to "%s"', dpath, dest)
|
|
58
58
|
else:
|
|
59
|
+
logger.error('Failed to move "%s" to "%s" - raising', dpath, dest)
|
|
59
60
|
raise
|
|
60
61
|
|
|
61
62
|
|
|
62
|
-
@contextlib.contextmanager
|
|
63
|
-
def _verify_md5s_before_and_after_download(
|
|
64
|
-
remote_md5b64: str, expected_md5b64: str, fqn: AdlsFqn, local_dest: StrOrPath
|
|
65
|
-
) -> ty.Iterator[None]:
|
|
66
|
-
if expected_md5b64:
|
|
67
|
-
check_reasonable_md5b64(expected_md5b64)
|
|
68
|
-
if remote_md5b64:
|
|
69
|
-
check_reasonable_md5b64(remote_md5b64)
|
|
70
|
-
if remote_md5b64 and expected_md5b64 and remote_md5b64 != expected_md5b64:
|
|
71
|
-
raise MD5MismatchError(
|
|
72
|
-
f"ADLS thinks the MD5 of {fqn} is {remote_md5b64}, but we expected {expected_md5b64}."
|
|
73
|
-
" This may indicate that we need to update a hash in the codebase."
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
yield # perform download
|
|
77
|
-
|
|
78
|
-
with log.logger_context(hash_for="after-download"):
|
|
79
|
-
local_md5b64 = b64(md5_file(local_dest))
|
|
80
|
-
check_reasonable_md5b64(local_md5b64) # must always exist
|
|
81
|
-
if remote_md5b64 and remote_md5b64 != local_md5b64:
|
|
82
|
-
raise MD5MismatchError(
|
|
83
|
-
f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
|
|
84
|
-
f" but the remote ({fqn}) says it should be {remote_md5b64}."
|
|
85
|
-
f" This may indicate that ADLS has an erroneous MD5 for {fqn}."
|
|
86
|
-
)
|
|
87
|
-
if expected_md5b64 and local_md5b64 != expected_md5b64:
|
|
88
|
-
raise MD5MismatchError(
|
|
89
|
-
f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
|
|
90
|
-
f" but we expected it to be {expected_md5b64}."
|
|
91
|
-
f" This probably indicates a corrupted download of {fqn}"
|
|
92
|
-
)
|
|
93
|
-
all_hashes = dict(local=local_md5b64, remote=remote_md5b64, expected=expected_md5b64)
|
|
94
|
-
assert 1 == len(set(filter(None, all_hashes.values()))), all_hashes
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def _md5b64_path_if_exists(path: StrOrPath) -> ty.Optional[str]:
|
|
98
|
-
if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
|
|
99
|
-
return None
|
|
100
|
-
return b64(md5_file(path))
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def _remote_md5b64(file_properties: FileProperties) -> str:
|
|
104
|
-
if file_properties.content_settings.content_md5:
|
|
105
|
-
return b64(file_properties.content_settings.content_md5)
|
|
106
|
-
return ""
|
|
107
|
-
|
|
108
|
-
|
|
109
63
|
# Async is weird.
|
|
110
64
|
#
|
|
111
65
|
# You cannot easily call an async function from within a standard/non-async function.
|
|
@@ -144,6 +98,57 @@ def _remote_md5b64(file_properties: FileProperties) -> str:
|
|
|
144
98
|
# again and rely on the controller to re-send the previously fetched result.
|
|
145
99
|
|
|
146
100
|
|
|
101
|
+
class _FileResult(ty.NamedTuple):
|
|
102
|
+
hash: hashing.Hash
|
|
103
|
+
hit: ty.Optional[Path]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _attempt_cache_hit(
|
|
107
|
+
expected_hash: ty.Optional[hashing.Hash],
|
|
108
|
+
fqn: AdlsFqn,
|
|
109
|
+
local_path: StrOrPath,
|
|
110
|
+
cache: ty.Optional[Cache],
|
|
111
|
+
) -> ty.Optional[_FileResult]:
|
|
112
|
+
if not expected_hash:
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
hash_path_if_exists = hashes.hash_path_for_algo(expected_hash.algo)
|
|
116
|
+
|
|
117
|
+
with log.logger_context(hash_for="before-download-dest"):
|
|
118
|
+
local_hash = hash_path_if_exists(local_path)
|
|
119
|
+
if local_hash == expected_hash:
|
|
120
|
+
logger.debug("Local path matches %s - no need to look further", expected_hash.algo)
|
|
121
|
+
if cache:
|
|
122
|
+
cache_path = cache.path(fqn)
|
|
123
|
+
with log.logger_context(hash_for="before-download-cache"):
|
|
124
|
+
if local_hash != hash_path_if_exists(cache_path):
|
|
125
|
+
# only copy if the cache is out of date
|
|
126
|
+
from_local_path_to_cache(local_path, cache_path, cache.link)
|
|
127
|
+
return _FileResult(local_hash, hit=cache_path)
|
|
128
|
+
return _FileResult(local_hash, hit=Path(local_path))
|
|
129
|
+
|
|
130
|
+
if local_hash:
|
|
131
|
+
logger.debug(
|
|
132
|
+
"Local path exists but does not match expected %s %s",
|
|
133
|
+
expected_hash.algo,
|
|
134
|
+
expected_hash.bytes,
|
|
135
|
+
)
|
|
136
|
+
if cache:
|
|
137
|
+
cache_path = cache.path(fqn)
|
|
138
|
+
cache_hash = hash_path_if_exists(cache_path)
|
|
139
|
+
if cache_hash == expected_hash: # file in cache matches!
|
|
140
|
+
from_cache_path_to_local(cache_path, local_path, cache.link)
|
|
141
|
+
return _FileResult(cache_hash, hit=cache_path)
|
|
142
|
+
|
|
143
|
+
if cache_hash:
|
|
144
|
+
logger.debug(
|
|
145
|
+
"Cache path exists but does not match expected %s %s",
|
|
146
|
+
expected_hash.algo,
|
|
147
|
+
expected_hash.bytes,
|
|
148
|
+
)
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
|
|
147
152
|
class _IoRequest(enum.Enum):
|
|
148
153
|
FILE_PROPERTIES = "file_properties"
|
|
149
154
|
|
|
@@ -152,18 +157,13 @@ IoRequest = ty.Union[_IoRequest, azcopy.download.DownloadRequest]
|
|
|
152
157
|
IoResponse = ty.Union[FileProperties, None]
|
|
153
158
|
|
|
154
159
|
|
|
155
|
-
class _FileResult(ty.NamedTuple):
|
|
156
|
-
md5b64: str
|
|
157
|
-
hit: bool
|
|
158
|
-
|
|
159
|
-
|
|
160
160
|
_dl_scope = scope.Scope("adls.download")
|
|
161
161
|
|
|
162
162
|
|
|
163
163
|
def _download_or_use_verified_cached_coroutine( # noqa: C901
|
|
164
164
|
fqn: AdlsFqn,
|
|
165
165
|
local_path: StrOrPath,
|
|
166
|
-
|
|
166
|
+
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
167
167
|
cache: ty.Optional[Cache] = None,
|
|
168
168
|
) -> ty.Generator[IoRequest, IoResponse, _FileResult]:
|
|
169
169
|
"""Make a file on ADLS available at the local path provided.
|
|
@@ -202,82 +202,72 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
|
|
|
202
202
|
writing in a standard fashion.
|
|
203
203
|
|
|
204
204
|
Raises StopIteration when complete. StopIteration.value.hit will
|
|
205
|
-
be
|
|
206
|
-
required. `.value` will also contain the
|
|
207
|
-
file, which may be used as desired.
|
|
205
|
+
be the Path to the cached file if there was a cache hit, and None
|
|
206
|
+
if a download was required. `.value` will also contain the Hash of
|
|
207
|
+
the downloaded file, which may be used as desired.
|
|
208
208
|
"""
|
|
209
209
|
if not local_path:
|
|
210
210
|
raise ValueError("Must provide a destination path.")
|
|
211
211
|
|
|
212
|
-
_dl_scope.enter(log.logger_context(dl=fqn))
|
|
212
|
+
_dl_scope.enter(log.logger_context(dl=fqn, pid=os.getpid(), tid=threading.get_ident()))
|
|
213
213
|
file_properties = None
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
#
|
|
214
|
+
|
|
215
|
+
if not expected_hash:
|
|
216
|
+
# we don't know what we expect, so attempt to retrieve
|
|
217
|
+
# expectations from ADLS itself.
|
|
217
218
|
file_properties = yield _IoRequest.FILE_PROPERTIES
|
|
218
|
-
|
|
219
|
-
|
|
219
|
+
if file_properties:
|
|
220
|
+
# critically, we expect the _first_ one in this list to be the fastest to verify.
|
|
221
|
+
expected_hash = next(iter(hashes.extract_hashes_from_props(file_properties).values()), None)
|
|
220
222
|
|
|
221
223
|
def attempt_cache_hit() -> ty.Optional[_FileResult]:
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
check_reasonable_md5b64(md5b64)
|
|
226
|
-
with log.logger_context(hash_for="before-download-dest"):
|
|
227
|
-
local_md5b64 = _md5b64_path_if_exists(local_path)
|
|
228
|
-
if local_md5b64 == md5b64:
|
|
229
|
-
logger.debug("Local path matches MD5 - no need to look further")
|
|
230
|
-
if cache:
|
|
231
|
-
cache_path = cache.path(fqn)
|
|
232
|
-
with log.logger_context(hash_for="before-download-cache"):
|
|
233
|
-
if local_md5b64 != _md5b64_path_if_exists(cache_path):
|
|
234
|
-
# only copy if the cache is out of date
|
|
235
|
-
from_local_path_to_cache(local_path, cache_path, cache.link)
|
|
236
|
-
return _FileResult(local_md5b64, hit=True)
|
|
237
|
-
|
|
238
|
-
if local_md5b64:
|
|
239
|
-
logger.debug("Local path exists but does not match expected md5 %s", md5b64)
|
|
240
|
-
if cache:
|
|
241
|
-
cache_path = cache.path(fqn)
|
|
242
|
-
cache_md5b64 = _md5b64_path_if_exists(cache_path)
|
|
243
|
-
if cache_md5b64 == md5b64: # file in cache matches!
|
|
244
|
-
from_cache_path_to_local(cache_path, local_path, cache.link)
|
|
245
|
-
return _FileResult(cache_md5b64, hit=True)
|
|
246
|
-
|
|
247
|
-
if cache_md5b64:
|
|
248
|
-
logger.debug("Cache path exists but does not match expected md5 %s", md5b64)
|
|
249
|
-
return None
|
|
224
|
+
return _attempt_cache_hit(
|
|
225
|
+
expected_hash=expected_hash, cache=cache, fqn=fqn, local_path=local_path
|
|
226
|
+
)
|
|
250
227
|
|
|
251
|
-
# attempt cache
|
|
228
|
+
# attempt cache hits before taking a lock, to avoid contention for existing files.
|
|
252
229
|
if file_result := attempt_cache_hit():
|
|
253
230
|
return file_result # noqa: B901
|
|
254
231
|
|
|
255
|
-
|
|
232
|
+
# No cache hit, so its time to prepare to download. if a cache was provided, we will
|
|
233
|
+
# _put_ the resulting file in it.
|
|
234
|
+
|
|
235
|
+
file_lock = str(cache.path(fqn) if cache else local_path)
|
|
256
236
|
# create lockfile name from the (shared) cache path if present, otherwise the final
|
|
257
237
|
# destination. Non-cache users may then still incur multiple downloads in parallel,
|
|
258
238
|
# but if you wanted to coordinate then you should probably have been using the global
|
|
259
239
|
# cache in the first place.
|
|
240
|
+
_dl_scope.enter(download_lock(file_lock))
|
|
260
241
|
|
|
261
242
|
# re-attempt cache hit - we may have gotten the lock after somebody else downloaded
|
|
262
243
|
if file_result := attempt_cache_hit():
|
|
263
|
-
logger.
|
|
244
|
+
logger.info("Got cache hit on the second attempt, after acquiring lock for %s", fqn)
|
|
264
245
|
return file_result # noqa: B901
|
|
265
246
|
|
|
266
247
|
logger.debug("Unable to find a cached version anywhere that we looked...")
|
|
267
248
|
file_properties = yield _IoRequest.FILE_PROPERTIES
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
249
|
+
|
|
250
|
+
# if any of the remote hashes match the expected hash, verify that one.
|
|
251
|
+
# otherwise, verify the first remote hash in the list, since that's the fastest one.
|
|
252
|
+
all_remote_hashes = hashes.extract_hashes_from_props(file_properties)
|
|
253
|
+
remote_hash_to_match = all_remote_hashes.get(expected_hash.algo) if expected_hash else None
|
|
254
|
+
with hashes.verify_hashes_before_and_after_download(
|
|
255
|
+
remote_hash_to_match,
|
|
256
|
+
expected_hash,
|
|
273
257
|
fqn,
|
|
274
258
|
local_path,
|
|
275
259
|
): # download new data directly to local path
|
|
276
260
|
with _atomic_download_and_move(fqn, local_path, file_properties) as tmpwriter:
|
|
277
261
|
yield tmpwriter
|
|
262
|
+
|
|
278
263
|
if cache:
|
|
279
264
|
from_local_path_to_cache(local_path, cache.path(fqn), cache.link)
|
|
280
|
-
|
|
265
|
+
|
|
266
|
+
hash_to_set_if_missing = expected_hash or remote_hash_to_match
|
|
267
|
+
if not hash_to_set_if_missing or hash_to_set_if_missing.algo not in hashes.PREFERRED_ALGOS:
|
|
268
|
+
hash_to_set_if_missing = hash_cache.filehash(hashes.PREFERRED_ALGOS[0], local_path)
|
|
269
|
+
assert hash_to_set_if_missing, "We should have a preferred hash to set at this point."
|
|
270
|
+
return _FileResult(hash_to_set_if_missing, hit=None)
|
|
281
271
|
|
|
282
272
|
|
|
283
273
|
# So ends the crazy download caching coroutine.
|
|
@@ -293,7 +283,7 @@ def _prep_download_coroutine(
|
|
|
293
283
|
fs_client: FileSystemClient,
|
|
294
284
|
remote_key: str,
|
|
295
285
|
local_path: StrOrPath,
|
|
296
|
-
|
|
286
|
+
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
297
287
|
cache: ty.Optional[Cache] = None,
|
|
298
288
|
) -> ty.Tuple[
|
|
299
289
|
ty.Generator[IoRequest, IoResponse, _FileResult],
|
|
@@ -304,22 +294,12 @@ def _prep_download_coroutine(
|
|
|
304
294
|
co = _download_or_use_verified_cached_coroutine(
|
|
305
295
|
AdlsFqn(ty.cast(str, fs_client.account_name), fs_client.file_system_name, remote_key),
|
|
306
296
|
local_path,
|
|
307
|
-
|
|
297
|
+
expected_hash=expected_hash,
|
|
308
298
|
cache=cache,
|
|
309
299
|
)
|
|
310
300
|
return co, co.send(None), None, fs_client.get_file_client(remote_key)
|
|
311
301
|
|
|
312
302
|
|
|
313
|
-
def _set_md5_if_missing(
|
|
314
|
-
file_properties: ty.Optional[FileProperties], md5b64: str
|
|
315
|
-
) -> ty.Optional[ContentSettings]:
|
|
316
|
-
if not file_properties or file_properties.content_settings.content_md5:
|
|
317
|
-
return None
|
|
318
|
-
file_properties.content_settings.content_md5 = b64decode(md5b64) # type: ignore[assignment]
|
|
319
|
-
# TODO - check above type ignore
|
|
320
|
-
return file_properties.content_settings
|
|
321
|
-
|
|
322
|
-
|
|
323
303
|
def _excs_to_retry() -> ty.Callable[[Exception], bool]:
|
|
324
304
|
"""These are exceptions that we observe to be spurious failures worth retrying."""
|
|
325
305
|
return fretry.is_exc(
|
|
@@ -343,9 +323,10 @@ def download_or_use_verified(
|
|
|
343
323
|
fs_client: FileSystemClient,
|
|
344
324
|
remote_key: str,
|
|
345
325
|
local_path: StrOrPath,
|
|
346
|
-
|
|
326
|
+
*,
|
|
327
|
+
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
347
328
|
cache: ty.Optional[Cache] = None,
|
|
348
|
-
) ->
|
|
329
|
+
) -> ty.Optional[Path]:
|
|
349
330
|
"""Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
|
|
350
331
|
|
|
351
332
|
Note that you will get a logged warning if `local_path` already exists when you call
|
|
@@ -354,7 +335,7 @@ def download_or_use_verified(
|
|
|
354
335
|
file_properties = None
|
|
355
336
|
try:
|
|
356
337
|
co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
|
|
357
|
-
fs_client, remote_key, local_path,
|
|
338
|
+
fs_client, remote_key, local_path, expected_hash, cache
|
|
358
339
|
)
|
|
359
340
|
_dl_scope.enter(dl_file_client) # on __exit__, will release the connection to the pool
|
|
360
341
|
while True:
|
|
@@ -373,16 +354,16 @@ def download_or_use_verified(
|
|
|
373
354
|
else:
|
|
374
355
|
raise ValueError(f"Unexpected coroutine request: {co_request}")
|
|
375
356
|
except StopIteration as si:
|
|
376
|
-
if
|
|
357
|
+
if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
|
|
377
358
|
try:
|
|
378
|
-
logger.info(f"Setting missing
|
|
359
|
+
logger.info(f"Setting missing hash for {remote_key}")
|
|
379
360
|
assert file_properties
|
|
380
|
-
dl_file_client.
|
|
381
|
-
except HttpResponseError as
|
|
382
|
-
logger.info(f"Unable to set
|
|
361
|
+
dl_file_client.set_metadata(meta, **etag.match_etag(file_properties))
|
|
362
|
+
except (HttpResponseError, ResourceModifiedError) as ex:
|
|
363
|
+
logger.info(f"Unable to set Hash for {remote_key}: {ex}")
|
|
383
364
|
return si.value.hit
|
|
384
365
|
except AzureError as err:
|
|
385
|
-
translate_azure_error(fs_client, remote_key, err)
|
|
366
|
+
errors.translate_azure_error(fs_client, remote_key, err)
|
|
386
367
|
|
|
387
368
|
|
|
388
369
|
_async_dl_scope = scope.AsyncScope("adls.download.async")
|
|
@@ -390,21 +371,24 @@ _async_dl_scope = scope.AsyncScope("adls.download.async")
|
|
|
390
371
|
|
|
391
372
|
@_dl_scope.bound
|
|
392
373
|
@_async_dl_scope.async_bound
|
|
374
|
+
@fretry.retry_regular_async(
|
|
375
|
+
fretry.is_exc(errors.ContentLengthMismatchError), fretry.iter_to_async(fretry.n_times(2))
|
|
376
|
+
)
|
|
393
377
|
async def async_download_or_use_verified(
|
|
394
378
|
fs_client: aio.FileSystemClient,
|
|
395
379
|
remote_key: str,
|
|
396
380
|
local_path: StrOrPath,
|
|
397
|
-
|
|
381
|
+
*,
|
|
382
|
+
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
398
383
|
cache: ty.Optional[Cache] = None,
|
|
399
|
-
) ->
|
|
384
|
+
) -> ty.Optional[Path]:
|
|
400
385
|
file_properties = None
|
|
401
386
|
try:
|
|
402
387
|
co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
|
|
403
|
-
fs_client, remote_key, local_path,
|
|
388
|
+
fs_client, remote_key, local_path, expected_hash, cache
|
|
404
389
|
)
|
|
405
|
-
await _async_dl_scope.async_enter(
|
|
406
|
-
|
|
407
|
-
) # on __aexit__, will release the connection to the pool
|
|
390
|
+
await _async_dl_scope.async_enter(dl_file_client) # type: ignore[arg-type]
|
|
391
|
+
# on __aexit__, will release the connection to the pool
|
|
408
392
|
while True:
|
|
409
393
|
if co_request == _IoRequest.FILE_PROPERTIES:
|
|
410
394
|
if not file_properties:
|
|
@@ -414,7 +398,6 @@ async def async_download_or_use_verified(
|
|
|
414
398
|
co_request = co.send(file_properties)
|
|
415
399
|
elif isinstance(co_request, azcopy.download.DownloadRequest):
|
|
416
400
|
# coroutine is requesting download
|
|
417
|
-
|
|
418
401
|
await fretry.retry_regular_async(
|
|
419
402
|
_excs_to_retry(), fretry.iter_to_async(fretry.n_times(2))
|
|
420
403
|
)(
|
|
@@ -428,16 +411,14 @@ async def async_download_or_use_verified(
|
|
|
428
411
|
raise ValueError(f"Unexpected coroutine request: {co_request}")
|
|
429
412
|
|
|
430
413
|
except StopIteration as si:
|
|
431
|
-
if
|
|
414
|
+
if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
|
|
432
415
|
try:
|
|
433
|
-
logger.info(f"Setting missing
|
|
416
|
+
logger.info(f"Setting missing Hash for {remote_key}")
|
|
434
417
|
assert file_properties
|
|
435
|
-
await dl_file_client.
|
|
436
|
-
cs, **match_etag(file_properties)
|
|
437
|
-
)
|
|
418
|
+
await dl_file_client.set_metadata(meta, **etag.match_etag(file_properties)) # type: ignore[misc]
|
|
438
419
|
# TODO - check above type ignore
|
|
439
|
-
except HttpResponseError as
|
|
440
|
-
logger.info(f"Unable to set
|
|
420
|
+
except (HttpResponseError, ResourceModifiedError) as ex:
|
|
421
|
+
logger.info(f"Unable to set Hash for {remote_key}: {ex}")
|
|
441
422
|
return si.value.hit
|
|
442
423
|
except AzureError as err:
|
|
443
|
-
translate_azure_error(fs_client, remote_key, err)
|
|
424
|
+
errors.translate_azure_error(fs_client, remote_key, err)
|
thds/adls/download_lock.py
CHANGED
|
@@ -9,7 +9,7 @@ from thds.core import config, home, log
|
|
|
9
9
|
|
|
10
10
|
from .md5 import hex_md5_str
|
|
11
11
|
|
|
12
|
-
DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".adls
|
|
12
|
+
DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/download-locks", parse=Path)
|
|
13
13
|
_CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
|
|
14
14
|
_CLEAN_UP_LOCKFILES_EVERY = timedelta(hours=1).total_seconds()
|
|
15
15
|
_LAST_CLEANED_BY_THIS_PROCESS = time.monotonic() - _CLEAN_UP_LOCKFILES_EVERY
|
|
@@ -60,4 +60,11 @@ def download_lock(download_unique_str: str) -> FileLock:
|
|
|
60
60
|
"""
|
|
61
61
|
DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
|
|
62
62
|
_occasionally_clean_download_locks()
|
|
63
|
-
return FileLock(
|
|
63
|
+
return FileLock(
|
|
64
|
+
DOWNLOAD_LOCKS_DIR()
|
|
65
|
+
/ (download_unique_str.split("/")[-1][:50] + hex_md5_str(download_unique_str)),
|
|
66
|
+
# is_singleton=True,
|
|
67
|
+
# critical for keeping this reentrant without passing the lock around.
|
|
68
|
+
# see https://github.com/tox-dev/filelock/issues/315#issuecomment-2016797681
|
|
69
|
+
# however, this is not compatible with the version of Databricks we use, so.....
|
|
70
|
+
)
|
thds/adls/errors.py
CHANGED
|
@@ -15,8 +15,16 @@ class BlobNotFoundError(HttpResponseError):
|
|
|
15
15
|
super().__init__(f"{type_hint} not found: {fqn}")
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class
|
|
19
|
-
"""
|
|
18
|
+
class BlobPropertiesValidationError(ValueError):
|
|
19
|
+
"""Raised when the properties of a blob do not match the expected values."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HashMismatchError(BlobPropertiesValidationError):
|
|
23
|
+
"""Raised when the hash of a file does not match the expected value."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ContentLengthMismatchError(BlobPropertiesValidationError):
|
|
27
|
+
"""Raised when the content length of a file does not match the expected value as retrieved from the server."""
|
|
20
28
|
|
|
21
29
|
|
|
22
30
|
def is_blob_not_found(exc: Exception) -> bool:
|
thds/adls/file_properties.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
|
|
1
3
|
from azure.core.exceptions import AzureError, ResourceNotFoundError
|
|
2
4
|
from azure.storage.blob import BlobProperties
|
|
3
5
|
from azure.storage.filedatalake import FileProperties
|
|
@@ -27,6 +29,12 @@ def get_blob_properties(fqn: AdlsFqn) -> BlobProperties:
|
|
|
27
29
|
)
|
|
28
30
|
|
|
29
31
|
|
|
32
|
+
class PropertiesP(ty.Protocol):
|
|
33
|
+
name: ty.Any
|
|
34
|
+
metadata: ty.Any
|
|
35
|
+
content_settings: ty.Any
|
|
36
|
+
|
|
37
|
+
|
|
30
38
|
# At some point it may make sense to separate file and blob property modules,
|
|
31
39
|
# but they also are very closely tied together. AFAIK all files are blobs, and given our usage of ADLS,
|
|
32
40
|
# I don't know if we ever deal with things that are blobs but not files.
|