thds.adls 3.0.20250116223841__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/__init__.py +15 -0
- thds/adls/_progress.py +193 -0
- thds/adls/_upload.py +127 -0
- thds/adls/abfss.py +24 -0
- thds/adls/cached_up_down.py +48 -0
- thds/adls/conf.py +33 -0
- thds/adls/dbfs.py +60 -0
- thds/adls/defaults.py +26 -0
- thds/adls/download.py +394 -0
- thds/adls/download_lock.py +57 -0
- thds/adls/errors.py +44 -0
- thds/adls/etag.py +6 -0
- thds/adls/file_properties.py +13 -0
- thds/adls/fqn.py +169 -0
- thds/adls/global_client.py +78 -0
- thds/adls/impl.py +1111 -0
- thds/adls/md5.py +60 -0
- thds/adls/meta.json +8 -0
- thds/adls/named_roots.py +26 -0
- thds/adls/py.typed +0 -0
- thds/adls/resource/__init__.py +36 -0
- thds/adls/resource/core.py +79 -0
- thds/adls/resource/file_pointers.py +54 -0
- thds/adls/resource/up_down.py +245 -0
- thds/adls/ro_cache.py +126 -0
- thds/adls/shared_credential.py +107 -0
- thds/adls/source.py +66 -0
- thds/adls/tools/download.py +35 -0
- thds/adls/tools/ls.py +38 -0
- thds/adls/uri.py +38 -0
- thds.adls-3.0.20250116223841.dist-info/METADATA +16 -0
- thds.adls-3.0.20250116223841.dist-info/RECORD +35 -0
- thds.adls-3.0.20250116223841.dist-info/WHEEL +5 -0
- thds.adls-3.0.20250116223841.dist-info/entry_points.txt +3 -0
- thds.adls-3.0.20250116223841.dist-info/top_level.txt +1 -0
thds/adls/download.py
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import enum
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import typing as ty
|
|
6
|
+
from base64 import b64decode
|
|
7
|
+
|
|
8
|
+
from azure.core.exceptions import AzureError, HttpResponseError
|
|
9
|
+
from azure.storage.filedatalake import (
|
|
10
|
+
ContentSettings,
|
|
11
|
+
DataLakeFileClient,
|
|
12
|
+
FileProperties,
|
|
13
|
+
FileSystemClient,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from thds.core import log, scope, tmp
|
|
17
|
+
from thds.core.hashing import b64
|
|
18
|
+
from thds.core.types import StrOrPath
|
|
19
|
+
|
|
20
|
+
from ._progress import report_download_progress
|
|
21
|
+
from .conf import CONNECTION_TIMEOUT, DOWNLOAD_FILE_MAX_CONCURRENCY
|
|
22
|
+
from .download_lock import download_lock
|
|
23
|
+
from .errors import translate_azure_error
|
|
24
|
+
from .etag import match_etag
|
|
25
|
+
from .fqn import AdlsFqn
|
|
26
|
+
from .md5 import check_reasonable_md5b64, md5_file
|
|
27
|
+
from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
|
|
28
|
+
|
|
29
|
+
logger = log.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class MD5MismatchError(Exception):
|
|
33
|
+
"""Indicates that something needs to be done by the developer to correct a hash mismatch."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@contextlib.contextmanager
|
|
37
|
+
def _atomic_download_and_move(
|
|
38
|
+
fqn: AdlsFqn,
|
|
39
|
+
dest: StrOrPath,
|
|
40
|
+
properties: ty.Optional[FileProperties] = None,
|
|
41
|
+
) -> ty.Iterator[ty.IO[bytes]]:
|
|
42
|
+
with tmp.temppath_same_fs(dest) as dpath:
|
|
43
|
+
with open(dpath, "wb") as f:
|
|
44
|
+
known_size = (properties.size or 0) if properties else 0
|
|
45
|
+
logger.debug("Downloading %s", fqn)
|
|
46
|
+
yield report_download_progress(f, str(dest), known_size)
|
|
47
|
+
try:
|
|
48
|
+
os.rename(dpath, dest) # will succeed even if dest is read-only
|
|
49
|
+
except OSError as oserr:
|
|
50
|
+
if "Invalid cross-device link" in str(oserr):
|
|
51
|
+
# this shouldn't ever happen because of temppath_same_fs, but just in case...
|
|
52
|
+
shutil.copyfile(dpath, dest)
|
|
53
|
+
else:
|
|
54
|
+
raise
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@contextlib.contextmanager
|
|
58
|
+
def _verify_md5s_before_and_after_download(
|
|
59
|
+
remote_md5b64: str, expected_md5b64: str, fqn: AdlsFqn, local_dest: StrOrPath
|
|
60
|
+
) -> ty.Iterator[None]:
|
|
61
|
+
if expected_md5b64:
|
|
62
|
+
check_reasonable_md5b64(expected_md5b64)
|
|
63
|
+
if remote_md5b64:
|
|
64
|
+
check_reasonable_md5b64(remote_md5b64)
|
|
65
|
+
if remote_md5b64 and expected_md5b64 and remote_md5b64 != expected_md5b64:
|
|
66
|
+
raise MD5MismatchError(
|
|
67
|
+
f"ADLS thinks the MD5 of {fqn} is {remote_md5b64}, but we expected {expected_md5b64}."
|
|
68
|
+
" This may indicate that we need to update a hash in the codebase."
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
yield # perform download
|
|
72
|
+
|
|
73
|
+
with log.logger_context(hash_for="after-download"):
|
|
74
|
+
local_md5b64 = b64(md5_file(local_dest))
|
|
75
|
+
check_reasonable_md5b64(local_md5b64) # must always exist
|
|
76
|
+
if remote_md5b64 and remote_md5b64 != local_md5b64:
|
|
77
|
+
raise MD5MismatchError(
|
|
78
|
+
f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
|
|
79
|
+
f" but the remote ({fqn}) says it should be {remote_md5b64}."
|
|
80
|
+
f" This may indicate that ADLS has an erroneous MD5 for {fqn}."
|
|
81
|
+
)
|
|
82
|
+
if expected_md5b64 and local_md5b64 != expected_md5b64:
|
|
83
|
+
raise MD5MismatchError(
|
|
84
|
+
f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
|
|
85
|
+
f" but we expected it to be {expected_md5b64}."
|
|
86
|
+
f" This probably indicates a corrupted download of {fqn}"
|
|
87
|
+
)
|
|
88
|
+
all_hashes = dict(local=local_md5b64, remote=remote_md5b64, expected=expected_md5b64)
|
|
89
|
+
assert 1 == len(set(filter(None, all_hashes.values()))), all_hashes
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _md5b64_path_if_exists(path: StrOrPath) -> ty.Optional[str]:
|
|
93
|
+
if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
|
|
94
|
+
return None
|
|
95
|
+
return b64(md5_file(path))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _remote_md5b64(file_properties: FileProperties) -> str:
|
|
99
|
+
if file_properties.content_settings.content_md5:
|
|
100
|
+
return b64(file_properties.content_settings.content_md5)
|
|
101
|
+
return ""
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# Async is weird.
|
|
105
|
+
#
|
|
106
|
+
# You cannot easily call an async function from within a standard/non-async function.
|
|
107
|
+
# And while you _can_ call a synchronous function from within an async one,
|
|
108
|
+
# it's highly discouraged if that function is doing network I/O,
|
|
109
|
+
# because your sync network IO will block the entire async green thread,
|
|
110
|
+
# grinding all 'async' work to a halt. It's very unneighborly.
|
|
111
|
+
#
|
|
112
|
+
# Unfortunately, this means that it's quite difficult to share the implementation
|
|
113
|
+
# of complex logic between async and non-async users. You can't use callbacks to abstract
|
|
114
|
+
# I/O, because those would themselves need to be either async or not.
|
|
115
|
+
#
|
|
116
|
+
# What you can do is a sort of 'functional core, imperative shell' approach,
|
|
117
|
+
# where the I/O parts are performed in a top level imperative shell, and the functional
|
|
118
|
+
# (logic) parts are performed in the shared core. As with many such things,
|
|
119
|
+
# the trick then is how to structure the functional core such that it 'makes sense' to a reader.
|
|
120
|
+
#
|
|
121
|
+
# One traditional means of doing this is breaking the functional core up into
|
|
122
|
+
# several functions to be called before and after the network calls.
|
|
123
|
+
# However, that does tend to impair readability, as those core functions
|
|
124
|
+
# each only do part of the work, and sometimes the 'part' doesn't make as much
|
|
125
|
+
# sense on its own as you might like.
|
|
126
|
+
#
|
|
127
|
+
# This is (to me) an entirely novel approach, and as such is an experiment.
|
|
128
|
+
# By writing a coroutine (the logic directly below) as the functional core,
|
|
129
|
+
# we can make the main 'logic' of the system readable in one go.
|
|
130
|
+
# What is required is a willingness to read the `yield` statements as
|
|
131
|
+
# essentially the 'inverse' of async/await - yield means "send this value to
|
|
132
|
+
# my controller and wait for their response'. Once the response is sent, we can
|
|
133
|
+
# resume where we left off, and the logic flows reasonably nicely.
|
|
134
|
+
#
|
|
135
|
+
# One _additional_ advantage of this approach is that certain bits of IO
|
|
136
|
+
# can actually be re-requested at any time, and the coroutine's controller
|
|
137
|
+
# can in a sense 'cache' those responses. So instead of the core logic
|
|
138
|
+
# having to keep track of whether it has performed the IO, it can request the result
|
|
139
|
+
# again and rely on the controller to re-send the previously fetched result.
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class _IoRequest(enum.Enum):
|
|
143
|
+
FILE_PROPERTIES = "file_properties"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
IoRequest = ty.Union[_IoRequest, ty.IO[bytes]]
|
|
147
|
+
IoResponse = ty.Union[FileProperties, None]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class _FileResult(ty.NamedTuple):
|
|
151
|
+
md5b64: str
|
|
152
|
+
hit: bool
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
_dl_scope = scope.Scope("adls.download")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _download_or_use_verified_cached_coroutine( # noqa: C901
|
|
159
|
+
fqn: AdlsFqn,
|
|
160
|
+
local_path: StrOrPath,
|
|
161
|
+
md5b64: str = "",
|
|
162
|
+
cache: ty.Optional[Cache] = None,
|
|
163
|
+
) -> ty.Generator[IoRequest, IoResponse, _FileResult]:
|
|
164
|
+
"""Make a file on ADLS available at the local path provided.
|
|
165
|
+
|
|
166
|
+
When we download from ADLS we want to know for sure that we have
|
|
167
|
+
the bytes we expected. Sometimes we have a hash upfront that we
|
|
168
|
+
want to verify. Other times, we simply want to rely on the hash
|
|
169
|
+
ADLS has. If we have both, we should check anything we have the
|
|
170
|
+
opportunity to check.
|
|
171
|
+
|
|
172
|
+
Because we're verifying everything, we can optionally offer two
|
|
173
|
+
sorts of verified caching.
|
|
174
|
+
|
|
175
|
+
1. With no local cache, we can at least verify whether the file is
|
|
176
|
+
present at the local path and contains the expected bytes. If it
|
|
177
|
+
does, there's no need to re-download.
|
|
178
|
+
|
|
179
|
+
2. With a cache provided, we can also check that cache, and if the
|
|
180
|
+
file is present in the local cache, we can either hard/soft link or copy
|
|
181
|
+
it into the expected location, depending on the Cache configuration.
|
|
182
|
+
|
|
183
|
+
If the file is not present in either location, we finally download
|
|
184
|
+
the file from ADLS and then verify that the local hash matches
|
|
185
|
+
both what we expected and what ADLS told us it would be, if any.
|
|
186
|
+
|
|
187
|
+
The downloaded file will always placed into the cache if a cache
|
|
188
|
+
is provided. The local path, if different from the cache path,
|
|
189
|
+
will either be linked or copied to, as selected by
|
|
190
|
+
`cache.link`. `link=True` will save storage space but is not what
|
|
191
|
+
you want if you intend to modify the file.
|
|
192
|
+
|
|
193
|
+
Files placed in the cache will be marked as read-only to prevent
|
|
194
|
+
_some_ types of accidents. This will not prevent you from
|
|
195
|
+
accidentally or maliciously moving files on top of existing cached
|
|
196
|
+
files, but it will prevent you from opening those files for
|
|
197
|
+
writing in a standard fashion.
|
|
198
|
+
|
|
199
|
+
Raises StopIteration when complete. StopIteration.value.hit will
|
|
200
|
+
be True if there was a cache hit, and False if a download was
|
|
201
|
+
required. `.value` will also contain the md5b64 of the downloaded
|
|
202
|
+
file, which may be used as desired.
|
|
203
|
+
"""
|
|
204
|
+
if not local_path:
|
|
205
|
+
raise ValueError("Must provide a destination path.")
|
|
206
|
+
|
|
207
|
+
_dl_scope.enter(log.logger_context(dl=fqn))
|
|
208
|
+
file_properties = None
|
|
209
|
+
if not md5b64:
|
|
210
|
+
# we don't know what we expect, so attempt to retrieve an
|
|
211
|
+
# expectation from ADLS itself.
|
|
212
|
+
file_properties = yield _IoRequest.FILE_PROPERTIES
|
|
213
|
+
md5b64 = _remote_md5b64(file_properties)
|
|
214
|
+
|
|
215
|
+
def attempt_cache_hit() -> ty.Optional[_FileResult]:
|
|
216
|
+
if not md5b64:
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
check_reasonable_md5b64(md5b64)
|
|
220
|
+
with log.logger_context(hash_for="before-download-dest"):
|
|
221
|
+
local_md5b64 = _md5b64_path_if_exists(local_path)
|
|
222
|
+
if local_md5b64 == md5b64:
|
|
223
|
+
logger.debug("Local path matches MD5 - no need to look further")
|
|
224
|
+
if cache:
|
|
225
|
+
cache_path = cache.path(fqn)
|
|
226
|
+
with log.logger_context(hash_for="before-download-cache"):
|
|
227
|
+
if local_md5b64 != _md5b64_path_if_exists(cache_path):
|
|
228
|
+
# only copy if the cache is out of date
|
|
229
|
+
from_local_path_to_cache(local_path, cache_path, cache.link)
|
|
230
|
+
return _FileResult(local_md5b64, hit=True)
|
|
231
|
+
|
|
232
|
+
if local_md5b64:
|
|
233
|
+
logger.debug("Local path exists but does not match expected md5 %s", md5b64)
|
|
234
|
+
if cache:
|
|
235
|
+
cache_path = cache.path(fqn)
|
|
236
|
+
cache_md5b64 = _md5b64_path_if_exists(cache_path)
|
|
237
|
+
if cache_md5b64 == md5b64: # file in cache matches!
|
|
238
|
+
from_cache_path_to_local(cache_path, local_path, cache.link)
|
|
239
|
+
return _FileResult(cache_md5b64, hit=True)
|
|
240
|
+
|
|
241
|
+
if cache_md5b64:
|
|
242
|
+
logger.debug("Cache path exists but does not match expected md5 %s", md5b64)
|
|
243
|
+
return None
|
|
244
|
+
|
|
245
|
+
# attempt cache hit before taking a lock, to avoid contention for existing files.
|
|
246
|
+
if file_result := attempt_cache_hit():
|
|
247
|
+
return file_result # noqa: B901
|
|
248
|
+
|
|
249
|
+
_dl_scope.enter(download_lock(str(cache.path(fqn) if cache else local_path)))
|
|
250
|
+
# create lockfile name from the (shared) cache path if present, otherwise the final
|
|
251
|
+
# destination. Non-cache users may then still incur multiple downloads in parallel,
|
|
252
|
+
# but if you wanted to coordinate then you should probably have been using the global
|
|
253
|
+
# cache in the first place.
|
|
254
|
+
|
|
255
|
+
# re-attempt cache hit - we may have gotten the lock after somebody else downloaded
|
|
256
|
+
if file_result := attempt_cache_hit():
|
|
257
|
+
logger.debug("Got cache hit on the second attempt, after acquiring lock for %s", fqn)
|
|
258
|
+
return file_result # noqa: B901
|
|
259
|
+
|
|
260
|
+
logger.debug("Unable to find a cached version anywhere that we looked...")
|
|
261
|
+
file_properties = yield _IoRequest.FILE_PROPERTIES
|
|
262
|
+
# no point in downloading if we've asked for hash X but ADLS only has hash Y.
|
|
263
|
+
with _verify_md5s_before_and_after_download(
|
|
264
|
+
_remote_md5b64(file_properties),
|
|
265
|
+
md5b64,
|
|
266
|
+
fqn,
|
|
267
|
+
local_path,
|
|
268
|
+
): # download new data directly to local path
|
|
269
|
+
with _atomic_download_and_move(fqn, local_path, file_properties) as tmpwriter:
|
|
270
|
+
yield tmpwriter
|
|
271
|
+
if cache:
|
|
272
|
+
from_local_path_to_cache(local_path, cache.path(fqn), cache.link)
|
|
273
|
+
return _FileResult(md5b64 or b64(md5_file(local_path)), hit=False)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# So ends the crazy download caching coroutine.
|
|
277
|
+
#
|
|
278
|
+
# Below this point are several helper functions, and after that are the two
|
|
279
|
+
# (async and non-async) coroutine controllers. While you can still see duplication
|
|
280
|
+
# between the two controllers, it is clearly much less code than would otherwise
|
|
281
|
+
# have to be duplicated in order to maintain an async and non-async
|
|
282
|
+
# implementation in parallel.
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _prep_download_coroutine(
|
|
286
|
+
fs_client: FileSystemClient,
|
|
287
|
+
remote_key: str,
|
|
288
|
+
local_path: StrOrPath,
|
|
289
|
+
md5b64: str = "",
|
|
290
|
+
cache: ty.Optional[Cache] = None,
|
|
291
|
+
) -> ty.Tuple[
|
|
292
|
+
ty.Generator[IoRequest, IoResponse, _FileResult],
|
|
293
|
+
IoRequest,
|
|
294
|
+
ty.Optional[FileProperties],
|
|
295
|
+
DataLakeFileClient,
|
|
296
|
+
]:
|
|
297
|
+
co = _download_or_use_verified_cached_coroutine(
|
|
298
|
+
AdlsFqn(fs_client.account_name, fs_client.file_system_name, remote_key),
|
|
299
|
+
local_path,
|
|
300
|
+
md5b64=md5b64,
|
|
301
|
+
cache=cache,
|
|
302
|
+
)
|
|
303
|
+
return co, co.send(None), None, fs_client.get_file_client(remote_key)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _set_md5_if_missing(
|
|
307
|
+
file_properties: ty.Optional[FileProperties], md5b64: str
|
|
308
|
+
) -> ty.Optional[ContentSettings]:
|
|
309
|
+
if not file_properties or file_properties.content_settings.content_md5:
|
|
310
|
+
return None
|
|
311
|
+
file_properties.content_settings.content_md5 = b64decode(md5b64)
|
|
312
|
+
return file_properties.content_settings
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@_dl_scope.bound
|
|
316
|
+
def download_or_use_verified(
|
|
317
|
+
fs_client: FileSystemClient,
|
|
318
|
+
remote_key: str,
|
|
319
|
+
local_path: StrOrPath,
|
|
320
|
+
md5b64: str = "",
|
|
321
|
+
cache: ty.Optional[Cache] = None,
|
|
322
|
+
) -> bool:
|
|
323
|
+
"""Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
|
|
324
|
+
|
|
325
|
+
Note that you will get a logged warning if `local_path` already exists when you call
|
|
326
|
+
this function.
|
|
327
|
+
"""
|
|
328
|
+
file_properties = None
|
|
329
|
+
try:
|
|
330
|
+
co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
|
|
331
|
+
fs_client, remote_key, local_path, md5b64, cache
|
|
332
|
+
)
|
|
333
|
+
while True:
|
|
334
|
+
if co_request == _IoRequest.FILE_PROPERTIES:
|
|
335
|
+
if not file_properties:
|
|
336
|
+
# only fetch these if they haven't already been requested
|
|
337
|
+
file_properties = dl_file_client.get_file_properties()
|
|
338
|
+
co_request = co.send(file_properties)
|
|
339
|
+
else: # needs file object
|
|
340
|
+
dl_file_client.download_file(
|
|
341
|
+
max_concurrency=DOWNLOAD_FILE_MAX_CONCURRENCY(),
|
|
342
|
+
connection_timeout=CONNECTION_TIMEOUT(),
|
|
343
|
+
).readinto(co_request)
|
|
344
|
+
co_request = co.send(None)
|
|
345
|
+
except StopIteration as si:
|
|
346
|
+
if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
|
|
347
|
+
try:
|
|
348
|
+
logger.info(f"Setting missing MD5 for {remote_key}")
|
|
349
|
+
assert file_properties
|
|
350
|
+
dl_file_client.set_http_headers(cs, **match_etag(file_properties))
|
|
351
|
+
except HttpResponseError as hre:
|
|
352
|
+
logger.info(f"Unable to set MD5 for {remote_key}: {hre}")
|
|
353
|
+
return si.value.hit
|
|
354
|
+
except AzureError as err:
|
|
355
|
+
translate_azure_error(fs_client, remote_key, err)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
@_dl_scope.bound
|
|
359
|
+
async def async_download_or_use_verified(
|
|
360
|
+
fs_client: FileSystemClient,
|
|
361
|
+
remote_key: str,
|
|
362
|
+
local_path: StrOrPath,
|
|
363
|
+
md5b64: str = "",
|
|
364
|
+
cache: ty.Optional[Cache] = None,
|
|
365
|
+
) -> bool:
|
|
366
|
+
file_properties = None
|
|
367
|
+
try:
|
|
368
|
+
co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
|
|
369
|
+
fs_client, remote_key, local_path, md5b64, cache
|
|
370
|
+
)
|
|
371
|
+
while True:
|
|
372
|
+
if co_request == _IoRequest.FILE_PROPERTIES:
|
|
373
|
+
if not file_properties:
|
|
374
|
+
# only fetch these if they haven't already been requested
|
|
375
|
+
file_properties = await dl_file_client.get_file_properties()
|
|
376
|
+
co_request = co.send(file_properties)
|
|
377
|
+
else: # needs file object
|
|
378
|
+
reader = await dl_file_client.download_file(
|
|
379
|
+
max_concurrency=DOWNLOAD_FILE_MAX_CONCURRENCY(),
|
|
380
|
+
connection_timeout=CONNECTION_TIMEOUT(),
|
|
381
|
+
)
|
|
382
|
+
await reader.readinto(co_request)
|
|
383
|
+
co_request = co.send(None)
|
|
384
|
+
except StopIteration as si:
|
|
385
|
+
if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
|
|
386
|
+
try:
|
|
387
|
+
logger.info(f"Setting missing MD5 for {remote_key}")
|
|
388
|
+
assert file_properties
|
|
389
|
+
await dl_file_client.set_http_headers(cs, **match_etag(file_properties))
|
|
390
|
+
except HttpResponseError as hre:
|
|
391
|
+
logger.info(f"Unable to set MD5 for {remote_key}: {hre}")
|
|
392
|
+
return si.value.hit
|
|
393
|
+
except AzureError as err:
|
|
394
|
+
translate_azure_error(fs_client, remote_key, err)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import time
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from filelock import FileLock
|
|
7
|
+
|
|
8
|
+
from thds.core import config, home, log
|
|
9
|
+
|
|
10
|
+
from .md5 import hex_md5_str
|
|
11
|
+
|
|
12
|
+
DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".adls-md5-download-locks", parse=Path)
|
|
13
|
+
_CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
|
|
14
|
+
logger = log.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _clean_download_locks() -> int:
|
|
18
|
+
deleted = 0
|
|
19
|
+
deletion_threshold = time.time() - _CLEAN_UP_LOCKFILES_AFTER_TIME.total_seconds()
|
|
20
|
+
try:
|
|
21
|
+
for f in DOWNLOAD_LOCKS_DIR().iterdir():
|
|
22
|
+
if f.is_file() and f.stat().st_mtime < deletion_threshold:
|
|
23
|
+
f.unlink()
|
|
24
|
+
deleted += 1
|
|
25
|
+
except Exception:
|
|
26
|
+
# this should be, hopefully, both very rare and completely inconsequential as to
|
|
27
|
+
# program correctness. if you see this happen multiple times, you may have some
|
|
28
|
+
# read-only files or something and want to manually clean up this directory.
|
|
29
|
+
logger.exception("Failed to clean download locks directory.")
|
|
30
|
+
return deleted
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _occasionally_clean_download_locks():
|
|
34
|
+
if random.random() < 0.005: # do this about every 200 downloads
|
|
35
|
+
# random.random is considered to be very fast, and we have no need of cryptographic quality.
|
|
36
|
+
_clean_download_locks()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def download_lock(download_unique_str: str) -> FileLock:
|
|
40
|
+
"""Note that the lockfiles will never be deleted automatically.
|
|
41
|
+
https://py-filelock.readthedocs.io/en/latest/api.html#filelock.BaseFileLock.release
|
|
42
|
+
|
|
43
|
+
also see:
|
|
44
|
+
https://stackoverflow.com/questions/58098634/why-does-the-python-filelock-library-delete-lockfiles-on-windows-but-not-unix
|
|
45
|
+
|
|
46
|
+
This means local developers would have a whole bunch of zero-byte files in their
|
|
47
|
+
download locks directory. So, we take a slightly idiosyncratic approach to cleaning
|
|
48
|
+
this up: not wanting to run this code on every download, but also not wanting
|
|
49
|
+
developers to see an infinitely-growing mess. Since parallel downloads will
|
|
50
|
+
(generally) not constitute a correctness issue, the 'safest' time to clean it up will
|
|
51
|
+
be when you don't have any downloads in progress, but in practice it seems likely that
|
|
52
|
+
we can get rid of old lockfiles after they've existed for more than 24 hours, since
|
|
53
|
+
it's quite rare that a download would last that long.
|
|
54
|
+
"""
|
|
55
|
+
DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
|
|
56
|
+
_occasionally_clean_download_locks()
|
|
57
|
+
return FileLock(DOWNLOAD_LOCKS_DIR() / hex_md5_str(download_unique_str))
|
thds/adls/errors.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
|
|
4
|
+
from azure.core.exceptions import AzureError, HttpResponseError
|
|
5
|
+
|
|
6
|
+
from thds.core.log import getLogger
|
|
7
|
+
|
|
8
|
+
from .fqn import AdlsFqn
|
|
9
|
+
|
|
10
|
+
logger = getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BlobNotFoundError(HttpResponseError):
|
|
14
|
+
def __init__(self, fqn: AdlsFqn, type_hint: str = "Blob"):
|
|
15
|
+
super().__init__(f"{type_hint} not found: {fqn}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def is_blob_not_found(exc: Exception) -> bool:
|
|
19
|
+
return (isinstance(exc, HttpResponseError) and exc.status_code == 404) or isinstance(
|
|
20
|
+
exc, BlobNotFoundError
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def translate_blob_not_found(hre: HttpResponseError, sa: str, container: str, path: str) -> ty.NoReturn:
|
|
25
|
+
if is_blob_not_found(hre):
|
|
26
|
+
raise BlobNotFoundError(AdlsFqn.of(sa, container, path)) from hre
|
|
27
|
+
raise hre
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@contextmanager
|
|
31
|
+
def blob_not_found_translation(fqn: AdlsFqn) -> ty.Iterator[None]:
|
|
32
|
+
try:
|
|
33
|
+
yield
|
|
34
|
+
except HttpResponseError as hre:
|
|
35
|
+
translate_blob_not_found(hre, *fqn)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def translate_azure_error(client, key: str, err: AzureError) -> ty.NoReturn:
|
|
39
|
+
"""We reserve the right to translate others in the future."""
|
|
40
|
+
fqn = AdlsFqn.of(client.account_name, client.file_system_name, key)
|
|
41
|
+
if is_blob_not_found(err):
|
|
42
|
+
raise BlobNotFoundError(fqn) from err
|
|
43
|
+
logger.error("Failed when operating on %s", fqn)
|
|
44
|
+
raise err
|
thds/adls/etag.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from azure.storage.filedatalake import FileProperties
|
|
2
|
+
|
|
3
|
+
from .fqn import AdlsFqn
|
|
4
|
+
from .global_client import get_global_fs_client
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_directory(info: FileProperties) -> bool:
|
|
8
|
+
# from https://github.com/Azure/azure-sdk-for-python/issues/24814#issuecomment-1159280840
|
|
9
|
+
return str(info.metadata.get("hdi_isfolder", "")).lower() == "true"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_file_properties(fqn: AdlsFqn) -> FileProperties:
|
|
13
|
+
return get_global_fs_client(fqn.sa, fqn.container).get_file_client(fqn.path).get_file_properties()
|
thds/adls/fqn.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from functools import reduce
|
|
3
|
+
from typing import NamedTuple, TypeVar, Union
|
|
4
|
+
|
|
5
|
+
ADLS_SCHEME = (
|
|
6
|
+
"adls://" # this is our invention, but ADLS does not appear to define one suitable for general use.
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def join(*parts: str) -> str:
|
|
11
|
+
"""For joining ADLS paths together."""
|
|
12
|
+
|
|
13
|
+
def join_(prefix: str, suffix: str) -> str:
|
|
14
|
+
prefix = prefix.rstrip("/")
|
|
15
|
+
suffix = suffix.lstrip("/")
|
|
16
|
+
return f"{prefix}/{suffix}".rstrip("/")
|
|
17
|
+
|
|
18
|
+
return reduce(join_, parts)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AdlsRoot(NamedTuple):
|
|
22
|
+
sa: str
|
|
23
|
+
container: str
|
|
24
|
+
|
|
25
|
+
def __str__(self) -> str:
|
|
26
|
+
return format_fqn(*self)
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def of(storage_account: str, container: str) -> "AdlsRoot":
|
|
30
|
+
return parse_fqn(format_fqn(storage_account, container, "/")).root()
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def parse(root_uri: str) -> "AdlsRoot":
|
|
34
|
+
if not root_uri.endswith("/"):
|
|
35
|
+
root_uri = root_uri + "/"
|
|
36
|
+
fqn = AdlsFqn.parse(root_uri)
|
|
37
|
+
assert not fqn.path, f"URI '{root_uri}' does not represent an ADLS root!"
|
|
38
|
+
return AdlsRoot(fqn.sa, fqn.container)
|
|
39
|
+
|
|
40
|
+
def join(self, *path: str) -> "AdlsFqn":
|
|
41
|
+
return AdlsFqn(self.sa, self.container, join("", *path))
|
|
42
|
+
|
|
43
|
+
def __truediv__(self, path: str) -> "AdlsFqn":
|
|
44
|
+
return self.join(path)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def parent(self) -> "AdlsRoot":
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class AdlsFqn(NamedTuple):
|
|
52
|
+
"""A fully-qualified ADLS path.
|
|
53
|
+
|
|
54
|
+
Represents a (Storage Account, Container) root, if path is empty.
|
|
55
|
+
|
|
56
|
+
Should usually be constructed via `parse`, `parse_fqn`, or `of`,
|
|
57
|
+
which will perform validation.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
sa: str
|
|
61
|
+
container: str
|
|
62
|
+
path: str
|
|
63
|
+
|
|
64
|
+
def __str__(self) -> str:
|
|
65
|
+
return format_fqn(*self)
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def of(storage_account: str, container: str, path: str = "") -> "AdlsFqn":
|
|
69
|
+
"""Expensive but includes validation."""
|
|
70
|
+
return of(storage_account, container, path)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def parse(fully_qualified_name: str) -> "AdlsFqn":
|
|
74
|
+
return parse_fqn(fully_qualified_name)
|
|
75
|
+
|
|
76
|
+
def join(self, *path_suffix: str) -> "AdlsFqn":
|
|
77
|
+
return AdlsFqn(self.sa, self.container, join(self.path, *path_suffix))
|
|
78
|
+
|
|
79
|
+
def __truediv__(self, path: str) -> "AdlsFqn":
|
|
80
|
+
return self.join(path)
|
|
81
|
+
|
|
82
|
+
# Should be a property?
|
|
83
|
+
def root(self) -> AdlsRoot:
|
|
84
|
+
return AdlsRoot(self.sa, self.container)
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def parent(self) -> "AdlsFqn":
|
|
88
|
+
return parent(self)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
FR = TypeVar("FR", bound=Union[AdlsFqn, AdlsRoot])
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def of(storage_account: str, container: str, path: str = "") -> AdlsFqn:
|
|
95
|
+
return AdlsFqn.parse(format_fqn(storage_account, container, path))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def parent(fqn: FR) -> FR:
|
|
99
|
+
if isinstance(fqn, AdlsRoot):
|
|
100
|
+
return fqn # type: ignore
|
|
101
|
+
assert isinstance(fqn, AdlsFqn)
|
|
102
|
+
if "/" not in fqn.path.strip("/"):
|
|
103
|
+
return AdlsFqn(fqn.sa, fqn.container, "") # type: ignore
|
|
104
|
+
return AdlsFqn(fqn.sa, fqn.container, join(*fqn.path.split("/")[:-1])) # type: ignore
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
SA_REGEX = re.compile(r"^[\w]{3,24}$")
|
|
108
|
+
# https://github.com/MicrosoftDocs/azure-docs/issues/64497#issuecomment-714380739
|
|
109
|
+
CONT_REGEX = re.compile(r"^\w[\w\-]{2,63}$")
|
|
110
|
+
# https://learn.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata#container-names
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class NotAdlsUri(ValueError):
|
|
114
|
+
"""This string does not represent an adls:// uri"""
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def parse_fqn(fully_qualified_uri: str) -> AdlsFqn:
|
|
118
|
+
"""There are many ways to represent a fully qualified ADLS path, and most of them are cumbersome.
|
|
119
|
+
|
|
120
|
+
This is an attempt to provide a standard way across our codebases
|
|
121
|
+
that keeps all parts together, but allows separating them for
|
|
122
|
+
passing into libraries.
|
|
123
|
+
|
|
124
|
+
Because Storage Account names can only include alphanumeric
|
|
125
|
+
characters, and Container names may only include alphanumerics
|
|
126
|
+
plus the dash character, this simple format turns out to be
|
|
127
|
+
unambiguous and easy for humans to read.
|
|
128
|
+
|
|
129
|
+
We accept formatted strings with or without the leading forward
|
|
130
|
+
slash in front of the path even though the formatter below
|
|
131
|
+
guarantees the leading forward slash, but we do require there to
|
|
132
|
+
be two spaces. If you wish to represent a Storage Account and
|
|
133
|
+
Container with no path, simply append a forward slash to the end
|
|
134
|
+
of your string, which represents the root of that SA and
|
|
135
|
+
container, because a single forward slash is not valid as a path
|
|
136
|
+
name for a blob in ADLS.
|
|
137
|
+
"""
|
|
138
|
+
# an older, scheme-less version of format_fqn used spaces to separate sa and container.
|
|
139
|
+
if fully_qualified_uri.startswith(ADLS_SCHEME):
|
|
140
|
+
fully_qualified_uri = fully_qualified_uri[len(ADLS_SCHEME) :]
|
|
141
|
+
sep = "/"
|
|
142
|
+
else:
|
|
143
|
+
sep = None
|
|
144
|
+
try:
|
|
145
|
+
sa, container, path = fully_qualified_uri.split(sep, 2)
|
|
146
|
+
except ValueError as ve:
|
|
147
|
+
raise NotAdlsUri(
|
|
148
|
+
f"Cannot split {fully_qualified_uri} into at least three '/'-separated pieces."
|
|
149
|
+
) from ve
|
|
150
|
+
assert SA_REGEX.match(sa), sa
|
|
151
|
+
assert CONT_REGEX.match(container), container
|
|
152
|
+
return AdlsFqn(sa, container, path.lstrip("/"))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
parse = parse_fqn
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def format_fqn(storage_account: str, container: str, path: str = "") -> str:
|
|
159
|
+
"""Returns a fully-qualifed ADLS name in URI format, with adls:// as a prefix.
|
|
160
|
+
|
|
161
|
+
When formatting, we will prefix your path with a forward-slash (/)
|
|
162
|
+
|
|
163
|
+
if it does not already have one, in order to allow empty paths to
|
|
164
|
+
be formatted and parsed simply.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
assert SA_REGEX.match(storage_account), storage_account
|
|
168
|
+
assert CONT_REGEX.match(container), container
|
|
169
|
+
return f"{ADLS_SCHEME}{storage_account}/{container}/{path.lstrip('/')}"
|