thds.adls 4.1.20250701001205__py3-none-any.whl → 4.1.20250702194306__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

@@ -1,45 +1,53 @@
1
1
  import typing as ty
2
2
  from pathlib import Path
3
3
 
4
- from thds.adls import source
5
- from thds.core import parallel
4
+ from thds import core
6
5
  from thds.core.source.tree import SourceTree
7
6
  from thds.core.thunks import thunking
8
7
 
8
+ from . import upload
9
9
  from .download import download_or_use_verified
10
10
  from .fqn import AdlsFqn
11
11
  from .global_client import get_global_fs_client
12
12
  from .impl import ADLSFileSystem
13
- from .resource.up_down import AdlsHashedResource, upload
14
13
  from .ro_cache import global_cache
15
14
  from .uri import UriIsh, parse_any
16
15
 
17
16
 
18
- def download_to_cache(fqn_or_uri: UriIsh, md5b64: str = "") -> Path:
17
+ def download_to_cache(
18
+ fqn_or_uri: UriIsh,
19
+ *,
20
+ expected_hash: ty.Optional[core.hashing.Hash] = None,
21
+ ) -> Path:
19
22
  """Downloads directly to the cache and returns a Path to the read-only file.
20
23
 
21
24
  This will allow you to download a file 'into' the cache even if
22
- you provide no MD5 and the remote file properties does not have
25
+ you provide no expected hash and the remote file properties does not have
23
26
  one. However, future attempts to reuse the cache will force a
24
- re-download if no MD5 is available at that time.
27
+ re-download if no remote hash is available at that time.
25
28
  """
26
29
  fqn = parse_any(fqn_or_uri)
27
30
  cache_path = global_cache().path(fqn)
28
31
  download_or_use_verified(
29
- get_global_fs_client(fqn.sa, fqn.container), fqn.path, cache_path, md5b64, cache=global_cache()
32
+ get_global_fs_client(fqn.sa, fqn.container),
33
+ fqn.path,
34
+ cache_path,
35
+ expected_hash=expected_hash,
36
+ cache=global_cache(),
30
37
  )
38
+ assert cache_path.is_file(), "File should have been downloaded to the cache."
31
39
  return cache_path
32
40
 
33
41
 
34
- def upload_through_cache(dest: UriIsh, src_path: Path) -> AdlsHashedResource:
35
- """Return an AdlsHashedResource, since by definition an upload through the cache must have a known checksum.
42
+ def upload_through_cache(dest: UriIsh, src_path: Path) -> core.source.Source:
43
+ """Return a Source with a Hash, since by definition an upload through the cache must have a known checksum.
36
44
 
37
45
  Uses global client, which is pretty much always what you want.
38
46
  """
39
47
  assert src_path.is_file(), "src_path must be a file."
40
- resource = upload(dest, src_path, write_through_cache=global_cache())
41
- assert resource, "MD5 should always be calculable for a local path."
42
- return resource
48
+ new_src = upload.upload(dest, src_path, write_through_cache=global_cache())
49
+ assert new_src.hash, "hash should always be calculable for a local path."
50
+ return new_src
43
51
 
44
52
 
45
53
  def download_directory(fqn: AdlsFqn) -> Path:
@@ -75,9 +83,6 @@ def upload_directory_through_cache(dest: UriIsh, src_path: Path) -> SourceTree:
75
83
 
76
84
  return SourceTree(
77
85
  sources=list(
78
- map(
79
- source.from_adls,
80
- parallel.yield_results(upload_thunks, named="upload_directory_through_cache"),
81
- )
86
+ core.parallel.yield_results(upload_thunks, named="upload_directory_through_cache"),
82
87
  )
83
88
  )
thds/adls/conf.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """This is where fine-tuning environment variables are defined."""
2
+
2
3
  from thds.core import config
3
4
 
4
5
  # These defaults were tested to perform well (~200 MB/sec) on a 2 core
thds/adls/download.py CHANGED
@@ -2,110 +2,64 @@ import contextlib
2
2
  import enum
3
3
  import os
4
4
  import shutil
5
+ import threading
5
6
  import typing as ty
6
- from base64 import b64decode
7
+ from pathlib import Path
7
8
 
8
9
  import aiohttp.http_exceptions
9
- from azure.core.exceptions import AzureError, HttpResponseError
10
- from azure.storage.filedatalake import (
11
- ContentSettings,
12
- DataLakeFileClient,
13
- FileProperties,
14
- FileSystemClient,
15
- aio,
16
- )
10
+ from azure.core.exceptions import AzureError, HttpResponseError, ResourceModifiedError
11
+ from azure.storage.filedatalake import DataLakeFileClient, FileProperties, FileSystemClient, aio
17
12
 
18
- from thds.core import fretry, log, scope, tmp
19
- from thds.core.hashing import b64
13
+ from thds.core import fretry, hash_cache, hashing, log, scope, tmp
20
14
  from thds.core.types import StrOrPath
21
15
 
22
- from . import azcopy
16
+ from . import azcopy, errors, etag, hashes
23
17
  from ._progress import report_download_progress
24
18
  from .download_lock import download_lock
25
- from .errors import MD5MismatchError, translate_azure_error
26
- from .etag import match_etag
27
19
  from .fqn import AdlsFqn
28
- from .md5 import check_reasonable_md5b64, md5_file
29
20
  from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
30
21
 
31
22
  logger = log.getLogger(__name__)
32
23
 
33
24
 
25
+ def _check_size(dpath: Path, expected_size: ty.Optional[int]) -> None:
26
+ actual_size = os.path.getsize(dpath)
27
+ if expected_size is not None and actual_size != expected_size:
28
+ raise errors.ContentLengthMismatchError(
29
+ f"Downloaded file {dpath} has size {actual_size} but expected {expected_size}"
30
+ )
31
+
32
+
34
33
  @contextlib.contextmanager
35
34
  def _atomic_download_and_move(
36
35
  fqn: AdlsFqn,
37
36
  dest: StrOrPath,
38
37
  properties: ty.Optional[FileProperties] = None,
39
38
  ) -> ty.Iterator[azcopy.download.DownloadRequest]:
40
- known_size = (properties.size or 0) if properties else 0
39
+ known_size = properties.size if properties else None
41
40
  with tmp.temppath_same_fs(dest) as dpath:
42
- with open(dpath, "wb") as f:
43
- logger.debug("Downloading %s", fqn)
44
- yield azcopy.download.DownloadRequest(
45
- report_download_progress(f, str(fqn), known_size), dpath
46
- )
47
- if known_size and os.path.getsize(dpath) != known_size:
48
- raise ValueError(
49
- f"Downloaded file {dpath} has size {os.path.getsize(dpath)}"
50
- f" but expected {known_size}."
51
- )
41
+ logger.debug("Downloading %s", fqn)
42
+ if azcopy.download.should_use_azcopy(known_size or -1):
43
+ yield azcopy.download.DownloadRequest(dpath, known_size)
44
+ else:
45
+ with open(dpath, "wb") as down_f:
46
+ yield azcopy.download.SdkDownloadRequest(
47
+ dpath, known_size, report_download_progress(down_f, str(fqn), known_size or 0)
48
+ )
49
+ _check_size(dpath, known_size)
52
50
  try:
53
51
  os.rename(dpath, dest) # will succeed even if dest is read-only
54
52
  except OSError as oserr:
55
53
  if "Invalid cross-device link" in str(oserr):
56
54
  # this shouldn't ever happen because of temppath_same_fs, but just in case...
55
+ logger.warning('Failed to move "%s" to "%s" - copying instead', dpath, dest)
57
56
  shutil.copyfile(dpath, dest)
57
+ logger.info('Copied "%s" to "%s"', dpath, dest)
58
58
  else:
59
+ logger.error('Failed to move "%s" to "%s" - raising', dpath, dest)
59
60
  raise
60
61
 
61
62
 
62
- @contextlib.contextmanager
63
- def _verify_md5s_before_and_after_download(
64
- remote_md5b64: str, expected_md5b64: str, fqn: AdlsFqn, local_dest: StrOrPath
65
- ) -> ty.Iterator[None]:
66
- if expected_md5b64:
67
- check_reasonable_md5b64(expected_md5b64)
68
- if remote_md5b64:
69
- check_reasonable_md5b64(remote_md5b64)
70
- if remote_md5b64 and expected_md5b64 and remote_md5b64 != expected_md5b64:
71
- raise MD5MismatchError(
72
- f"ADLS thinks the MD5 of {fqn} is {remote_md5b64}, but we expected {expected_md5b64}."
73
- " This may indicate that we need to update a hash in the codebase."
74
- )
75
-
76
- yield # perform download
77
-
78
- with log.logger_context(hash_for="after-download"):
79
- local_md5b64 = b64(md5_file(local_dest))
80
- check_reasonable_md5b64(local_md5b64) # must always exist
81
- if remote_md5b64 and remote_md5b64 != local_md5b64:
82
- raise MD5MismatchError(
83
- f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
84
- f" but the remote ({fqn}) says it should be {remote_md5b64}."
85
- f" This may indicate that ADLS has an erroneous MD5 for {fqn}."
86
- )
87
- if expected_md5b64 and local_md5b64 != expected_md5b64:
88
- raise MD5MismatchError(
89
- f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
90
- f" but we expected it to be {expected_md5b64}."
91
- f" This probably indicates a corrupted download of {fqn}"
92
- )
93
- all_hashes = dict(local=local_md5b64, remote=remote_md5b64, expected=expected_md5b64)
94
- assert 1 == len(set(filter(None, all_hashes.values()))), all_hashes
95
-
96
-
97
- def _md5b64_path_if_exists(path: StrOrPath) -> ty.Optional[str]:
98
- if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
99
- return None
100
- return b64(md5_file(path))
101
-
102
-
103
- def _remote_md5b64(file_properties: FileProperties) -> str:
104
- if file_properties.content_settings.content_md5:
105
- return b64(file_properties.content_settings.content_md5)
106
- return ""
107
-
108
-
109
63
  # Async is weird.
110
64
  #
111
65
  # You cannot easily call an async function from within a standard/non-async function.
@@ -144,6 +98,57 @@ def _remote_md5b64(file_properties: FileProperties) -> str:
144
98
  # again and rely on the controller to re-send the previously fetched result.
145
99
 
146
100
 
101
+ class _FileResult(ty.NamedTuple):
102
+ hash: hashing.Hash
103
+ hit: ty.Optional[Path]
104
+
105
+
106
+ def _attempt_cache_hit(
107
+ expected_hash: ty.Optional[hashing.Hash],
108
+ fqn: AdlsFqn,
109
+ local_path: StrOrPath,
110
+ cache: ty.Optional[Cache],
111
+ ) -> ty.Optional[_FileResult]:
112
+ if not expected_hash:
113
+ return None
114
+
115
+ hash_path_if_exists = hashes.hash_path_for_algo(expected_hash.algo)
116
+
117
+ with log.logger_context(hash_for="before-download-dest"):
118
+ local_hash = hash_path_if_exists(local_path)
119
+ if local_hash == expected_hash:
120
+ logger.debug("Local path matches %s - no need to look further", expected_hash.algo)
121
+ if cache:
122
+ cache_path = cache.path(fqn)
123
+ with log.logger_context(hash_for="before-download-cache"):
124
+ if local_hash != hash_path_if_exists(cache_path):
125
+ # only copy if the cache is out of date
126
+ from_local_path_to_cache(local_path, cache_path, cache.link)
127
+ return _FileResult(local_hash, hit=cache_path)
128
+ return _FileResult(local_hash, hit=Path(local_path))
129
+
130
+ if local_hash:
131
+ logger.debug(
132
+ "Local path exists but does not match expected %s %s",
133
+ expected_hash.algo,
134
+ expected_hash.bytes,
135
+ )
136
+ if cache:
137
+ cache_path = cache.path(fqn)
138
+ cache_hash = hash_path_if_exists(cache_path)
139
+ if cache_hash == expected_hash: # file in cache matches!
140
+ from_cache_path_to_local(cache_path, local_path, cache.link)
141
+ return _FileResult(cache_hash, hit=cache_path)
142
+
143
+ if cache_hash:
144
+ logger.debug(
145
+ "Cache path exists but does not match expected %s %s",
146
+ expected_hash.algo,
147
+ expected_hash.bytes,
148
+ )
149
+ return None
150
+
151
+
147
152
  class _IoRequest(enum.Enum):
148
153
  FILE_PROPERTIES = "file_properties"
149
154
 
@@ -152,18 +157,13 @@ IoRequest = ty.Union[_IoRequest, azcopy.download.DownloadRequest]
152
157
  IoResponse = ty.Union[FileProperties, None]
153
158
 
154
159
 
155
- class _FileResult(ty.NamedTuple):
156
- md5b64: str
157
- hit: bool
158
-
159
-
160
160
  _dl_scope = scope.Scope("adls.download")
161
161
 
162
162
 
163
163
  def _download_or_use_verified_cached_coroutine( # noqa: C901
164
164
  fqn: AdlsFqn,
165
165
  local_path: StrOrPath,
166
- md5b64: str = "",
166
+ expected_hash: ty.Optional[hashing.Hash] = None,
167
167
  cache: ty.Optional[Cache] = None,
168
168
  ) -> ty.Generator[IoRequest, IoResponse, _FileResult]:
169
169
  """Make a file on ADLS available at the local path provided.
@@ -202,82 +202,72 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
202
202
  writing in a standard fashion.
203
203
 
204
204
  Raises StopIteration when complete. StopIteration.value.hit will
205
- be True if there was a cache hit, and False if a download was
206
- required. `.value` will also contain the md5b64 of the downloaded
207
- file, which may be used as desired.
205
+ be the Path to the cached file if there was a cache hit, and None
206
+ if a download was required. `.value` will also contain the Hash of
207
+ the downloaded file, which may be used as desired.
208
208
  """
209
209
  if not local_path:
210
210
  raise ValueError("Must provide a destination path.")
211
211
 
212
- _dl_scope.enter(log.logger_context(dl=fqn))
212
+ _dl_scope.enter(log.logger_context(dl=fqn, pid=os.getpid(), tid=threading.get_ident()))
213
213
  file_properties = None
214
- if not md5b64:
215
- # we don't know what we expect, so attempt to retrieve an
216
- # expectation from ADLS itself.
214
+
215
+ if not expected_hash:
216
+ # we don't know what we expect, so attempt to retrieve
217
+ # expectations from ADLS itself.
217
218
  file_properties = yield _IoRequest.FILE_PROPERTIES
218
- md5b64 = _remote_md5b64(file_properties) # type: ignore[arg-type]
219
- # TODO - check above type ignore
219
+ if file_properties:
220
+ # critically, we expect the _first_ one in this list to be the fastest to verify.
221
+ expected_hash = next(iter(hashes.extract_hashes_from_props(file_properties).values()), None)
220
222
 
221
223
  def attempt_cache_hit() -> ty.Optional[_FileResult]:
222
- if not md5b64:
223
- return None
224
-
225
- check_reasonable_md5b64(md5b64)
226
- with log.logger_context(hash_for="before-download-dest"):
227
- local_md5b64 = _md5b64_path_if_exists(local_path)
228
- if local_md5b64 == md5b64:
229
- logger.debug("Local path matches MD5 - no need to look further")
230
- if cache:
231
- cache_path = cache.path(fqn)
232
- with log.logger_context(hash_for="before-download-cache"):
233
- if local_md5b64 != _md5b64_path_if_exists(cache_path):
234
- # only copy if the cache is out of date
235
- from_local_path_to_cache(local_path, cache_path, cache.link)
236
- return _FileResult(local_md5b64, hit=True)
237
-
238
- if local_md5b64:
239
- logger.debug("Local path exists but does not match expected md5 %s", md5b64)
240
- if cache:
241
- cache_path = cache.path(fqn)
242
- cache_md5b64 = _md5b64_path_if_exists(cache_path)
243
- if cache_md5b64 == md5b64: # file in cache matches!
244
- from_cache_path_to_local(cache_path, local_path, cache.link)
245
- return _FileResult(cache_md5b64, hit=True)
246
-
247
- if cache_md5b64:
248
- logger.debug("Cache path exists but does not match expected md5 %s", md5b64)
249
- return None
224
+ return _attempt_cache_hit(
225
+ expected_hash=expected_hash, cache=cache, fqn=fqn, local_path=local_path
226
+ )
250
227
 
251
- # attempt cache hit before taking a lock, to avoid contention for existing files.
228
+ # attempt cache hits before taking a lock, to avoid contention for existing files.
252
229
  if file_result := attempt_cache_hit():
253
230
  return file_result # noqa: B901
254
231
 
255
- _dl_scope.enter(download_lock(str(cache.path(fqn) if cache else local_path)))
232
+ # No cache hit, so its time to prepare to download. if a cache was provided, we will
233
+ # _put_ the resulting file in it.
234
+
235
+ file_lock = str(cache.path(fqn) if cache else local_path)
256
236
  # create lockfile name from the (shared) cache path if present, otherwise the final
257
237
  # destination. Non-cache users may then still incur multiple downloads in parallel,
258
238
  # but if you wanted to coordinate then you should probably have been using the global
259
239
  # cache in the first place.
240
+ _dl_scope.enter(download_lock(file_lock))
260
241
 
261
242
  # re-attempt cache hit - we may have gotten the lock after somebody else downloaded
262
243
  if file_result := attempt_cache_hit():
263
- logger.debug("Got cache hit on the second attempt, after acquiring lock for %s", fqn)
244
+ logger.info("Got cache hit on the second attempt, after acquiring lock for %s", fqn)
264
245
  return file_result # noqa: B901
265
246
 
266
247
  logger.debug("Unable to find a cached version anywhere that we looked...")
267
248
  file_properties = yield _IoRequest.FILE_PROPERTIES
268
- # no point in downloading if we've asked for hash X but ADLS only has hash Y.
269
- with _verify_md5s_before_and_after_download(
270
- _remote_md5b64(file_properties), # type: ignore[arg-type]
271
- # TODO - check above type ignore
272
- md5b64,
249
+
250
+ # if any of the remote hashes match the expected hash, verify that one.
251
+ # otherwise, verify the first remote hash in the list, since that's the fastest one.
252
+ all_remote_hashes = hashes.extract_hashes_from_props(file_properties)
253
+ remote_hash_to_match = all_remote_hashes.get(expected_hash.algo) if expected_hash else None
254
+ with hashes.verify_hashes_before_and_after_download(
255
+ remote_hash_to_match,
256
+ expected_hash,
273
257
  fqn,
274
258
  local_path,
275
259
  ): # download new data directly to local path
276
260
  with _atomic_download_and_move(fqn, local_path, file_properties) as tmpwriter:
277
261
  yield tmpwriter
262
+
278
263
  if cache:
279
264
  from_local_path_to_cache(local_path, cache.path(fqn), cache.link)
280
- return _FileResult(md5b64 or b64(md5_file(local_path)), hit=False)
265
+
266
+ hash_to_set_if_missing = expected_hash or remote_hash_to_match
267
+ if not hash_to_set_if_missing or hash_to_set_if_missing.algo not in hashes.PREFERRED_ALGOS:
268
+ hash_to_set_if_missing = hash_cache.filehash(hashes.PREFERRED_ALGOS[0], local_path)
269
+ assert hash_to_set_if_missing, "We should have a preferred hash to set at this point."
270
+ return _FileResult(hash_to_set_if_missing, hit=None)
281
271
 
282
272
 
283
273
  # So ends the crazy download caching coroutine.
@@ -293,7 +283,7 @@ def _prep_download_coroutine(
293
283
  fs_client: FileSystemClient,
294
284
  remote_key: str,
295
285
  local_path: StrOrPath,
296
- md5b64: str = "",
286
+ expected_hash: ty.Optional[hashing.Hash] = None,
297
287
  cache: ty.Optional[Cache] = None,
298
288
  ) -> ty.Tuple[
299
289
  ty.Generator[IoRequest, IoResponse, _FileResult],
@@ -304,22 +294,12 @@ def _prep_download_coroutine(
304
294
  co = _download_or_use_verified_cached_coroutine(
305
295
  AdlsFqn(ty.cast(str, fs_client.account_name), fs_client.file_system_name, remote_key),
306
296
  local_path,
307
- md5b64=md5b64,
297
+ expected_hash=expected_hash,
308
298
  cache=cache,
309
299
  )
310
300
  return co, co.send(None), None, fs_client.get_file_client(remote_key)
311
301
 
312
302
 
313
- def _set_md5_if_missing(
314
- file_properties: ty.Optional[FileProperties], md5b64: str
315
- ) -> ty.Optional[ContentSettings]:
316
- if not file_properties or file_properties.content_settings.content_md5:
317
- return None
318
- file_properties.content_settings.content_md5 = b64decode(md5b64) # type: ignore[assignment]
319
- # TODO - check above type ignore
320
- return file_properties.content_settings
321
-
322
-
323
303
  def _excs_to_retry() -> ty.Callable[[Exception], bool]:
324
304
  """These are exceptions that we observe to be spurious failures worth retrying."""
325
305
  return fretry.is_exc(
@@ -343,9 +323,10 @@ def download_or_use_verified(
343
323
  fs_client: FileSystemClient,
344
324
  remote_key: str,
345
325
  local_path: StrOrPath,
346
- md5b64: str = "",
326
+ *,
327
+ expected_hash: ty.Optional[hashing.Hash] = None,
347
328
  cache: ty.Optional[Cache] = None,
348
- ) -> bool:
329
+ ) -> ty.Optional[Path]:
349
330
  """Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
350
331
 
351
332
  Note that you will get a logged warning if `local_path` already exists when you call
@@ -354,7 +335,7 @@ def download_or_use_verified(
354
335
  file_properties = None
355
336
  try:
356
337
  co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
357
- fs_client, remote_key, local_path, md5b64, cache
338
+ fs_client, remote_key, local_path, expected_hash, cache
358
339
  )
359
340
  _dl_scope.enter(dl_file_client) # on __exit__, will release the connection to the pool
360
341
  while True:
@@ -373,16 +354,16 @@ def download_or_use_verified(
373
354
  else:
374
355
  raise ValueError(f"Unexpected coroutine request: {co_request}")
375
356
  except StopIteration as si:
376
- if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
357
+ if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
377
358
  try:
378
- logger.info(f"Setting missing MD5 for {remote_key}")
359
+ logger.info(f"Setting missing hash for {remote_key}")
379
360
  assert file_properties
380
- dl_file_client.set_http_headers(cs, **match_etag(file_properties))
381
- except HttpResponseError as hre:
382
- logger.info(f"Unable to set MD5 for {remote_key}: {hre}")
361
+ dl_file_client.set_metadata(meta, **etag.match_etag(file_properties))
362
+ except (HttpResponseError, ResourceModifiedError) as ex:
363
+ logger.info(f"Unable to set Hash for {remote_key}: {ex}")
383
364
  return si.value.hit
384
365
  except AzureError as err:
385
- translate_azure_error(fs_client, remote_key, err)
366
+ errors.translate_azure_error(fs_client, remote_key, err)
386
367
 
387
368
 
388
369
  _async_dl_scope = scope.AsyncScope("adls.download.async")
@@ -390,21 +371,24 @@ _async_dl_scope = scope.AsyncScope("adls.download.async")
390
371
 
391
372
  @_dl_scope.bound
392
373
  @_async_dl_scope.async_bound
374
+ @fretry.retry_regular_async(
375
+ fretry.is_exc(errors.ContentLengthMismatchError), fretry.iter_to_async(fretry.n_times(2))
376
+ )
393
377
  async def async_download_or_use_verified(
394
378
  fs_client: aio.FileSystemClient,
395
379
  remote_key: str,
396
380
  local_path: StrOrPath,
397
- md5b64: str = "",
381
+ *,
382
+ expected_hash: ty.Optional[hashing.Hash] = None,
398
383
  cache: ty.Optional[Cache] = None,
399
- ) -> bool:
384
+ ) -> ty.Optional[Path]:
400
385
  file_properties = None
401
386
  try:
402
387
  co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
403
- fs_client, remote_key, local_path, md5b64, cache
388
+ fs_client, remote_key, local_path, expected_hash, cache
404
389
  )
405
- await _async_dl_scope.async_enter(
406
- dl_file_client # type: ignore[arg-type]
407
- ) # on __aexit__, will release the connection to the pool
390
+ await _async_dl_scope.async_enter(dl_file_client) # type: ignore[arg-type]
391
+ # on __aexit__, will release the connection to the pool
408
392
  while True:
409
393
  if co_request == _IoRequest.FILE_PROPERTIES:
410
394
  if not file_properties:
@@ -414,7 +398,6 @@ async def async_download_or_use_verified(
414
398
  co_request = co.send(file_properties)
415
399
  elif isinstance(co_request, azcopy.download.DownloadRequest):
416
400
  # coroutine is requesting download
417
-
418
401
  await fretry.retry_regular_async(
419
402
  _excs_to_retry(), fretry.iter_to_async(fretry.n_times(2))
420
403
  )(
@@ -428,16 +411,14 @@ async def async_download_or_use_verified(
428
411
  raise ValueError(f"Unexpected coroutine request: {co_request}")
429
412
 
430
413
  except StopIteration as si:
431
- if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
414
+ if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
432
415
  try:
433
- logger.info(f"Setting missing MD5 for {remote_key}")
416
+ logger.info(f"Setting missing Hash for {remote_key}")
434
417
  assert file_properties
435
- await dl_file_client.set_http_headers( # type: ignore[misc]
436
- cs, **match_etag(file_properties)
437
- )
418
+ await dl_file_client.set_metadata(meta, **etag.match_etag(file_properties)) # type: ignore[misc]
438
419
  # TODO - check above type ignore
439
- except HttpResponseError as hre:
440
- logger.info(f"Unable to set MD5 for {remote_key}: {hre}")
420
+ except (HttpResponseError, ResourceModifiedError) as ex:
421
+ logger.info(f"Unable to set Hash for {remote_key}: {ex}")
441
422
  return si.value.hit
442
423
  except AzureError as err:
443
- translate_azure_error(fs_client, remote_key, err)
424
+ errors.translate_azure_error(fs_client, remote_key, err)
@@ -9,7 +9,7 @@ from thds.core import config, home, log
9
9
 
10
10
  from .md5 import hex_md5_str
11
11
 
12
- DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".adls-md5-download-locks", parse=Path)
12
+ DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/download-locks", parse=Path)
13
13
  _CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
14
14
  _CLEAN_UP_LOCKFILES_EVERY = timedelta(hours=1).total_seconds()
15
15
  _LAST_CLEANED_BY_THIS_PROCESS = time.monotonic() - _CLEAN_UP_LOCKFILES_EVERY
@@ -60,4 +60,11 @@ def download_lock(download_unique_str: str) -> FileLock:
60
60
  """
61
61
  DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
62
62
  _occasionally_clean_download_locks()
63
- return FileLock(DOWNLOAD_LOCKS_DIR() / hex_md5_str(download_unique_str))
63
+ return FileLock(
64
+ DOWNLOAD_LOCKS_DIR()
65
+ / (download_unique_str.split("/")[-1][:50] + hex_md5_str(download_unique_str)),
66
+ # is_singleton=True,
67
+ # critical for keeping this reentrant without passing the lock around.
68
+ # see https://github.com/tox-dev/filelock/issues/315#issuecomment-2016797681
69
+ # however, this is not compatible with the version of Databricks we use, so.....
70
+ )
thds/adls/errors.py CHANGED
@@ -15,8 +15,16 @@ class BlobNotFoundError(HttpResponseError):
15
15
  super().__init__(f"{type_hint} not found: {fqn}")
16
16
 
17
17
 
18
- class MD5MismatchError(Exception):
19
- """Indicates that something needs to be done by the developer to correct a hash mismatch."""
18
+ class BlobPropertiesValidationError(ValueError):
19
+ """Raised when the properties of a blob do not match the expected values."""
20
+
21
+
22
+ class HashMismatchError(BlobPropertiesValidationError):
23
+ """Raised when the hash of a file does not match the expected value."""
24
+
25
+
26
+ class ContentLengthMismatchError(BlobPropertiesValidationError):
27
+ """Raised when the content length of a file does not match the expected value as retrieved from the server."""
20
28
 
21
29
 
22
30
  def is_blob_not_found(exc: Exception) -> bool:
@@ -1,3 +1,5 @@
1
+ import typing as ty
2
+
1
3
  from azure.core.exceptions import AzureError, ResourceNotFoundError
2
4
  from azure.storage.blob import BlobProperties
3
5
  from azure.storage.filedatalake import FileProperties
@@ -27,6 +29,12 @@ def get_blob_properties(fqn: AdlsFqn) -> BlobProperties:
27
29
  )
28
30
 
29
31
 
32
+ class PropertiesP(ty.Protocol):
33
+ name: ty.Any
34
+ metadata: ty.Any
35
+ content_settings: ty.Any
36
+
37
+
30
38
  # At some point it may make sense to separate file and blob property modules,
31
39
  # but they also are very closely tied together. AFAIK all files are blobs, and given our usage of ADLS,
32
40
  # I don't know if we ever deal with things that are blobs but not files.