thds.adls 3.2.20250630174944__py3-none-any.whl → 4.1.20250701190349__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

@@ -1,45 +1,53 @@
1
1
  import typing as ty
2
2
  from pathlib import Path
3
3
 
4
- from thds.adls import source
5
- from thds.core import parallel
4
+ from thds import core
6
5
  from thds.core.source.tree import SourceTree
7
6
  from thds.core.thunks import thunking
8
7
 
8
+ from . import upload
9
9
  from .download import download_or_use_verified
10
10
  from .fqn import AdlsFqn
11
11
  from .global_client import get_global_fs_client
12
12
  from .impl import ADLSFileSystem
13
- from .resource.up_down import AdlsHashedResource, upload
14
13
  from .ro_cache import global_cache
15
14
  from .uri import UriIsh, parse_any
16
15
 
17
16
 
18
- def download_to_cache(fqn_or_uri: UriIsh, md5b64: str = "") -> Path:
17
+ def download_to_cache(
18
+ fqn_or_uri: UriIsh,
19
+ *,
20
+ expected_hash: ty.Optional[core.hashing.Hash] = None,
21
+ ) -> Path:
19
22
  """Downloads directly to the cache and returns a Path to the read-only file.
20
23
 
21
24
  This will allow you to download a file 'into' the cache even if
22
- you provide no MD5 and the remote file properties does not have
25
+ you provide no expected hash and the remote file properties does not have
23
26
  one. However, future attempts to reuse the cache will force a
24
- re-download if no MD5 is available at that time.
27
+ re-download if no remote hash is available at that time.
25
28
  """
26
29
  fqn = parse_any(fqn_or_uri)
27
30
  cache_path = global_cache().path(fqn)
28
31
  download_or_use_verified(
29
- get_global_fs_client(fqn.sa, fqn.container), fqn.path, cache_path, md5b64, cache=global_cache()
32
+ get_global_fs_client(fqn.sa, fqn.container),
33
+ fqn.path,
34
+ cache_path,
35
+ expected_hash=expected_hash,
36
+ cache=global_cache(),
30
37
  )
38
+ assert cache_path.is_file(), "File should have been downloaded to the cache."
31
39
  return cache_path
32
40
 
33
41
 
34
- def upload_through_cache(dest: UriIsh, src_path: Path) -> AdlsHashedResource:
35
- """Return an AdlsHashedResource, since by definition an upload through the cache must have a known checksum.
42
+ def upload_through_cache(dest: UriIsh, src_path: Path) -> core.source.Source:
43
+ """Return a Source with a Hash, since by definition an upload through the cache must have a known checksum.
36
44
 
37
45
  Uses global client, which is pretty much always what you want.
38
46
  """
39
47
  assert src_path.is_file(), "src_path must be a file."
40
- resource = upload(dest, src_path, write_through_cache=global_cache())
41
- assert resource, "MD5 should always be calculable for a local path."
42
- return resource
48
+ new_src = upload.upload(dest, src_path, write_through_cache=global_cache())
49
+ assert new_src.hash, "hash should always be calculable for a local path."
50
+ return new_src
43
51
 
44
52
 
45
53
  def download_directory(fqn: AdlsFqn) -> Path:
@@ -75,9 +83,6 @@ def upload_directory_through_cache(dest: UriIsh, src_path: Path) -> SourceTree:
75
83
 
76
84
  return SourceTree(
77
85
  sources=list(
78
- map(
79
- source.from_adls,
80
- parallel.yield_results(upload_thunks, named="upload_directory_through_cache"),
81
- )
86
+ core.parallel.yield_results(upload_thunks, named="upload_directory_through_cache"),
82
87
  )
83
88
  )
thds/adls/conf.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """This is where fine-tuning environment variables are defined."""
2
+
2
3
  from thds.core import config
3
4
 
4
5
  # These defaults were tested to perform well (~200 MB/sec) on a 2 core
thds/adls/download.py CHANGED
@@ -2,30 +2,21 @@ import contextlib
2
2
  import enum
3
3
  import os
4
4
  import shutil
5
+ import threading
5
6
  import typing as ty
6
- from base64 import b64decode
7
+ from pathlib import Path
7
8
 
8
9
  import aiohttp.http_exceptions
9
- from azure.core.exceptions import AzureError, HttpResponseError
10
- from azure.storage.filedatalake import (
11
- ContentSettings,
12
- DataLakeFileClient,
13
- FileProperties,
14
- FileSystemClient,
15
- aio,
16
- )
10
+ from azure.core.exceptions import AzureError, HttpResponseError, ResourceModifiedError
11
+ from azure.storage.filedatalake import DataLakeFileClient, FileProperties, FileSystemClient, aio
17
12
 
18
- from thds.core import fretry, log, scope, tmp
19
- from thds.core.hashing import b64
13
+ from thds.core import fretry, hash_cache, hashing, log, scope, tmp
20
14
  from thds.core.types import StrOrPath
21
15
 
22
- from . import azcopy
16
+ from . import azcopy, errors, etag, hashes
23
17
  from ._progress import report_download_progress
24
18
  from .download_lock import download_lock
25
- from .errors import MD5MismatchError, translate_azure_error
26
- from .etag import match_etag
27
19
  from .fqn import AdlsFqn
28
- from .md5 import check_reasonable_md5b64, md5_file
29
20
  from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
30
21
 
31
22
  logger = log.getLogger(__name__)
@@ -39,13 +30,16 @@ def _atomic_download_and_move(
39
30
  ) -> ty.Iterator[azcopy.download.DownloadRequest]:
40
31
  known_size = (properties.size or 0) if properties else 0
41
32
  with tmp.temppath_same_fs(dest) as dpath:
42
- with open(dpath, "wb") as f:
43
- logger.debug("Downloading %s", fqn)
44
- yield azcopy.download.DownloadRequest(
45
- report_download_progress(f, str(fqn), known_size), dpath
46
- )
33
+ logger.debug("Downloading %s", fqn)
34
+ if azcopy.download.should_use_azcopy(known_size):
35
+ yield azcopy.download.DownloadRequest(dpath, known_size)
36
+ else:
37
+ with open(dpath, "wb") as down_f:
38
+ yield azcopy.download.SdkDownloadRequest(
39
+ dpath, known_size, report_download_progress(down_f, str(fqn), known_size)
40
+ )
47
41
  if known_size and os.path.getsize(dpath) != known_size:
48
- raise ValueError(
42
+ raise errors.ContentLengthMismatchError(
49
43
  f"Downloaded file {dpath} has size {os.path.getsize(dpath)}"
50
44
  f" but expected {known_size}."
51
45
  )
@@ -54,58 +48,14 @@ def _atomic_download_and_move(
54
48
  except OSError as oserr:
55
49
  if "Invalid cross-device link" in str(oserr):
56
50
  # this shouldn't ever happen because of temppath_same_fs, but just in case...
51
+ logger.warning('Failed to move "%s" to "%s" - copying instead', dpath, dest)
57
52
  shutil.copyfile(dpath, dest)
53
+ logger.info('Copied "%s" to "%s"', dpath, dest)
58
54
  else:
55
+ logger.error('Failed to move "%s" to "%s" - raising', dpath, dest)
59
56
  raise
60
57
 
61
58
 
62
- @contextlib.contextmanager
63
- def _verify_md5s_before_and_after_download(
64
- remote_md5b64: str, expected_md5b64: str, fqn: AdlsFqn, local_dest: StrOrPath
65
- ) -> ty.Iterator[None]:
66
- if expected_md5b64:
67
- check_reasonable_md5b64(expected_md5b64)
68
- if remote_md5b64:
69
- check_reasonable_md5b64(remote_md5b64)
70
- if remote_md5b64 and expected_md5b64 and remote_md5b64 != expected_md5b64:
71
- raise MD5MismatchError(
72
- f"ADLS thinks the MD5 of {fqn} is {remote_md5b64}, but we expected {expected_md5b64}."
73
- " This may indicate that we need to update a hash in the codebase."
74
- )
75
-
76
- yield # perform download
77
-
78
- with log.logger_context(hash_for="after-download"):
79
- local_md5b64 = b64(md5_file(local_dest))
80
- check_reasonable_md5b64(local_md5b64) # must always exist
81
- if remote_md5b64 and remote_md5b64 != local_md5b64:
82
- raise MD5MismatchError(
83
- f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
84
- f" but the remote ({fqn}) says it should be {remote_md5b64}."
85
- f" This may indicate that ADLS has an erroneous MD5 for {fqn}."
86
- )
87
- if expected_md5b64 and local_md5b64 != expected_md5b64:
88
- raise MD5MismatchError(
89
- f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
90
- f" but we expected it to be {expected_md5b64}."
91
- f" This probably indicates a corrupted download of {fqn}"
92
- )
93
- all_hashes = dict(local=local_md5b64, remote=remote_md5b64, expected=expected_md5b64)
94
- assert 1 == len(set(filter(None, all_hashes.values()))), all_hashes
95
-
96
-
97
- def _md5b64_path_if_exists(path: StrOrPath) -> ty.Optional[str]:
98
- if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
99
- return None
100
- return b64(md5_file(path))
101
-
102
-
103
- def _remote_md5b64(file_properties: FileProperties) -> str:
104
- if file_properties.content_settings.content_md5:
105
- return b64(file_properties.content_settings.content_md5)
106
- return ""
107
-
108
-
109
59
  # Async is weird.
110
60
  #
111
61
  # You cannot easily call an async function from within a standard/non-async function.
@@ -144,6 +94,57 @@ def _remote_md5b64(file_properties: FileProperties) -> str:
144
94
  # again and rely on the controller to re-send the previously fetched result.
145
95
 
146
96
 
97
+ class _FileResult(ty.NamedTuple):
98
+ hash: hashing.Hash
99
+ hit: ty.Optional[Path]
100
+
101
+
102
+ def _attempt_cache_hit(
103
+ expected_hash: ty.Optional[hashing.Hash],
104
+ fqn: AdlsFqn,
105
+ local_path: StrOrPath,
106
+ cache: ty.Optional[Cache],
107
+ ) -> ty.Optional[_FileResult]:
108
+ if not expected_hash:
109
+ return None
110
+
111
+ hash_path_if_exists = hashes.hash_path_for_algo(expected_hash.algo)
112
+
113
+ with log.logger_context(hash_for="before-download-dest"):
114
+ local_hash = hash_path_if_exists(local_path)
115
+ if local_hash == expected_hash:
116
+ logger.debug("Local path matches %s - no need to look further", expected_hash.algo)
117
+ if cache:
118
+ cache_path = cache.path(fqn)
119
+ with log.logger_context(hash_for="before-download-cache"):
120
+ if local_hash != hash_path_if_exists(cache_path):
121
+ # only copy if the cache is out of date
122
+ from_local_path_to_cache(local_path, cache_path, cache.link)
123
+ return _FileResult(local_hash, hit=cache_path)
124
+ return _FileResult(local_hash, hit=Path(local_path))
125
+
126
+ if local_hash:
127
+ logger.debug(
128
+ "Local path exists but does not match expected %s %s",
129
+ expected_hash.algo,
130
+ expected_hash.bytes,
131
+ )
132
+ if cache:
133
+ cache_path = cache.path(fqn)
134
+ cache_hash = hash_path_if_exists(cache_path)
135
+ if cache_hash == expected_hash: # file in cache matches!
136
+ from_cache_path_to_local(cache_path, local_path, cache.link)
137
+ return _FileResult(cache_hash, hit=cache_path)
138
+
139
+ if cache_hash:
140
+ logger.debug(
141
+ "Cache path exists but does not match expected %s %s",
142
+ expected_hash.algo,
143
+ expected_hash.bytes,
144
+ )
145
+ return None
146
+
147
+
147
148
  class _IoRequest(enum.Enum):
148
149
  FILE_PROPERTIES = "file_properties"
149
150
 
@@ -152,18 +153,13 @@ IoRequest = ty.Union[_IoRequest, azcopy.download.DownloadRequest]
152
153
  IoResponse = ty.Union[FileProperties, None]
153
154
 
154
155
 
155
- class _FileResult(ty.NamedTuple):
156
- md5b64: str
157
- hit: bool
158
-
159
-
160
156
  _dl_scope = scope.Scope("adls.download")
161
157
 
162
158
 
163
159
  def _download_or_use_verified_cached_coroutine( # noqa: C901
164
160
  fqn: AdlsFqn,
165
161
  local_path: StrOrPath,
166
- md5b64: str = "",
162
+ expected_hash: ty.Optional[hashing.Hash] = None,
167
163
  cache: ty.Optional[Cache] = None,
168
164
  ) -> ty.Generator[IoRequest, IoResponse, _FileResult]:
169
165
  """Make a file on ADLS available at the local path provided.
@@ -202,82 +198,72 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
202
198
  writing in a standard fashion.
203
199
 
204
200
  Raises StopIteration when complete. StopIteration.value.hit will
205
- be True if there was a cache hit, and False if a download was
206
- required. `.value` will also contain the md5b64 of the downloaded
207
- file, which may be used as desired.
201
+ be the Path to the cached file if there was a cache hit, and None
202
+ if a download was required. `.value` will also contain the Hash of
203
+ the downloaded file, which may be used as desired.
208
204
  """
209
205
  if not local_path:
210
206
  raise ValueError("Must provide a destination path.")
211
207
 
212
- _dl_scope.enter(log.logger_context(dl=fqn))
208
+ _dl_scope.enter(log.logger_context(dl=fqn, pid=os.getpid(), tid=threading.get_ident()))
213
209
  file_properties = None
214
- if not md5b64:
215
- # we don't know what we expect, so attempt to retrieve an
216
- # expectation from ADLS itself.
210
+
211
+ if not expected_hash:
212
+ # we don't know what we expect, so attempt to retrieve
213
+ # expectations from ADLS itself.
217
214
  file_properties = yield _IoRequest.FILE_PROPERTIES
218
- md5b64 = _remote_md5b64(file_properties) # type: ignore[arg-type]
219
- # TODO - check above type ignore
215
+ if file_properties:
216
+ # critically, we expect the _first_ one in this list to be the fastest to verify.
217
+ expected_hash = next(iter(hashes.extract_hashes_from_props(file_properties).values()), None)
220
218
 
221
219
  def attempt_cache_hit() -> ty.Optional[_FileResult]:
222
- if not md5b64:
223
- return None
224
-
225
- check_reasonable_md5b64(md5b64)
226
- with log.logger_context(hash_for="before-download-dest"):
227
- local_md5b64 = _md5b64_path_if_exists(local_path)
228
- if local_md5b64 == md5b64:
229
- logger.debug("Local path matches MD5 - no need to look further")
230
- if cache:
231
- cache_path = cache.path(fqn)
232
- with log.logger_context(hash_for="before-download-cache"):
233
- if local_md5b64 != _md5b64_path_if_exists(cache_path):
234
- # only copy if the cache is out of date
235
- from_local_path_to_cache(local_path, cache_path, cache.link)
236
- return _FileResult(local_md5b64, hit=True)
237
-
238
- if local_md5b64:
239
- logger.debug("Local path exists but does not match expected md5 %s", md5b64)
240
- if cache:
241
- cache_path = cache.path(fqn)
242
- cache_md5b64 = _md5b64_path_if_exists(cache_path)
243
- if cache_md5b64 == md5b64: # file in cache matches!
244
- from_cache_path_to_local(cache_path, local_path, cache.link)
245
- return _FileResult(cache_md5b64, hit=True)
246
-
247
- if cache_md5b64:
248
- logger.debug("Cache path exists but does not match expected md5 %s", md5b64)
249
- return None
220
+ return _attempt_cache_hit(
221
+ expected_hash=expected_hash, cache=cache, fqn=fqn, local_path=local_path
222
+ )
250
223
 
251
- # attempt cache hit before taking a lock, to avoid contention for existing files.
224
+ # attempt cache hits before taking a lock, to avoid contention for existing files.
252
225
  if file_result := attempt_cache_hit():
253
226
  return file_result # noqa: B901
254
227
 
255
- _dl_scope.enter(download_lock(str(cache.path(fqn) if cache else local_path)))
228
+ # No cache hit, so its time to prepare to download. if a cache was provided, we will
229
+ # _put_ the resulting file in it.
230
+
231
+ file_lock = str(cache.path(fqn) if cache else local_path)
256
232
  # create lockfile name from the (shared) cache path if present, otherwise the final
257
233
  # destination. Non-cache users may then still incur multiple downloads in parallel,
258
234
  # but if you wanted to coordinate then you should probably have been using the global
259
235
  # cache in the first place.
236
+ _dl_scope.enter(download_lock(file_lock))
260
237
 
261
238
  # re-attempt cache hit - we may have gotten the lock after somebody else downloaded
262
239
  if file_result := attempt_cache_hit():
263
- logger.debug("Got cache hit on the second attempt, after acquiring lock for %s", fqn)
240
+ logger.info("Got cache hit on the second attempt, after acquiring lock for %s", fqn)
264
241
  return file_result # noqa: B901
265
242
 
266
243
  logger.debug("Unable to find a cached version anywhere that we looked...")
267
244
  file_properties = yield _IoRequest.FILE_PROPERTIES
268
- # no point in downloading if we've asked for hash X but ADLS only has hash Y.
269
- with _verify_md5s_before_and_after_download(
270
- _remote_md5b64(file_properties), # type: ignore[arg-type]
271
- # TODO - check above type ignore
272
- md5b64,
245
+
246
+ # if any of the remote hashes match the expected hash, verify that one.
247
+ # otherwise, verify the first remote hash in the list, since that's the fastest one.
248
+ all_remote_hashes = hashes.extract_hashes_from_props(file_properties)
249
+ remote_hash_to_match = all_remote_hashes.get(expected_hash.algo) if expected_hash else None
250
+ with hashes.verify_hashes_before_and_after_download(
251
+ remote_hash_to_match,
252
+ expected_hash,
273
253
  fqn,
274
254
  local_path,
275
255
  ): # download new data directly to local path
276
256
  with _atomic_download_and_move(fqn, local_path, file_properties) as tmpwriter:
277
257
  yield tmpwriter
258
+
278
259
  if cache:
279
260
  from_local_path_to_cache(local_path, cache.path(fqn), cache.link)
280
- return _FileResult(md5b64 or b64(md5_file(local_path)), hit=False)
261
+
262
+ hash_to_set_if_missing = expected_hash or remote_hash_to_match
263
+ if not hash_to_set_if_missing or hash_to_set_if_missing.algo not in hashes.PREFERRED_ALGOS:
264
+ hash_to_set_if_missing = hash_cache.filehash(hashes.PREFERRED_ALGOS[0], local_path)
265
+ assert hash_to_set_if_missing, "We should have a preferred hash to set at this point."
266
+ return _FileResult(hash_to_set_if_missing, hit=None)
281
267
 
282
268
 
283
269
  # So ends the crazy download caching coroutine.
@@ -293,7 +279,7 @@ def _prep_download_coroutine(
293
279
  fs_client: FileSystemClient,
294
280
  remote_key: str,
295
281
  local_path: StrOrPath,
296
- md5b64: str = "",
282
+ expected_hash: ty.Optional[hashing.Hash] = None,
297
283
  cache: ty.Optional[Cache] = None,
298
284
  ) -> ty.Tuple[
299
285
  ty.Generator[IoRequest, IoResponse, _FileResult],
@@ -304,22 +290,12 @@ def _prep_download_coroutine(
304
290
  co = _download_or_use_verified_cached_coroutine(
305
291
  AdlsFqn(ty.cast(str, fs_client.account_name), fs_client.file_system_name, remote_key),
306
292
  local_path,
307
- md5b64=md5b64,
293
+ expected_hash=expected_hash,
308
294
  cache=cache,
309
295
  )
310
296
  return co, co.send(None), None, fs_client.get_file_client(remote_key)
311
297
 
312
298
 
313
- def _set_md5_if_missing(
314
- file_properties: ty.Optional[FileProperties], md5b64: str
315
- ) -> ty.Optional[ContentSettings]:
316
- if not file_properties or file_properties.content_settings.content_md5:
317
- return None
318
- file_properties.content_settings.content_md5 = b64decode(md5b64) # type: ignore[assignment]
319
- # TODO - check above type ignore
320
- return file_properties.content_settings
321
-
322
-
323
299
  def _excs_to_retry() -> ty.Callable[[Exception], bool]:
324
300
  """These are exceptions that we observe to be spurious failures worth retrying."""
325
301
  return fretry.is_exc(
@@ -327,6 +303,7 @@ def _excs_to_retry() -> ty.Callable[[Exception], bool]:
327
303
  filter(
328
304
  None,
329
305
  (
306
+ errors.ContentLengthMismatchError,
330
307
  aiohttp.http_exceptions.ContentLengthError,
331
308
  aiohttp.client_exceptions.ClientPayloadError,
332
309
  getattr(
@@ -343,9 +320,10 @@ def download_or_use_verified(
343
320
  fs_client: FileSystemClient,
344
321
  remote_key: str,
345
322
  local_path: StrOrPath,
346
- md5b64: str = "",
323
+ *,
324
+ expected_hash: ty.Optional[hashing.Hash] = None,
347
325
  cache: ty.Optional[Cache] = None,
348
- ) -> bool:
326
+ ) -> ty.Optional[Path]:
349
327
  """Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
350
328
 
351
329
  Note that you will get a logged warning if `local_path` already exists when you call
@@ -354,7 +332,7 @@ def download_or_use_verified(
354
332
  file_properties = None
355
333
  try:
356
334
  co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
357
- fs_client, remote_key, local_path, md5b64, cache
335
+ fs_client, remote_key, local_path, expected_hash, cache
358
336
  )
359
337
  _dl_scope.enter(dl_file_client) # on __exit__, will release the connection to the pool
360
338
  while True:
@@ -373,16 +351,16 @@ def download_or_use_verified(
373
351
  else:
374
352
  raise ValueError(f"Unexpected coroutine request: {co_request}")
375
353
  except StopIteration as si:
376
- if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
354
+ if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
377
355
  try:
378
- logger.info(f"Setting missing MD5 for {remote_key}")
356
+ logger.info(f"Setting missing hash for {remote_key}")
379
357
  assert file_properties
380
- dl_file_client.set_http_headers(cs, **match_etag(file_properties))
381
- except HttpResponseError as hre:
382
- logger.info(f"Unable to set MD5 for {remote_key}: {hre}")
358
+ dl_file_client.set_metadata(meta, **etag.match_etag(file_properties))
359
+ except (HttpResponseError, ResourceModifiedError) as ex:
360
+ logger.info(f"Unable to set Hash for {remote_key}: {ex}")
383
361
  return si.value.hit
384
362
  except AzureError as err:
385
- translate_azure_error(fs_client, remote_key, err)
363
+ errors.translate_azure_error(fs_client, remote_key, err)
386
364
 
387
365
 
388
366
  _async_dl_scope = scope.AsyncScope("adls.download.async")
@@ -394,13 +372,14 @@ async def async_download_or_use_verified(
394
372
  fs_client: aio.FileSystemClient,
395
373
  remote_key: str,
396
374
  local_path: StrOrPath,
397
- md5b64: str = "",
375
+ *,
376
+ expected_hash: ty.Optional[hashing.Hash] = None,
398
377
  cache: ty.Optional[Cache] = None,
399
- ) -> bool:
378
+ ) -> ty.Optional[Path]:
400
379
  file_properties = None
401
380
  try:
402
381
  co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
403
- fs_client, remote_key, local_path, md5b64, cache
382
+ fs_client, remote_key, local_path, expected_hash, cache
404
383
  )
405
384
  await _async_dl_scope.async_enter(
406
385
  dl_file_client # type: ignore[arg-type]
@@ -428,16 +407,14 @@ async def async_download_or_use_verified(
428
407
  raise ValueError(f"Unexpected coroutine request: {co_request}")
429
408
 
430
409
  except StopIteration as si:
431
- if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
410
+ if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
432
411
  try:
433
- logger.info(f"Setting missing MD5 for {remote_key}")
412
+ logger.info(f"Setting missing Hash for {remote_key}")
434
413
  assert file_properties
435
- await dl_file_client.set_http_headers( # type: ignore[misc]
436
- cs, **match_etag(file_properties)
437
- )
414
+ await dl_file_client.set_metadata(meta, **etag.match_etag(file_properties)) # type: ignore[misc]
438
415
  # TODO - check above type ignore
439
- except HttpResponseError as hre:
440
- logger.info(f"Unable to set MD5 for {remote_key}: {hre}")
416
+ except (HttpResponseError, ResourceModifiedError) as ex:
417
+ logger.info(f"Unable to set Hash for {remote_key}: {ex}")
441
418
  return si.value.hit
442
419
  except AzureError as err:
443
- translate_azure_error(fs_client, remote_key, err)
420
+ errors.translate_azure_error(fs_client, remote_key, err)
@@ -9,7 +9,7 @@ from thds.core import config, home, log
9
9
 
10
10
  from .md5 import hex_md5_str
11
11
 
12
- DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".adls-md5-download-locks", parse=Path)
12
+ DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/download-locks", parse=Path)
13
13
  _CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
14
14
  _CLEAN_UP_LOCKFILES_EVERY = timedelta(hours=1).total_seconds()
15
15
  _LAST_CLEANED_BY_THIS_PROCESS = time.monotonic() - _CLEAN_UP_LOCKFILES_EVERY
@@ -60,4 +60,11 @@ def download_lock(download_unique_str: str) -> FileLock:
60
60
  """
61
61
  DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
62
62
  _occasionally_clean_download_locks()
63
- return FileLock(DOWNLOAD_LOCKS_DIR() / hex_md5_str(download_unique_str))
63
+ return FileLock(
64
+ DOWNLOAD_LOCKS_DIR()
65
+ / (download_unique_str.split("/")[-1][:50] + hex_md5_str(download_unique_str)),
66
+ # is_singleton=True,
67
+ # critical for keeping this reentrant without passing the lock around.
68
+ # see https://github.com/tox-dev/filelock/issues/315#issuecomment-2016797681
69
+ # however, this is not compatible with the version of Databricks we use, so.....
70
+ )
thds/adls/errors.py CHANGED
@@ -15,8 +15,16 @@ class BlobNotFoundError(HttpResponseError):
15
15
  super().__init__(f"{type_hint} not found: {fqn}")
16
16
 
17
17
 
18
- class MD5MismatchError(Exception):
19
- """Indicates that something needs to be done by the developer to correct a hash mismatch."""
18
+ class BlobPropertiesValidationError(ValueError):
19
+ """Raised when the properties of a blob do not match the expected values."""
20
+
21
+
22
+ class HashMismatchError(BlobPropertiesValidationError):
23
+ """Raised when the hash of a file does not match the expected value."""
24
+
25
+
26
+ class ContentLengthMismatchError(BlobPropertiesValidationError):
27
+ """Raised when the content length of a file does not match the expected value as retrieved from the server."""
20
28
 
21
29
 
22
30
  def is_blob_not_found(exc: Exception) -> bool:
@@ -1,3 +1,5 @@
1
+ import typing as ty
2
+
1
3
  from azure.core.exceptions import AzureError, ResourceNotFoundError
2
4
  from azure.storage.blob import BlobProperties
3
5
  from azure.storage.filedatalake import FileProperties
@@ -27,6 +29,12 @@ def get_blob_properties(fqn: AdlsFqn) -> BlobProperties:
27
29
  )
28
30
 
29
31
 
32
+ class PropertiesP(ty.Protocol):
33
+ name: ty.Any
34
+ metadata: ty.Any
35
+ content_settings: ty.Any
36
+
37
+
30
38
  # At some point it may make sense to separate file and blob property modules,
31
39
  # but they also are very closely tied together. AFAIK all files are blobs, and given our usage of ADLS,
32
40
  # I don't know if we ever deal with things that are blobs but not files.