thds.adls 4.4.20251117191451__py3-none-any.whl → 4.5.20260110021526__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thds/adls/__init__.py CHANGED
@@ -3,6 +3,7 @@ from thds import core
3
3
  from . import ( # noqa: F401
4
4
  abfss,
5
5
  blob_meta,
6
+ blobs,
6
7
  defaults,
7
8
  etag,
8
9
  fqn,
thds/adls/_etag.py ADDED
@@ -0,0 +1,46 @@
1
+ # this module is for handling some new functionality related to using etags as a fallback
2
+ # for file hashing when the file properties do not include locally-verifiable hash information.
3
+ import typing as ty
4
+ from pathlib import Path
5
+
6
+ import xxhash
7
+
8
+ from thds.core import config, hash_cache, home, log, types
9
+
10
+ ETAG_FAKE_HASH_NAME = "adls-azure-etag-fake"
11
+ logger = log.getLogger(__name__)
12
+
13
+
14
+ def extract_etag_bytes(etag_str: str) -> bytes:
15
+ # ADLS etags may or may not be quoted depending on the API used:
16
+ # list_blobs returns unquoted, get_*_properties returns quoted.
17
+ # Strip quotes first, then calculate byte length from the stripped string.
18
+ stripped = etag_str.strip('"')
19
+ return int(stripped, 16).to_bytes((len(stripped) - 2 + 1) // 2, byteorder="big")
20
+
21
+
22
+ _ETAG_CACHE = config.item("cache-path", home.HOMEDIR() / ".thds/adls/xxhash-onto-etag", parse=Path)
23
+
24
+
25
+ def add_to_etag_cache(local_path: types.StrOrPath, etag: bytes) -> hash_cache.Hash:
26
+ xxh_bytes = hash_cache.hash_file(local_path, xxhash.xxh3_128())
27
+ etag_path = _ETAG_CACHE() / xxh_bytes.hex()
28
+ etag_path.parent.mkdir(parents=True, exist_ok=True)
29
+ etag_path.write_bytes(etag)
30
+ logger.debug("Writing etag 'hash' to path at %s", etag_path)
31
+ return hash_cache.Hash(ETAG_FAKE_HASH_NAME, etag)
32
+
33
+
34
+ def hash_file_fake_etag(local_path: types.StrOrPath) -> ty.Optional[hash_cache.Hash]:
35
+ try:
36
+ xxh_bytes = hash_cache.hash_file(local_path, xxhash.xxh3_128())
37
+ except FileNotFoundError:
38
+ return None
39
+
40
+ etag_path = _ETAG_CACHE() / xxh_bytes.hex()
41
+ if etag_path.is_file():
42
+ etag_bytes = etag_path.read_bytes()
43
+ logger.debug("Reusing etag 'fake hash' from path at %s", etag_path)
44
+ return hash_cache.Hash(ETAG_FAKE_HASH_NAME, etag_bytes)
45
+
46
+ return None
thds/adls/blobs.py ADDED
@@ -0,0 +1,23 @@
1
+ """TODO: better organize the blobs-related modules in thds.adls"""
2
+
3
+ import heapq
4
+
5
+ from thds.core import log, source
6
+
7
+ from .fqn import AdlsFqn
8
+ from .impl import ADLSFileSystem
9
+ from .source import get_with_hash
10
+
11
+ _logger = log.getLogger(__name__)
12
+
13
+
14
+ def most_recent_blobs(blobs_fqn: AdlsFqn, top_n: int = 1) -> list[source.Source]:
15
+ """Gets top n most recently-created blob in the directory at `blobs_fqn`."""
16
+ _logger.info(f"Enumerating the most recent blobs in {blobs_fqn}")
17
+ fs = ADLSFileSystem(blobs_fqn.sa, blobs_fqn.container)
18
+ snapshots = fs.get_directory_info(blobs_fqn.path, recursive=False)
19
+ if not snapshots:
20
+ raise ValueError(f"No blobs found in {blobs_fqn}")
21
+ top_blobs = heapq.nlargest(top_n, snapshots, key=lambda x: x.creation_time or -1)
22
+
23
+ return [get_with_hash(blobs_fqn.root() / item.name) for item in top_blobs if item.name]
thds/adls/copy.py CHANGED
@@ -10,6 +10,7 @@ from azure.storage.blob import BlobSasPermissions, BlobServiceClient, UserDelega
10
10
 
11
11
  from thds.core import cache, log, parallel, thunks
12
12
 
13
+ from ._etag import ETAG_FAKE_HASH_NAME
13
14
  from .file_properties import exists, get_blob_properties, get_file_properties, is_directory
14
15
  from .fqn import AdlsFqn
15
16
  from .global_client import get_global_blob_container_client, get_global_blob_service_client
@@ -60,7 +61,18 @@ def _copy_file(
60
61
  def hashes_exist_and_are_equal() -> bool:
61
62
  src_blob_props = src_blob_client.get_blob_properties()
62
63
  dest_blob_props = dest_blob_client.get_blob_properties()
63
- return extract_hashes_from_props(src_blob_props) == extract_hashes_from_props(dest_blob_props)
64
+ # exclude etag from comparison since it's unique per blob and will always differ
65
+ src_hashes = {
66
+ k: v
67
+ for k, v in extract_hashes_from_props(src_blob_props).items()
68
+ if k != ETAG_FAKE_HASH_NAME
69
+ }
70
+ dest_hashes = {
71
+ k: v
72
+ for k, v in extract_hashes_from_props(dest_blob_props).items()
73
+ if k != ETAG_FAKE_HASH_NAME
74
+ }
75
+ return src_hashes == dest_hashes
64
76
 
65
77
  if dest_blob_client.exists():
66
78
  if hashes_exist_and_are_equal():
thds/adls/download.py CHANGED
@@ -118,7 +118,7 @@ def _attempt_cache_hit(
118
118
  with log.logger_context(hash_for="before-download-dest"):
119
119
  local_hash = hash_path_if_exists(local_path)
120
120
  if local_hash == expected_hash:
121
- logger.debug("Local path matches %s - no need to look further", expected_hash.algo)
121
+ logger.debug("Local path matches '%s' hash - no need to look further", expected_hash.algo)
122
122
  if cache:
123
123
  cache_path = cache.path(fqn)
124
124
  with log.logger_context(hash_for="before-download-cache"):
@@ -235,6 +235,12 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
235
235
 
236
236
  # attempt cache hits before taking a lock, to avoid contention for existing files.
237
237
  if file_result := attempt_cache_hit():
238
+ logger.debug(
239
+ "No download - found cached version of %s using expected %s at %s",
240
+ fqn,
241
+ expected_hash,
242
+ file_result.hit,
243
+ )
238
244
  return file_result # noqa: B901
239
245
 
240
246
  # No cache hit, so its time to prepare to download. if a cache was provided, we will
@@ -344,11 +350,15 @@ def download_or_use_verified(
344
350
  *,
345
351
  expected_hash: ty.Optional[hashing.Hash] = None,
346
352
  cache: ty.Optional[Cache] = None,
353
+ set_remote_hash: bool = True,
347
354
  ) -> ty.Optional[Path]:
348
355
  """Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
349
356
 
350
357
  Note that you will get a logged warning if `local_path` already exists when you call
351
358
  this function.
359
+
360
+ If set_remote_hash is False, the function will not attempt to set hash metadata on the
361
+ remote file after download. This is useful when downloading from read-only locations.
352
362
  """
353
363
  file_properties = None
354
364
  try:
@@ -372,7 +382,9 @@ def download_or_use_verified(
372
382
  else:
373
383
  raise ValueError(f"Unexpected coroutine request: {co_request}")
374
384
  except StopIteration as si:
375
- if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
385
+ if set_remote_hash and (
386
+ meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash)
387
+ ):
376
388
  try:
377
389
  logger.info(f"Setting missing {si.value.hash.algo} hash for {remote_key}")
378
390
  assert file_properties
@@ -399,6 +411,7 @@ async def async_download_or_use_verified(
399
411
  *,
400
412
  expected_hash: ty.Optional[hashing.Hash] = None,
401
413
  cache: ty.Optional[Cache] = None,
414
+ set_remote_hash: bool = True,
402
415
  ) -> ty.Optional[Path]:
403
416
  file_properties = None
404
417
  try:
@@ -429,7 +442,9 @@ async def async_download_or_use_verified(
429
442
  raise ValueError(f"Unexpected coroutine request: {co_request}")
430
443
 
431
444
  except StopIteration as si:
432
- if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
445
+ if set_remote_hash and (
446
+ meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash)
447
+ ):
433
448
  try:
434
449
  logger.info(f"Setting missing Hash for {remote_key}")
435
450
  assert file_properties
@@ -37,6 +37,10 @@ class PropertiesP(ty.Protocol):
37
37
  name: ty.Any
38
38
  metadata: ty.Any
39
39
 
40
+ @property
41
+ def etag(self) -> ty.Union[str, None, ty.Any]:
42
+ pass
43
+
40
44
  @property
41
45
  def content_settings(self) -> ContentSettingsP:
42
46
  pass
thds/adls/hashes.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import contextlib
2
2
  import os
3
+ import sys
3
4
  import typing as ty
4
5
  from functools import partial
5
6
 
@@ -9,6 +10,7 @@ from thds.core import hash_cache, hashing, log, source, types
9
10
  from thds.core.hashing import Hash, SomehowReadable
10
11
 
11
12
  from . import errors, file_properties
13
+ from ._etag import ETAG_FAKE_HASH_NAME, add_to_etag_cache, extract_etag_bytes, hash_file_fake_etag
12
14
  from .fqn import AdlsFqn
13
15
 
14
16
  logger = log.getLogger(__name__)
@@ -53,6 +55,9 @@ def hash_path_for_algo(
53
55
  algo: str,
54
56
  ) -> ty.Callable[[types.StrOrPath], ty.Optional[hashing.Hash]]:
55
57
  """Return a function that hashes a path for the given algorithm."""
58
+ if algo == ETAG_FAKE_HASH_NAME:
59
+ return hash_file_fake_etag
60
+
56
61
  return partial(_hash_path_if_exists, partial(hash_cache.filehash, algo))
57
62
 
58
63
 
@@ -77,6 +82,13 @@ def extract_hashes_from_props(
77
82
  hashes = list(extract_hashes_from_metadata(props.metadata or dict()))
78
83
  if props.content_settings and props.content_settings.content_md5:
79
84
  hashes.append(hashing.Hash("md5", bytes(props.content_settings.content_md5)))
85
+
86
+ if props.etag:
87
+ # this is the final fallback. it cannot be checked locally, but at least
88
+ # it can be checked against what exists remotely the next time we want to use it.
89
+ if etag_bytes := extract_etag_bytes(props.etag):
90
+ hashes.append(hashing.Hash(sys.intern(ETAG_FAKE_HASH_NAME), etag_bytes))
91
+
80
92
  return {h.algo: h for h in hashes}
81
93
 
82
94
 
@@ -87,10 +99,6 @@ def verify_hashes_before_and_after_download(
87
99
  fqn: AdlsFqn,
88
100
  local_dest: types.StrOrPath,
89
101
  ) -> ty.Iterator[None]:
90
- # if expected_hash:
91
- # check_reasonable_md5b64(expected_md5b64)
92
- # if remote_md5b64:
93
- # check_reasonable_md5b64(remote_md5b64)
94
102
  if remote_hash and expected_hash and remote_hash != expected_hash:
95
103
  raise errors.HashMismatchError(
96
104
  f"ADLS thinks the {remote_hash.algo} of {fqn} is {hashing.b64(remote_hash.bytes)},"
@@ -105,11 +113,16 @@ def verify_hashes_before_and_after_download(
105
113
  expected_algo = remote_hash.algo
106
114
 
107
115
  if not expected_algo:
108
- # if we have neither a user-provided hash nor a remotely-foun9d hash, then we have nothing to check.
116
+ # if we have neither a user-provided hash nor a remotely-found hash, then we have nothing to check.
109
117
  return
110
118
 
119
+ assert expected_hash or remote_hash, "At least one of expected or remote hash must be present."
111
120
  with log.logger_context(hash_for="after-download"):
112
- local_hash = hash_cache.filehash(expected_algo, local_dest)
121
+ if expected_algo == ETAG_FAKE_HASH_NAME:
122
+ assert remote_hash, f"An Etag hash should always originate remotely: {fqn}"
123
+ local_hash = add_to_etag_cache(local_dest, remote_hash.bytes)
124
+ else:
125
+ local_hash = hash_cache.filehash(expected_algo, local_dest)
113
126
 
114
127
  if remote_hash and remote_hash != local_hash:
115
128
  raise errors.HashMismatchError(
@@ -142,6 +155,10 @@ def create_hash_metadata_if_missing(
142
155
  # without file properties, we can't match the etag when we try to set this.
143
156
  return dict()
144
157
 
158
+ if new_hash.algo == ETAG_FAKE_HASH_NAME:
159
+ # we never want to write etag-based hashes into metadata.
160
+ return dict()
161
+
145
162
  existing_metadata = file_properties.metadata or dict()
146
163
  if metadata_hash_b64_key(new_hash.algo) not in existing_metadata:
147
164
  return {**existing_metadata, **metadata_hash_dict(new_hash)}
thds/adls/source.py CHANGED
@@ -6,6 +6,7 @@ from thds.core import source
6
6
  from thds.core.hashing import Hash
7
7
 
8
8
  from . import cached, hashes, md5
9
+ from .cached import upload_through_cache
9
10
  from .errors import blob_not_found_translation
10
11
  from .file_properties import get_file_properties
11
12
  from .fqn import AdlsFqn
@@ -71,3 +72,17 @@ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
71
72
  def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
72
73
  """Meant for older use cases where we had an MD5"""
73
74
  return from_adls(uri_or_fqn, md5.to_hash(md5b64) if md5b64 else None)
75
+
76
+
77
+ def _upload_handler(dest_uri: str) -> ty.Optional[source.Uploader]:
78
+ if dest_fqn := resolve_uri(dest_uri):
79
+
80
+ def upload_to_adls(local_path: Path, hash: ty.Optional[Hash]) -> None:
81
+ upload_through_cache(dest_fqn, local_path)
82
+
83
+ return upload_to_adls
84
+
85
+ return None
86
+
87
+
88
+ source.register_upload_handler("thds.adls", _upload_handler)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thds.adls
3
- Version: 4.4.20251117191451
3
+ Version: 4.5.20260110021526
4
4
  Summary: ADLS tools
5
5
  Author-email: Trilliant Health <info@trillianthealth.com>
6
6
  License: MIT
@@ -1,22 +1,24 @@
1
- thds/adls/__init__.py,sha256=CzNcmTdP5RHrD-9K1boDU4Z3pbJAvEAFfFhXGFwpEXw,1088
1
+ thds/adls/__init__.py,sha256=MXsKVIZ3uDayyKEXApWn-huhK9JcqlSl-12wE_lUcyo,1099
2
+ thds/adls/_etag.py,sha256=amzbykSwmt5S426M_GXXr2vjwI1NhxYO_GvY-rA7E3Y,1779
2
3
  thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
3
4
  thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
4
5
  thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
5
6
  thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
6
7
  thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
8
+ thds/adls/blobs.py,sha256=Rzw1gDlvI-CswUS8Wd-ebWxGxoKAkR7kC_OKn-QRxzc,869
7
9
  thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
8
10
  thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
9
- thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
11
+ thds/adls/copy.py,sha256=-_5eDKRfhFfR7pGPs257cQL2x0JJTIXKDi3AB-fAtqc,7007
10
12
  thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
11
13
  thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
12
- thds/adls/download.py,sha256=IPg5nz_sGE7dX8DUQyWjG2D9z54PXLScap-pZzTUFTk,19142
14
+ thds/adls/download.py,sha256=jdg8t5lTHhJmH7qLbwxUCCPSErPdEtHUEVQFLSFgRe4,19672
13
15
  thds/adls/errors.py,sha256=6NMLHtVNsWBRDXaes9yzHj9cwKOD9t1dwL4BltdtjhU,1895
14
16
  thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
15
17
  thds/adls/file_lock.py,sha256=yLak5XDpnIYwfUNdpGFbIGG64uEs98-yVscNpJlqMxM,3176
16
- thds/adls/file_properties.py,sha256=dhRtbsMNOYfExkEiy76wrLfrJ6IMQeN1Z3LIxgKceqY,2042
18
+ thds/adls/file_properties.py,sha256=xtI2a0ahcqcJRernoDipeEbn2r_I_pMyR0ZSoapkDgc,2121
17
19
  thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
18
20
  thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
19
- thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
21
+ thds/adls/hashes.py,sha256=t2EZHWNN7N0VkkH1CyE1l5BNAjmn78F5k03fUFErWK0,6289
20
22
  thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
21
23
  thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
22
24
  thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
@@ -25,7 +27,7 @@ thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
27
  thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
26
28
  thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
27
29
  thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
28
- thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
30
+ thds/adls/source.py,sha256=G9C5ncWSxbLCARDoPnhsQIgvTlFuNlSucMUiUoRmt60,3056
29
31
  thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
30
32
  thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
31
33
  thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
@@ -39,8 +41,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
39
41
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
40
42
  thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
41
43
  thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
42
- thds_adls-4.4.20251117191451.dist-info/METADATA,sha256=phV7EH6lnptlnQYY5TSfyZZk0Wiv0wqZ6L6o7pcP4UM,4586
43
- thds_adls-4.4.20251117191451.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- thds_adls-4.4.20251117191451.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
45
- thds_adls-4.4.20251117191451.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
46
- thds_adls-4.4.20251117191451.dist-info/RECORD,,
44
+ thds_adls-4.5.20260110021526.dist-info/METADATA,sha256=f95s20SMLUIvjdtM6b4Y56_UWxEFVrwNskJ8fkEHkWY,4586
45
+ thds_adls-4.5.20260110021526.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
+ thds_adls-4.5.20260110021526.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
47
+ thds_adls-4.5.20260110021526.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
48
+ thds_adls-4.5.20260110021526.dist-info/RECORD,,