thds.adls 4.3.20251014213630__py3-none-any.whl → 4.5.20260110021526__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thds/adls/__init__.py CHANGED
@@ -2,6 +2,8 @@ from thds import core
2
2
 
3
3
  from . import ( # noqa: F401
4
4
  abfss,
5
+ blob_meta,
6
+ blobs,
5
7
  defaults,
6
8
  etag,
7
9
  fqn,
thds/adls/_etag.py ADDED
@@ -0,0 +1,46 @@
1
+ # this module is for handling some new functionality related to using etags as a fallback
2
+ # for file hashing when the file properties do not include locally-verifiable hash information.
3
+ import typing as ty
4
+ from pathlib import Path
5
+
6
+ import xxhash
7
+
8
+ from thds.core import config, hash_cache, home, log, types
9
+
10
+ ETAG_FAKE_HASH_NAME = "adls-azure-etag-fake"
11
+ logger = log.getLogger(__name__)
12
+
13
+
14
+ def extract_etag_bytes(etag_str: str) -> bytes:
15
+ # ADLS etags may or may not be quoted depending on the API used:
16
+ # list_blobs returns unquoted, get_*_properties returns quoted.
17
+ # Strip quotes first, then calculate byte length from the stripped string.
18
+ stripped = etag_str.strip('"')
19
+ return int(stripped, 16).to_bytes((len(stripped) - 2 + 1) // 2, byteorder="big")
20
+
21
+
22
+ _ETAG_CACHE = config.item("cache-path", home.HOMEDIR() / ".thds/adls/xxhash-onto-etag", parse=Path)
23
+
24
+
25
+ def add_to_etag_cache(local_path: types.StrOrPath, etag: bytes) -> hash_cache.Hash:
26
+ xxh_bytes = hash_cache.hash_file(local_path, xxhash.xxh3_128())
27
+ etag_path = _ETAG_CACHE() / xxh_bytes.hex()
28
+ etag_path.parent.mkdir(parents=True, exist_ok=True)
29
+ etag_path.write_bytes(etag)
30
+ logger.debug("Writing etag 'hash' to path at %s", etag_path)
31
+ return hash_cache.Hash(ETAG_FAKE_HASH_NAME, etag)
32
+
33
+
34
+ def hash_file_fake_etag(local_path: types.StrOrPath) -> ty.Optional[hash_cache.Hash]:
35
+ try:
36
+ xxh_bytes = hash_cache.hash_file(local_path, xxhash.xxh3_128())
37
+ except FileNotFoundError:
38
+ return None
39
+
40
+ etag_path = _ETAG_CACHE() / xxh_bytes.hex()
41
+ if etag_path.is_file():
42
+ etag_bytes = etag_path.read_bytes()
43
+ logger.debug("Reusing etag 'fake hash' from path at %s", etag_path)
44
+ return hash_cache.Hash(ETAG_FAKE_HASH_NAME, etag_bytes)
45
+
46
+ return None
thds/adls/blob_meta.py ADDED
@@ -0,0 +1,38 @@
1
+ import typing as ty
2
+ from dataclasses import dataclass
3
+
4
+ from azure.storage.blob import BlobProperties, ContainerClient
5
+
6
+ from thds.core import hashing
7
+
8
+ from . import hashes
9
+
10
+
11
+ @dataclass
12
+ class BlobMeta:
13
+ path: str
14
+ size: int
15
+ hash: ty.Optional[hashing.Hash]
16
+ metadata: dict[str, str]
17
+
18
+
19
+ def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
20
+ return BlobMeta(
21
+ blob_props.name,
22
+ blob_props.size,
23
+ next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
24
+ blob_props.metadata or {},
25
+ )
26
+
27
+
28
+ def is_dir(blob_meta: BlobMeta) -> bool:
29
+ return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
30
+
31
+
32
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
33
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
34
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
35
+ def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
36
+ for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
37
+ # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
38
+ yield to_blob_meta(blob_props)
thds/adls/blobs.py ADDED
@@ -0,0 +1,23 @@
1
+ """TODO: better organize the blobs-related modules in thds.adls"""
2
+
3
+ import heapq
4
+
5
+ from thds.core import log, source
6
+
7
+ from .fqn import AdlsFqn
8
+ from .impl import ADLSFileSystem
9
+ from .source import get_with_hash
10
+
11
+ _logger = log.getLogger(__name__)
12
+
13
+
14
+ def most_recent_blobs(blobs_fqn: AdlsFqn, top_n: int = 1) -> list[source.Source]:
15
+ """Gets top n most recently-created blob in the directory at `blobs_fqn`."""
16
+ _logger.info(f"Enumerating the most recent blobs in {blobs_fqn}")
17
+ fs = ADLSFileSystem(blobs_fqn.sa, blobs_fqn.container)
18
+ snapshots = fs.get_directory_info(blobs_fqn.path, recursive=False)
19
+ if not snapshots:
20
+ raise ValueError(f"No blobs found in {blobs_fqn}")
21
+ top_blobs = heapq.nlargest(top_n, snapshots, key=lambda x: x.creation_time or -1)
22
+
23
+ return [get_with_hash(blobs_fqn.root() / item.name) for item in top_blobs if item.name]
thds/adls/copy.py CHANGED
@@ -10,6 +10,7 @@ from azure.storage.blob import BlobSasPermissions, BlobServiceClient, UserDelega
10
10
 
11
11
  from thds.core import cache, log, parallel, thunks
12
12
 
13
+ from ._etag import ETAG_FAKE_HASH_NAME
13
14
  from .file_properties import exists, get_blob_properties, get_file_properties, is_directory
14
15
  from .fqn import AdlsFqn
15
16
  from .global_client import get_global_blob_container_client, get_global_blob_service_client
@@ -60,7 +61,18 @@ def _copy_file(
60
61
  def hashes_exist_and_are_equal() -> bool:
61
62
  src_blob_props = src_blob_client.get_blob_properties()
62
63
  dest_blob_props = dest_blob_client.get_blob_properties()
63
- return extract_hashes_from_props(src_blob_props) == extract_hashes_from_props(dest_blob_props)
64
+ # exclude etag from comparison since it's unique per blob and will always differ
65
+ src_hashes = {
66
+ k: v
67
+ for k, v in extract_hashes_from_props(src_blob_props).items()
68
+ if k != ETAG_FAKE_HASH_NAME
69
+ }
70
+ dest_hashes = {
71
+ k: v
72
+ for k, v in extract_hashes_from_props(dest_blob_props).items()
73
+ if k != ETAG_FAKE_HASH_NAME
74
+ }
75
+ return src_hashes == dest_hashes
64
76
 
65
77
  if dest_blob_client.exists():
66
78
  if hashes_exist_and_are_equal():
thds/adls/download.py CHANGED
@@ -118,7 +118,7 @@ def _attempt_cache_hit(
118
118
  with log.logger_context(hash_for="before-download-dest"):
119
119
  local_hash = hash_path_if_exists(local_path)
120
120
  if local_hash == expected_hash:
121
- logger.debug("Local path matches %s - no need to look further", expected_hash.algo)
121
+ logger.debug("Local path matches '%s' hash - no need to look further", expected_hash.algo)
122
122
  if cache:
123
123
  cache_path = cache.path(fqn)
124
124
  with log.logger_context(hash_for="before-download-cache"):
@@ -235,6 +235,12 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
235
235
 
236
236
  # attempt cache hits before taking a lock, to avoid contention for existing files.
237
237
  if file_result := attempt_cache_hit():
238
+ logger.debug(
239
+ "No download - found cached version of %s using expected %s at %s",
240
+ fqn,
241
+ expected_hash,
242
+ file_result.hit,
243
+ )
238
244
  return file_result # noqa: B901
239
245
 
240
246
  # No cache hit, so its time to prepare to download. if a cache was provided, we will
@@ -344,11 +350,15 @@ def download_or_use_verified(
344
350
  *,
345
351
  expected_hash: ty.Optional[hashing.Hash] = None,
346
352
  cache: ty.Optional[Cache] = None,
353
+ set_remote_hash: bool = True,
347
354
  ) -> ty.Optional[Path]:
348
355
  """Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
349
356
 
350
357
  Note that you will get a logged warning if `local_path` already exists when you call
351
358
  this function.
359
+
360
+ If set_remote_hash is False, the function will not attempt to set hash metadata on the
361
+ remote file after download. This is useful when downloading from read-only locations.
352
362
  """
353
363
  file_properties = None
354
364
  try:
@@ -372,7 +382,9 @@ def download_or_use_verified(
372
382
  else:
373
383
  raise ValueError(f"Unexpected coroutine request: {co_request}")
374
384
  except StopIteration as si:
375
- if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
385
+ if set_remote_hash and (
386
+ meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash)
387
+ ):
376
388
  try:
377
389
  logger.info(f"Setting missing {si.value.hash.algo} hash for {remote_key}")
378
390
  assert file_properties
@@ -399,6 +411,7 @@ async def async_download_or_use_verified(
399
411
  *,
400
412
  expected_hash: ty.Optional[hashing.Hash] = None,
401
413
  cache: ty.Optional[Cache] = None,
414
+ set_remote_hash: bool = True,
402
415
  ) -> ty.Optional[Path]:
403
416
  file_properties = None
404
417
  try:
@@ -429,7 +442,9 @@ async def async_download_or_use_verified(
429
442
  raise ValueError(f"Unexpected coroutine request: {co_request}")
430
443
 
431
444
  except StopIteration as si:
432
- if meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash):
445
+ if set_remote_hash and (
446
+ meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash)
447
+ ):
433
448
  try:
434
449
  logger.info(f"Setting missing Hash for {remote_key}")
435
450
  assert file_properties
@@ -37,6 +37,10 @@ class PropertiesP(ty.Protocol):
37
37
  name: ty.Any
38
38
  metadata: ty.Any
39
39
 
40
+ @property
41
+ def etag(self) -> ty.Union[str, None, ty.Any]:
42
+ pass
43
+
40
44
  @property
41
45
  def content_settings(self) -> ContentSettingsP:
42
46
  pass
thds/adls/hashes.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import contextlib
2
2
  import os
3
+ import sys
3
4
  import typing as ty
4
5
  from functools import partial
5
6
 
@@ -9,6 +10,7 @@ from thds.core import hash_cache, hashing, log, source, types
9
10
  from thds.core.hashing import Hash, SomehowReadable
10
11
 
11
12
  from . import errors, file_properties
13
+ from ._etag import ETAG_FAKE_HASH_NAME, add_to_etag_cache, extract_etag_bytes, hash_file_fake_etag
12
14
  from .fqn import AdlsFqn
13
15
 
14
16
  logger = log.getLogger(__name__)
@@ -53,6 +55,9 @@ def hash_path_for_algo(
53
55
  algo: str,
54
56
  ) -> ty.Callable[[types.StrOrPath], ty.Optional[hashing.Hash]]:
55
57
  """Return a function that hashes a path for the given algorithm."""
58
+ if algo == ETAG_FAKE_HASH_NAME:
59
+ return hash_file_fake_etag
60
+
56
61
  return partial(_hash_path_if_exists, partial(hash_cache.filehash, algo))
57
62
 
58
63
 
@@ -77,6 +82,13 @@ def extract_hashes_from_props(
77
82
  hashes = list(extract_hashes_from_metadata(props.metadata or dict()))
78
83
  if props.content_settings and props.content_settings.content_md5:
79
84
  hashes.append(hashing.Hash("md5", bytes(props.content_settings.content_md5)))
85
+
86
+ if props.etag:
87
+ # this is the final fallback. it cannot be checked locally, but at least
88
+ # it can be checked against what exists remotely the next time we want to use it.
89
+ if etag_bytes := extract_etag_bytes(props.etag):
90
+ hashes.append(hashing.Hash(sys.intern(ETAG_FAKE_HASH_NAME), etag_bytes))
91
+
80
92
  return {h.algo: h for h in hashes}
81
93
 
82
94
 
@@ -87,10 +99,6 @@ def verify_hashes_before_and_after_download(
87
99
  fqn: AdlsFqn,
88
100
  local_dest: types.StrOrPath,
89
101
  ) -> ty.Iterator[None]:
90
- # if expected_hash:
91
- # check_reasonable_md5b64(expected_md5b64)
92
- # if remote_md5b64:
93
- # check_reasonable_md5b64(remote_md5b64)
94
102
  if remote_hash and expected_hash and remote_hash != expected_hash:
95
103
  raise errors.HashMismatchError(
96
104
  f"ADLS thinks the {remote_hash.algo} of {fqn} is {hashing.b64(remote_hash.bytes)},"
@@ -105,11 +113,16 @@ def verify_hashes_before_and_after_download(
105
113
  expected_algo = remote_hash.algo
106
114
 
107
115
  if not expected_algo:
108
- # if we have neither a user-provided hash nor a remotely-foun9d hash, then we have nothing to check.
116
+ # if we have neither a user-provided hash nor a remotely-found hash, then we have nothing to check.
109
117
  return
110
118
 
119
+ assert expected_hash or remote_hash, "At least one of expected or remote hash must be present."
111
120
  with log.logger_context(hash_for="after-download"):
112
- local_hash = hash_cache.filehash(expected_algo, local_dest)
121
+ if expected_algo == ETAG_FAKE_HASH_NAME:
122
+ assert remote_hash, f"An Etag hash should always originate remotely: {fqn}"
123
+ local_hash = add_to_etag_cache(local_dest, remote_hash.bytes)
124
+ else:
125
+ local_hash = hash_cache.filehash(expected_algo, local_dest)
113
126
 
114
127
  if remote_hash and remote_hash != local_hash:
115
128
  raise errors.HashMismatchError(
@@ -142,6 +155,10 @@ def create_hash_metadata_if_missing(
142
155
  # without file properties, we can't match the etag when we try to set this.
143
156
  return dict()
144
157
 
158
+ if new_hash.algo == ETAG_FAKE_HASH_NAME:
159
+ # we never want to write etag-based hashes into metadata.
160
+ return dict()
161
+
145
162
  existing_metadata = file_properties.metadata or dict()
146
163
  if metadata_hash_b64_key(new_hash.algo) not in existing_metadata:
147
164
  return {**existing_metadata, **metadata_hash_dict(new_hash)}
thds/adls/list_fast.py CHANGED
@@ -6,20 +6,29 @@ client instead of the file system client.
6
6
 
7
7
  import typing as ty
8
8
 
9
- from thds.core import parallel, thunks
9
+ from thds.core import log, parallel, source, thunks
10
10
 
11
- from . import global_client
11
+ from . import blob_meta, global_client
12
+ from . import source as adls_source
12
13
  from .fqn import AdlsFqn
13
- from .source_tree import BlobMeta, to_blob_meta, yield_blob_meta
14
+ from .uri import UriIsh, parse_any
14
15
 
15
16
  R = ty.TypeVar("R")
16
17
 
17
18
 
19
+ logger = log.getLogger(__name__)
20
+
21
+
18
22
  def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R]:
19
- yield from (res for _, res in parallel.failfast(parallel.yield_all(parallel.create_keys(thunks))))
23
+ yield from (
24
+ res
25
+ for _, res in parallel.failfast(
26
+ parallel.yield_all(parallel.create_keys(thunks), progress_logger=logger.debug)
27
+ )
28
+ )
20
29
 
21
30
 
22
- def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
31
+ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[blob_meta.BlobMeta]:
23
32
  """A fast way to find all blobs in a directory tree; we do this in parallel on
24
33
  subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
25
34
 
@@ -29,7 +38,7 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
29
38
  """
30
39
  if layers <= 0:
31
40
  # directly yield the blobs
32
- yield from yield_blob_meta(
41
+ yield from blob_meta.yield_blob_meta(
33
42
  global_client.get_global_blob_container_client(fqn.sa, fqn.container),
34
43
  fqn.path.rstrip("/") + "/",
35
44
  )
@@ -69,8 +78,10 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
69
78
 
70
79
  blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
71
80
 
72
- def _get_blob_meta(blob_name: str) -> BlobMeta:
73
- return to_blob_meta(blob_container_client.get_blob_client(blob_name).get_blob_properties())
81
+ def _get_blob_meta(blob_name: str) -> blob_meta.BlobMeta:
82
+ return blob_meta.to_blob_meta(
83
+ blob_container_client.get_blob_client(blob_name).get_blob_properties()
84
+ )
74
85
 
75
86
  for blob_meta_iter in (
76
87
  _failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
@@ -86,10 +97,22 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
86
97
  yield from blob_meta_iter
87
98
 
88
99
 
89
- def is_dir(blob_meta: BlobMeta) -> bool: # TODO move to blob_meta.py once it exists
90
- return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
91
-
92
-
93
- def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
100
+ def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[blob_meta.BlobMeta]:
94
101
  """Only for use within multi_layer_yield_blobs."""
95
102
  return list(multilayer_yield_blob_meta(fqn, layers))
103
+
104
+
105
+ def multilayer_yield_sources(
106
+ fqn_or_uri: UriIsh,
107
+ layers: int = 1,
108
+ filter_: ty.Callable[[blob_meta.BlobMeta], bool] = lambda _: True,
109
+ ) -> ty.Iterator[source.Source]:
110
+ """
111
+ if you want to list directories and files, use `multilayer_yield_blob_meta` instead
112
+ """
113
+ fqn = parse_any(fqn_or_uri)
114
+ root = fqn.root()
115
+ for blob in multilayer_yield_blob_meta(fqn, layers):
116
+ if not blob_meta.is_dir(blob) and filter_(blob):
117
+ # ^ a "dir" Source would not make sense
118
+ yield adls_source.from_adls(root / blob.path, hash=blob.hash, size=blob.size)
thds/adls/source.py CHANGED
@@ -6,6 +6,7 @@ from thds.core import source
6
6
  from thds.core.hashing import Hash
7
7
 
8
8
  from . import cached, hashes, md5
9
+ from .cached import upload_through_cache
9
10
  from .errors import blob_not_found_translation
10
11
  from .file_properties import get_file_properties
11
12
  from .fqn import AdlsFqn
@@ -71,3 +72,17 @@ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
71
72
  def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
72
73
  """Meant for older use cases where we had an MD5"""
73
74
  return from_adls(uri_or_fqn, md5.to_hash(md5b64) if md5b64 else None)
75
+
76
+
77
+ def _upload_handler(dest_uri: str) -> ty.Optional[source.Uploader]:
78
+ if dest_fqn := resolve_uri(dest_uri):
79
+
80
+ def upload_to_adls(local_path: Path, hash: ty.Optional[Hash]) -> None:
81
+ upload_through_cache(dest_fqn, local_path)
82
+
83
+ return upload_to_adls
84
+
85
+ return None
86
+
87
+
88
+ source.register_upload_handler("thds.adls", _upload_handler)
thds/adls/source_tree.py CHANGED
@@ -1,53 +1,6 @@
1
- import typing as ty
2
- from dataclasses import dataclass
3
-
4
- from azure.storage.blob import BlobProperties, ContainerClient
5
-
6
- from thds.core import hashing
7
1
  from thds.core.source.tree import SourceTree
8
2
 
9
- from . import fqn, global_client, hashes, source, uri
10
-
11
- # TODO refactor BlobMeta into its own module.
12
-
13
-
14
- @dataclass
15
- class BlobMeta:
16
- path: str
17
- size: int
18
- hash: ty.Optional[hashing.Hash]
19
- metadata: dict[str, str]
20
-
21
-
22
- def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
23
- return BlobMeta(
24
- blob_props.name,
25
- blob_props.size,
26
- next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
27
- blob_props.metadata or {},
28
- )
29
-
30
-
31
- def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
32
- for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
33
- # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
34
- yield to_blob_meta(blob_props)
35
-
36
-
37
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
38
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
39
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
40
- def list_blob_meta(
41
- container_client: ContainerClient, root_dir: str, match_suffix: str = ""
42
- ) -> ty.List[BlobMeta]:
43
- """Gets the path (relative to the SA/container root), size, and _a_ hash (if available) of all blobs in a directory."""
44
- return [
45
- blob_meta
46
- for blob_meta in yield_blob_meta(container_client, root_dir)
47
- if blob_meta.size > 0
48
- # container client lists directories as blobs with size 0
49
- and blob_meta.path.endswith(match_suffix)
50
- ]
3
+ from . import fqn, list_fast, uri
51
4
 
52
5
 
53
6
  def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
@@ -56,12 +9,12 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
56
9
  """
57
10
  root_fqn = uri.parse_any(adls_path)
58
11
 
59
- container_client = global_client.get_global_blob_container_client(root_fqn.sa, root_fqn.container)
60
- container_root = root_fqn.root()
61
12
  return SourceTree(
62
- sources=[
63
- source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash, size=blob_meta.size)
64
- for blob_meta in list_blob_meta(container_client, root_fqn.path, match_suffix=match_suffix)
65
- ],
13
+ sources=sorted(
14
+ list_fast.multilayer_yield_sources(
15
+ root_fqn, layers=0, filter_=lambda blob: blob.path.endswith(match_suffix)
16
+ ),
17
+ key=lambda src: src.uri,
18
+ ),
66
19
  higher_logical_root=fqn.split(root_fqn)[-1],
67
20
  )
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: thds.adls
3
+ Version: 4.5.20260110021526
4
+ Summary: ADLS tools
5
+ Author-email: Trilliant Health <info@trillianthealth.com>
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: aiohttp>=3.8.1
11
+ Requires-Dist: aiostream>=0.4.5
12
+ Requires-Dist: azure-identity>=1.9
13
+ Requires-Dist: azure-storage-file-datalake>=12.6
14
+ Requires-Dist: blake3
15
+ Requires-Dist: filelock>=3.0
16
+ Requires-Dist: xxhash
17
+ Requires-Dist: thds-core
18
+
19
+ # thds.adls
20
+
21
+ A high-performance Azure Data Lake Storage (ADLS Gen2) client for the THDS monorepo. It wraps the Azure
22
+ SDK with hash-aware caching, azcopy acceleration, and shared client/credential plumbing so applications
23
+ can transfer large blob datasets quickly and reliably.
24
+
25
+ ## Highlights
26
+
27
+ - **Environment-aware paths first:** Almost every consumer starts by importing `fqn`, `AdlsFqn`, and
28
+ `defaults.env_root()` to build storage-account/container URIs that follow the current THDS environment.
29
+ - **Cache-backed reads:** `download_to_cache` is the standard entry point for pulling blobs down with a
30
+ verified hash so local workflows, tests, and pipelines can operate on read-only copies.
31
+ - **Bulk filesystem helpers:** `ADLSFileSystem` powers scripts and jobs that need to walk directories,
32
+ fetch batches of files, or mirror hive tables without re-implementing Azure SDK plumbing.
33
+ - **Spark/Databricks bridges:** `abfss` and `uri` conversions keep analytics code agnostic to whether it
34
+ needs an `adls://`, `abfss://`, `https://`, or `dbfs://` view of the same path.
35
+ - **Composable utilities:** Higher-level modules (cache, upload, copy, list) layer on top of those
36
+ imports so teams can opt into more advanced behavior without leaving the public API surface.
37
+
38
+ ## Key Modules
39
+
40
+ | Component | Typical usage in the monorepo |
41
+ | ------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
42
+ | `fqn` | Parse, validate, and join ADLS paths; used when materializing model datasets and configuring pipelines. |
43
+ | `AdlsFqn` | Strongly typed value passed between tasks and tests to represent a single blob or directory. |
44
+ | `defaults` / `named_roots` | Resolve environment-specific storage roots (`defaults.env_root()`, `named_roots.require(...)`). |
45
+ | `download_to_cache` (`cached` module) | Bring a blob down to the shared read-only cache before analytics, feature builds, or test fixtures run. |
46
+ | `ADLSFileSystem` (`impl` module) | Fetch or list entire directory trees and integrate with caching inside scripts and notebooks. |
47
+ | `abfss` | Translate `AdlsFqn` objects into `abfss://` URIs for Spark/Databricks jobs. |
48
+ | `uri` | Normalize `adls://`, `abfss://`, `https://`, and `dbfs://` strings into `AdlsFqn` values (and vice versa). |
49
+ | `global_client` / `shared_credential` | Shared, fork-safe Azure clients and credentials backing the public helpers above. |
50
+
51
+ ## Example Usage
52
+
53
+ 1. Use the caching helpers and Source integration:
54
+
55
+ ```python
56
+ from thds.adls import cached, upload, source
57
+
58
+ cache_path = cached.download_to_cache("adls://acct/container/path/to/file")
59
+ src = upload("adls://acct/container/path/out.parquet", cache_path)
60
+ verified = source.get_with_hash(src.uri)
61
+ ```
62
+
63
+ 1. For CLI usage, run (from repo root):
64
+
65
+ ```bash
66
+ uv run python -m thds.adls.tools.download adls://acct/container/path/file
67
+ ```
68
+
69
+ ## Operational Notes
70
+
71
+ - **Hash metadata:** Uploads attach `hash_xxh3_128_b64` automatically when the bytes are known. Download
72
+ completion back-fills missing hashes when permissions allow.
73
+ - **Locks and concurrency:** Large transfers acquire per-path file locks to keep azcopy instances
74
+ cooperative. Global HTTP connection pools default to 100 but are configurable via `thds.core.config`.
75
+ - **Error handling:** `BlobNotFoundError` and other ADLS-specific exceptions translate into custom error
76
+ types to simplify retries and diagnostics.
77
+ - **Extensibility:** Additional hash algorithms can be registered by importing dependent packages (e.g.,
78
+ `blake3`). Named roots can be populated dynamically via environment-specific modules
79
+ (`thds.adls._thds_defaults` hook).
@@ -1,31 +1,34 @@
1
- thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
1
+ thds/adls/__init__.py,sha256=MXsKVIZ3uDayyKEXApWn-huhK9JcqlSl-12wE_lUcyo,1099
2
+ thds/adls/_etag.py,sha256=amzbykSwmt5S426M_GXXr2vjwI1NhxYO_GvY-rA7E3Y,1779
2
3
  thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
3
4
  thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
4
5
  thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
5
6
  thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
7
+ thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
8
+ thds/adls/blobs.py,sha256=Rzw1gDlvI-CswUS8Wd-ebWxGxoKAkR7kC_OKn-QRxzc,869
6
9
  thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
7
10
  thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
8
- thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
11
+ thds/adls/copy.py,sha256=-_5eDKRfhFfR7pGPs257cQL2x0JJTIXKDi3AB-fAtqc,7007
9
12
  thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
10
13
  thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
11
- thds/adls/download.py,sha256=IPg5nz_sGE7dX8DUQyWjG2D9z54PXLScap-pZzTUFTk,19142
14
+ thds/adls/download.py,sha256=jdg8t5lTHhJmH7qLbwxUCCPSErPdEtHUEVQFLSFgRe4,19672
12
15
  thds/adls/errors.py,sha256=6NMLHtVNsWBRDXaes9yzHj9cwKOD9t1dwL4BltdtjhU,1895
13
16
  thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
14
17
  thds/adls/file_lock.py,sha256=yLak5XDpnIYwfUNdpGFbIGG64uEs98-yVscNpJlqMxM,3176
15
- thds/adls/file_properties.py,sha256=dhRtbsMNOYfExkEiy76wrLfrJ6IMQeN1Z3LIxgKceqY,2042
18
+ thds/adls/file_properties.py,sha256=xtI2a0ahcqcJRernoDipeEbn2r_I_pMyR0ZSoapkDgc,2121
16
19
  thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
17
20
  thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
18
- thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
21
+ thds/adls/hashes.py,sha256=t2EZHWNN7N0VkkH1CyE1l5BNAjmn78F5k03fUFErWK0,6289
19
22
  thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
20
- thds/adls/list_fast.py,sha256=yk0ydFiBa7U5JU3BCcIGCcrnS-J3yJaZbaZQ_Xj9xWU,4207
23
+ thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
21
24
  thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
22
25
  thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
23
26
  thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
27
  thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
25
28
  thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
26
29
  thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
27
- thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
28
- thds/adls/source_tree.py,sha256=FqVXgvfYPiowrWhRsXBItjvB7t41JRI3sCFVAHxjwgI,2610
30
+ thds/adls/source.py,sha256=G9C5ncWSxbLCARDoPnhsQIgvTlFuNlSucMUiUoRmt60,3056
31
+ thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
29
32
  thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
30
33
  thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
31
34
  thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
@@ -38,8 +41,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
38
41
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
39
42
  thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
40
43
  thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
41
- thds_adls-4.3.20251014213630.dist-info/METADATA,sha256=oJ4p2IS6hLfeR1BEG7-hCKl8uvfGCifr82ZPJpjIYGY,587
42
- thds_adls-4.3.20251014213630.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- thds_adls-4.3.20251014213630.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
44
- thds_adls-4.3.20251014213630.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
45
- thds_adls-4.3.20251014213630.dist-info/RECORD,,
44
+ thds_adls-4.5.20260110021526.dist-info/METADATA,sha256=f95s20SMLUIvjdtM6b4Y56_UWxEFVrwNskJ8fkEHkWY,4586
45
+ thds_adls-4.5.20260110021526.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
46
+ thds_adls-4.5.20260110021526.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
47
+ thds_adls-4.5.20260110021526.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
48
+ thds_adls-4.5.20260110021526.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: thds.adls
3
- Version: 4.3.20251014213630
4
- Summary: ADLS tools
5
- Author-email: Trilliant Health <info@trillianthealth.com>
6
- License: MIT
7
- Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
8
- Requires-Python: >=3.9
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: aiohttp>=3.8.1
11
- Requires-Dist: aiostream>=0.4.5
12
- Requires-Dist: azure-identity>=1.9
13
- Requires-Dist: azure-storage-file-datalake>=12.6
14
- Requires-Dist: blake3
15
- Requires-Dist: filelock>=3.0
16
- Requires-Dist: xxhash
17
- Requires-Dist: thds-core
18
-
19
- # adls Library
20
-
21
- A port of `core.adls`.