thds.adls 4.3.20251014213630__py3-none-any.whl → 4.5.20260110021526__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/adls/__init__.py +2 -0
- thds/adls/_etag.py +46 -0
- thds/adls/blob_meta.py +38 -0
- thds/adls/blobs.py +23 -0
- thds/adls/copy.py +13 -1
- thds/adls/download.py +18 -3
- thds/adls/file_properties.py +4 -0
- thds/adls/hashes.py +23 -6
- thds/adls/list_fast.py +36 -13
- thds/adls/source.py +15 -0
- thds/adls/source_tree.py +7 -54
- thds_adls-4.5.20260110021526.dist-info/METADATA +79 -0
- {thds_adls-4.3.20251014213630.dist-info → thds_adls-4.5.20260110021526.dist-info}/RECORD +16 -13
- thds_adls-4.3.20251014213630.dist-info/METADATA +0 -21
- {thds_adls-4.3.20251014213630.dist-info → thds_adls-4.5.20260110021526.dist-info}/WHEEL +0 -0
- {thds_adls-4.3.20251014213630.dist-info → thds_adls-4.5.20260110021526.dist-info}/entry_points.txt +0 -0
- {thds_adls-4.3.20251014213630.dist-info → thds_adls-4.5.20260110021526.dist-info}/top_level.txt +0 -0
thds/adls/__init__.py
CHANGED
thds/adls/_etag.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# this module is for handling some new functionality related to using etags as a fallback
|
|
2
|
+
# for file hashing when the file properties do not include locally-verifiable hash information.
|
|
3
|
+
import typing as ty
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import xxhash
|
|
7
|
+
|
|
8
|
+
from thds.core import config, hash_cache, home, log, types
|
|
9
|
+
|
|
10
|
+
ETAG_FAKE_HASH_NAME = "adls-azure-etag-fake"
|
|
11
|
+
logger = log.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_etag_bytes(etag_str: str) -> bytes:
|
|
15
|
+
# ADLS etags may or may not be quoted depending on the API used:
|
|
16
|
+
# list_blobs returns unquoted, get_*_properties returns quoted.
|
|
17
|
+
# Strip quotes first, then calculate byte length from the stripped string.
|
|
18
|
+
stripped = etag_str.strip('"')
|
|
19
|
+
return int(stripped, 16).to_bytes((len(stripped) - 2 + 1) // 2, byteorder="big")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_ETAG_CACHE = config.item("cache-path", home.HOMEDIR() / ".thds/adls/xxhash-onto-etag", parse=Path)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def add_to_etag_cache(local_path: types.StrOrPath, etag: bytes) -> hash_cache.Hash:
|
|
26
|
+
xxh_bytes = hash_cache.hash_file(local_path, xxhash.xxh3_128())
|
|
27
|
+
etag_path = _ETAG_CACHE() / xxh_bytes.hex()
|
|
28
|
+
etag_path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
etag_path.write_bytes(etag)
|
|
30
|
+
logger.debug("Writing etag 'hash' to path at %s", etag_path)
|
|
31
|
+
return hash_cache.Hash(ETAG_FAKE_HASH_NAME, etag)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def hash_file_fake_etag(local_path: types.StrOrPath) -> ty.Optional[hash_cache.Hash]:
|
|
35
|
+
try:
|
|
36
|
+
xxh_bytes = hash_cache.hash_file(local_path, xxhash.xxh3_128())
|
|
37
|
+
except FileNotFoundError:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
etag_path = _ETAG_CACHE() / xxh_bytes.hex()
|
|
41
|
+
if etag_path.is_file():
|
|
42
|
+
etag_bytes = etag_path.read_bytes()
|
|
43
|
+
logger.debug("Reusing etag 'fake hash' from path at %s", etag_path)
|
|
44
|
+
return hash_cache.Hash(ETAG_FAKE_HASH_NAME, etag_bytes)
|
|
45
|
+
|
|
46
|
+
return None
|
thds/adls/blob_meta.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from azure.storage.blob import BlobProperties, ContainerClient
|
|
5
|
+
|
|
6
|
+
from thds.core import hashing
|
|
7
|
+
|
|
8
|
+
from . import hashes
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class BlobMeta:
|
|
13
|
+
path: str
|
|
14
|
+
size: int
|
|
15
|
+
hash: ty.Optional[hashing.Hash]
|
|
16
|
+
metadata: dict[str, str]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
|
|
20
|
+
return BlobMeta(
|
|
21
|
+
blob_props.name,
|
|
22
|
+
blob_props.size,
|
|
23
|
+
next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
|
|
24
|
+
blob_props.metadata or {},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_dir(blob_meta: BlobMeta) -> bool:
|
|
29
|
+
return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
|
|
33
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
|
|
34
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
|
|
35
|
+
def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
|
|
36
|
+
for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
|
|
37
|
+
# `list_blobs` does not include metadata by default, so we need to explicitly specify including it
|
|
38
|
+
yield to_blob_meta(blob_props)
|
thds/adls/blobs.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""TODO: better organize the blobs-related modules in thds.adls"""
|
|
2
|
+
|
|
3
|
+
import heapq
|
|
4
|
+
|
|
5
|
+
from thds.core import log, source
|
|
6
|
+
|
|
7
|
+
from .fqn import AdlsFqn
|
|
8
|
+
from .impl import ADLSFileSystem
|
|
9
|
+
from .source import get_with_hash
|
|
10
|
+
|
|
11
|
+
_logger = log.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def most_recent_blobs(blobs_fqn: AdlsFqn, top_n: int = 1) -> list[source.Source]:
|
|
15
|
+
"""Gets top n most recently-created blob in the directory at `blobs_fqn`."""
|
|
16
|
+
_logger.info(f"Enumerating the most recent blobs in {blobs_fqn}")
|
|
17
|
+
fs = ADLSFileSystem(blobs_fqn.sa, blobs_fqn.container)
|
|
18
|
+
snapshots = fs.get_directory_info(blobs_fqn.path, recursive=False)
|
|
19
|
+
if not snapshots:
|
|
20
|
+
raise ValueError(f"No blobs found in {blobs_fqn}")
|
|
21
|
+
top_blobs = heapq.nlargest(top_n, snapshots, key=lambda x: x.creation_time or -1)
|
|
22
|
+
|
|
23
|
+
return [get_with_hash(blobs_fqn.root() / item.name) for item in top_blobs if item.name]
|
thds/adls/copy.py
CHANGED
|
@@ -10,6 +10,7 @@ from azure.storage.blob import BlobSasPermissions, BlobServiceClient, UserDelega
|
|
|
10
10
|
|
|
11
11
|
from thds.core import cache, log, parallel, thunks
|
|
12
12
|
|
|
13
|
+
from ._etag import ETAG_FAKE_HASH_NAME
|
|
13
14
|
from .file_properties import exists, get_blob_properties, get_file_properties, is_directory
|
|
14
15
|
from .fqn import AdlsFqn
|
|
15
16
|
from .global_client import get_global_blob_container_client, get_global_blob_service_client
|
|
@@ -60,7 +61,18 @@ def _copy_file(
|
|
|
60
61
|
def hashes_exist_and_are_equal() -> bool:
|
|
61
62
|
src_blob_props = src_blob_client.get_blob_properties()
|
|
62
63
|
dest_blob_props = dest_blob_client.get_blob_properties()
|
|
63
|
-
|
|
64
|
+
# exclude etag from comparison since it's unique per blob and will always differ
|
|
65
|
+
src_hashes = {
|
|
66
|
+
k: v
|
|
67
|
+
for k, v in extract_hashes_from_props(src_blob_props).items()
|
|
68
|
+
if k != ETAG_FAKE_HASH_NAME
|
|
69
|
+
}
|
|
70
|
+
dest_hashes = {
|
|
71
|
+
k: v
|
|
72
|
+
for k, v in extract_hashes_from_props(dest_blob_props).items()
|
|
73
|
+
if k != ETAG_FAKE_HASH_NAME
|
|
74
|
+
}
|
|
75
|
+
return src_hashes == dest_hashes
|
|
64
76
|
|
|
65
77
|
if dest_blob_client.exists():
|
|
66
78
|
if hashes_exist_and_are_equal():
|
thds/adls/download.py
CHANGED
|
@@ -118,7 +118,7 @@ def _attempt_cache_hit(
|
|
|
118
118
|
with log.logger_context(hash_for="before-download-dest"):
|
|
119
119
|
local_hash = hash_path_if_exists(local_path)
|
|
120
120
|
if local_hash == expected_hash:
|
|
121
|
-
logger.debug("Local path matches %s - no need to look further", expected_hash.algo)
|
|
121
|
+
logger.debug("Local path matches '%s' hash - no need to look further", expected_hash.algo)
|
|
122
122
|
if cache:
|
|
123
123
|
cache_path = cache.path(fqn)
|
|
124
124
|
with log.logger_context(hash_for="before-download-cache"):
|
|
@@ -235,6 +235,12 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
|
|
|
235
235
|
|
|
236
236
|
# attempt cache hits before taking a lock, to avoid contention for existing files.
|
|
237
237
|
if file_result := attempt_cache_hit():
|
|
238
|
+
logger.debug(
|
|
239
|
+
"No download - found cached version of %s using expected %s at %s",
|
|
240
|
+
fqn,
|
|
241
|
+
expected_hash,
|
|
242
|
+
file_result.hit,
|
|
243
|
+
)
|
|
238
244
|
return file_result # noqa: B901
|
|
239
245
|
|
|
240
246
|
# No cache hit, so its time to prepare to download. if a cache was provided, we will
|
|
@@ -344,11 +350,15 @@ def download_or_use_verified(
|
|
|
344
350
|
*,
|
|
345
351
|
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
346
352
|
cache: ty.Optional[Cache] = None,
|
|
353
|
+
set_remote_hash: bool = True,
|
|
347
354
|
) -> ty.Optional[Path]:
|
|
348
355
|
"""Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
|
|
349
356
|
|
|
350
357
|
Note that you will get a logged warning if `local_path` already exists when you call
|
|
351
358
|
this function.
|
|
359
|
+
|
|
360
|
+
If set_remote_hash is False, the function will not attempt to set hash metadata on the
|
|
361
|
+
remote file after download. This is useful when downloading from read-only locations.
|
|
352
362
|
"""
|
|
353
363
|
file_properties = None
|
|
354
364
|
try:
|
|
@@ -372,7 +382,9 @@ def download_or_use_verified(
|
|
|
372
382
|
else:
|
|
373
383
|
raise ValueError(f"Unexpected coroutine request: {co_request}")
|
|
374
384
|
except StopIteration as si:
|
|
375
|
-
if
|
|
385
|
+
if set_remote_hash and (
|
|
386
|
+
meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash)
|
|
387
|
+
):
|
|
376
388
|
try:
|
|
377
389
|
logger.info(f"Setting missing {si.value.hash.algo} hash for {remote_key}")
|
|
378
390
|
assert file_properties
|
|
@@ -399,6 +411,7 @@ async def async_download_or_use_verified(
|
|
|
399
411
|
*,
|
|
400
412
|
expected_hash: ty.Optional[hashing.Hash] = None,
|
|
401
413
|
cache: ty.Optional[Cache] = None,
|
|
414
|
+
set_remote_hash: bool = True,
|
|
402
415
|
) -> ty.Optional[Path]:
|
|
403
416
|
file_properties = None
|
|
404
417
|
try:
|
|
@@ -429,7 +442,9 @@ async def async_download_or_use_verified(
|
|
|
429
442
|
raise ValueError(f"Unexpected coroutine request: {co_request}")
|
|
430
443
|
|
|
431
444
|
except StopIteration as si:
|
|
432
|
-
if
|
|
445
|
+
if set_remote_hash and (
|
|
446
|
+
meta := hashes.create_hash_metadata_if_missing(file_properties, si.value.hash)
|
|
447
|
+
):
|
|
433
448
|
try:
|
|
434
449
|
logger.info(f"Setting missing Hash for {remote_key}")
|
|
435
450
|
assert file_properties
|
thds/adls/file_properties.py
CHANGED
thds/adls/hashes.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
import os
|
|
3
|
+
import sys
|
|
3
4
|
import typing as ty
|
|
4
5
|
from functools import partial
|
|
5
6
|
|
|
@@ -9,6 +10,7 @@ from thds.core import hash_cache, hashing, log, source, types
|
|
|
9
10
|
from thds.core.hashing import Hash, SomehowReadable
|
|
10
11
|
|
|
11
12
|
from . import errors, file_properties
|
|
13
|
+
from ._etag import ETAG_FAKE_HASH_NAME, add_to_etag_cache, extract_etag_bytes, hash_file_fake_etag
|
|
12
14
|
from .fqn import AdlsFqn
|
|
13
15
|
|
|
14
16
|
logger = log.getLogger(__name__)
|
|
@@ -53,6 +55,9 @@ def hash_path_for_algo(
|
|
|
53
55
|
algo: str,
|
|
54
56
|
) -> ty.Callable[[types.StrOrPath], ty.Optional[hashing.Hash]]:
|
|
55
57
|
"""Return a function that hashes a path for the given algorithm."""
|
|
58
|
+
if algo == ETAG_FAKE_HASH_NAME:
|
|
59
|
+
return hash_file_fake_etag
|
|
60
|
+
|
|
56
61
|
return partial(_hash_path_if_exists, partial(hash_cache.filehash, algo))
|
|
57
62
|
|
|
58
63
|
|
|
@@ -77,6 +82,13 @@ def extract_hashes_from_props(
|
|
|
77
82
|
hashes = list(extract_hashes_from_metadata(props.metadata or dict()))
|
|
78
83
|
if props.content_settings and props.content_settings.content_md5:
|
|
79
84
|
hashes.append(hashing.Hash("md5", bytes(props.content_settings.content_md5)))
|
|
85
|
+
|
|
86
|
+
if props.etag:
|
|
87
|
+
# this is the final fallback. it cannot be checked locally, but at least
|
|
88
|
+
# it can be checked against what exists remotely the next time we want to use it.
|
|
89
|
+
if etag_bytes := extract_etag_bytes(props.etag):
|
|
90
|
+
hashes.append(hashing.Hash(sys.intern(ETAG_FAKE_HASH_NAME), etag_bytes))
|
|
91
|
+
|
|
80
92
|
return {h.algo: h for h in hashes}
|
|
81
93
|
|
|
82
94
|
|
|
@@ -87,10 +99,6 @@ def verify_hashes_before_and_after_download(
|
|
|
87
99
|
fqn: AdlsFqn,
|
|
88
100
|
local_dest: types.StrOrPath,
|
|
89
101
|
) -> ty.Iterator[None]:
|
|
90
|
-
# if expected_hash:
|
|
91
|
-
# check_reasonable_md5b64(expected_md5b64)
|
|
92
|
-
# if remote_md5b64:
|
|
93
|
-
# check_reasonable_md5b64(remote_md5b64)
|
|
94
102
|
if remote_hash and expected_hash and remote_hash != expected_hash:
|
|
95
103
|
raise errors.HashMismatchError(
|
|
96
104
|
f"ADLS thinks the {remote_hash.algo} of {fqn} is {hashing.b64(remote_hash.bytes)},"
|
|
@@ -105,11 +113,16 @@ def verify_hashes_before_and_after_download(
|
|
|
105
113
|
expected_algo = remote_hash.algo
|
|
106
114
|
|
|
107
115
|
if not expected_algo:
|
|
108
|
-
# if we have neither a user-provided hash nor a remotely-
|
|
116
|
+
# if we have neither a user-provided hash nor a remotely-found hash, then we have nothing to check.
|
|
109
117
|
return
|
|
110
118
|
|
|
119
|
+
assert expected_hash or remote_hash, "At least one of expected or remote hash must be present."
|
|
111
120
|
with log.logger_context(hash_for="after-download"):
|
|
112
|
-
|
|
121
|
+
if expected_algo == ETAG_FAKE_HASH_NAME:
|
|
122
|
+
assert remote_hash, f"An Etag hash should always originate remotely: {fqn}"
|
|
123
|
+
local_hash = add_to_etag_cache(local_dest, remote_hash.bytes)
|
|
124
|
+
else:
|
|
125
|
+
local_hash = hash_cache.filehash(expected_algo, local_dest)
|
|
113
126
|
|
|
114
127
|
if remote_hash and remote_hash != local_hash:
|
|
115
128
|
raise errors.HashMismatchError(
|
|
@@ -142,6 +155,10 @@ def create_hash_metadata_if_missing(
|
|
|
142
155
|
# without file properties, we can't match the etag when we try to set this.
|
|
143
156
|
return dict()
|
|
144
157
|
|
|
158
|
+
if new_hash.algo == ETAG_FAKE_HASH_NAME:
|
|
159
|
+
# we never want to write etag-based hashes into metadata.
|
|
160
|
+
return dict()
|
|
161
|
+
|
|
145
162
|
existing_metadata = file_properties.metadata or dict()
|
|
146
163
|
if metadata_hash_b64_key(new_hash.algo) not in existing_metadata:
|
|
147
164
|
return {**existing_metadata, **metadata_hash_dict(new_hash)}
|
thds/adls/list_fast.py
CHANGED
|
@@ -6,20 +6,29 @@ client instead of the file system client.
|
|
|
6
6
|
|
|
7
7
|
import typing as ty
|
|
8
8
|
|
|
9
|
-
from thds.core import parallel, thunks
|
|
9
|
+
from thds.core import log, parallel, source, thunks
|
|
10
10
|
|
|
11
|
-
from . import global_client
|
|
11
|
+
from . import blob_meta, global_client
|
|
12
|
+
from . import source as adls_source
|
|
12
13
|
from .fqn import AdlsFqn
|
|
13
|
-
from .
|
|
14
|
+
from .uri import UriIsh, parse_any
|
|
14
15
|
|
|
15
16
|
R = ty.TypeVar("R")
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
logger = log.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
18
22
|
def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R]:
|
|
19
|
-
yield from (
|
|
23
|
+
yield from (
|
|
24
|
+
res
|
|
25
|
+
for _, res in parallel.failfast(
|
|
26
|
+
parallel.yield_all(parallel.create_keys(thunks), progress_logger=logger.debug)
|
|
27
|
+
)
|
|
28
|
+
)
|
|
20
29
|
|
|
21
30
|
|
|
22
|
-
def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
|
|
31
|
+
def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[blob_meta.BlobMeta]:
|
|
23
32
|
"""A fast way to find all blobs in a directory tree; we do this in parallel on
|
|
24
33
|
subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
|
|
25
34
|
|
|
@@ -29,7 +38,7 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
29
38
|
"""
|
|
30
39
|
if layers <= 0:
|
|
31
40
|
# directly yield the blobs
|
|
32
|
-
yield from yield_blob_meta(
|
|
41
|
+
yield from blob_meta.yield_blob_meta(
|
|
33
42
|
global_client.get_global_blob_container_client(fqn.sa, fqn.container),
|
|
34
43
|
fqn.path.rstrip("/") + "/",
|
|
35
44
|
)
|
|
@@ -69,8 +78,10 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
69
78
|
|
|
70
79
|
blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
|
|
71
80
|
|
|
72
|
-
def _get_blob_meta(blob_name: str) -> BlobMeta:
|
|
73
|
-
return to_blob_meta(
|
|
81
|
+
def _get_blob_meta(blob_name: str) -> blob_meta.BlobMeta:
|
|
82
|
+
return blob_meta.to_blob_meta(
|
|
83
|
+
blob_container_client.get_blob_client(blob_name).get_blob_properties()
|
|
84
|
+
)
|
|
74
85
|
|
|
75
86
|
for blob_meta_iter in (
|
|
76
87
|
_failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
|
|
@@ -86,10 +97,22 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
86
97
|
yield from blob_meta_iter
|
|
87
98
|
|
|
88
99
|
|
|
89
|
-
def
|
|
90
|
-
return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
|
|
100
|
+
def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[blob_meta.BlobMeta]:
|
|
94
101
|
"""Only for use within multi_layer_yield_blobs."""
|
|
95
102
|
return list(multilayer_yield_blob_meta(fqn, layers))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def multilayer_yield_sources(
|
|
106
|
+
fqn_or_uri: UriIsh,
|
|
107
|
+
layers: int = 1,
|
|
108
|
+
filter_: ty.Callable[[blob_meta.BlobMeta], bool] = lambda _: True,
|
|
109
|
+
) -> ty.Iterator[source.Source]:
|
|
110
|
+
"""
|
|
111
|
+
if you want to list directories and files, use `multilayer_yield_blob_meta` instead
|
|
112
|
+
"""
|
|
113
|
+
fqn = parse_any(fqn_or_uri)
|
|
114
|
+
root = fqn.root()
|
|
115
|
+
for blob in multilayer_yield_blob_meta(fqn, layers):
|
|
116
|
+
if not blob_meta.is_dir(blob) and filter_(blob):
|
|
117
|
+
# ^ a "dir" Source would not make sense
|
|
118
|
+
yield adls_source.from_adls(root / blob.path, hash=blob.hash, size=blob.size)
|
thds/adls/source.py
CHANGED
|
@@ -6,6 +6,7 @@ from thds.core import source
|
|
|
6
6
|
from thds.core.hashing import Hash
|
|
7
7
|
|
|
8
8
|
from . import cached, hashes, md5
|
|
9
|
+
from .cached import upload_through_cache
|
|
9
10
|
from .errors import blob_not_found_translation
|
|
10
11
|
from .file_properties import get_file_properties
|
|
11
12
|
from .fqn import AdlsFqn
|
|
@@ -71,3 +72,17 @@ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
|
|
|
71
72
|
def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
|
|
72
73
|
"""Meant for older use cases where we had an MD5"""
|
|
73
74
|
return from_adls(uri_or_fqn, md5.to_hash(md5b64) if md5b64 else None)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _upload_handler(dest_uri: str) -> ty.Optional[source.Uploader]:
|
|
78
|
+
if dest_fqn := resolve_uri(dest_uri):
|
|
79
|
+
|
|
80
|
+
def upload_to_adls(local_path: Path, hash: ty.Optional[Hash]) -> None:
|
|
81
|
+
upload_through_cache(dest_fqn, local_path)
|
|
82
|
+
|
|
83
|
+
return upload_to_adls
|
|
84
|
+
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
source.register_upload_handler("thds.adls", _upload_handler)
|
thds/adls/source_tree.py
CHANGED
|
@@ -1,53 +1,6 @@
|
|
|
1
|
-
import typing as ty
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from azure.storage.blob import BlobProperties, ContainerClient
|
|
5
|
-
|
|
6
|
-
from thds.core import hashing
|
|
7
1
|
from thds.core.source.tree import SourceTree
|
|
8
2
|
|
|
9
|
-
from . import fqn,
|
|
10
|
-
|
|
11
|
-
# TODO refactor BlobMeta into its own module.
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class BlobMeta:
|
|
16
|
-
path: str
|
|
17
|
-
size: int
|
|
18
|
-
hash: ty.Optional[hashing.Hash]
|
|
19
|
-
metadata: dict[str, str]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
|
|
23
|
-
return BlobMeta(
|
|
24
|
-
blob_props.name,
|
|
25
|
-
blob_props.size,
|
|
26
|
-
next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
|
|
27
|
-
blob_props.metadata or {},
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
|
|
32
|
-
for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
|
|
33
|
-
# `list_blobs` does not include metadata by default, so we need to explicitly specify including it
|
|
34
|
-
yield to_blob_meta(blob_props)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
|
|
38
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
|
|
39
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
|
|
40
|
-
def list_blob_meta(
|
|
41
|
-
container_client: ContainerClient, root_dir: str, match_suffix: str = ""
|
|
42
|
-
) -> ty.List[BlobMeta]:
|
|
43
|
-
"""Gets the path (relative to the SA/container root), size, and _a_ hash (if available) of all blobs in a directory."""
|
|
44
|
-
return [
|
|
45
|
-
blob_meta
|
|
46
|
-
for blob_meta in yield_blob_meta(container_client, root_dir)
|
|
47
|
-
if blob_meta.size > 0
|
|
48
|
-
# container client lists directories as blobs with size 0
|
|
49
|
-
and blob_meta.path.endswith(match_suffix)
|
|
50
|
-
]
|
|
3
|
+
from . import fqn, list_fast, uri
|
|
51
4
|
|
|
52
5
|
|
|
53
6
|
def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
|
|
@@ -56,12 +9,12 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
|
|
|
56
9
|
"""
|
|
57
10
|
root_fqn = uri.parse_any(adls_path)
|
|
58
11
|
|
|
59
|
-
container_client = global_client.get_global_blob_container_client(root_fqn.sa, root_fqn.container)
|
|
60
|
-
container_root = root_fqn.root()
|
|
61
12
|
return SourceTree(
|
|
62
|
-
sources=
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
13
|
+
sources=sorted(
|
|
14
|
+
list_fast.multilayer_yield_sources(
|
|
15
|
+
root_fqn, layers=0, filter_=lambda blob: blob.path.endswith(match_suffix)
|
|
16
|
+
),
|
|
17
|
+
key=lambda src: src.uri,
|
|
18
|
+
),
|
|
66
19
|
higher_logical_root=fqn.split(root_fqn)[-1],
|
|
67
20
|
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thds.adls
|
|
3
|
+
Version: 4.5.20260110021526
|
|
4
|
+
Summary: ADLS tools
|
|
5
|
+
Author-email: Trilliant Health <info@trillianthealth.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: aiohttp>=3.8.1
|
|
11
|
+
Requires-Dist: aiostream>=0.4.5
|
|
12
|
+
Requires-Dist: azure-identity>=1.9
|
|
13
|
+
Requires-Dist: azure-storage-file-datalake>=12.6
|
|
14
|
+
Requires-Dist: blake3
|
|
15
|
+
Requires-Dist: filelock>=3.0
|
|
16
|
+
Requires-Dist: xxhash
|
|
17
|
+
Requires-Dist: thds-core
|
|
18
|
+
|
|
19
|
+
# thds.adls
|
|
20
|
+
|
|
21
|
+
A high-performance Azure Data Lake Storage (ADLS Gen2) client for the THDS monorepo. It wraps the Azure
|
|
22
|
+
SDK with hash-aware caching, azcopy acceleration, and shared client/credential plumbing so applications
|
|
23
|
+
can transfer large blob datasets quickly and reliably.
|
|
24
|
+
|
|
25
|
+
## Highlights
|
|
26
|
+
|
|
27
|
+
- **Environment-aware paths first:** Almost every consumer starts by importing `fqn`, `AdlsFqn`, and
|
|
28
|
+
`defaults.env_root()` to build storage-account/container URIs that follow the current THDS environment.
|
|
29
|
+
- **Cache-backed reads:** `download_to_cache` is the standard entry point for pulling blobs down with a
|
|
30
|
+
verified hash so local workflows, tests, and pipelines can operate on read-only copies.
|
|
31
|
+
- **Bulk filesystem helpers:** `ADLSFileSystem` powers scripts and jobs that need to walk directories,
|
|
32
|
+
fetch batches of files, or mirror hive tables without re-implementing Azure SDK plumbing.
|
|
33
|
+
- **Spark/Databricks bridges:** `abfss` and `uri` conversions keep analytics code agnostic to whether it
|
|
34
|
+
needs an `adls://`, `abfss://`, `https://`, or `dbfs://` view of the same path.
|
|
35
|
+
- **Composable utilities:** Higher-level modules (cache, upload, copy, list) layer on top of those
|
|
36
|
+
imports so teams can opt into more advanced behavior without leaving the public API surface.
|
|
37
|
+
|
|
38
|
+
## Key Modules
|
|
39
|
+
|
|
40
|
+
| Component | Typical usage in the monorepo |
|
|
41
|
+
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
|
|
42
|
+
| `fqn` | Parse, validate, and join ADLS paths; used when materializing model datasets and configuring pipelines. |
|
|
43
|
+
| `AdlsFqn` | Strongly typed value passed between tasks and tests to represent a single blob or directory. |
|
|
44
|
+
| `defaults` / `named_roots` | Resolve environment-specific storage roots (`defaults.env_root()`, `named_roots.require(...)`). |
|
|
45
|
+
| `download_to_cache` (`cached` module) | Bring a blob down to the shared read-only cache before analytics, feature builds, or test fixtures run. |
|
|
46
|
+
| `ADLSFileSystem` (`impl` module) | Fetch or list entire directory trees and integrate with caching inside scripts and notebooks. |
|
|
47
|
+
| `abfss` | Translate `AdlsFqn` objects into `abfss://` URIs for Spark/Databricks jobs. |
|
|
48
|
+
| `uri` | Normalize `adls://`, `abfss://`, `https://`, and `dbfs://` strings into `AdlsFqn` values (and vice versa). |
|
|
49
|
+
| `global_client` / `shared_credential` | Shared, fork-safe Azure clients and credentials backing the public helpers above. |
|
|
50
|
+
|
|
51
|
+
## Example Usage
|
|
52
|
+
|
|
53
|
+
1. Use the caching helpers and Source integration:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from thds.adls import cached, upload, source
|
|
57
|
+
|
|
58
|
+
cache_path = cached.download_to_cache("adls://acct/container/path/to/file")
|
|
59
|
+
src = upload("adls://acct/container/path/out.parquet", cache_path)
|
|
60
|
+
verified = source.get_with_hash(src.uri)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
1. For CLI usage, run (from repo root):
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
uv run python -m thds.adls.tools.download adls://acct/container/path/file
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Operational Notes
|
|
70
|
+
|
|
71
|
+
- **Hash metadata:** Uploads attach `hash_xxh3_128_b64` automatically when the bytes are known. Download
|
|
72
|
+
completion back-fills missing hashes when permissions allow.
|
|
73
|
+
- **Locks and concurrency:** Large transfers acquire per-path file locks to keep azcopy instances
|
|
74
|
+
cooperative. Global HTTP connection pools default to 100 but are configurable via `thds.core.config`.
|
|
75
|
+
- **Error handling:** `BlobNotFoundError` and other ADLS-specific exceptions translate into custom error
|
|
76
|
+
types to simplify retries and diagnostics.
|
|
77
|
+
- **Extensibility:** Additional hash algorithms can be registered by importing dependent packages (e.g.,
|
|
78
|
+
`blake3`). Named roots can be populated dynamically via environment-specific modules
|
|
79
|
+
(`thds.adls._thds_defaults` hook).
|
|
@@ -1,31 +1,34 @@
|
|
|
1
|
-
thds/adls/__init__.py,sha256=
|
|
1
|
+
thds/adls/__init__.py,sha256=MXsKVIZ3uDayyKEXApWn-huhK9JcqlSl-12wE_lUcyo,1099
|
|
2
|
+
thds/adls/_etag.py,sha256=amzbykSwmt5S426M_GXXr2vjwI1NhxYO_GvY-rA7E3Y,1779
|
|
2
3
|
thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
|
|
3
4
|
thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
|
|
4
5
|
thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
|
|
5
6
|
thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
|
|
7
|
+
thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
|
|
8
|
+
thds/adls/blobs.py,sha256=Rzw1gDlvI-CswUS8Wd-ebWxGxoKAkR7kC_OKn-QRxzc,869
|
|
6
9
|
thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
|
|
7
10
|
thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
|
|
8
|
-
thds/adls/copy.py,sha256
|
|
11
|
+
thds/adls/copy.py,sha256=-_5eDKRfhFfR7pGPs257cQL2x0JJTIXKDi3AB-fAtqc,7007
|
|
9
12
|
thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
|
|
10
13
|
thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
|
|
11
|
-
thds/adls/download.py,sha256=
|
|
14
|
+
thds/adls/download.py,sha256=jdg8t5lTHhJmH7qLbwxUCCPSErPdEtHUEVQFLSFgRe4,19672
|
|
12
15
|
thds/adls/errors.py,sha256=6NMLHtVNsWBRDXaes9yzHj9cwKOD9t1dwL4BltdtjhU,1895
|
|
13
16
|
thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
|
|
14
17
|
thds/adls/file_lock.py,sha256=yLak5XDpnIYwfUNdpGFbIGG64uEs98-yVscNpJlqMxM,3176
|
|
15
|
-
thds/adls/file_properties.py,sha256=
|
|
18
|
+
thds/adls/file_properties.py,sha256=xtI2a0ahcqcJRernoDipeEbn2r_I_pMyR0ZSoapkDgc,2121
|
|
16
19
|
thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
|
|
17
20
|
thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
|
|
18
|
-
thds/adls/hashes.py,sha256=
|
|
21
|
+
thds/adls/hashes.py,sha256=t2EZHWNN7N0VkkH1CyE1l5BNAjmn78F5k03fUFErWK0,6289
|
|
19
22
|
thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
|
|
20
|
-
thds/adls/list_fast.py,sha256=
|
|
23
|
+
thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
|
|
21
24
|
thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
|
|
22
25
|
thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
|
|
23
26
|
thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
27
|
thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
|
|
25
28
|
thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
|
|
26
29
|
thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
|
|
27
|
-
thds/adls/source.py,sha256=
|
|
28
|
-
thds/adls/source_tree.py,sha256=
|
|
30
|
+
thds/adls/source.py,sha256=G9C5ncWSxbLCARDoPnhsQIgvTlFuNlSucMUiUoRmt60,3056
|
|
31
|
+
thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
|
|
29
32
|
thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
|
|
30
33
|
thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
|
|
31
34
|
thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
|
|
@@ -38,8 +41,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
|
|
|
38
41
|
thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
|
|
39
42
|
thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
|
|
40
43
|
thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
|
|
41
|
-
thds_adls-4.
|
|
42
|
-
thds_adls-4.
|
|
43
|
-
thds_adls-4.
|
|
44
|
-
thds_adls-4.
|
|
45
|
-
thds_adls-4.
|
|
44
|
+
thds_adls-4.5.20260110021526.dist-info/METADATA,sha256=f95s20SMLUIvjdtM6b4Y56_UWxEFVrwNskJ8fkEHkWY,4586
|
|
45
|
+
thds_adls-4.5.20260110021526.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
46
|
+
thds_adls-4.5.20260110021526.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
|
|
47
|
+
thds_adls-4.5.20260110021526.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
|
|
48
|
+
thds_adls-4.5.20260110021526.dist-info/RECORD,,
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: thds.adls
|
|
3
|
-
Version: 4.3.20251014213630
|
|
4
|
-
Summary: ADLS tools
|
|
5
|
-
Author-email: Trilliant Health <info@trillianthealth.com>
|
|
6
|
-
License: MIT
|
|
7
|
-
Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
|
|
8
|
-
Requires-Python: >=3.9
|
|
9
|
-
Description-Content-Type: text/markdown
|
|
10
|
-
Requires-Dist: aiohttp>=3.8.1
|
|
11
|
-
Requires-Dist: aiostream>=0.4.5
|
|
12
|
-
Requires-Dist: azure-identity>=1.9
|
|
13
|
-
Requires-Dist: azure-storage-file-datalake>=12.6
|
|
14
|
-
Requires-Dist: blake3
|
|
15
|
-
Requires-Dist: filelock>=3.0
|
|
16
|
-
Requires-Dist: xxhash
|
|
17
|
-
Requires-Dist: thds-core
|
|
18
|
-
|
|
19
|
-
# adls Library
|
|
20
|
-
|
|
21
|
-
A port of `core.adls`.
|
|
File without changes
|
{thds_adls-4.3.20251014213630.dist-info → thds_adls-4.5.20260110021526.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{thds_adls-4.3.20251014213630.dist-info → thds_adls-4.5.20260110021526.dist-info}/top_level.txt
RENAMED
|
File without changes
|