thds.adls 4.2.20250926202021__py3-none-any.whl → 4.4.20251117191451__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/adls/__init__.py +1 -0
- thds/adls/_upload.py +15 -10
- thds/adls/blob_meta.py +38 -0
- thds/adls/download.py +3 -3
- thds/adls/{download_lock.py → file_lock.py} +12 -12
- thds/adls/list_fast.py +37 -14
- thds/adls/source.py +8 -4
- thds/adls/source_tree.py +7 -54
- thds/adls/upload.py +14 -14
- thds_adls-4.4.20251117191451.dist-info/METADATA +79 -0
- {thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/RECORD +14 -13
- thds_adls-4.2.20250926202021.dist-info/METADATA +0 -21
- {thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/WHEEL +0 -0
- {thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/entry_points.txt +0 -0
- {thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/top_level.txt +0 -0
thds/adls/__init__.py
CHANGED
thds/adls/_upload.py
CHANGED
|
@@ -38,17 +38,22 @@ def _try_default_hash(data: hashes.AnyStrSrc) -> ty.Optional[hashing.Hash]:
|
|
|
38
38
|
return None
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
UploadSrc = ty.Union[Path, bytes, ty.IO[bytes], ty.Iterable[bytes]]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def upload_src_len(upload_src: UploadSrc, default: int = 0) -> int:
|
|
45
|
+
if isinstance(upload_src, Path) and upload_src.exists():
|
|
46
|
+
return upload_src.stat().st_size
|
|
47
|
+
try:
|
|
48
|
+
return len(upload_src) # type: ignore
|
|
49
|
+
except TypeError as te:
|
|
50
|
+
logger.debug(f"failed to get length? {repr(te)} for {upload_src!r}")
|
|
51
|
+
return default
|
|
52
|
+
|
|
53
|
+
|
|
41
54
|
def _too_small_to_skip_upload(data: hashes.AnyStrSrc, min_size_for_remote_check: int) -> bool:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
return data.stat().st_size
|
|
45
|
-
try:
|
|
46
|
-
return len(data) # type: ignore
|
|
47
|
-
except TypeError as te:
|
|
48
|
-
logger.debug(f"failed to get length? {repr(te)} for {data}")
|
|
49
|
-
return min_size_for_remote_check + 1
|
|
50
|
-
|
|
51
|
-
return _len() < min_size_for_remote_check
|
|
55
|
+
len_ = upload_src_len(data) or min_size_for_remote_check + 1
|
|
56
|
+
return len_ < min_size_for_remote_check
|
|
52
57
|
|
|
53
58
|
|
|
54
59
|
class UploadDecision(ty.NamedTuple):
|
thds/adls/blob_meta.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from azure.storage.blob import BlobProperties, ContainerClient
|
|
5
|
+
|
|
6
|
+
from thds.core import hashing
|
|
7
|
+
|
|
8
|
+
from . import hashes
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class BlobMeta:
|
|
13
|
+
path: str
|
|
14
|
+
size: int
|
|
15
|
+
hash: ty.Optional[hashing.Hash]
|
|
16
|
+
metadata: dict[str, str]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
|
|
20
|
+
return BlobMeta(
|
|
21
|
+
blob_props.name,
|
|
22
|
+
blob_props.size,
|
|
23
|
+
next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
|
|
24
|
+
blob_props.metadata or {},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_dir(blob_meta: BlobMeta) -> bool:
|
|
29
|
+
return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
|
|
33
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
|
|
34
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
|
|
35
|
+
def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
|
|
36
|
+
for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
|
|
37
|
+
# `list_blobs` does not include metadata by default, so we need to explicitly specify including it
|
|
38
|
+
yield to_blob_meta(blob_props)
|
thds/adls/download.py
CHANGED
|
@@ -16,7 +16,7 @@ from thds.core.types import StrOrPath
|
|
|
16
16
|
|
|
17
17
|
from . import azcopy, errors, etag, hashes
|
|
18
18
|
from ._progress import report_download_progress
|
|
19
|
-
from .
|
|
19
|
+
from .file_lock import file_lock
|
|
20
20
|
from .fqn import AdlsFqn
|
|
21
21
|
from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
|
|
22
22
|
|
|
@@ -240,12 +240,12 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
|
|
|
240
240
|
# No cache hit, so its time to prepare to download. if a cache was provided, we will
|
|
241
241
|
# _put_ the resulting file in it.
|
|
242
242
|
|
|
243
|
-
|
|
243
|
+
file_lock_str = str(cache.path(fqn) if cache else local_path)
|
|
244
244
|
# create lockfile name from the (shared) cache path if present, otherwise the final
|
|
245
245
|
# destination. Non-cache users may then still incur multiple downloads in parallel,
|
|
246
246
|
# but if you wanted to coordinate then you should probably have been using the global
|
|
247
247
|
# cache in the first place.
|
|
248
|
-
_dl_scope.enter(
|
|
248
|
+
_dl_scope.enter(file_lock(file_lock_str))
|
|
249
249
|
|
|
250
250
|
# re-attempt cache hit - we may have gotten the lock after somebody else downloaded
|
|
251
251
|
if file_result := attempt_cache_hit():
|
|
@@ -9,18 +9,18 @@ from thds.core import config, home, log
|
|
|
9
9
|
|
|
10
10
|
from .md5 import hex_md5_str
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
FILELOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/file-locks", parse=Path)
|
|
13
13
|
_CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
|
|
14
14
|
_CLEAN_UP_LOCKFILES_EVERY = timedelta(hours=1).total_seconds()
|
|
15
15
|
_LAST_CLEANED_BY_THIS_PROCESS = time.monotonic() - _CLEAN_UP_LOCKFILES_EVERY
|
|
16
16
|
logger = log.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def
|
|
19
|
+
def _clean_file_locks() -> int:
|
|
20
20
|
deleted = 0
|
|
21
21
|
deletion_threshold = time.time() - _CLEAN_UP_LOCKFILES_AFTER_TIME.total_seconds()
|
|
22
22
|
try:
|
|
23
|
-
for f in
|
|
23
|
+
for f in FILELOCKS_DIR().rglob("*"):
|
|
24
24
|
fstat = f.stat()
|
|
25
25
|
if stat.S_ISREG(fstat.st_mode) and fstat.st_mtime < deletion_threshold:
|
|
26
26
|
f.unlink()
|
|
@@ -29,20 +29,20 @@ def _clean_download_locks() -> int:
|
|
|
29
29
|
# this should be, hopefully, both very rare and completely inconsequential as to
|
|
30
30
|
# program correctness. if you see this happen multiple times, you may have some
|
|
31
31
|
# read-only files or something and want to manually clean up this directory.
|
|
32
|
-
logger.exception("Failed to clean
|
|
32
|
+
logger.exception("Failed to clean file locks directory.")
|
|
33
33
|
return deleted
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def
|
|
36
|
+
def _occasionally_clean_file_locks():
|
|
37
37
|
global _LAST_CLEANED_BY_THIS_PROCESS
|
|
38
38
|
# do this about once an hour
|
|
39
39
|
if time.monotonic() > _LAST_CLEANED_BY_THIS_PROCESS + _CLEAN_UP_LOCKFILES_EVERY:
|
|
40
40
|
_LAST_CLEANED_BY_THIS_PROCESS = time.monotonic()
|
|
41
41
|
# minor race condition with other threads but it doesn't really matter.
|
|
42
|
-
|
|
42
|
+
_clean_file_locks()
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def
|
|
45
|
+
def file_lock(lock_unique_str: str, locktype: str = "download") -> FileLock:
|
|
46
46
|
"""Note that the lockfiles will never be deleted automatically.
|
|
47
47
|
https://py-filelock.readthedocs.io/en/latest/api.html#filelock.BaseFileLock.release
|
|
48
48
|
|
|
@@ -50,7 +50,7 @@ def download_lock(download_unique_str: str) -> FileLock:
|
|
|
50
50
|
https://stackoverflow.com/questions/58098634/why-does-the-python-filelock-library-delete-lockfiles-on-windows-but-not-unix
|
|
51
51
|
|
|
52
52
|
This means local developers would have a whole bunch of zero-byte files in their
|
|
53
|
-
|
|
53
|
+
file locks directory. So, we take a slightly idiosyncratic approach to cleaning
|
|
54
54
|
this up: not wanting to run this code on every download, but also not wanting
|
|
55
55
|
developers to see an infinitely-growing mess. Since parallel downloads will
|
|
56
56
|
(generally) not constitute a correctness issue, the 'safest' time to clean it up will
|
|
@@ -58,11 +58,11 @@ def download_lock(download_unique_str: str) -> FileLock:
|
|
|
58
58
|
we can get rid of old lockfiles after they've existed for more than 24 hours, since
|
|
59
59
|
it's quite rare that a download would last that long.
|
|
60
60
|
"""
|
|
61
|
-
|
|
62
|
-
|
|
61
|
+
lock_type_dir = FILELOCKS_DIR() / locktype
|
|
62
|
+
lock_type_dir.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
_occasionally_clean_file_locks()
|
|
63
64
|
return FileLock(
|
|
64
|
-
|
|
65
|
-
/ (download_unique_str.split("/")[-1][:50] + hex_md5_str(download_unique_str)),
|
|
65
|
+
lock_type_dir / (lock_unique_str.split("/")[-1][:50] + hex_md5_str(lock_unique_str)),
|
|
66
66
|
# is_singleton=True,
|
|
67
67
|
# critical for keeping this reentrant without passing the lock around.
|
|
68
68
|
# see https://github.com/tox-dev/filelock/issues/315#issuecomment-2016797681
|
thds/adls/list_fast.py
CHANGED
|
@@ -6,20 +6,29 @@ client instead of the file system client.
|
|
|
6
6
|
|
|
7
7
|
import typing as ty
|
|
8
8
|
|
|
9
|
-
from thds.core import parallel, thunks
|
|
9
|
+
from thds.core import log, parallel, source, thunks
|
|
10
10
|
|
|
11
|
-
from . import global_client
|
|
11
|
+
from . import blob_meta, global_client
|
|
12
|
+
from . import source as adls_source
|
|
12
13
|
from .fqn import AdlsFqn
|
|
13
|
-
from .
|
|
14
|
+
from .uri import UriIsh, parse_any
|
|
14
15
|
|
|
15
16
|
R = ty.TypeVar("R")
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
logger = log.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
18
22
|
def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R]:
|
|
19
|
-
yield from (
|
|
23
|
+
yield from (
|
|
24
|
+
res
|
|
25
|
+
for _, res in parallel.failfast(
|
|
26
|
+
parallel.yield_all(parallel.create_keys(thunks), progress_logger=logger.debug)
|
|
27
|
+
)
|
|
28
|
+
)
|
|
20
29
|
|
|
21
30
|
|
|
22
|
-
def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
|
|
31
|
+
def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[blob_meta.BlobMeta]:
|
|
23
32
|
"""A fast way to find all blobs in a directory tree; we do this in parallel on
|
|
24
33
|
subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
|
|
25
34
|
|
|
@@ -29,9 +38,9 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
29
38
|
"""
|
|
30
39
|
if layers <= 0:
|
|
31
40
|
# directly yield the blobs
|
|
32
|
-
yield from yield_blob_meta(
|
|
41
|
+
yield from blob_meta.yield_blob_meta(
|
|
33
42
|
global_client.get_global_blob_container_client(fqn.sa, fqn.container),
|
|
34
|
-
fqn.path,
|
|
43
|
+
fqn.path.rstrip("/") + "/",
|
|
35
44
|
)
|
|
36
45
|
return
|
|
37
46
|
|
|
@@ -69,8 +78,10 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
69
78
|
|
|
70
79
|
blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
|
|
71
80
|
|
|
72
|
-
def _get_blob_meta(blob_name: str) -> BlobMeta:
|
|
73
|
-
return to_blob_meta(
|
|
81
|
+
def _get_blob_meta(blob_name: str) -> blob_meta.BlobMeta:
|
|
82
|
+
return blob_meta.to_blob_meta(
|
|
83
|
+
blob_container_client.get_blob_client(blob_name).get_blob_properties()
|
|
84
|
+
)
|
|
74
85
|
|
|
75
86
|
for blob_meta_iter in (
|
|
76
87
|
_failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
|
|
@@ -86,10 +97,22 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
86
97
|
yield from blob_meta_iter
|
|
87
98
|
|
|
88
99
|
|
|
89
|
-
def
|
|
90
|
-
return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
|
|
100
|
+
def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[blob_meta.BlobMeta]:
|
|
94
101
|
"""Only for use within multi_layer_yield_blobs."""
|
|
95
102
|
return list(multilayer_yield_blob_meta(fqn, layers))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def multilayer_yield_sources(
|
|
106
|
+
fqn_or_uri: UriIsh,
|
|
107
|
+
layers: int = 1,
|
|
108
|
+
filter_: ty.Callable[[blob_meta.BlobMeta], bool] = lambda _: True,
|
|
109
|
+
) -> ty.Iterator[source.Source]:
|
|
110
|
+
"""
|
|
111
|
+
if you want to list directories and files, use `multilayer_yield_blob_meta` instead
|
|
112
|
+
"""
|
|
113
|
+
fqn = parse_any(fqn_or_uri)
|
|
114
|
+
root = fqn.root()
|
|
115
|
+
for blob in multilayer_yield_blob_meta(fqn, layers):
|
|
116
|
+
if not blob_meta.is_dir(blob) and filter_(blob):
|
|
117
|
+
# ^ a "dir" Source would not make sense
|
|
118
|
+
yield adls_source.from_adls(root / blob.path, hash=blob.hash, size=blob.size)
|
thds/adls/source.py
CHANGED
|
@@ -30,7 +30,9 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
|
|
|
30
30
|
source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def from_adls(
|
|
33
|
+
def from_adls(
|
|
34
|
+
uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None, size: int = 0
|
|
35
|
+
) -> source.Source:
|
|
34
36
|
"""Flexible, public interface to creating Sources from any ADLS-like reference.
|
|
35
37
|
|
|
36
38
|
Does NOT automatically fetch a checksumming hash from the ADLS URI if it's not
|
|
@@ -40,7 +42,7 @@ def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None
|
|
|
40
42
|
r_fqn = resolve_any(uri_or_fqn)
|
|
41
43
|
if not r_fqn:
|
|
42
44
|
raise ValueError(f"Could not resolve {uri_or_fqn} to an ADLS FQN")
|
|
43
|
-
return source.Source(str(r_fqn), hash)
|
|
45
|
+
return source.Source(str(r_fqn), hash, size)
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
source.register_from_uri_handler(
|
|
@@ -55,13 +57,15 @@ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
|
|
|
55
57
|
"""
|
|
56
58
|
fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
|
|
57
59
|
with blob_not_found_translation(fqn):
|
|
58
|
-
|
|
60
|
+
props = get_file_properties(fqn)
|
|
61
|
+
uri_hashes = hashes.extract_hashes_from_props(props)
|
|
59
62
|
if not uri_hashes:
|
|
60
63
|
raise ValueError(
|
|
61
64
|
f"ADLS file {fqn} must have a hash to use this function. "
|
|
62
65
|
"If you know the hash, use `from_adls` with the hash parameter."
|
|
63
66
|
)
|
|
64
|
-
|
|
67
|
+
size = int(props.get("size")) or 0
|
|
68
|
+
return from_adls(fqn, next(iter(uri_hashes.values())), size)
|
|
65
69
|
|
|
66
70
|
|
|
67
71
|
def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
|
thds/adls/source_tree.py
CHANGED
|
@@ -1,53 +1,6 @@
|
|
|
1
|
-
import typing as ty
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from azure.storage.blob import BlobProperties, ContainerClient
|
|
5
|
-
|
|
6
|
-
from thds.core import hashing
|
|
7
1
|
from thds.core.source.tree import SourceTree
|
|
8
2
|
|
|
9
|
-
from . import fqn,
|
|
10
|
-
|
|
11
|
-
# TODO refactor BlobMeta into its own module.
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class BlobMeta:
|
|
16
|
-
path: str
|
|
17
|
-
size: int
|
|
18
|
-
hash: ty.Optional[hashing.Hash]
|
|
19
|
-
metadata: dict[str, str]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
|
|
23
|
-
return BlobMeta(
|
|
24
|
-
blob_props.name,
|
|
25
|
-
blob_props.size,
|
|
26
|
-
next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
|
|
27
|
-
blob_props.metadata or {},
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
|
|
32
|
-
for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
|
|
33
|
-
# `list_blobs` does not include metadata by default, so we need to explicitly specify including it
|
|
34
|
-
yield to_blob_meta(blob_props)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
|
|
38
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
|
|
39
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
|
|
40
|
-
def list_blob_meta(
|
|
41
|
-
container_client: ContainerClient, root_dir: str, match_suffix: str = ""
|
|
42
|
-
) -> ty.List[BlobMeta]:
|
|
43
|
-
"""Gets the path (relative to the SA/container root), size, and _a_ hash (if available) of all blobs in a directory."""
|
|
44
|
-
return [
|
|
45
|
-
blob_meta
|
|
46
|
-
for blob_meta in yield_blob_meta(container_client, root_dir)
|
|
47
|
-
if blob_meta.size > 0
|
|
48
|
-
# container client lists directories as blobs with size 0
|
|
49
|
-
and blob_meta.path.endswith(match_suffix)
|
|
50
|
-
]
|
|
3
|
+
from . import fqn, list_fast, uri
|
|
51
4
|
|
|
52
5
|
|
|
53
6
|
def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
|
|
@@ -56,12 +9,12 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
|
|
|
56
9
|
"""
|
|
57
10
|
root_fqn = uri.parse_any(adls_path)
|
|
58
11
|
|
|
59
|
-
container_client = global_client.get_global_blob_container_client(root_fqn.sa, root_fqn.container)
|
|
60
|
-
container_root = root_fqn.root()
|
|
61
12
|
return SourceTree(
|
|
62
|
-
sources=
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
13
|
+
sources=sorted(
|
|
14
|
+
list_fast.multilayer_yield_sources(
|
|
15
|
+
root_fqn, layers=0, filter_=lambda blob: blob.path.endswith(match_suffix)
|
|
16
|
+
),
|
|
17
|
+
key=lambda src: src.uri,
|
|
18
|
+
),
|
|
66
19
|
higher_logical_root=fqn.split(root_fqn)[-1],
|
|
67
20
|
)
|
thds/adls/upload.py
CHANGED
|
@@ -15,8 +15,9 @@ from thds.core import files, fretry, link, log, scope, source, tmp
|
|
|
15
15
|
|
|
16
16
|
from . import azcopy, hashes
|
|
17
17
|
from ._progress import report_upload_progress
|
|
18
|
-
from ._upload import upload_decision_and_metadata
|
|
18
|
+
from ._upload import UploadSrc, upload_decision_and_metadata, upload_src_len
|
|
19
19
|
from .conf import UPLOAD_FILE_MAX_CONCURRENCY
|
|
20
|
+
from .file_lock import file_lock
|
|
20
21
|
from .fqn import AdlsFqn
|
|
21
22
|
from .global_client import get_global_blob_container_client
|
|
22
23
|
from .ro_cache import Cache
|
|
@@ -25,9 +26,6 @@ logger = log.getLogger(__name__)
|
|
|
25
26
|
_SLOW_CONNECTION_WORKAROUND = 14400 # seconds
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
|
|
29
|
-
|
|
30
|
-
|
|
31
29
|
def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
|
|
32
30
|
@scope.bound
|
|
33
31
|
def _try_write_through() -> bool:
|
|
@@ -40,8 +38,8 @@ def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Op
|
|
|
40
38
|
out = scope.enter(tmp.temppath_same_fs(local_cache_path))
|
|
41
39
|
if hasattr(data, "read") and hasattr(data, "seek"):
|
|
42
40
|
with open(out, "wb") as f:
|
|
43
|
-
f.write(data.read())
|
|
44
|
-
data.seek(0)
|
|
41
|
+
f.write(data.read())
|
|
42
|
+
data.seek(0)
|
|
45
43
|
link.link_or_copy(out, local_cache_path)
|
|
46
44
|
return True
|
|
47
45
|
|
|
@@ -101,9 +99,12 @@ def upload(
|
|
|
101
99
|
# we always use the original source file to upload, not the cached path,
|
|
102
100
|
# because uploading from a shared location risks race conditions.
|
|
103
101
|
|
|
102
|
+
scope.enter(file_lock(str(dest_), locktype="upload"))
|
|
103
|
+
|
|
104
104
|
blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
|
|
105
105
|
blob_client = blob_container_client.get_blob_client(dest_.path)
|
|
106
106
|
decision = upload_decision_and_metadata(blob_client.get_blob_properties, src) # type: ignore [arg-type]
|
|
107
|
+
n_bytes = upload_src_len(src, default=0)
|
|
107
108
|
|
|
108
109
|
def source_from_meta() -> source.Source:
|
|
109
110
|
best_hash = next(iter(hashes.extract_hashes_from_metadata(decision.metadata)), None)
|
|
@@ -111,17 +112,14 @@ def upload(
|
|
|
111
112
|
assert best_hash, "A hash should always be calculable for a local path."
|
|
112
113
|
return source.from_file(src, hash=best_hash, uri=str(dest_))
|
|
113
114
|
|
|
114
|
-
return source.from_uri(str(dest_), hash=best_hash)
|
|
115
|
+
return source.from_uri(str(dest_), hash=best_hash, size=n_bytes)
|
|
115
116
|
|
|
116
117
|
if decision.upload_required:
|
|
117
118
|
# set up some bookkeeping
|
|
118
|
-
n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
|
|
119
119
|
bytes_src: ty.Union[bytes, ty.IO, ty.Iterable[bytes]]
|
|
120
120
|
if isinstance(src, Path):
|
|
121
|
-
n_bytes = src.stat().st_size
|
|
122
121
|
bytes_src = scope.enter(open(src, "rb"))
|
|
123
122
|
elif isinstance(src, bytes):
|
|
124
|
-
n_bytes = len(src)
|
|
125
123
|
bytes_src = src
|
|
126
124
|
else:
|
|
127
125
|
bytes_src = src
|
|
@@ -129,7 +127,7 @@ def upload(
|
|
|
129
127
|
if "metadata" in upload_data_kwargs:
|
|
130
128
|
decision.metadata.update(upload_data_kwargs.pop("metadata"))
|
|
131
129
|
|
|
132
|
-
if azcopy.upload.should_use_azcopy(n_bytes
|
|
130
|
+
if azcopy.upload.should_use_azcopy(n_bytes) and isinstance(src, Path):
|
|
133
131
|
logger.info("Using azcopy to upload %s to %s", src, dest_)
|
|
134
132
|
try:
|
|
135
133
|
azcopy.upload.run(
|
|
@@ -137,7 +135,7 @@ def upload(
|
|
|
137
135
|
src, dest_, content_type=content_type, metadata=decision.metadata, overwrite=True
|
|
138
136
|
),
|
|
139
137
|
dest_,
|
|
140
|
-
n_bytes
|
|
138
|
+
n_bytes,
|
|
141
139
|
)
|
|
142
140
|
return source_from_meta()
|
|
143
141
|
|
|
@@ -155,9 +153,11 @@ def upload(
|
|
|
155
153
|
# This is both faster, as well as simpler to reason about, and
|
|
156
154
|
# in fact was the behavior I had been assuming all along...
|
|
157
155
|
blob_client.upload_blob(
|
|
158
|
-
report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes
|
|
156
|
+
report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes),
|
|
159
157
|
overwrite=True,
|
|
160
|
-
length=
|
|
158
|
+
length=(
|
|
159
|
+
n_bytes if n_bytes > 0 else None
|
|
160
|
+
), # if we pass 0 to upload_blob, it truncates the write now
|
|
161
161
|
content_settings=upload_content_settings,
|
|
162
162
|
connection_timeout=_SLOW_CONNECTION_WORKAROUND,
|
|
163
163
|
max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thds.adls
|
|
3
|
+
Version: 4.4.20251117191451
|
|
4
|
+
Summary: ADLS tools
|
|
5
|
+
Author-email: Trilliant Health <info@trillianthealth.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: aiohttp>=3.8.1
|
|
11
|
+
Requires-Dist: aiostream>=0.4.5
|
|
12
|
+
Requires-Dist: azure-identity>=1.9
|
|
13
|
+
Requires-Dist: azure-storage-file-datalake>=12.6
|
|
14
|
+
Requires-Dist: blake3
|
|
15
|
+
Requires-Dist: filelock>=3.0
|
|
16
|
+
Requires-Dist: xxhash
|
|
17
|
+
Requires-Dist: thds-core
|
|
18
|
+
|
|
19
|
+
# thds.adls
|
|
20
|
+
|
|
21
|
+
A high-performance Azure Data Lake Storage (ADLS Gen2) client for the THDS monorepo. It wraps the Azure
|
|
22
|
+
SDK with hash-aware caching, azcopy acceleration, and shared client/credential plumbing so applications
|
|
23
|
+
can transfer large blob datasets quickly and reliably.
|
|
24
|
+
|
|
25
|
+
## Highlights
|
|
26
|
+
|
|
27
|
+
- **Environment-aware paths first:** Almost every consumer starts by importing `fqn`, `AdlsFqn`, and
|
|
28
|
+
`defaults.env_root()` to build storage-account/container URIs that follow the current THDS environment.
|
|
29
|
+
- **Cache-backed reads:** `download_to_cache` is the standard entry point for pulling blobs down with a
|
|
30
|
+
verified hash so local workflows, tests, and pipelines can operate on read-only copies.
|
|
31
|
+
- **Bulk filesystem helpers:** `ADLSFileSystem` powers scripts and jobs that need to walk directories,
|
|
32
|
+
fetch batches of files, or mirror hive tables without re-implementing Azure SDK plumbing.
|
|
33
|
+
- **Spark/Databricks bridges:** `abfss` and `uri` conversions keep analytics code agnostic to whether it
|
|
34
|
+
needs an `adls://`, `abfss://`, `https://`, or `dbfs://` view of the same path.
|
|
35
|
+
- **Composable utilities:** Higher-level modules (cache, upload, copy, list) layer on top of those
|
|
36
|
+
imports so teams can opt into more advanced behavior without leaving the public API surface.
|
|
37
|
+
|
|
38
|
+
## Key Modules
|
|
39
|
+
|
|
40
|
+
| Component | Typical usage in the monorepo |
|
|
41
|
+
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
|
|
42
|
+
| `fqn` | Parse, validate, and join ADLS paths; used when materializing model datasets and configuring pipelines. |
|
|
43
|
+
| `AdlsFqn` | Strongly typed value passed between tasks and tests to represent a single blob or directory. |
|
|
44
|
+
| `defaults` / `named_roots` | Resolve environment-specific storage roots (`defaults.env_root()`, `named_roots.require(...)`). |
|
|
45
|
+
| `download_to_cache` (`cached` module) | Bring a blob down to the shared read-only cache before analytics, feature builds, or test fixtures run. |
|
|
46
|
+
| `ADLSFileSystem` (`impl` module) | Fetch or list entire directory trees and integrate with caching inside scripts and notebooks. |
|
|
47
|
+
| `abfss` | Translate `AdlsFqn` objects into `abfss://` URIs for Spark/Databricks jobs. |
|
|
48
|
+
| `uri` | Normalize `adls://`, `abfss://`, `https://`, and `dbfs://` strings into `AdlsFqn` values (and vice versa). |
|
|
49
|
+
| `global_client` / `shared_credential` | Shared, fork-safe Azure clients and credentials backing the public helpers above. |
|
|
50
|
+
|
|
51
|
+
## Example Usage
|
|
52
|
+
|
|
53
|
+
1. Use the caching helpers and Source integration:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from thds.adls import cached, upload, source
|
|
57
|
+
|
|
58
|
+
cache_path = cached.download_to_cache("adls://acct/container/path/to/file")
|
|
59
|
+
src = upload("adls://acct/container/path/out.parquet", cache_path)
|
|
60
|
+
verified = source.get_with_hash(src.uri)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
1. For CLI usage, run (from repo root):
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
uv run python -m thds.adls.tools.download adls://acct/container/path/file
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Operational Notes
|
|
70
|
+
|
|
71
|
+
- **Hash metadata:** Uploads attach `hash_xxh3_128_b64` automatically when the bytes are known. Download
|
|
72
|
+
completion back-fills missing hashes when permissions allow.
|
|
73
|
+
- **Locks and concurrency:** Large transfers acquire per-path file locks to keep azcopy instances
|
|
74
|
+
cooperative. Global HTTP connection pools default to 100 but are configurable via `thds.core.config`.
|
|
75
|
+
- **Error handling:** `BlobNotFoundError` and other ADLS-specific exceptions translate into custom error
|
|
76
|
+
types to simplify retries and diagnostics.
|
|
77
|
+
- **Extensibility:** Additional hash algorithms can be registered by importing dependent packages (e.g.,
|
|
78
|
+
`blake3`). Named roots can be populated dynamically via environment-specific modules
|
|
79
|
+
(`thds.adls._thds_defaults` hook).
|
|
@@ -1,32 +1,33 @@
|
|
|
1
|
-
thds/adls/__init__.py,sha256=
|
|
1
|
+
thds/adls/__init__.py,sha256=CzNcmTdP5RHrD-9K1boDU4Z3pbJAvEAFfFhXGFwpEXw,1088
|
|
2
2
|
thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
|
|
3
3
|
thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
|
|
4
|
-
thds/adls/_upload.py,sha256=
|
|
4
|
+
thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
|
|
5
5
|
thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
|
|
6
|
+
thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
|
|
6
7
|
thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
|
|
7
8
|
thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
|
|
8
9
|
thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
|
|
9
10
|
thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
|
|
10
11
|
thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
|
|
11
|
-
thds/adls/download.py,sha256=
|
|
12
|
-
thds/adls/download_lock.py,sha256=tgT48l4C5_qmArGeq05gl7VlxT22dZBH2Xwxx0itE9o,3176
|
|
12
|
+
thds/adls/download.py,sha256=IPg5nz_sGE7dX8DUQyWjG2D9z54PXLScap-pZzTUFTk,19142
|
|
13
13
|
thds/adls/errors.py,sha256=6NMLHtVNsWBRDXaes9yzHj9cwKOD9t1dwL4BltdtjhU,1895
|
|
14
14
|
thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
|
|
15
|
+
thds/adls/file_lock.py,sha256=yLak5XDpnIYwfUNdpGFbIGG64uEs98-yVscNpJlqMxM,3176
|
|
15
16
|
thds/adls/file_properties.py,sha256=dhRtbsMNOYfExkEiy76wrLfrJ6IMQeN1Z3LIxgKceqY,2042
|
|
16
17
|
thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
|
|
17
18
|
thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
|
|
18
19
|
thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
|
|
19
20
|
thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
|
|
20
|
-
thds/adls/list_fast.py,sha256=
|
|
21
|
+
thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
|
|
21
22
|
thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
|
|
22
23
|
thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
|
|
23
24
|
thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
25
|
thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
|
|
25
26
|
thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
|
|
26
27
|
thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
|
|
27
|
-
thds/adls/source.py,sha256=
|
|
28
|
-
thds/adls/source_tree.py,sha256=
|
|
29
|
-
thds/adls/upload.py,sha256=
|
|
28
|
+
thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
|
|
29
|
+
thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
|
|
30
|
+
thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
|
|
30
31
|
thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
|
|
31
32
|
thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
|
|
32
33
|
thds/adls/azcopy/download.py,sha256=FOtYyYh7ZXNWNdkj04yTV26lxcKOVj-YhS2p_EclYxA,6526
|
|
@@ -38,8 +39,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
|
|
|
38
39
|
thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
|
|
39
40
|
thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
|
|
40
41
|
thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
|
|
41
|
-
thds_adls-4.
|
|
42
|
-
thds_adls-4.
|
|
43
|
-
thds_adls-4.
|
|
44
|
-
thds_adls-4.
|
|
45
|
-
thds_adls-4.
|
|
42
|
+
thds_adls-4.4.20251117191451.dist-info/METADATA,sha256=phV7EH6lnptlnQYY5TSfyZZk0Wiv0wqZ6L6o7pcP4UM,4586
|
|
43
|
+
thds_adls-4.4.20251117191451.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
thds_adls-4.4.20251117191451.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
|
|
45
|
+
thds_adls-4.4.20251117191451.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
|
|
46
|
+
thds_adls-4.4.20251117191451.dist-info/RECORD,,
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: thds.adls
|
|
3
|
-
Version: 4.2.20250926202021
|
|
4
|
-
Summary: ADLS tools
|
|
5
|
-
Author-email: Trilliant Health <info@trillianthealth.com>
|
|
6
|
-
License: MIT
|
|
7
|
-
Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
|
|
8
|
-
Requires-Python: >=3.9
|
|
9
|
-
Description-Content-Type: text/markdown
|
|
10
|
-
Requires-Dist: aiohttp>=3.8.1
|
|
11
|
-
Requires-Dist: aiostream>=0.4.5
|
|
12
|
-
Requires-Dist: azure-identity>=1.9
|
|
13
|
-
Requires-Dist: azure-storage-file-datalake>=12.6
|
|
14
|
-
Requires-Dist: blake3
|
|
15
|
-
Requires-Dist: filelock>=3.0
|
|
16
|
-
Requires-Dist: xxhash
|
|
17
|
-
Requires-Dist: thds-core
|
|
18
|
-
|
|
19
|
-
# adls Library
|
|
20
|
-
|
|
21
|
-
A port of `core.adls`.
|
|
File without changes
|
{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/top_level.txt
RENAMED
|
File without changes
|