thds.adls 4.1.20250701001205__py3-none-any.whl → 4.1.20250701190349__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/__init__.py +10 -5
- thds/adls/_upload.py +54 -41
- thds/adls/azcopy/__init__.py +1 -1
- thds/adls/azcopy/download.py +66 -100
- thds/adls/azcopy/login.py +39 -0
- thds/adls/azcopy/progress.py +49 -0
- thds/adls/azcopy/system_resources.py +26 -0
- thds/adls/azcopy/upload.py +95 -0
- thds/adls/{cached_up_down.py → cached.py} +21 -16
- thds/adls/conf.py +1 -0
- thds/adls/download.py +129 -152
- thds/adls/download_lock.py +9 -2
- thds/adls/errors.py +10 -2
- thds/adls/file_properties.py +8 -0
- thds/adls/hashes.py +147 -0
- thds/adls/impl.py +3 -4
- thds/adls/md5.py +5 -52
- thds/adls/ro_cache.py +1 -2
- thds/adls/source.py +37 -34
- thds/adls/tools/download.py +3 -3
- thds/adls/tools/upload.py +3 -4
- thds/adls/upload.py +162 -0
- thds/adls/uri.py +6 -0
- {thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250701190349.dist-info}/METADATA +1 -1
- thds_adls-4.1.20250701190349.dist-info/RECORD +42 -0
- thds/adls/resource/__init__.py +0 -36
- thds/adls/resource/core.py +0 -77
- thds/adls/resource/file_pointers.py +0 -54
- thds/adls/resource/up_down.py +0 -242
- thds_adls-4.1.20250701001205.dist-info/RECORD +0 -40
- {thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250701190349.dist-info}/WHEEL +0 -0
- {thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250701190349.dist-info}/entry_points.txt +0 -0
- {thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250701190349.dist-info}/top_level.txt +0 -0
thds/adls/hashes.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import os
|
|
3
|
+
import typing as ty
|
|
4
|
+
from functools import partial
|
|
5
|
+
|
|
6
|
+
import xxhash
|
|
7
|
+
|
|
8
|
+
from thds.core import hash_cache, hashing, log, source, types
|
|
9
|
+
from thds.core.hashing import Hash, SomehowReadable
|
|
10
|
+
|
|
11
|
+
from . import errors, file_properties
|
|
12
|
+
from .fqn import AdlsFqn
|
|
13
|
+
|
|
14
|
+
logger = log.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
PREFERRED_ALGOS: ty.Final = ("xxh3_128", "blake3")
|
|
17
|
+
AnyStrSrc = ty.Union[SomehowReadable, ty.Iterable[ty.AnyStr]]
|
|
18
|
+
# this type closely corresponds to what the underlying DataLakeStorageClient will accept for upload_data.
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def default_hasher() -> hashing.Hasher:
|
|
22
|
+
return xxhash.xxh3_128()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _xxhash_hasher(algo: str) -> hashing.Hasher:
|
|
26
|
+
return getattr(xxhash, algo)()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def register_hashes():
|
|
30
|
+
for algo in xxhash.algorithms_available:
|
|
31
|
+
hashing.add_named_hash(algo, _xxhash_hasher)
|
|
32
|
+
source.set_file_autohash(PREFERRED_ALGOS[0], _xxhash_hasher)
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
from blake3 import blake3
|
|
36
|
+
|
|
37
|
+
hashing.add_named_hash("blake3", lambda _: blake3()) # type: ignore
|
|
38
|
+
except ModuleNotFoundError:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _hash_path_if_exists(
|
|
43
|
+
file_hasher: ty.Callable[[types.StrOrPath], hashing.Hash], path: types.StrOrPath
|
|
44
|
+
) -> ty.Optional[hashing.Hash]:
|
|
45
|
+
if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
|
|
46
|
+
return None
|
|
47
|
+
return file_hasher(path)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def hash_path_for_algo(
|
|
51
|
+
algo: str,
|
|
52
|
+
) -> ty.Callable[[types.StrOrPath], ty.Optional[hashing.Hash]]:
|
|
53
|
+
"""Return a function that hashes a path for the given algorithm."""
|
|
54
|
+
return partial(_hash_path_if_exists, partial(hash_cache.filehash, algo))
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def metadata_hash_b64_key(algo: str) -> str:
|
|
58
|
+
return f"hash_{algo}_b64"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def extract_hashes_from_metadata(metadata: dict) -> ty.Iterable[hashing.Hash]:
|
|
62
|
+
# NOTE! the order here is critical, because we want to _prefer_ the faster hash if it exists.
|
|
63
|
+
for hash_algo in PREFERRED_ALGOS:
|
|
64
|
+
md_key = metadata_hash_b64_key(hash_algo)
|
|
65
|
+
if metadata and md_key in metadata:
|
|
66
|
+
yield hashing.Hash(hash_algo, hashing.db64(metadata[md_key]))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def extract_hashes_from_props(
|
|
70
|
+
props: ty.Optional[file_properties.PropertiesP],
|
|
71
|
+
) -> dict[str, hashing.Hash]:
|
|
72
|
+
if not props:
|
|
73
|
+
return dict()
|
|
74
|
+
|
|
75
|
+
hashes = list(extract_hashes_from_metadata(props.metadata or dict()))
|
|
76
|
+
if props.content_settings and props.content_settings.content_md5:
|
|
77
|
+
hashes.append(hashing.Hash("md5", props.content_settings.content_md5))
|
|
78
|
+
return {h.algo: h for h in hashes}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@contextlib.contextmanager
|
|
82
|
+
def verify_hashes_before_and_after_download(
|
|
83
|
+
remote_hash: ty.Optional[Hash],
|
|
84
|
+
expected_hash: ty.Optional[Hash],
|
|
85
|
+
fqn: AdlsFqn,
|
|
86
|
+
local_dest: types.StrOrPath,
|
|
87
|
+
) -> ty.Iterator[None]:
|
|
88
|
+
# if expected_hash:
|
|
89
|
+
# check_reasonable_md5b64(expected_md5b64)
|
|
90
|
+
# if remote_md5b64:
|
|
91
|
+
# check_reasonable_md5b64(remote_md5b64)
|
|
92
|
+
if remote_hash and expected_hash and remote_hash != expected_hash:
|
|
93
|
+
raise errors.HashMismatchError(
|
|
94
|
+
f"ADLS thinks the {remote_hash.algo} of {fqn} is {hashing.b64(remote_hash.bytes)},"
|
|
95
|
+
f" but we expected {hashing.b64(expected_hash.bytes)}."
|
|
96
|
+
" This may indicate that we need to update a hash in the codebase."
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
yield # perform download
|
|
100
|
+
|
|
101
|
+
expected_algo = expected_hash.algo if expected_hash else None
|
|
102
|
+
if not expected_algo and remote_hash:
|
|
103
|
+
expected_algo = remote_hash.algo
|
|
104
|
+
|
|
105
|
+
if not expected_algo:
|
|
106
|
+
# if we have neither a user-provided hash nor a remotely-foun9d hash, then we have nothing to check.
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
with log.logger_context(hash_for="after-download"):
|
|
110
|
+
local_hash = hash_cache.filehash(expected_algo, local_dest)
|
|
111
|
+
|
|
112
|
+
if remote_hash and remote_hash != local_hash:
|
|
113
|
+
raise errors.HashMismatchError(
|
|
114
|
+
f"The {local_hash.algo} of the downloaded file {local_dest} is {hashing.b64(local_hash.bytes)},"
|
|
115
|
+
f" but the remote ({fqn}) says it should be {hashing.b64(remote_hash.bytes)}."
|
|
116
|
+
f" This may indicate that ADLS has an erroneous {remote_hash.algo} for {fqn}."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if expected_hash and local_hash != expected_hash:
|
|
120
|
+
raise errors.HashMismatchError(
|
|
121
|
+
f"The {local_hash.algo} of the downloaded file {local_dest} is {hashing.b64(local_hash.bytes)},"
|
|
122
|
+
f" but we expected it to be {hashing.b64(expected_hash.bytes)}."
|
|
123
|
+
f" This probably indicates a corrupted download of {fqn}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
all_hashes = dict(local=local_hash, remote=remote_hash, expected=expected_hash)
|
|
127
|
+
real_hashes = list(filter(None, all_hashes.values()))
|
|
128
|
+
assert len(real_hashes) > 0, all_hashes
|
|
129
|
+
assert all(real_hashes[0] == h for h in real_hashes), all_hashes
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def metadata_hash_dict(hash: Hash) -> dict[str, str]:
|
|
133
|
+
return {metadata_hash_b64_key(hash.algo): hashing.b64(hash.bytes)}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def create_hash_metadata_if_missing(
|
|
137
|
+
file_properties: ty.Optional[file_properties.FileProperties], new_hash: ty.Optional[Hash]
|
|
138
|
+
) -> dict:
|
|
139
|
+
if not (file_properties and new_hash):
|
|
140
|
+
# without file properties, we can't match the etag when we try to set this.
|
|
141
|
+
return dict()
|
|
142
|
+
|
|
143
|
+
existing_metadata = file_properties.metadata or dict()
|
|
144
|
+
if metadata_hash_b64_key(new_hash.algo) not in existing_metadata:
|
|
145
|
+
return {**existing_metadata, **metadata_hash_dict(new_hash)}
|
|
146
|
+
|
|
147
|
+
return dict()
|
thds/adls/impl.py
CHANGED
|
@@ -31,7 +31,7 @@ from azure.storage.filedatalake.aio import DataLakeServiceClient, FileSystemClie
|
|
|
31
31
|
|
|
32
32
|
from thds.core import lazy, log
|
|
33
33
|
|
|
34
|
-
from ._upload import
|
|
34
|
+
from ._upload import async_upload_decision_and_metadata
|
|
35
35
|
from .conf import CONNECTION_TIMEOUT, UPLOAD_CHUNK_SIZE
|
|
36
36
|
from .download import async_download_or_use_verified
|
|
37
37
|
from .errors import translate_azure_error
|
|
@@ -330,15 +330,14 @@ class ADLSFileSystem:
|
|
|
330
330
|
|
|
331
331
|
async with file_system_client.get_file_client(remote_path) as file_client:
|
|
332
332
|
with open(local_path, "rb") as fp:
|
|
333
|
-
decision = await
|
|
333
|
+
decision = await async_upload_decision_and_metadata(file_client.get_file_properties, fp)
|
|
334
334
|
if decision.upload_required:
|
|
335
335
|
await file_client.upload_data(
|
|
336
336
|
fp,
|
|
337
337
|
overwrite=True,
|
|
338
|
-
content_settings=decision.content_settings,
|
|
339
338
|
connection_timeout=CONNECTION_TIMEOUT(),
|
|
340
339
|
chunk_size=UPLOAD_CHUNK_SIZE(),
|
|
341
|
-
metadata={**
|
|
340
|
+
metadata={**decision.metadata, **(metadata or {})},
|
|
342
341
|
)
|
|
343
342
|
|
|
344
343
|
return remote_path
|
thds/adls/md5.py
CHANGED
|
@@ -1,60 +1,13 @@
|
|
|
1
|
-
"""Why MD5 when it's no longer a good choice for most use cases?
|
|
2
|
-
Because Azure/ADLS support Content-MD5 but nothing else, and I don't
|
|
3
|
-
want to lie to them and get us confused later.
|
|
4
|
-
|
|
5
|
-
Thankfully, there are no real security concerns for us with purely
|
|
6
|
-
internal code and data sets.
|
|
7
|
-
|
|
8
|
-
That said, please _do not_ use MD5 for non-Azure things. Prefer SHA256
|
|
9
|
-
if at all possible.
|
|
10
|
-
"""
|
|
11
1
|
import hashlib
|
|
12
|
-
import typing as ty
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
|
|
15
|
-
from thds.core.hash_cache import hash_file
|
|
16
|
-
from thds.core.hashing import SomehowReadable, hash_anything, hash_using
|
|
17
|
-
from thds.core.types import StrOrPath
|
|
18
|
-
|
|
19
|
-
AnyStrSrc = ty.Union[SomehowReadable, ty.Iterable[ty.AnyStr]]
|
|
20
|
-
# this type closely corresponds to what the underlying DataLakeStorageClient will accept for upload_data.
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def md5_file(file: StrOrPath) -> bytes:
|
|
24
|
-
"""Raise exception if it cannot be read.
|
|
25
2
|
|
|
26
|
-
|
|
27
|
-
determine staleness).
|
|
28
|
-
"""
|
|
29
|
-
return hash_file(file, hashlib.md5())
|
|
3
|
+
from thds.core.hashing import Hash, db64, hash_using
|
|
30
4
|
|
|
31
5
|
|
|
32
6
|
def hex_md5_str(string: str) -> str:
|
|
33
7
|
return hash_using(string.encode(), hashlib.md5()).hexdigest()
|
|
34
8
|
|
|
35
9
|
|
|
36
|
-
def
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
stream does not exist in its entirety before the upload begins.
|
|
41
|
-
"""
|
|
42
|
-
if isinstance(data, Path):
|
|
43
|
-
return md5_file(data)
|
|
44
|
-
res = hash_anything(data, hashlib.md5())
|
|
45
|
-
if res:
|
|
46
|
-
return res.digest()
|
|
47
|
-
return None
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def is_reasonable_b64(md5: str):
|
|
51
|
-
if len(md5) == 22:
|
|
52
|
-
return True
|
|
53
|
-
if len(md5) == 24 and md5.endswith("=="):
|
|
54
|
-
return True
|
|
55
|
-
return False
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def check_reasonable_md5b64(maybe_md5: str):
|
|
59
|
-
if not is_reasonable_b64(maybe_md5):
|
|
60
|
-
raise ValueError(f"MD5 '{maybe_md5}' is not a reasonable MD5.")
|
|
10
|
+
def to_hash(md5b64: str) -> Hash:
|
|
11
|
+
"""Convert a base64-encoded MD5 hash to a hex string."""
|
|
12
|
+
assert md5b64, "MD5 base64 string cannot be empty"
|
|
13
|
+
return Hash(algo="md5", bytes=db64(md5b64))
|
thds/adls/ro_cache.py
CHANGED
|
@@ -12,7 +12,7 @@ from thds.core.link import LinkType, link_or_copy
|
|
|
12
12
|
from .fqn import AdlsFqn
|
|
13
13
|
from .md5 import hex_md5_str
|
|
14
14
|
|
|
15
|
-
GLOBAL_CACHE_PATH = config.item("global-cache-path", HOMEDIR() / ".adls
|
|
15
|
+
GLOBAL_CACHE_PATH = config.item("global-cache-path", HOMEDIR() / ".thds/adls/ro-cache", parse=Path)
|
|
16
16
|
MAX_FILENAME_LEN = config.item("max-filename-len", 255, parse=int) # safe on most local filesystems?
|
|
17
17
|
MAX_TOTAL_PATH_LEN = config.item(
|
|
18
18
|
"max-total-path-len", 1023 if sys.platform == "darwin" else 4095, parse=int
|
|
@@ -39,7 +39,6 @@ class Cache(ty.NamedTuple):
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def global_cache(link: LinkOpts = ("ref", "hard")) -> Cache:
|
|
42
|
-
"""This is the recommended caching configuration."""
|
|
43
42
|
return Cache(GLOBAL_CACHE_PATH(), link)
|
|
44
43
|
|
|
45
44
|
|
thds/adls/source.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import base64
|
|
2
1
|
import typing as ty
|
|
3
2
|
from functools import partial
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
|
|
6
5
|
from thds.core import source
|
|
7
|
-
from thds.core.hashing import Hash
|
|
6
|
+
from thds.core.hashing import Hash
|
|
8
7
|
|
|
9
|
-
from .
|
|
8
|
+
from . import cached, hashes, md5
|
|
9
|
+
from .errors import blob_not_found_translation
|
|
10
|
+
from .file_properties import get_file_properties
|
|
10
11
|
from .fqn import AdlsFqn
|
|
11
|
-
from .resource import AdlsHashedResource
|
|
12
12
|
from .uri import resolve_any, resolve_uri
|
|
13
13
|
|
|
14
14
|
|
|
@@ -19,17 +19,10 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
|
|
|
19
19
|
|
|
20
20
|
def download(hash: ty.Optional[Hash]) -> Path:
|
|
21
21
|
assert fqn
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return download_to_cache(fqn, b64(hash.bytes))
|
|
27
|
-
|
|
28
|
-
# we don't validate this hash, because we already have md5 validation
|
|
29
|
-
# happening inside the download_to_cache function. the Source hash
|
|
30
|
-
# is actually mostly for use by systems that want to do content addressing,
|
|
31
|
-
# and not necessarily intended to be a runtime check in all scenarios.
|
|
32
|
-
return download_to_cache(fqn)
|
|
22
|
+
# this 'extra' check just allows us to short-circuit a download
|
|
23
|
+
# where the hash at this URI is known not to match what we expect.
|
|
24
|
+
# It's no safer than the non-md5 hash check that Source performs after download.
|
|
25
|
+
return cached.download_to_cache(fqn, expected_hash=hash)
|
|
33
26
|
|
|
34
27
|
return download
|
|
35
28
|
|
|
@@ -37,30 +30,40 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
|
|
|
37
30
|
source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
|
|
38
31
|
|
|
39
32
|
|
|
40
|
-
def from_adls(
|
|
41
|
-
uri_or_fqn_or_ahr: ty.Union[str, AdlsFqn, AdlsHashedResource], hash: ty.Optional[Hash] = None
|
|
42
|
-
) -> source.Source:
|
|
33
|
+
def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None) -> source.Source:
|
|
43
34
|
"""Flexible, public interface to creating Sources from any ADLS-like reference.
|
|
44
35
|
|
|
45
|
-
Does NOT automatically fetch
|
|
46
|
-
you know you want to include that,
|
|
47
|
-
`source.
|
|
36
|
+
Does NOT automatically fetch a checksumming hash from the ADLS URI if it's not
|
|
37
|
+
provided. If you know you want to include that, instead call:
|
|
38
|
+
`source.get_with_hash(uri_or_fqn)`.
|
|
48
39
|
"""
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
raise ValueError(f"Resource Hash mismatch for {fqn}: {hash} != {res_hash}")
|
|
54
|
-
hash = res_hash
|
|
55
|
-
else:
|
|
56
|
-
r_fqn = resolve_any(uri_or_fqn_or_ahr)
|
|
57
|
-
if not r_fqn:
|
|
58
|
-
raise ValueError(f"Could not resolve {uri_or_fqn_or_ahr} to an ADLS FQN")
|
|
59
|
-
fqn = r_fqn
|
|
60
|
-
|
|
61
|
-
return source.Source(str(fqn), hash)
|
|
40
|
+
r_fqn = resolve_any(uri_or_fqn)
|
|
41
|
+
if not r_fqn:
|
|
42
|
+
raise ValueError(f"Could not resolve {uri_or_fqn} to an ADLS FQN")
|
|
43
|
+
return source.Source(str(r_fqn), hash)
|
|
62
44
|
|
|
63
45
|
|
|
64
46
|
source.register_from_uri_handler(
|
|
65
47
|
"thds.adls", lambda uri: partial(from_adls, uri) if resolve_uri(uri) else None
|
|
66
48
|
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
|
|
52
|
+
"""Creates a Source from a remote-only file, with MD5 or other hash.
|
|
53
|
+
|
|
54
|
+
The file _must_ have a pre-existing hash!
|
|
55
|
+
"""
|
|
56
|
+
fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
|
|
57
|
+
with blob_not_found_translation(fqn):
|
|
58
|
+
uri_hashes = hashes.extract_hashes_from_props(get_file_properties(fqn))
|
|
59
|
+
if not uri_hashes:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"ADLS file {fqn} must have a hash to use this function. "
|
|
62
|
+
"If you know the hash, use `from_adls` with the hash parameter."
|
|
63
|
+
)
|
|
64
|
+
return from_adls(fqn, next(iter(uri_hashes.values())))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
|
|
68
|
+
"""Meant for older use cases where we had an MD5"""
|
|
69
|
+
return from_adls(uri_or_fqn, md5.to_hash(md5b64) if md5b64 else None)
|
thds/adls/tools/download.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
from thds.adls
|
|
4
|
+
from thds.adls import cached
|
|
5
5
|
from thds.adls.file_properties import get_file_properties, is_directory
|
|
6
6
|
from thds.adls.impl import ADLSFileSystem
|
|
7
7
|
from thds.adls.uri import resolve_uri
|
|
@@ -39,9 +39,9 @@ def main():
|
|
|
39
39
|
cache_path = fs.fetch_file(args.adls_fqn.path)
|
|
40
40
|
else:
|
|
41
41
|
if is_dir:
|
|
42
|
-
cache_path = download_directory(args.adls_fqn)
|
|
42
|
+
cache_path = cached.download_directory(args.adls_fqn)
|
|
43
43
|
else:
|
|
44
|
-
cache_path = download_to_cache(args.adls_fqn)
|
|
44
|
+
cache_path = cached.download_to_cache(args.adls_fqn)
|
|
45
45
|
|
|
46
46
|
if args.copy_to:
|
|
47
47
|
link(cache_path, args.copy_to)
|
thds/adls/tools/upload.py
CHANGED
|
@@ -1,17 +1,16 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
from thds.adls
|
|
5
|
-
from thds.adls.uri import resolve_uri
|
|
4
|
+
from thds.adls import cached, uri
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def main():
|
|
9
8
|
parser = argparse.ArgumentParser()
|
|
10
9
|
parser.add_argument("path", type=Path, help="A local file you want to upload.")
|
|
11
|
-
parser.add_argument("uri", type=resolve_uri, help="A fully qualified path to an ADLS location")
|
|
10
|
+
parser.add_argument("uri", type=uri.resolve_uri, help="A fully qualified path to an ADLS location")
|
|
12
11
|
args = parser.parse_args()
|
|
13
12
|
|
|
14
|
-
upload_through_cache(args.uri, args.path)
|
|
13
|
+
cached.upload_through_cache(args.uri, args.path)
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
if __name__ == "__main__":
|
thds/adls/upload.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""API for uploading files to Azure Data Lake Storage (ADLS) Gen2.
|
|
2
|
+
|
|
3
|
+
We hash anything that we possibly can, since it's a fast verification step that we
|
|
4
|
+
can do later during downloads.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import typing as ty
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from azure.core.exceptions import ResourceModifiedError
|
|
12
|
+
from azure.storage.blob import ContentSettings
|
|
13
|
+
|
|
14
|
+
from thds.core import files, fretry, link, log, scope, source, tmp
|
|
15
|
+
|
|
16
|
+
from . import azcopy, hashes
|
|
17
|
+
from ._progress import report_upload_progress
|
|
18
|
+
from ._upload import upload_decision_and_metadata
|
|
19
|
+
from .conf import UPLOAD_FILE_MAX_CONCURRENCY
|
|
20
|
+
from .fqn import AdlsFqn
|
|
21
|
+
from .global_client import get_global_blob_container_client
|
|
22
|
+
from .ro_cache import Cache
|
|
23
|
+
|
|
24
|
+
logger = log.getLogger(__name__)
|
|
25
|
+
_SLOW_CONNECTION_WORKAROUND = 14400 # seconds
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
|
|
32
|
+
@scope.bound
|
|
33
|
+
def _try_write_through() -> bool:
|
|
34
|
+
if isinstance(data, Path) and data.exists():
|
|
35
|
+
link.link_or_copy(data, local_cache_path, "ref")
|
|
36
|
+
return True
|
|
37
|
+
out = scope.enter(tmp.temppath_same_fs(local_cache_path))
|
|
38
|
+
if hasattr(data, "read") and hasattr(data, "seek"):
|
|
39
|
+
with open(out, "wb") as f:
|
|
40
|
+
f.write(data.read()) # type: ignore
|
|
41
|
+
data.seek(0) # type: ignore
|
|
42
|
+
link.link_or_copy(out, local_cache_path)
|
|
43
|
+
return True
|
|
44
|
+
if isinstance(data, bytes):
|
|
45
|
+
with open(out, "wb") as f:
|
|
46
|
+
f.write(data)
|
|
47
|
+
link.link_or_copy(out, local_cache_path)
|
|
48
|
+
return True
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
if _try_write_through():
|
|
52
|
+
try:
|
|
53
|
+
# it's a reflink or a copy, so the cache now owns its copy
|
|
54
|
+
# and we don't want to allow anyone to write to its copy.
|
|
55
|
+
files.set_read_only(local_cache_path)
|
|
56
|
+
return local_cache_path
|
|
57
|
+
except FileNotFoundError:
|
|
58
|
+
# may have hit a race condition.
|
|
59
|
+
# don't fail upload just because we couldn't write through the cache.
|
|
60
|
+
pass
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@scope.bound
|
|
65
|
+
@fretry.retry_sleep(
|
|
66
|
+
# ADLS lib has a bug where parallel uploads of the same thing will
|
|
67
|
+
# hit a race condition and error. this will detect that scenario
|
|
68
|
+
# and avoid re-uploading as well.
|
|
69
|
+
fretry.is_exc(ResourceModifiedError),
|
|
70
|
+
fretry.expo(retries=5),
|
|
71
|
+
)
|
|
72
|
+
def upload(
|
|
73
|
+
dest: ty.Union[AdlsFqn, str],
|
|
74
|
+
src: UploadSrc,
|
|
75
|
+
write_through_cache: ty.Optional[Cache] = None,
|
|
76
|
+
*,
|
|
77
|
+
content_type: str = "",
|
|
78
|
+
**upload_data_kwargs: ty.Any,
|
|
79
|
+
) -> source.Source:
|
|
80
|
+
"""Uploads only if the remote does not exist or does not match
|
|
81
|
+
xxhash.
|
|
82
|
+
|
|
83
|
+
Always embeds xxhash in the blob metadata if at all possible. In very rare cases
|
|
84
|
+
it may not be possible for us to calculate one. Will always be possible if the passed
|
|
85
|
+
data was a Path. If one can be calculated, it will be returned in the Source.
|
|
86
|
+
|
|
87
|
+
Can write through a local cache, which may save you a download later.
|
|
88
|
+
|
|
89
|
+
content_type and all upload_data_kwargs will be ignored if the file
|
|
90
|
+
has already been uploaded and the hash matches.
|
|
91
|
+
"""
|
|
92
|
+
dest_ = AdlsFqn.parse(dest) if isinstance(dest, str) else dest
|
|
93
|
+
if write_through_cache:
|
|
94
|
+
_write_through_local_cache(write_through_cache.path(dest_), src)
|
|
95
|
+
# we always use the original source file to upload, not the cached path,
|
|
96
|
+
# because uploading from a shared location risks race conditions.
|
|
97
|
+
|
|
98
|
+
blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
|
|
99
|
+
blob_client = blob_container_client.get_blob_client(dest_.path)
|
|
100
|
+
decision = upload_decision_and_metadata(blob_client.get_blob_properties, src) # type: ignore [arg-type]
|
|
101
|
+
|
|
102
|
+
def source_from_meta() -> source.Source:
|
|
103
|
+
best_hash = next(iter(hashes.extract_hashes_from_metadata(decision.metadata)), None)
|
|
104
|
+
if isinstance(src, Path):
|
|
105
|
+
assert best_hash, "A hash should always be calculable for a local path."
|
|
106
|
+
return source.from_file(src, hash=best_hash, uri=str(dest_))
|
|
107
|
+
|
|
108
|
+
return source.from_uri(str(dest_), hash=best_hash)
|
|
109
|
+
|
|
110
|
+
if decision.upload_required:
|
|
111
|
+
# set up some bookkeeping
|
|
112
|
+
n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
|
|
113
|
+
bytes_src: ty.Union[bytes, ty.IO, ty.Iterable[bytes]]
|
|
114
|
+
if isinstance(src, Path):
|
|
115
|
+
n_bytes = src.stat().st_size
|
|
116
|
+
bytes_src = scope.enter(open(src, "rb"))
|
|
117
|
+
elif isinstance(src, bytes):
|
|
118
|
+
n_bytes = len(src)
|
|
119
|
+
bytes_src = src
|
|
120
|
+
else:
|
|
121
|
+
bytes_src = src
|
|
122
|
+
|
|
123
|
+
if "metadata" in upload_data_kwargs:
|
|
124
|
+
decision.metadata.update(upload_data_kwargs.pop("metadata"))
|
|
125
|
+
|
|
126
|
+
if azcopy.upload.should_use_azcopy(n_bytes or 0) and isinstance(src, Path):
|
|
127
|
+
logger.info("Using azcopy to upload %s to %s", src, dest_)
|
|
128
|
+
try:
|
|
129
|
+
azcopy.upload.run(
|
|
130
|
+
azcopy.upload.build_azcopy_upload_command(
|
|
131
|
+
src, dest_, content_type=content_type, metadata=decision.metadata, overwrite=True
|
|
132
|
+
),
|
|
133
|
+
dest_,
|
|
134
|
+
n_bytes or 0,
|
|
135
|
+
)
|
|
136
|
+
return source_from_meta()
|
|
137
|
+
|
|
138
|
+
except subprocess.SubprocessError:
|
|
139
|
+
logger.warning("Azcopy upload failed, falling back to SDK upload")
|
|
140
|
+
|
|
141
|
+
upload_content_settings = ContentSettings()
|
|
142
|
+
if content_type:
|
|
143
|
+
upload_content_settings.content_type = content_type
|
|
144
|
+
|
|
145
|
+
# we are now using blob_client instead of file system client
|
|
146
|
+
# because blob client (as of 2024-06-24) does actually do
|
|
147
|
+
# some one-step, atomic uploads, wherein there is not a separate
|
|
148
|
+
# create/truncate action associated with an overwrite.
|
|
149
|
+
# This is both faster, as well as simpler to reason about, and
|
|
150
|
+
# in fact was the behavior I had been assuming all along...
|
|
151
|
+
blob_client.upload_blob(
|
|
152
|
+
report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes or 0),
|
|
153
|
+
overwrite=True,
|
|
154
|
+
length=n_bytes,
|
|
155
|
+
content_settings=upload_content_settings,
|
|
156
|
+
connection_timeout=_SLOW_CONNECTION_WORKAROUND,
|
|
157
|
+
max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
|
|
158
|
+
metadata=decision.metadata,
|
|
159
|
+
**upload_data_kwargs,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return source_from_meta()
|
thds/adls/uri.py
CHANGED
|
@@ -36,3 +36,9 @@ def resolve_any(fqn_or_uri: UriIsh) -> ty.Optional[fqn.AdlsFqn]:
|
|
|
36
36
|
|
|
37
37
|
def parse_any(fqn_or_uri: UriIsh) -> fqn.AdlsFqn:
|
|
38
38
|
return parse_uri(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def to_blob_windows_url(uri: UriIsh) -> str:
|
|
42
|
+
"""Convert an ADLS URI to a Windows network path."""
|
|
43
|
+
fqn = parse_any(uri)
|
|
44
|
+
return f"https://{fqn.sa}.blob.core.windows.net/{fqn.container}/{fqn.path}"
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
thds/adls/__init__.py,sha256=g2Zb0EAAH-JzPMYHAub9liU4qa5pfqQDnILfEhmObGo,1036
|
|
2
|
+
thds/adls/_progress.py,sha256=ZzCHn_G7nHakioNFxdvoJZRr-jN6ymsp5JXf-iReROM,6580
|
|
3
|
+
thds/adls/_upload.py,sha256=mhTdWiQroaugYuwQg7R8CEgdfCYF4xvJthlsqO0jlnE,4692
|
|
4
|
+
thds/adls/abfss.py,sha256=ZRJOLjDuXmS4bIbQAQpQxWWWeu74N9NKEKCNfXQek80,726
|
|
5
|
+
thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
|
|
6
|
+
thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
|
|
7
|
+
thds/adls/copy.py,sha256=jUWbGvTpb4B3yRGS0nhGSbDzqRPzUqYgH0z1lFRJB3k,6365
|
|
8
|
+
thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
|
|
9
|
+
thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
|
|
10
|
+
thds/adls/download.py,sha256=N8JqNqD5ioHsEHcTl2bNJt3Bb187yyvZAXn4xW3flfU,18090
|
|
11
|
+
thds/adls/download_lock.py,sha256=tgT48l4C5_qmArGeq05gl7VlxT22dZBH2Xwxx0itE9o,3176
|
|
12
|
+
thds/adls/errors.py,sha256=6cLg2E4SB8ic46PBzA3ynRH4b1oR8qRb07RBgKGJRxY,1783
|
|
13
|
+
thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
|
|
14
|
+
thds/adls/file_properties.py,sha256=C9Kl3a5wuBNWYgZYnZbkH04u8uxadEcjVJIm3UevUM0,1912
|
|
15
|
+
thds/adls/fqn.py,sha256=0zHmHhBWN7GEfKRB3fBC1NVhaiIHHifBdCRanyT01X8,5822
|
|
16
|
+
thds/adls/global_client.py,sha256=f4VJw5y_Yh__8gQUcdSYTh1aU6iEPlauMchVirSAwDQ,3716
|
|
17
|
+
thds/adls/hashes.py,sha256=RDQS-C38wskUhxXGFGLJ4ox8vm7ofurxSsUk13Ywijo,5309
|
|
18
|
+
thds/adls/impl.py,sha256=4rZAGlhU_UojPy1FC7j3KEFIj6BWSbCDAVV1FCyki3s,42496
|
|
19
|
+
thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
|
|
20
|
+
thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
|
|
21
|
+
thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
|
|
23
|
+
thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
|
|
24
|
+
thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
|
|
25
|
+
thds/adls/source.py,sha256=8HVMYuxDn1XYGwFFSBowMlvQ6r2Jm2CQlpu4h85JvsE,2559
|
|
26
|
+
thds/adls/source_tree.py,sha256=yP_v2XrKxXqUOdZ-x8kqHhBFAuur3AlAq3zi4hHj4AE,2235
|
|
27
|
+
thds/adls/upload.py,sha256=gS_S66gorzdW83eavPUVJ3UYrv5u3HnftDXjdwEZOo8,6441
|
|
28
|
+
thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
|
|
29
|
+
thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
|
|
30
|
+
thds/adls/azcopy/download.py,sha256=J7QAoBehpxsY58ofgGQur-MtIwM0NEnV9_Cw4i_X3y8,6007
|
|
31
|
+
thds/adls/azcopy/login.py,sha256=923UaewVMPFzkDSgCQsbl-_g7qdFhpXpF0MGNIy3T_A,1538
|
|
32
|
+
thds/adls/azcopy/progress.py,sha256=K7TVmSiWfu561orL3GuOnlQX9VtVxWVECAq9NiweYNo,1387
|
|
33
|
+
thds/adls/azcopy/system_resources.py,sha256=okgDEKAp0oWGQF7OKikbgJ9buBeiOgNaDYy-36j6dHo,761
|
|
34
|
+
thds/adls/azcopy/upload.py,sha256=bvtYdbaFsZkOHFLDpeBlTKqw63P3_kbImInI04ZlekM,2601
|
|
35
|
+
thds/adls/tools/download.py,sha256=Dmt-EBZUEF-gVfUcwjAD8VRKR5rhw-oozxl40lZHmdw,1562
|
|
36
|
+
thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
|
|
37
|
+
thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
|
|
38
|
+
thds_adls-4.1.20250701190349.dist-info/METADATA,sha256=gJNup1vZrpFp-0nor96kwmz__Ij_Zc5pkytWoEslYMU,587
|
|
39
|
+
thds_adls-4.1.20250701190349.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
+
thds_adls-4.1.20250701190349.dist-info/entry_points.txt,sha256=uTqreT1AIwqJboMfLv5w6sviM8mNbAkln765gIjzoA4,152
|
|
41
|
+
thds_adls-4.1.20250701190349.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
|
|
42
|
+
thds_adls-4.1.20250701190349.dist-info/RECORD,,
|
thds/adls/resource/__init__.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
"""The reason for a hashed resource is that it enables worry-free caching.
|
|
2
|
-
|
|
3
|
-
If under any circumstances we re-use a name/URI with different bytes,
|
|
4
|
-
then having captured a hash will enable us to transparently detect the
|
|
5
|
-
situation and re-download.
|
|
6
|
-
|
|
7
|
-
It is strongly recommended that you construct these using `of`, as
|
|
8
|
-
that will avoid the accidental, invalid creation of an
|
|
9
|
-
AdlsHashedResource containing an empty hash.
|
|
10
|
-
|
|
11
|
-
How to get the hash itself?
|
|
12
|
-
|
|
13
|
-
From our experience, it seems that any file uploaded using Azure
|
|
14
|
-
Storage Explorer will have an MD5 calculated locally before upload and
|
|
15
|
-
that will be embedded in the remote file. You can look in the
|
|
16
|
-
properties of the uploaded file for Content-MD5 and copy-paste that
|
|
17
|
-
into whatever you're writing.
|
|
18
|
-
|
|
19
|
-
Programmatically, you can instead use `resource.upload`, which will
|
|
20
|
-
return to you an in-memory AdlsHashedResource object. If you want to
|
|
21
|
-
store it programmatically rather than in the source code, it's
|
|
22
|
-
recommended that you use `resource.to_path`, and then load it using
|
|
23
|
-
`resource.from_path`.
|
|
24
|
-
|
|
25
|
-
Prefer importing this module `as resource` or `from thds.adls
|
|
26
|
-
import resource`, and then using it as a namespace,
|
|
27
|
-
e.g. `resource.of(uri)`.
|
|
28
|
-
"""
|
|
29
|
-
from .core import AdlsHashedResource, from_source, get, of, parse, serialize, to_source # noqa: F401
|
|
30
|
-
from .file_pointers import resource_from_path as from_path # noqa: F401
|
|
31
|
-
from .file_pointers import resource_to_path as to_path # noqa: F401
|
|
32
|
-
from .file_pointers import validate_resource as validate # noqa: F401
|
|
33
|
-
from .up_down import get_read_only, upload # noqa: F401
|
|
34
|
-
from .up_down import verify_or_create_resource as verify_or_create # noqa: F401
|
|
35
|
-
|
|
36
|
-
AHR = AdlsHashedResource # just an alias
|