thds.adls 4.1.20250701001205__py3-none-any.whl → 4.1.20250701190349__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/hashes.py ADDED
@@ -0,0 +1,147 @@
1
+ import contextlib
2
+ import os
3
+ import typing as ty
4
+ from functools import partial
5
+
6
+ import xxhash
7
+
8
+ from thds.core import hash_cache, hashing, log, source, types
9
+ from thds.core.hashing import Hash, SomehowReadable
10
+
11
+ from . import errors, file_properties
12
+ from .fqn import AdlsFqn
13
+
14
+ logger = log.getLogger(__name__)
15
+
16
+ PREFERRED_ALGOS: ty.Final = ("xxh3_128", "blake3")
17
+ AnyStrSrc = ty.Union[SomehowReadable, ty.Iterable[ty.AnyStr]]
18
+ # this type closely corresponds to what the underlying DataLakeStorageClient will accept for upload_data.
19
+
20
+
21
+ def default_hasher() -> hashing.Hasher:
22
+ return xxhash.xxh3_128()
23
+
24
+
25
+ def _xxhash_hasher(algo: str) -> hashing.Hasher:
26
+ return getattr(xxhash, algo)()
27
+
28
+
29
+ def register_hashes():
30
+ for algo in xxhash.algorithms_available:
31
+ hashing.add_named_hash(algo, _xxhash_hasher)
32
+ source.set_file_autohash(PREFERRED_ALGOS[0], _xxhash_hasher)
33
+
34
+ try:
35
+ from blake3 import blake3
36
+
37
+ hashing.add_named_hash("blake3", lambda _: blake3()) # type: ignore
38
+ except ModuleNotFoundError:
39
+ pass
40
+
41
+
42
+ def _hash_path_if_exists(
43
+ file_hasher: ty.Callable[[types.StrOrPath], hashing.Hash], path: types.StrOrPath
44
+ ) -> ty.Optional[hashing.Hash]:
45
+ if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
46
+ return None
47
+ return file_hasher(path)
48
+
49
+
50
+ def hash_path_for_algo(
51
+ algo: str,
52
+ ) -> ty.Callable[[types.StrOrPath], ty.Optional[hashing.Hash]]:
53
+ """Return a function that hashes a path for the given algorithm."""
54
+ return partial(_hash_path_if_exists, partial(hash_cache.filehash, algo))
55
+
56
+
57
+ def metadata_hash_b64_key(algo: str) -> str:
58
+ return f"hash_{algo}_b64"
59
+
60
+
61
+ def extract_hashes_from_metadata(metadata: dict) -> ty.Iterable[hashing.Hash]:
62
+ # NOTE! the order here is critical, because we want to _prefer_ the faster hash if it exists.
63
+ for hash_algo in PREFERRED_ALGOS:
64
+ md_key = metadata_hash_b64_key(hash_algo)
65
+ if metadata and md_key in metadata:
66
+ yield hashing.Hash(hash_algo, hashing.db64(metadata[md_key]))
67
+
68
+
69
+ def extract_hashes_from_props(
70
+ props: ty.Optional[file_properties.PropertiesP],
71
+ ) -> dict[str, hashing.Hash]:
72
+ if not props:
73
+ return dict()
74
+
75
+ hashes = list(extract_hashes_from_metadata(props.metadata or dict()))
76
+ if props.content_settings and props.content_settings.content_md5:
77
+ hashes.append(hashing.Hash("md5", props.content_settings.content_md5))
78
+ return {h.algo: h for h in hashes}
79
+
80
+
81
+ @contextlib.contextmanager
82
+ def verify_hashes_before_and_after_download(
83
+ remote_hash: ty.Optional[Hash],
84
+ expected_hash: ty.Optional[Hash],
85
+ fqn: AdlsFqn,
86
+ local_dest: types.StrOrPath,
87
+ ) -> ty.Iterator[None]:
88
+ # if expected_hash:
89
+ # check_reasonable_md5b64(expected_md5b64)
90
+ # if remote_md5b64:
91
+ # check_reasonable_md5b64(remote_md5b64)
92
+ if remote_hash and expected_hash and remote_hash != expected_hash:
93
+ raise errors.HashMismatchError(
94
+ f"ADLS thinks the {remote_hash.algo} of {fqn} is {hashing.b64(remote_hash.bytes)},"
95
+ f" but we expected {hashing.b64(expected_hash.bytes)}."
96
+ " This may indicate that we need to update a hash in the codebase."
97
+ )
98
+
99
+ yield # perform download
100
+
101
+ expected_algo = expected_hash.algo if expected_hash else None
102
+ if not expected_algo and remote_hash:
103
+ expected_algo = remote_hash.algo
104
+
105
+ if not expected_algo:
106
+ # if we have neither a user-provided hash nor a remotely-foun9d hash, then we have nothing to check.
107
+ return
108
+
109
+ with log.logger_context(hash_for="after-download"):
110
+ local_hash = hash_cache.filehash(expected_algo, local_dest)
111
+
112
+ if remote_hash and remote_hash != local_hash:
113
+ raise errors.HashMismatchError(
114
+ f"The {local_hash.algo} of the downloaded file {local_dest} is {hashing.b64(local_hash.bytes)},"
115
+ f" but the remote ({fqn}) says it should be {hashing.b64(remote_hash.bytes)}."
116
+ f" This may indicate that ADLS has an erroneous {remote_hash.algo} for {fqn}."
117
+ )
118
+
119
+ if expected_hash and local_hash != expected_hash:
120
+ raise errors.HashMismatchError(
121
+ f"The {local_hash.algo} of the downloaded file {local_dest} is {hashing.b64(local_hash.bytes)},"
122
+ f" but we expected it to be {hashing.b64(expected_hash.bytes)}."
123
+ f" This probably indicates a corrupted download of {fqn}"
124
+ )
125
+
126
+ all_hashes = dict(local=local_hash, remote=remote_hash, expected=expected_hash)
127
+ real_hashes = list(filter(None, all_hashes.values()))
128
+ assert len(real_hashes) > 0, all_hashes
129
+ assert all(real_hashes[0] == h for h in real_hashes), all_hashes
130
+
131
+
132
+ def metadata_hash_dict(hash: Hash) -> dict[str, str]:
133
+ return {metadata_hash_b64_key(hash.algo): hashing.b64(hash.bytes)}
134
+
135
+
136
+ def create_hash_metadata_if_missing(
137
+ file_properties: ty.Optional[file_properties.FileProperties], new_hash: ty.Optional[Hash]
138
+ ) -> dict:
139
+ if not (file_properties and new_hash):
140
+ # without file properties, we can't match the etag when we try to set this.
141
+ return dict()
142
+
143
+ existing_metadata = file_properties.metadata or dict()
144
+ if metadata_hash_b64_key(new_hash.algo) not in existing_metadata:
145
+ return {**existing_metadata, **metadata_hash_dict(new_hash)}
146
+
147
+ return dict()
thds/adls/impl.py CHANGED
@@ -31,7 +31,7 @@ from azure.storage.filedatalake.aio import DataLakeServiceClient, FileSystemClie
31
31
 
32
32
  from thds.core import lazy, log
33
33
 
34
- from ._upload import async_upload_decision_and_settings, metadata_for_upload
34
+ from ._upload import async_upload_decision_and_metadata
35
35
  from .conf import CONNECTION_TIMEOUT, UPLOAD_CHUNK_SIZE
36
36
  from .download import async_download_or_use_verified
37
37
  from .errors import translate_azure_error
@@ -330,15 +330,14 @@ class ADLSFileSystem:
330
330
 
331
331
  async with file_system_client.get_file_client(remote_path) as file_client:
332
332
  with open(local_path, "rb") as fp:
333
- decision = await async_upload_decision_and_settings(file_client.get_file_properties, fp)
333
+ decision = await async_upload_decision_and_metadata(file_client.get_file_properties, fp)
334
334
  if decision.upload_required:
335
335
  await file_client.upload_data(
336
336
  fp,
337
337
  overwrite=True,
338
- content_settings=decision.content_settings,
339
338
  connection_timeout=CONNECTION_TIMEOUT(),
340
339
  chunk_size=UPLOAD_CHUNK_SIZE(),
341
- metadata={**metadata_for_upload(), **(metadata or {})},
340
+ metadata={**decision.metadata, **(metadata or {})},
342
341
  )
343
342
 
344
343
  return remote_path
thds/adls/md5.py CHANGED
@@ -1,60 +1,13 @@
1
- """Why MD5 when it's no longer a good choice for most use cases?
2
- Because Azure/ADLS support Content-MD5 but nothing else, and I don't
3
- want to lie to them and get us confused later.
4
-
5
- Thankfully, there are no real security concerns for us with purely
6
- internal code and data sets.
7
-
8
- That said, please _do not_ use MD5 for non-Azure things. Prefer SHA256
9
- if at all possible.
10
- """
11
1
  import hashlib
12
- import typing as ty
13
- from pathlib import Path
14
-
15
- from thds.core.hash_cache import hash_file
16
- from thds.core.hashing import SomehowReadable, hash_anything, hash_using
17
- from thds.core.types import StrOrPath
18
-
19
- AnyStrSrc = ty.Union[SomehowReadable, ty.Iterable[ty.AnyStr]]
20
- # this type closely corresponds to what the underlying DataLakeStorageClient will accept for upload_data.
21
-
22
-
23
- def md5_file(file: StrOrPath) -> bytes:
24
- """Raise exception if it cannot be read.
25
2
 
26
- Safely caches the hash on your local filesystem (uses mtimes to
27
- determine staleness).
28
- """
29
- return hash_file(file, hashlib.md5())
3
+ from thds.core.hashing import Hash, db64, hash_using
30
4
 
31
5
 
32
6
  def hex_md5_str(string: str) -> str:
33
7
  return hash_using(string.encode(), hashlib.md5()).hexdigest()
34
8
 
35
9
 
36
- def try_md5(data: AnyStrSrc) -> ty.Optional[bytes]:
37
- """Ideally, we calculate an MD5 sum for all data that we upload.
38
-
39
- The only circumstances under which we cannot do this are if the
40
- stream does not exist in its entirety before the upload begins.
41
- """
42
- if isinstance(data, Path):
43
- return md5_file(data)
44
- res = hash_anything(data, hashlib.md5())
45
- if res:
46
- return res.digest()
47
- return None
48
-
49
-
50
- def is_reasonable_b64(md5: str):
51
- if len(md5) == 22:
52
- return True
53
- if len(md5) == 24 and md5.endswith("=="):
54
- return True
55
- return False
56
-
57
-
58
- def check_reasonable_md5b64(maybe_md5: str):
59
- if not is_reasonable_b64(maybe_md5):
60
- raise ValueError(f"MD5 '{maybe_md5}' is not a reasonable MD5.")
10
+ def to_hash(md5b64: str) -> Hash:
11
+ """Convert a base64-encoded MD5 hash to a hex string."""
12
+ assert md5b64, "MD5 base64 string cannot be empty"
13
+ return Hash(algo="md5", bytes=db64(md5b64))
thds/adls/ro_cache.py CHANGED
@@ -12,7 +12,7 @@ from thds.core.link import LinkType, link_or_copy
12
12
  from .fqn import AdlsFqn
13
13
  from .md5 import hex_md5_str
14
14
 
15
- GLOBAL_CACHE_PATH = config.item("global-cache-path", HOMEDIR() / ".adls-md5-ro-cache", parse=Path)
15
+ GLOBAL_CACHE_PATH = config.item("global-cache-path", HOMEDIR() / ".thds/adls/ro-cache", parse=Path)
16
16
  MAX_FILENAME_LEN = config.item("max-filename-len", 255, parse=int) # safe on most local filesystems?
17
17
  MAX_TOTAL_PATH_LEN = config.item(
18
18
  "max-total-path-len", 1023 if sys.platform == "darwin" else 4095, parse=int
@@ -39,7 +39,6 @@ class Cache(ty.NamedTuple):
39
39
 
40
40
 
41
41
  def global_cache(link: LinkOpts = ("ref", "hard")) -> Cache:
42
- """This is the recommended caching configuration."""
43
42
  return Cache(GLOBAL_CACHE_PATH(), link)
44
43
 
45
44
 
thds/adls/source.py CHANGED
@@ -1,14 +1,14 @@
1
- import base64
2
1
  import typing as ty
3
2
  from functools import partial
4
3
  from pathlib import Path
5
4
 
6
5
  from thds.core import source
7
- from thds.core.hashing import Hash, b64
6
+ from thds.core.hashing import Hash
8
7
 
9
- from .cached_up_down import download_to_cache
8
+ from . import cached, hashes, md5
9
+ from .errors import blob_not_found_translation
10
+ from .file_properties import get_file_properties
10
11
  from .fqn import AdlsFqn
11
- from .resource import AdlsHashedResource
12
12
  from .uri import resolve_any, resolve_uri
13
13
 
14
14
 
@@ -19,17 +19,10 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
19
19
 
20
20
  def download(hash: ty.Optional[Hash]) -> Path:
21
21
  assert fqn
22
- if hash and hash.algo == "md5":
23
- # this 'extra' check just allows us to short-circuit a download
24
- # where the hash at this URI is known not to match what we expect.
25
- # It's no safer than the non-md5 hash check that Source performs after download.
26
- return download_to_cache(fqn, b64(hash.bytes))
27
-
28
- # we don't validate this hash, because we already have md5 validation
29
- # happening inside the download_to_cache function. the Source hash
30
- # is actually mostly for use by systems that want to do content addressing,
31
- # and not necessarily intended to be a runtime check in all scenarios.
32
- return download_to_cache(fqn)
22
+ # this 'extra' check just allows us to short-circuit a download
23
+ # where the hash at this URI is known not to match what we expect.
24
+ # It's no safer than the non-md5 hash check that Source performs after download.
25
+ return cached.download_to_cache(fqn, expected_hash=hash)
33
26
 
34
27
  return download
35
28
 
@@ -37,30 +30,40 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
37
30
  source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
38
31
 
39
32
 
40
- def from_adls(
41
- uri_or_fqn_or_ahr: ty.Union[str, AdlsFqn, AdlsHashedResource], hash: ty.Optional[Hash] = None
42
- ) -> source.Source:
33
+ def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None) -> source.Source:
43
34
  """Flexible, public interface to creating Sources from any ADLS-like reference.
44
35
 
45
- Does NOT automatically fetch an MD5 hash from the ADLS URI if it's not provided. If
46
- you know you want to include that, combine this with `resource.get`:
47
- `source.from_adls(resource.get(uri))`
36
+ Does NOT automatically fetch a checksumming hash from the ADLS URI if it's not
37
+ provided. If you know you want to include that, instead call:
38
+ `source.get_with_hash(uri_or_fqn)`.
48
39
  """
49
- if isinstance(uri_or_fqn_or_ahr, AdlsHashedResource):
50
- fqn = uri_or_fqn_or_ahr.fqn
51
- res_hash = Hash("md5", base64.b64decode(uri_or_fqn_or_ahr.md5b64))
52
- if hash and hash != res_hash:
53
- raise ValueError(f"Resource Hash mismatch for {fqn}: {hash} != {res_hash}")
54
- hash = res_hash
55
- else:
56
- r_fqn = resolve_any(uri_or_fqn_or_ahr)
57
- if not r_fqn:
58
- raise ValueError(f"Could not resolve {uri_or_fqn_or_ahr} to an ADLS FQN")
59
- fqn = r_fqn
60
-
61
- return source.Source(str(fqn), hash)
40
+ r_fqn = resolve_any(uri_or_fqn)
41
+ if not r_fqn:
42
+ raise ValueError(f"Could not resolve {uri_or_fqn} to an ADLS FQN")
43
+ return source.Source(str(r_fqn), hash)
62
44
 
63
45
 
64
46
  source.register_from_uri_handler(
65
47
  "thds.adls", lambda uri: partial(from_adls, uri) if resolve_uri(uri) else None
66
48
  )
49
+
50
+
51
+ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
52
+ """Creates a Source from a remote-only file, with MD5 or other hash.
53
+
54
+ The file _must_ have a pre-existing hash!
55
+ """
56
+ fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
57
+ with blob_not_found_translation(fqn):
58
+ uri_hashes = hashes.extract_hashes_from_props(get_file_properties(fqn))
59
+ if not uri_hashes:
60
+ raise ValueError(
61
+ f"ADLS file {fqn} must have a hash to use this function. "
62
+ "If you know the hash, use `from_adls` with the hash parameter."
63
+ )
64
+ return from_adls(fqn, next(iter(uri_hashes.values())))
65
+
66
+
67
+ def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
68
+ """Meant for older use cases where we had an MD5"""
69
+ return from_adls(uri_or_fqn, md5.to_hash(md5b64) if md5b64 else None)
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  from pathlib import Path
3
3
 
4
- from thds.adls.cached_up_down import download_directory, download_to_cache
4
+ from thds.adls import cached
5
5
  from thds.adls.file_properties import get_file_properties, is_directory
6
6
  from thds.adls.impl import ADLSFileSystem
7
7
  from thds.adls.uri import resolve_uri
@@ -39,9 +39,9 @@ def main():
39
39
  cache_path = fs.fetch_file(args.adls_fqn.path)
40
40
  else:
41
41
  if is_dir:
42
- cache_path = download_directory(args.adls_fqn)
42
+ cache_path = cached.download_directory(args.adls_fqn)
43
43
  else:
44
- cache_path = download_to_cache(args.adls_fqn)
44
+ cache_path = cached.download_to_cache(args.adls_fqn)
45
45
 
46
46
  if args.copy_to:
47
47
  link(cache_path, args.copy_to)
thds/adls/tools/upload.py CHANGED
@@ -1,17 +1,16 @@
1
1
  import argparse
2
2
  from pathlib import Path
3
3
 
4
- from thds.adls.cached_up_down import upload_through_cache
5
- from thds.adls.uri import resolve_uri
4
+ from thds.adls import cached, uri
6
5
 
7
6
 
8
7
  def main():
9
8
  parser = argparse.ArgumentParser()
10
9
  parser.add_argument("path", type=Path, help="A local file you want to upload.")
11
- parser.add_argument("uri", type=resolve_uri, help="A fully qualified path to an ADLS location")
10
+ parser.add_argument("uri", type=uri.resolve_uri, help="A fully qualified path to an ADLS location")
12
11
  args = parser.parse_args()
13
12
 
14
- upload_through_cache(args.uri, args.path)
13
+ cached.upload_through_cache(args.uri, args.path)
15
14
 
16
15
 
17
16
  if __name__ == "__main__":
thds/adls/upload.py ADDED
@@ -0,0 +1,162 @@
1
+ """API for uploading files to Azure Data Lake Storage (ADLS) Gen2.
2
+
3
+ We hash anything that we possibly can, since it's a fast verification step that we
4
+ can do later during downloads.
5
+ """
6
+
7
+ import subprocess
8
+ import typing as ty
9
+ from pathlib import Path
10
+
11
+ from azure.core.exceptions import ResourceModifiedError
12
+ from azure.storage.blob import ContentSettings
13
+
14
+ from thds.core import files, fretry, link, log, scope, source, tmp
15
+
16
+ from . import azcopy, hashes
17
+ from ._progress import report_upload_progress
18
+ from ._upload import upload_decision_and_metadata
19
+ from .conf import UPLOAD_FILE_MAX_CONCURRENCY
20
+ from .fqn import AdlsFqn
21
+ from .global_client import get_global_blob_container_client
22
+ from .ro_cache import Cache
23
+
24
+ logger = log.getLogger(__name__)
25
+ _SLOW_CONNECTION_WORKAROUND = 14400 # seconds
26
+
27
+
28
+ UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
29
+
30
+
31
+ def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
32
+ @scope.bound
33
+ def _try_write_through() -> bool:
34
+ if isinstance(data, Path) and data.exists():
35
+ link.link_or_copy(data, local_cache_path, "ref")
36
+ return True
37
+ out = scope.enter(tmp.temppath_same_fs(local_cache_path))
38
+ if hasattr(data, "read") and hasattr(data, "seek"):
39
+ with open(out, "wb") as f:
40
+ f.write(data.read()) # type: ignore
41
+ data.seek(0) # type: ignore
42
+ link.link_or_copy(out, local_cache_path)
43
+ return True
44
+ if isinstance(data, bytes):
45
+ with open(out, "wb") as f:
46
+ f.write(data)
47
+ link.link_or_copy(out, local_cache_path)
48
+ return True
49
+ return False
50
+
51
+ if _try_write_through():
52
+ try:
53
+ # it's a reflink or a copy, so the cache now owns its copy
54
+ # and we don't want to allow anyone to write to its copy.
55
+ files.set_read_only(local_cache_path)
56
+ return local_cache_path
57
+ except FileNotFoundError:
58
+ # may have hit a race condition.
59
+ # don't fail upload just because we couldn't write through the cache.
60
+ pass
61
+ return None
62
+
63
+
64
+ @scope.bound
65
+ @fretry.retry_sleep(
66
+ # ADLS lib has a bug where parallel uploads of the same thing will
67
+ # hit a race condition and error. this will detect that scenario
68
+ # and avoid re-uploading as well.
69
+ fretry.is_exc(ResourceModifiedError),
70
+ fretry.expo(retries=5),
71
+ )
72
+ def upload(
73
+ dest: ty.Union[AdlsFqn, str],
74
+ src: UploadSrc,
75
+ write_through_cache: ty.Optional[Cache] = None,
76
+ *,
77
+ content_type: str = "",
78
+ **upload_data_kwargs: ty.Any,
79
+ ) -> source.Source:
80
+ """Uploads only if the remote does not exist or does not match
81
+ xxhash.
82
+
83
+ Always embeds xxhash in the blob metadata if at all possible. In very rare cases
84
+ it may not be possible for us to calculate one. Will always be possible if the passed
85
+ data was a Path. If one can be calculated, it will be returned in the Source.
86
+
87
+ Can write through a local cache, which may save you a download later.
88
+
89
+ content_type and all upload_data_kwargs will be ignored if the file
90
+ has already been uploaded and the hash matches.
91
+ """
92
+ dest_ = AdlsFqn.parse(dest) if isinstance(dest, str) else dest
93
+ if write_through_cache:
94
+ _write_through_local_cache(write_through_cache.path(dest_), src)
95
+ # we always use the original source file to upload, not the cached path,
96
+ # because uploading from a shared location risks race conditions.
97
+
98
+ blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
99
+ blob_client = blob_container_client.get_blob_client(dest_.path)
100
+ decision = upload_decision_and_metadata(blob_client.get_blob_properties, src) # type: ignore [arg-type]
101
+
102
+ def source_from_meta() -> source.Source:
103
+ best_hash = next(iter(hashes.extract_hashes_from_metadata(decision.metadata)), None)
104
+ if isinstance(src, Path):
105
+ assert best_hash, "A hash should always be calculable for a local path."
106
+ return source.from_file(src, hash=best_hash, uri=str(dest_))
107
+
108
+ return source.from_uri(str(dest_), hash=best_hash)
109
+
110
+ if decision.upload_required:
111
+ # set up some bookkeeping
112
+ n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
113
+ bytes_src: ty.Union[bytes, ty.IO, ty.Iterable[bytes]]
114
+ if isinstance(src, Path):
115
+ n_bytes = src.stat().st_size
116
+ bytes_src = scope.enter(open(src, "rb"))
117
+ elif isinstance(src, bytes):
118
+ n_bytes = len(src)
119
+ bytes_src = src
120
+ else:
121
+ bytes_src = src
122
+
123
+ if "metadata" in upload_data_kwargs:
124
+ decision.metadata.update(upload_data_kwargs.pop("metadata"))
125
+
126
+ if azcopy.upload.should_use_azcopy(n_bytes or 0) and isinstance(src, Path):
127
+ logger.info("Using azcopy to upload %s to %s", src, dest_)
128
+ try:
129
+ azcopy.upload.run(
130
+ azcopy.upload.build_azcopy_upload_command(
131
+ src, dest_, content_type=content_type, metadata=decision.metadata, overwrite=True
132
+ ),
133
+ dest_,
134
+ n_bytes or 0,
135
+ )
136
+ return source_from_meta()
137
+
138
+ except subprocess.SubprocessError:
139
+ logger.warning("Azcopy upload failed, falling back to SDK upload")
140
+
141
+ upload_content_settings = ContentSettings()
142
+ if content_type:
143
+ upload_content_settings.content_type = content_type
144
+
145
+ # we are now using blob_client instead of file system client
146
+ # because blob client (as of 2024-06-24) does actually do
147
+ # some one-step, atomic uploads, wherein there is not a separate
148
+ # create/truncate action associated with an overwrite.
149
+ # This is both faster, as well as simpler to reason about, and
150
+ # in fact was the behavior I had been assuming all along...
151
+ blob_client.upload_blob(
152
+ report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes or 0),
153
+ overwrite=True,
154
+ length=n_bytes,
155
+ content_settings=upload_content_settings,
156
+ connection_timeout=_SLOW_CONNECTION_WORKAROUND,
157
+ max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
158
+ metadata=decision.metadata,
159
+ **upload_data_kwargs,
160
+ )
161
+
162
+ return source_from_meta()
thds/adls/uri.py CHANGED
@@ -36,3 +36,9 @@ def resolve_any(fqn_or_uri: UriIsh) -> ty.Optional[fqn.AdlsFqn]:
36
36
 
37
37
  def parse_any(fqn_or_uri: UriIsh) -> fqn.AdlsFqn:
38
38
  return parse_uri(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
39
+
40
+
41
+ def to_blob_windows_url(uri: UriIsh) -> str:
42
+ """Convert an ADLS URI to a Windows network path."""
43
+ fqn = parse_any(uri)
44
+ return f"https://{fqn.sa}.blob.core.windows.net/{fqn.container}/{fqn.path}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thds.adls
3
- Version: 4.1.20250701001205
3
+ Version: 4.1.20250701190349
4
4
  Summary: ADLS tools
5
5
  Author-email: Trilliant Health <info@trillianthealth.com>
6
6
  License: MIT
@@ -0,0 +1,42 @@
1
+ thds/adls/__init__.py,sha256=g2Zb0EAAH-JzPMYHAub9liU4qa5pfqQDnILfEhmObGo,1036
2
+ thds/adls/_progress.py,sha256=ZzCHn_G7nHakioNFxdvoJZRr-jN6ymsp5JXf-iReROM,6580
3
+ thds/adls/_upload.py,sha256=mhTdWiQroaugYuwQg7R8CEgdfCYF4xvJthlsqO0jlnE,4692
4
+ thds/adls/abfss.py,sha256=ZRJOLjDuXmS4bIbQAQpQxWWWeu74N9NKEKCNfXQek80,726
5
+ thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
6
+ thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
7
+ thds/adls/copy.py,sha256=jUWbGvTpb4B3yRGS0nhGSbDzqRPzUqYgH0z1lFRJB3k,6365
8
+ thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
9
+ thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
10
+ thds/adls/download.py,sha256=N8JqNqD5ioHsEHcTl2bNJt3Bb187yyvZAXn4xW3flfU,18090
11
+ thds/adls/download_lock.py,sha256=tgT48l4C5_qmArGeq05gl7VlxT22dZBH2Xwxx0itE9o,3176
12
+ thds/adls/errors.py,sha256=6cLg2E4SB8ic46PBzA3ynRH4b1oR8qRb07RBgKGJRxY,1783
13
+ thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
14
+ thds/adls/file_properties.py,sha256=C9Kl3a5wuBNWYgZYnZbkH04u8uxadEcjVJIm3UevUM0,1912
15
+ thds/adls/fqn.py,sha256=0zHmHhBWN7GEfKRB3fBC1NVhaiIHHifBdCRanyT01X8,5822
16
+ thds/adls/global_client.py,sha256=f4VJw5y_Yh__8gQUcdSYTh1aU6iEPlauMchVirSAwDQ,3716
17
+ thds/adls/hashes.py,sha256=RDQS-C38wskUhxXGFGLJ4ox8vm7ofurxSsUk13Ywijo,5309
18
+ thds/adls/impl.py,sha256=4rZAGlhU_UojPy1FC7j3KEFIj6BWSbCDAVV1FCyki3s,42496
19
+ thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
20
+ thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
21
+ thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
23
+ thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
24
+ thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
25
+ thds/adls/source.py,sha256=8HVMYuxDn1XYGwFFSBowMlvQ6r2Jm2CQlpu4h85JvsE,2559
26
+ thds/adls/source_tree.py,sha256=yP_v2XrKxXqUOdZ-x8kqHhBFAuur3AlAq3zi4hHj4AE,2235
27
+ thds/adls/upload.py,sha256=gS_S66gorzdW83eavPUVJ3UYrv5u3HnftDXjdwEZOo8,6441
28
+ thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
29
+ thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
30
+ thds/adls/azcopy/download.py,sha256=J7QAoBehpxsY58ofgGQur-MtIwM0NEnV9_Cw4i_X3y8,6007
31
+ thds/adls/azcopy/login.py,sha256=923UaewVMPFzkDSgCQsbl-_g7qdFhpXpF0MGNIy3T_A,1538
32
+ thds/adls/azcopy/progress.py,sha256=K7TVmSiWfu561orL3GuOnlQX9VtVxWVECAq9NiweYNo,1387
33
+ thds/adls/azcopy/system_resources.py,sha256=okgDEKAp0oWGQF7OKikbgJ9buBeiOgNaDYy-36j6dHo,761
34
+ thds/adls/azcopy/upload.py,sha256=bvtYdbaFsZkOHFLDpeBlTKqw63P3_kbImInI04ZlekM,2601
35
+ thds/adls/tools/download.py,sha256=Dmt-EBZUEF-gVfUcwjAD8VRKR5rhw-oozxl40lZHmdw,1562
36
+ thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
37
+ thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
38
+ thds_adls-4.1.20250701190349.dist-info/METADATA,sha256=gJNup1vZrpFp-0nor96kwmz__Ij_Zc5pkytWoEslYMU,587
39
+ thds_adls-4.1.20250701190349.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
+ thds_adls-4.1.20250701190349.dist-info/entry_points.txt,sha256=uTqreT1AIwqJboMfLv5w6sviM8mNbAkln765gIjzoA4,152
41
+ thds_adls-4.1.20250701190349.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
42
+ thds_adls-4.1.20250701190349.dist-info/RECORD,,
@@ -1,36 +0,0 @@
1
- """The reason for a hashed resource is that it enables worry-free caching.
2
-
3
- If under any circumstances we re-use a name/URI with different bytes,
4
- then having captured a hash will enable us to transparently detect the
5
- situation and re-download.
6
-
7
- It is strongly recommended that you construct these using `of`, as
8
- that will avoid the accidental, invalid creation of an
9
- AdlsHashedResource containing an empty hash.
10
-
11
- How to get the hash itself?
12
-
13
- From our experience, it seems that any file uploaded using Azure
14
- Storage Explorer will have an MD5 calculated locally before upload and
15
- that will be embedded in the remote file. You can look in the
16
- properties of the uploaded file for Content-MD5 and copy-paste that
17
- into whatever you're writing.
18
-
19
- Programmatically, you can instead use `resource.upload`, which will
20
- return to you an in-memory AdlsHashedResource object. If you want to
21
- store it programmatically rather than in the source code, it's
22
- recommended that you use `resource.to_path`, and then load it using
23
- `resource.from_path`.
24
-
25
- Prefer importing this module `as resource` or `from thds.adls
26
- import resource`, and then using it as a namespace,
27
- e.g. `resource.of(uri)`.
28
- """
29
- from .core import AdlsHashedResource, from_source, get, of, parse, serialize, to_source # noqa: F401
30
- from .file_pointers import resource_from_path as from_path # noqa: F401
31
- from .file_pointers import resource_to_path as to_path # noqa: F401
32
- from .file_pointers import validate_resource as validate # noqa: F401
33
- from .up_down import get_read_only, upload # noqa: F401
34
- from .up_down import verify_or_create_resource as verify_or_create # noqa: F401
35
-
36
- AHR = AdlsHashedResource # just an alias