thds.adls 4.1.20250701001205__py3-none-any.whl → 4.1.20250702194306__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/hashes.py ADDED
@@ -0,0 +1,147 @@
1
+ import contextlib
2
+ import os
3
+ import typing as ty
4
+ from functools import partial
5
+
6
+ import xxhash
7
+
8
+ from thds.core import hash_cache, hashing, log, source, types
9
+ from thds.core.hashing import Hash, SomehowReadable
10
+
11
+ from . import errors, file_properties
12
+ from .fqn import AdlsFqn
13
+
14
+ logger = log.getLogger(__name__)
15
+
16
+ PREFERRED_ALGOS: ty.Final = ("xxh3_128", "blake3")
17
+ AnyStrSrc = ty.Union[SomehowReadable, ty.Iterable[ty.AnyStr]]
18
+ # this type closely corresponds to what the underlying DataLakeStorageClient will accept for upload_data.
19
+
20
+
21
+ def default_hasher() -> hashing.Hasher:
22
+ return xxhash.xxh3_128()
23
+
24
+
25
+ def _xxhash_hasher(algo: str) -> hashing.Hasher:
26
+ return getattr(xxhash, algo)()
27
+
28
+
29
+ def register_hashes():
30
+ for algo in xxhash.algorithms_available:
31
+ hashing.add_named_hash(algo, _xxhash_hasher)
32
+ source.set_file_autohash(PREFERRED_ALGOS[0], _xxhash_hasher)
33
+
34
+ try:
35
+ from blake3 import blake3
36
+
37
+ hashing.add_named_hash("blake3", lambda _: blake3()) # type: ignore
38
+ except ModuleNotFoundError:
39
+ pass
40
+
41
+
42
+ def _hash_path_if_exists(
43
+ file_hasher: ty.Callable[[types.StrOrPath], hashing.Hash], path: types.StrOrPath
44
+ ) -> ty.Optional[hashing.Hash]:
45
+ if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
46
+ return None
47
+ return file_hasher(path)
48
+
49
+
50
+ def hash_path_for_algo(
51
+ algo: str,
52
+ ) -> ty.Callable[[types.StrOrPath], ty.Optional[hashing.Hash]]:
53
+ """Return a function that hashes a path for the given algorithm."""
54
+ return partial(_hash_path_if_exists, partial(hash_cache.filehash, algo))
55
+
56
+
57
+ def metadata_hash_b64_key(algo: str) -> str:
58
+ return f"hash_{algo}_b64"
59
+
60
+
61
+ def extract_hashes_from_metadata(metadata: dict) -> ty.Iterable[hashing.Hash]:
62
+ # NOTE! the order here is critical, because we want to _prefer_ the faster hash if it exists.
63
+ for hash_algo in PREFERRED_ALGOS:
64
+ md_key = metadata_hash_b64_key(hash_algo)
65
+ if metadata and md_key in metadata:
66
+ yield hashing.Hash(hash_algo, hashing.db64(metadata[md_key]))
67
+
68
+
69
+ def extract_hashes_from_props(
70
+ props: ty.Optional[file_properties.PropertiesP],
71
+ ) -> dict[str, hashing.Hash]:
72
+ if not props:
73
+ return dict()
74
+
75
+ hashes = list(extract_hashes_from_metadata(props.metadata or dict()))
76
+ if props.content_settings and props.content_settings.content_md5:
77
+ hashes.append(hashing.Hash("md5", props.content_settings.content_md5))
78
+ return {h.algo: h for h in hashes}
79
+
80
+
81
+ @contextlib.contextmanager
82
+ def verify_hashes_before_and_after_download(
83
+ remote_hash: ty.Optional[Hash],
84
+ expected_hash: ty.Optional[Hash],
85
+ fqn: AdlsFqn,
86
+ local_dest: types.StrOrPath,
87
+ ) -> ty.Iterator[None]:
88
+ # if expected_hash:
89
+ # check_reasonable_md5b64(expected_md5b64)
90
+ # if remote_md5b64:
91
+ # check_reasonable_md5b64(remote_md5b64)
92
+ if remote_hash and expected_hash and remote_hash != expected_hash:
93
+ raise errors.HashMismatchError(
94
+ f"ADLS thinks the {remote_hash.algo} of {fqn} is {hashing.b64(remote_hash.bytes)},"
95
+ f" but we expected {hashing.b64(expected_hash.bytes)}."
96
+ " This may indicate that we need to update a hash in the codebase."
97
+ )
98
+
99
+ yield # perform download
100
+
101
+ expected_algo = expected_hash.algo if expected_hash else None
102
+ if not expected_algo and remote_hash:
103
+ expected_algo = remote_hash.algo
104
+
105
+ if not expected_algo:
106
+ # if we have neither a user-provided hash nor a remotely-foun9d hash, then we have nothing to check.
107
+ return
108
+
109
+ with log.logger_context(hash_for="after-download"):
110
+ local_hash = hash_cache.filehash(expected_algo, local_dest)
111
+
112
+ if remote_hash and remote_hash != local_hash:
113
+ raise errors.HashMismatchError(
114
+ f"The {local_hash.algo} of the downloaded file {local_dest} is {hashing.b64(local_hash.bytes)},"
115
+ f" but the remote ({fqn}) says it should be {hashing.b64(remote_hash.bytes)}."
116
+ f" This may indicate that ADLS has an erroneous {remote_hash.algo} for {fqn}."
117
+ )
118
+
119
+ if expected_hash and local_hash != expected_hash:
120
+ raise errors.HashMismatchError(
121
+ f"The {local_hash.algo} of the downloaded file {local_dest} is {hashing.b64(local_hash.bytes)},"
122
+ f" but we expected it to be {hashing.b64(expected_hash.bytes)}."
123
+ f" This probably indicates a corrupted download of {fqn}"
124
+ )
125
+
126
+ all_hashes = dict(local=local_hash, remote=remote_hash, expected=expected_hash)
127
+ real_hashes = list(filter(None, all_hashes.values()))
128
+ assert len(real_hashes) > 0, all_hashes
129
+ assert all(real_hashes[0] == h for h in real_hashes), all_hashes
130
+
131
+
132
+ def metadata_hash_dict(hash: Hash) -> dict[str, str]:
133
+ return {metadata_hash_b64_key(hash.algo): hashing.b64(hash.bytes)}
134
+
135
+
136
+ def create_hash_metadata_if_missing(
137
+ file_properties: ty.Optional[file_properties.FileProperties], new_hash: ty.Optional[Hash]
138
+ ) -> dict:
139
+ if not (file_properties and new_hash):
140
+ # without file properties, we can't match the etag when we try to set this.
141
+ return dict()
142
+
143
+ existing_metadata = file_properties.metadata or dict()
144
+ if metadata_hash_b64_key(new_hash.algo) not in existing_metadata:
145
+ return {**existing_metadata, **metadata_hash_dict(new_hash)}
146
+
147
+ return dict()
thds/adls/impl.py CHANGED
@@ -31,7 +31,7 @@ from azure.storage.filedatalake.aio import DataLakeServiceClient, FileSystemClie
31
31
 
32
32
  from thds.core import lazy, log
33
33
 
34
- from ._upload import async_upload_decision_and_settings, metadata_for_upload
34
+ from ._upload import async_upload_decision_and_metadata
35
35
  from .conf import CONNECTION_TIMEOUT, UPLOAD_CHUNK_SIZE
36
36
  from .download import async_download_or_use_verified
37
37
  from .errors import translate_azure_error
@@ -330,15 +330,14 @@ class ADLSFileSystem:
330
330
 
331
331
  async with file_system_client.get_file_client(remote_path) as file_client:
332
332
  with open(local_path, "rb") as fp:
333
- decision = await async_upload_decision_and_settings(file_client.get_file_properties, fp)
333
+ decision = await async_upload_decision_and_metadata(file_client.get_file_properties, fp)
334
334
  if decision.upload_required:
335
335
  await file_client.upload_data(
336
336
  fp,
337
337
  overwrite=True,
338
- content_settings=decision.content_settings,
339
338
  connection_timeout=CONNECTION_TIMEOUT(),
340
339
  chunk_size=UPLOAD_CHUNK_SIZE(),
341
- metadata={**metadata_for_upload(), **(metadata or {})},
340
+ metadata={**decision.metadata, **(metadata or {})},
342
341
  )
343
342
 
344
343
  return remote_path
thds/adls/md5.py CHANGED
@@ -1,60 +1,13 @@
1
- """Why MD5 when it's no longer a good choice for most use cases?
2
- Because Azure/ADLS support Content-MD5 but nothing else, and I don't
3
- want to lie to them and get us confused later.
4
-
5
- Thankfully, there are no real security concerns for us with purely
6
- internal code and data sets.
7
-
8
- That said, please _do not_ use MD5 for non-Azure things. Prefer SHA256
9
- if at all possible.
10
- """
11
1
  import hashlib
12
- import typing as ty
13
- from pathlib import Path
14
-
15
- from thds.core.hash_cache import hash_file
16
- from thds.core.hashing import SomehowReadable, hash_anything, hash_using
17
- from thds.core.types import StrOrPath
18
-
19
- AnyStrSrc = ty.Union[SomehowReadable, ty.Iterable[ty.AnyStr]]
20
- # this type closely corresponds to what the underlying DataLakeStorageClient will accept for upload_data.
21
-
22
-
23
- def md5_file(file: StrOrPath) -> bytes:
24
- """Raise exception if it cannot be read.
25
2
 
26
- Safely caches the hash on your local filesystem (uses mtimes to
27
- determine staleness).
28
- """
29
- return hash_file(file, hashlib.md5())
3
+ from thds.core.hashing import Hash, db64, hash_using
30
4
 
31
5
 
32
6
  def hex_md5_str(string: str) -> str:
33
7
  return hash_using(string.encode(), hashlib.md5()).hexdigest()
34
8
 
35
9
 
36
- def try_md5(data: AnyStrSrc) -> ty.Optional[bytes]:
37
- """Ideally, we calculate an MD5 sum for all data that we upload.
38
-
39
- The only circumstances under which we cannot do this are if the
40
- stream does not exist in its entirety before the upload begins.
41
- """
42
- if isinstance(data, Path):
43
- return md5_file(data)
44
- res = hash_anything(data, hashlib.md5())
45
- if res:
46
- return res.digest()
47
- return None
48
-
49
-
50
- def is_reasonable_b64(md5: str):
51
- if len(md5) == 22:
52
- return True
53
- if len(md5) == 24 and md5.endswith("=="):
54
- return True
55
- return False
56
-
57
-
58
- def check_reasonable_md5b64(maybe_md5: str):
59
- if not is_reasonable_b64(maybe_md5):
60
- raise ValueError(f"MD5 '{maybe_md5}' is not a reasonable MD5.")
10
+ def to_hash(md5b64: str) -> Hash:
11
+ """Convert a base64-encoded MD5 hash to a hex string."""
12
+ assert md5b64, "MD5 base64 string cannot be empty"
13
+ return Hash(algo="md5", bytes=db64(md5b64))
thds/adls/ro_cache.py CHANGED
@@ -12,7 +12,7 @@ from thds.core.link import LinkType, link_or_copy
12
12
  from .fqn import AdlsFqn
13
13
  from .md5 import hex_md5_str
14
14
 
15
- GLOBAL_CACHE_PATH = config.item("global-cache-path", HOMEDIR() / ".adls-md5-ro-cache", parse=Path)
15
+ GLOBAL_CACHE_PATH = config.item("global-cache-path", HOMEDIR() / ".thds/adls/ro-cache", parse=Path)
16
16
  MAX_FILENAME_LEN = config.item("max-filename-len", 255, parse=int) # safe on most local filesystems?
17
17
  MAX_TOTAL_PATH_LEN = config.item(
18
18
  "max-total-path-len", 1023 if sys.platform == "darwin" else 4095, parse=int
@@ -39,7 +39,6 @@ class Cache(ty.NamedTuple):
39
39
 
40
40
 
41
41
  def global_cache(link: LinkOpts = ("ref", "hard")) -> Cache:
42
- """This is the recommended caching configuration."""
43
42
  return Cache(GLOBAL_CACHE_PATH(), link)
44
43
 
45
44
 
thds/adls/source.py CHANGED
@@ -1,14 +1,14 @@
1
- import base64
2
1
  import typing as ty
3
2
  from functools import partial
4
3
  from pathlib import Path
5
4
 
6
5
  from thds.core import source
7
- from thds.core.hashing import Hash, b64
6
+ from thds.core.hashing import Hash
8
7
 
9
- from .cached_up_down import download_to_cache
8
+ from . import cached, hashes, md5
9
+ from .errors import blob_not_found_translation
10
+ from .file_properties import get_file_properties
10
11
  from .fqn import AdlsFqn
11
- from .resource import AdlsHashedResource
12
12
  from .uri import resolve_any, resolve_uri
13
13
 
14
14
 
@@ -19,17 +19,10 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
19
19
 
20
20
  def download(hash: ty.Optional[Hash]) -> Path:
21
21
  assert fqn
22
- if hash and hash.algo == "md5":
23
- # this 'extra' check just allows us to short-circuit a download
24
- # where the hash at this URI is known not to match what we expect.
25
- # It's no safer than the non-md5 hash check that Source performs after download.
26
- return download_to_cache(fqn, b64(hash.bytes))
27
-
28
- # we don't validate this hash, because we already have md5 validation
29
- # happening inside the download_to_cache function. the Source hash
30
- # is actually mostly for use by systems that want to do content addressing,
31
- # and not necessarily intended to be a runtime check in all scenarios.
32
- return download_to_cache(fqn)
22
+ # this 'extra' check just allows us to short-circuit a download
23
+ # where the hash at this URI is known not to match what we expect.
24
+ # It's no safer than the non-md5 hash check that Source performs after download.
25
+ return cached.download_to_cache(fqn, expected_hash=hash)
33
26
 
34
27
  return download
35
28
 
@@ -37,30 +30,40 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
37
30
  source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
38
31
 
39
32
 
40
- def from_adls(
41
- uri_or_fqn_or_ahr: ty.Union[str, AdlsFqn, AdlsHashedResource], hash: ty.Optional[Hash] = None
42
- ) -> source.Source:
33
+ def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None) -> source.Source:
43
34
  """Flexible, public interface to creating Sources from any ADLS-like reference.
44
35
 
45
- Does NOT automatically fetch an MD5 hash from the ADLS URI if it's not provided. If
46
- you know you want to include that, combine this with `resource.get`:
47
- `source.from_adls(resource.get(uri))`
36
+ Does NOT automatically fetch a checksumming hash from the ADLS URI if it's not
37
+ provided. If you know you want to include that, instead call:
38
+ `source.get_with_hash(uri_or_fqn)`.
48
39
  """
49
- if isinstance(uri_or_fqn_or_ahr, AdlsHashedResource):
50
- fqn = uri_or_fqn_or_ahr.fqn
51
- res_hash = Hash("md5", base64.b64decode(uri_or_fqn_or_ahr.md5b64))
52
- if hash and hash != res_hash:
53
- raise ValueError(f"Resource Hash mismatch for {fqn}: {hash} != {res_hash}")
54
- hash = res_hash
55
- else:
56
- r_fqn = resolve_any(uri_or_fqn_or_ahr)
57
- if not r_fqn:
58
- raise ValueError(f"Could not resolve {uri_or_fqn_or_ahr} to an ADLS FQN")
59
- fqn = r_fqn
60
-
61
- return source.Source(str(fqn), hash)
40
+ r_fqn = resolve_any(uri_or_fqn)
41
+ if not r_fqn:
42
+ raise ValueError(f"Could not resolve {uri_or_fqn} to an ADLS FQN")
43
+ return source.Source(str(r_fqn), hash)
62
44
 
63
45
 
64
46
  source.register_from_uri_handler(
65
47
  "thds.adls", lambda uri: partial(from_adls, uri) if resolve_uri(uri) else None
66
48
  )
49
+
50
+
51
+ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
52
+ """Creates a Source from a remote-only file, with MD5 or other hash.
53
+
54
+ The file _must_ have a pre-existing hash!
55
+ """
56
+ fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
57
+ with blob_not_found_translation(fqn):
58
+ uri_hashes = hashes.extract_hashes_from_props(get_file_properties(fqn))
59
+ if not uri_hashes:
60
+ raise ValueError(
61
+ f"ADLS file {fqn} must have a hash to use this function. "
62
+ "If you know the hash, use `from_adls` with the hash parameter."
63
+ )
64
+ return from_adls(fqn, next(iter(uri_hashes.values())))
65
+
66
+
67
+ def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
68
+ """Meant for older use cases where we had an MD5"""
69
+ return from_adls(uri_or_fqn, md5.to_hash(md5b64) if md5b64 else None)
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  from pathlib import Path
3
3
 
4
- from thds.adls.cached_up_down import download_directory, download_to_cache
4
+ from thds.adls import cached
5
5
  from thds.adls.file_properties import get_file_properties, is_directory
6
6
  from thds.adls.impl import ADLSFileSystem
7
7
  from thds.adls.uri import resolve_uri
@@ -16,9 +16,10 @@ def main():
16
16
  help="A fully qualified path to an ADLS location. Accepts adls://, https:// and abfss:// URIs.",
17
17
  )
18
18
  parser.add_argument(
19
- "--copy-to",
20
- "-c",
19
+ "copy_to",
20
+ nargs="?",
21
21
  type=Path,
22
+ default=None,
22
23
  help="This will create a link to the cached download at the specified location",
23
24
  )
24
25
  parser.add_argument(
@@ -39,9 +40,9 @@ def main():
39
40
  cache_path = fs.fetch_file(args.adls_fqn.path)
40
41
  else:
41
42
  if is_dir:
42
- cache_path = download_directory(args.adls_fqn)
43
+ cache_path = cached.download_directory(args.adls_fqn)
43
44
  else:
44
- cache_path = download_to_cache(args.adls_fqn)
45
+ cache_path = cached.download_to_cache(args.adls_fqn)
45
46
 
46
47
  if args.copy_to:
47
48
  link(cache_path, args.copy_to)
thds/adls/tools/upload.py CHANGED
@@ -1,17 +1,16 @@
1
1
  import argparse
2
2
  from pathlib import Path
3
3
 
4
- from thds.adls.cached_up_down import upload_through_cache
5
- from thds.adls.uri import resolve_uri
4
+ from thds.adls import cached, uri
6
5
 
7
6
 
8
7
  def main():
9
8
  parser = argparse.ArgumentParser()
10
9
  parser.add_argument("path", type=Path, help="A local file you want to upload.")
11
- parser.add_argument("uri", type=resolve_uri, help="A fully qualified path to an ADLS location")
10
+ parser.add_argument("uri", type=uri.resolve_uri, help="A fully qualified path to an ADLS location")
12
11
  args = parser.parse_args()
13
12
 
14
- upload_through_cache(args.uri, args.path)
13
+ cached.upload_through_cache(args.uri, args.path)
15
14
 
16
15
 
17
16
  if __name__ == "__main__":
thds/adls/upload.py ADDED
@@ -0,0 +1,168 @@
1
+ """API for uploading files to Azure Data Lake Storage (ADLS) Gen2.
2
+
3
+ We hash anything that we possibly can, since it's a fast verification step that we
4
+ can do later during downloads.
5
+ """
6
+
7
+ import subprocess
8
+ import typing as ty
9
+ from pathlib import Path
10
+
11
+ from azure.core.exceptions import ResourceModifiedError
12
+ from azure.storage.blob import ContentSettings
13
+
14
+ from thds.core import files, fretry, link, log, scope, source, tmp
15
+
16
+ from . import azcopy, hashes
17
+ from ._progress import report_upload_progress
18
+ from ._upload import upload_decision_and_metadata
19
+ from .conf import UPLOAD_FILE_MAX_CONCURRENCY
20
+ from .fqn import AdlsFqn
21
+ from .global_client import get_global_blob_container_client
22
+ from .ro_cache import Cache
23
+
24
+ logger = log.getLogger(__name__)
25
+ _SLOW_CONNECTION_WORKAROUND = 14400 # seconds
26
+
27
+
28
+ UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
29
+
30
+
31
+ def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
32
+ @scope.bound
33
+ def _try_write_through() -> bool:
34
+ if isinstance(data, Path) and data.exists():
35
+ # we don't do hard or soft links because they share file permissions,
36
+ # and it's not up to us to change permissions on the src file.
37
+ link.link_or_copy(data, local_cache_path, "ref")
38
+ return True
39
+
40
+ out = scope.enter(tmp.temppath_same_fs(local_cache_path))
41
+ if hasattr(data, "read") and hasattr(data, "seek"):
42
+ with open(out, "wb") as f:
43
+ f.write(data.read()) # type: ignore
44
+ data.seek(0) # type: ignore
45
+ link.link_or_copy(out, local_cache_path)
46
+ return True
47
+
48
+ if isinstance(data, bytes):
49
+ with open(out, "wb") as f:
50
+ f.write(data)
51
+ link.link_or_copy(out, local_cache_path)
52
+ return True
53
+
54
+ return False
55
+
56
+ if _try_write_through():
57
+ try:
58
+ # it's a reflink or a copy, so the cache now owns its copy
59
+ # and we don't want to allow anyone to write to its copy.
60
+ files.set_read_only(local_cache_path)
61
+ return local_cache_path
62
+
63
+ except FileNotFoundError:
64
+ # may have hit a race condition.
65
+ # don't fail upload just because we couldn't write through the cache.
66
+ pass
67
+ return None
68
+
69
+
70
+ @scope.bound
71
+ @fretry.retry_sleep(
72
+ # ADLS lib has a bug where parallel uploads of the same thing will
73
+ # hit a race condition and error. this will detect that scenario
74
+ # and avoid re-uploading as well.
75
+ fretry.is_exc(ResourceModifiedError),
76
+ fretry.expo(retries=5),
77
+ )
78
+ def upload(
79
+ dest: ty.Union[AdlsFqn, str],
80
+ src: UploadSrc,
81
+ write_through_cache: ty.Optional[Cache] = None,
82
+ *,
83
+ content_type: str = "",
84
+ **upload_data_kwargs: ty.Any,
85
+ ) -> source.Source:
86
+ """Uploads only if the remote does not exist or does not match
87
+ xxhash.
88
+
89
+ Always embeds xxhash in the blob metadata if at all possible. In very rare cases
90
+ it may not be possible for us to calculate one. Will always be possible if the passed
91
+ data was a Path. If one can be calculated, it will be returned in the Source.
92
+
93
+ Can write through a local cache, which may save you a download later.
94
+
95
+ content_type and all upload_data_kwargs will be ignored if the file
96
+ has already been uploaded and the hash matches.
97
+ """
98
+ dest_ = AdlsFqn.parse(dest) if isinstance(dest, str) else dest
99
+ if write_through_cache:
100
+ _write_through_local_cache(write_through_cache.path(dest_), src)
101
+ # we always use the original source file to upload, not the cached path,
102
+ # because uploading from a shared location risks race conditions.
103
+
104
+ blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
105
+ blob_client = blob_container_client.get_blob_client(dest_.path)
106
+ decision = upload_decision_and_metadata(blob_client.get_blob_properties, src) # type: ignore [arg-type]
107
+
108
+ def source_from_meta() -> source.Source:
109
+ best_hash = next(iter(hashes.extract_hashes_from_metadata(decision.metadata)), None)
110
+ if isinstance(src, Path):
111
+ assert best_hash, "A hash should always be calculable for a local path."
112
+ return source.from_file(src, hash=best_hash, uri=str(dest_))
113
+
114
+ return source.from_uri(str(dest_), hash=best_hash)
115
+
116
+ if decision.upload_required:
117
+ # set up some bookkeeping
118
+ n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
119
+ bytes_src: ty.Union[bytes, ty.IO, ty.Iterable[bytes]]
120
+ if isinstance(src, Path):
121
+ n_bytes = src.stat().st_size
122
+ bytes_src = scope.enter(open(src, "rb"))
123
+ elif isinstance(src, bytes):
124
+ n_bytes = len(src)
125
+ bytes_src = src
126
+ else:
127
+ bytes_src = src
128
+
129
+ if "metadata" in upload_data_kwargs:
130
+ decision.metadata.update(upload_data_kwargs.pop("metadata"))
131
+
132
+ if azcopy.upload.should_use_azcopy(n_bytes or 0) and isinstance(src, Path):
133
+ logger.info("Using azcopy to upload %s to %s", src, dest_)
134
+ try:
135
+ azcopy.upload.run(
136
+ azcopy.upload.build_azcopy_upload_command(
137
+ src, dest_, content_type=content_type, metadata=decision.metadata, overwrite=True
138
+ ),
139
+ dest_,
140
+ n_bytes or 0,
141
+ )
142
+ return source_from_meta()
143
+
144
+ except subprocess.SubprocessError:
145
+ logger.warning("Azcopy upload failed, falling back to SDK upload")
146
+
147
+ upload_content_settings = ContentSettings()
148
+ if content_type:
149
+ upload_content_settings.content_type = content_type
150
+
151
+ # we are now using blob_client instead of file system client
152
+ # because blob client (as of 2024-06-24) does actually do
153
+ # some one-step, atomic uploads, wherein there is not a separate
154
+ # create/truncate action associated with an overwrite.
155
+ # This is both faster, as well as simpler to reason about, and
156
+ # in fact was the behavior I had been assuming all along...
157
+ blob_client.upload_blob(
158
+ report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes or 0),
159
+ overwrite=True,
160
+ length=n_bytes,
161
+ content_settings=upload_content_settings,
162
+ connection_timeout=_SLOW_CONNECTION_WORKAROUND,
163
+ max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
164
+ metadata=decision.metadata,
165
+ **upload_data_kwargs,
166
+ )
167
+
168
+ return source_from_meta()
thds/adls/uri.py CHANGED
@@ -36,3 +36,9 @@ def resolve_any(fqn_or_uri: UriIsh) -> ty.Optional[fqn.AdlsFqn]:
36
36
 
37
37
  def parse_any(fqn_or_uri: UriIsh) -> fqn.AdlsFqn:
38
38
  return parse_uri(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
39
+
40
+
41
+ def to_blob_windows_url(uri: UriIsh) -> str:
42
+ """Convert an ADLS URI to a Windows network path."""
43
+ fqn = parse_any(uri)
44
+ return f"https://{fqn.sa}.blob.core.windows.net/{fqn.container}/{fqn.path}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thds.adls
3
- Version: 4.1.20250701001205
3
+ Version: 4.1.20250702194306
4
4
  Summary: ADLS tools
5
5
  Author-email: Trilliant Health <info@trillianthealth.com>
6
6
  License: MIT
@@ -0,0 +1,42 @@
1
+ thds/adls/__init__.py,sha256=g2Zb0EAAH-JzPMYHAub9liU4qa5pfqQDnILfEhmObGo,1036
2
+ thds/adls/_progress.py,sha256=ZzCHn_G7nHakioNFxdvoJZRr-jN6ymsp5JXf-iReROM,6580
3
+ thds/adls/_upload.py,sha256=mhTdWiQroaugYuwQg7R8CEgdfCYF4xvJthlsqO0jlnE,4692
4
+ thds/adls/abfss.py,sha256=ZRJOLjDuXmS4bIbQAQpQxWWWeu74N9NKEKCNfXQek80,726
5
+ thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
6
+ thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
7
+ thds/adls/copy.py,sha256=jUWbGvTpb4B3yRGS0nhGSbDzqRPzUqYgH0z1lFRJB3k,6365
8
+ thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
9
+ thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
10
+ thds/adls/download.py,sha256=HzmhHM0FAmxtCRkK9M7NajsIzIuHD74GuxP3dyLoP1Q,18266
11
+ thds/adls/download_lock.py,sha256=tgT48l4C5_qmArGeq05gl7VlxT22dZBH2Xwxx0itE9o,3176
12
+ thds/adls/errors.py,sha256=6cLg2E4SB8ic46PBzA3ynRH4b1oR8qRb07RBgKGJRxY,1783
13
+ thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
14
+ thds/adls/file_properties.py,sha256=C9Kl3a5wuBNWYgZYnZbkH04u8uxadEcjVJIm3UevUM0,1912
15
+ thds/adls/fqn.py,sha256=0zHmHhBWN7GEfKRB3fBC1NVhaiIHHifBdCRanyT01X8,5822
16
+ thds/adls/global_client.py,sha256=f4VJw5y_Yh__8gQUcdSYTh1aU6iEPlauMchVirSAwDQ,3716
17
+ thds/adls/hashes.py,sha256=RDQS-C38wskUhxXGFGLJ4ox8vm7ofurxSsUk13Ywijo,5309
18
+ thds/adls/impl.py,sha256=4rZAGlhU_UojPy1FC7j3KEFIj6BWSbCDAVV1FCyki3s,42496
19
+ thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
20
+ thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
21
+ thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
23
+ thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
24
+ thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
25
+ thds/adls/source.py,sha256=8HVMYuxDn1XYGwFFSBowMlvQ6r2Jm2CQlpu4h85JvsE,2559
26
+ thds/adls/source_tree.py,sha256=yP_v2XrKxXqUOdZ-x8kqHhBFAuur3AlAq3zi4hHj4AE,2235
27
+ thds/adls/upload.py,sha256=MRHK9Am-x5FKBPh1SXLTbPC1r0Xk0bGWNU8CcNuUMLo,6602
28
+ thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
29
+ thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
30
+ thds/adls/azcopy/download.py,sha256=8shLbizgKr5WLmOitQ8TY28EVj2IdT7iSRmRgqFNLAg,6008
31
+ thds/adls/azcopy/login.py,sha256=923UaewVMPFzkDSgCQsbl-_g7qdFhpXpF0MGNIy3T_A,1538
32
+ thds/adls/azcopy/progress.py,sha256=K7TVmSiWfu561orL3GuOnlQX9VtVxWVECAq9NiweYNo,1387
33
+ thds/adls/azcopy/system_resources.py,sha256=okgDEKAp0oWGQF7OKikbgJ9buBeiOgNaDYy-36j6dHo,761
34
+ thds/adls/azcopy/upload.py,sha256=bvtYdbaFsZkOHFLDpeBlTKqw63P3_kbImInI04ZlekM,2601
35
+ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1587
36
+ thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
37
+ thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
38
+ thds_adls-4.1.20250702194306.dist-info/METADATA,sha256=zgZubxCu37Sqrjn7b5NTJNlPxZbRgbcx-omuFBiVfMg,587
39
+ thds_adls-4.1.20250702194306.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
+ thds_adls-4.1.20250702194306.dist-info/entry_points.txt,sha256=uTqreT1AIwqJboMfLv5w6sviM8mNbAkln765gIjzoA4,152
41
+ thds_adls-4.1.20250702194306.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
42
+ thds_adls-4.1.20250702194306.dist-info/RECORD,,