thds.adls 3.0.20250116223841__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

@@ -0,0 +1,107 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import platform
5
+ import random
6
+ import threading
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Dict
10
+
11
+ from azure.core.credentials import AccessToken, TokenCredential
12
+ from azure.identity import AzureCliCredential, DefaultAzureCredential
13
+ from azure.identity._constants import EnvironmentVariables
14
+
15
+ from thds.core import config, files, log
16
+ from thds.core.lazy import Lazy
17
+
18
+ # suppress noisy Azure identity logs
19
+ logging.getLogger("azure.identity").setLevel(logging.WARNING)
20
+ logger = log.getLogger(__name__)
21
+ DISABLE_FAST_CACHED_CREDENTIAL = config.item("disable_fast", default=False, parse=config.tobool)
22
+
23
+
24
+ class ThreadSafeAzureCredential(DefaultAzureCredential):
25
+ """This is a workaround for how terrible the Azure SDK is.
26
+
27
+ They do _not_ play nicely with many threads all trying to load the
28
+ credentials at once. Thankfully we can just shim in there and make
29
+ sure that we're not making thousands of parallel requests to the
30
+ refresh token API...
31
+ """
32
+
33
+ def __init__(self, *args, **kwargs):
34
+ self.lock = threading.Lock()
35
+ super().__init__(*args, **kwargs)
36
+
37
+ def get_token(self, *args, **kwargs):
38
+ with self.lock:
39
+ return super().get_token(*args, **kwargs)
40
+
41
+ def close(self):
42
+ with self.lock:
43
+ return super().close()
44
+
45
+
46
+ _CACHED_TOKEN_PATH = "~/.azure/thds-cache-azure-cli-tokens"
47
+
48
+
49
+ class FastCachedAzureCliCredential(AzureCliCredential):
50
+ # AzureCliCredential only allows a single scope per token, so even though
51
+ # the abstract concept might name *args as *scopes, for our implementation
52
+ # it would be an error to try to fetch more than one scope at a time anyway.
53
+ def get_token(self, scope: str, *args, **kwargs):
54
+ cached_path = Path(os.path.expanduser(_CACHED_TOKEN_PATH)) / scope.replace("/", "|")
55
+ # the | (pipe) is just to make the local file paths not-weird.
56
+ # i chose it essentially at random, and while technically this is a form of compression,
57
+ # I very much doubt any collisions are possible in the real set of scopes Azure offers.
58
+ try:
59
+ token_dict = json.loads(open(cached_path).read())
60
+ expires_on = token_dict["expires_on"]
61
+ if expires_on - random.randint(50, 150) < time.time():
62
+ # we conservatively grab a new token even if we're within 50 to 150 seconds
63
+ # _before_ the current token expires.
64
+ # The randomness helps avoid a whole bunch of processes all grabbing it at once,
65
+ # assuming there's a _lot_ of Azure activity going on in parallel.
66
+ # There is no correctness concern with this; just optimizations.
67
+ raise ValueError("Expired Token")
68
+ return AccessToken(**token_dict)
69
+ except (FileNotFoundError, TypeError, KeyError, ValueError):
70
+ fresh_token = super().get_token(scope, *args, **kwargs)
71
+ # we write atomically b/c we know other processes may be reading or writing
72
+ # this file. it's really just a best practice for almost all file writes
73
+ # where there's any chance of other readers or writers.
74
+ with files.atomic_write_path(cached_path) as wp, open(wp, "w") as wf:
75
+ wf.write(json.dumps(fresh_token._asdict()))
76
+ return fresh_token
77
+ except Exception as e:
78
+ logger.exception(f"failed to get fast credential: {e}")
79
+ raise
80
+
81
+
82
+ def _has_workload_identity_creds() -> bool:
83
+ workload_identity_vars = [
84
+ EnvironmentVariables.AZURE_TENANT_ID,
85
+ EnvironmentVariables.AZURE_CLIENT_ID,
86
+ EnvironmentVariables.AZURE_FEDERATED_TOKEN_FILE,
87
+ ]
88
+ return all(var in os.environ for var in workload_identity_vars)
89
+
90
+
91
+ def get_credential_kwargs() -> Dict[str, bool]:
92
+ if _has_workload_identity_creds():
93
+ # in K8s, we use various forms of credentials, but not the EnvironmentCredential,
94
+ # and that one gets tried early and then warns us about something we don't care about.
95
+ return dict(exclude_environment_credential=True)
96
+ # exclusion due to storage explorer credentials issue when trying to hit East SAs
97
+ return dict(exclude_shared_token_cache_credential=True)
98
+
99
+
100
+ def _SharedCredential() -> TokenCredential:
101
+ if platform.system() == "Darwin" and not DISABLE_FAST_CACHED_CREDENTIAL():
102
+ # only try this crazy optimization on our local laptops
103
+ return FastCachedAzureCliCredential() # type: ignore
104
+ return ThreadSafeAzureCredential(**get_credential_kwargs())
105
+
106
+
107
+ SharedCredential = Lazy(_SharedCredential)
thds/adls/source.py ADDED
@@ -0,0 +1,66 @@
1
+ import base64
2
+ import typing as ty
3
+ from functools import partial
4
+ from pathlib import Path
5
+
6
+ from thds.core import source
7
+ from thds.core.hashing import Hash, b64
8
+
9
+ from .cached_up_down import download_to_cache
10
+ from .fqn import AdlsFqn
11
+ from .resource import AdlsHashedResource
12
+ from .uri import resolve_any, resolve_uri
13
+
14
+
15
+ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader]:
16
+ fqn = resolve_uri(uri)
17
+ if not fqn:
18
+ return None
19
+
20
+ def download(hash: ty.Optional[Hash]) -> Path:
21
+ assert fqn
22
+ if hash and hash.algo == "md5":
23
+ # this 'extra' check just allows us to short-circuit a download
24
+ # where the hash at this URI is known not to match what we expect.
25
+ # It's no safer than the non-md5 hash check that Source performs after download.
26
+ return download_to_cache(fqn, b64(hash.bytes))
27
+
28
+ # we don't validate this hash, because we already have md5 validation
29
+ # happening inside the download_to_cache function. the Source hash
30
+ # is actually mostly for use by systems that want to do content addressing,
31
+ # and not necessarily intended to be a runtime check in all scenarios.
32
+ return download_to_cache(fqn)
33
+
34
+ return download
35
+
36
+
37
+ source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
38
+
39
+
40
+ def from_adls(
41
+ uri_or_fqn_or_ahr: ty.Union[str, AdlsFqn, AdlsHashedResource], hash: ty.Optional[Hash] = None
42
+ ) -> source.Source:
43
+ """Flexible, public interface to creating Sources from any ADLS-like reference.
44
+
45
+ Does NOT automatically fetch an MD5 hash from the ADLS URI if it's not provided. If
46
+ you know you want to include that, combine this with `resource.get`:
47
+ `source.from_adls(resource.get(uri))`
48
+ """
49
+ if isinstance(uri_or_fqn_or_ahr, AdlsHashedResource):
50
+ fqn = uri_or_fqn_or_ahr.fqn
51
+ res_hash = Hash("md5", base64.b64decode(uri_or_fqn_or_ahr.md5b64))
52
+ if hash and hash != res_hash:
53
+ raise ValueError(f"Resource Hash mismatch for {fqn}: {hash} != {res_hash}")
54
+ hash = res_hash
55
+ else:
56
+ r_fqn = resolve_any(uri_or_fqn_or_ahr)
57
+ if not r_fqn:
58
+ raise ValueError(f"Could not resolve {uri_or_fqn_or_ahr} to an ADLS FQN")
59
+ fqn = r_fqn
60
+
61
+ return source.Source(str(fqn), hash)
62
+
63
+
64
+ source.register_from_uri_handler(
65
+ "thds.adls", lambda uri: partial(from_adls, uri) if resolve_uri(uri) else None
66
+ )
@@ -0,0 +1,35 @@
1
+ import argparse
2
+ from pathlib import Path
3
+
4
+ from thds.adls.cached_up_down import download_directory, download_to_cache
5
+ from thds.adls.file_properties import get_file_properties, is_directory
6
+ from thds.adls.uri import resolve_uri
7
+ from thds.core.link import link
8
+
9
+
10
+ def main():
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("uri", type=resolve_uri, help="A fully qualified path to an ADLS location")
13
+ parser.add_argument(
14
+ "--copy-to",
15
+ "-c",
16
+ type=Path,
17
+ help="This will create a link to the cached download at the specified location",
18
+ )
19
+
20
+ args = parser.parse_args()
21
+
22
+ if is_directory(get_file_properties(args.uri)):
23
+ cache_path = download_directory(args.uri)
24
+ else:
25
+ cache_path = download_to_cache(args.uri)
26
+
27
+ if args.copy_to:
28
+ link(cache_path, args.copy_to)
29
+ print(args.copy_to.resolve())
30
+ else:
31
+ print(cache_path.resolve())
32
+
33
+
34
+ if __name__ == "__main__":
35
+ main()
thds/adls/tools/ls.py ADDED
@@ -0,0 +1,38 @@
1
+ import argparse
2
+
3
+ from thds.adls import ADLSFileSystem
4
+ from thds.adls.file_properties import get_file_properties, is_directory
5
+ from thds.adls.uri import resolve_uri
6
+
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument("uri", type=resolve_uri, help="A fully qualified path to an ADLS location")
11
+ parser.add_argument(
12
+ "--verbose",
13
+ "-v",
14
+ default=False,
15
+ action="store_true",
16
+ help="Verbose mode will display ADLS metadata associated with contents of the specified URI",
17
+ )
18
+
19
+ args = parser.parse_args()
20
+
21
+ fs = ADLSFileSystem(args.uri.sa, args.uri.container)
22
+
23
+ files_info = (
24
+ fs.get_directory_info(args.uri.path)
25
+ if is_directory(get_file_properties(args.uri))
26
+ else fs.get_files_info([args.uri.path])
27
+ )
28
+ for f in files_info:
29
+ if args.verbose:
30
+ print(f["name"])
31
+ [print(f" {key}: {value}") for key, value in sorted(f.items()) if key != "name"]
32
+ print("\n")
33
+ else:
34
+ print(f["name"])
35
+
36
+
37
+ if __name__ == "__main__":
38
+ main()
thds/adls/uri.py ADDED
@@ -0,0 +1,38 @@
1
+ import typing as ty
2
+
3
+ from . import abfss, dbfs, fqn
4
+
5
+ HTTPS_SCHEME = "https"
6
+
7
+ UriIsh = ty.Union[str, fqn.AdlsFqn]
8
+ # Could be an `AdlsFqn`, a supported URI scheme, or potentially any string.
9
+
10
+
11
+ def resolve_uri(uri: str) -> ty.Optional[fqn.AdlsFqn]:
12
+ if uri.startswith(fqn.ADLS_SCHEME):
13
+ return fqn.parse(uri)
14
+ elif uri.startswith(abfss.ABFSS_SCHEME):
15
+ return abfss.to_adls_fqn(uri)
16
+ elif uri.startswith(HTTPS_SCHEME):
17
+ uri = uri.replace("https://", "adls://")
18
+ uri = uri.replace(".blob.core.windows.net", ".dfs.core.windows.net")
19
+ uri = uri.replace(".dfs.core.windows.net", "")
20
+ return fqn.parse(uri)
21
+ elif uri.startswith(dbfs.DBFS_SCHEME):
22
+ return dbfs.to_adls_fqn(uri)
23
+ return None
24
+
25
+
26
+ def parse_uri(uri: str) -> fqn.AdlsFqn:
27
+ """Strict/raises error if not a known ADLS URI"""
28
+ if name := resolve_uri(uri):
29
+ return name
30
+ raise fqn.NotAdlsUri(uri)
31
+
32
+
33
+ def resolve_any(fqn_or_uri: UriIsh) -> ty.Optional[fqn.AdlsFqn]:
34
+ return resolve_uri(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
35
+
36
+
37
+ def parse_any(fqn_or_uri: UriIsh) -> fqn.AdlsFqn:
38
+ return parse_uri(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.2
2
+ Name: thds.adls
3
+ Version: 3.0.20250116223841
4
+ Summary: ADLS tools
5
+ Author: Trilliant Health
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: aiohttp>=3.8.1
8
+ Requires-Dist: aiostream>=0.4.5
9
+ Requires-Dist: azure-identity>=1.9
10
+ Requires-Dist: azure-storage-file-datalake>=12.6
11
+ Requires-Dist: filelock>=3.0
12
+ Requires-Dist: thds.core>=1.31
13
+
14
+ # adls Library
15
+
16
+ A port of `core.adls`.
@@ -0,0 +1,35 @@
1
+ thds/adls/__init__.py,sha256=20oVbjjKkXWMsuhZo9I5NxQDUl5fL4r10hD-jIKPjeM,715
2
+ thds/adls/_progress.py,sha256=SVOiVRvfvOfUjp-y-UUgA6n0FhAnpKesxltaQ16kmOk,6394
3
+ thds/adls/_upload.py,sha256=q6Sk0CRnNcAjUOUPiBj4CfO4tJD196SQY0lT25CTSE4,4364
4
+ thds/adls/abfss.py,sha256=ZRJOLjDuXmS4bIbQAQpQxWWWeu74N9NKEKCNfXQek80,726
5
+ thds/adls/cached_up_down.py,sha256=qTMrsHHrDq3YZ0gaLMFIPnGNraz0ctG9yoFd3HCVit4,1896
6
+ thds/adls/conf.py,sha256=GmVO1L8nEvC0J0Tv7gUfC7II3yHZkejIddXEDPcCLM4,1522
7
+ thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
8
+ thds/adls/defaults.py,sha256=VxyaGz1gCcz2rGTR9acNRybbxGOYhTldvD_-SdGLBh0,652
9
+ thds/adls/download.py,sha256=Bw4caFGNq5QD9TMzSOVCNV5HVcQsW28WlNDhNItbb0U,16461
10
+ thds/adls/download_lock.py,sha256=_JZj-kjCUfHk9FvrmEuYpJYknmbam5eReFhGNDgzdLQ,2520
11
+ thds/adls/errors.py,sha256=IQfkdLVg0feAUd-xQzkS4-3Lp5_Ld9V9-SqiXeWLRtA,1335
12
+ thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
13
+ thds/adls/file_properties.py,sha256=i3O2VNyAsySRcZwc9Si-V4r0TPdFAbVblZ6Nl_b3KfE,506
14
+ thds/adls/fqn.py,sha256=pGPBAWKrXorXs1DGkH-XXeDgwwQk3S_heQE7VkPAbW4,5598
15
+ thds/adls/global_client.py,sha256=kl1miL5W7r0k6P0_E3U__zuIEfBbMQlURCokjoli0Rs,3433
16
+ thds/adls/impl.py,sha256=x1nSqc8W4NeuX8-JGOp2MRkK8ff6GnelTWedGxPs-qY,42494
17
+ thds/adls/md5.py,sha256=qOX4_7WUj1QkbH_IwREcQNHvvZccOj-HpHZBfsKn1gY,1846
18
+ thds/adls/meta.json,sha256=d0EYRqOCSexOHBq31_qHQTXHzst_Bt6QEBg0GjWpbbU,218
19
+ thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
20
+ thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ thds/adls/ro_cache.py,sha256=F0uXol0t95mcRuBukNg3A7wt7XXQxpD5Sy09d9sl8f0,4825
22
+ thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
23
+ thds/adls/source.py,sha256=1JYliDqafqpLewOQ2XMKp4T2WD7b5Rl7XYViyLYtzX8,2437
24
+ thds/adls/uri.py,sha256=pDH956p_VEHnjLLUnjWY6sGgRqksp9gdpc9KOW4gEP0,1205
25
+ thds/adls/resource/__init__.py,sha256=IZ7_aRf1b3jEp7wXOxqHop0gV2gUcf9SOLeEEjIWlCU,1669
26
+ thds/adls/resource/core.py,sha256=uU31tAVkwDcLC_TerZWq58xi4D791vNLDUfP0Y1BAoo,2716
27
+ thds/adls/resource/file_pointers.py,sha256=PLru_3lwut_ZvrX5Keu-wJkPOt5o7UGf-OOT4ixaXME,2049
28
+ thds/adls/resource/up_down.py,sha256=3uNlTvm2gVhSyYdQTBwsGecOgwtINQfINckR-awwV0Y,9907
29
+ thds/adls/tools/download.py,sha256=ZZ8t1g9MRRy-aENGjr10E4aAqyVkKq2OqpyH70pCD_8,965
30
+ thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
31
+ thds.adls-3.0.20250116223841.dist-info/METADATA,sha256=kpsfheQUb9dA-EW5PlWdgInYStAnIv1VkCNrTN7NnJo,397
32
+ thds.adls-3.0.20250116223841.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
33
+ thds.adls-3.0.20250116223841.dist-info/entry_points.txt,sha256=-dLxohfrxO0LTHZPGLF8vVXhb05qujJbF8c4fcKqpLQ,106
34
+ thds.adls-3.0.20250116223841.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
35
+ thds.adls-3.0.20250116223841.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ adls-download-uri = thds.adls.tools.download:main
3
+ adls-ls-uri = thds.adls.tools.ls:main