thds.adls 3.1.20250204215147__py3-none-any.whl → 3.1.20250205222211__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/__init__.py CHANGED
@@ -2,6 +2,7 @@ from thds.core import meta
2
2
 
3
3
  from . import abfss, defaults, etag, fqn, named_roots, resource, source, uri # noqa: F401
4
4
  from .cached_up_down import download_directory, download_to_cache, upload_through_cache # noqa: F401
5
+ from .copy import copy_file, copy_files, wait_for_copy # noqa: F401
5
6
  from .errors import BlobNotFoundError # noqa: F401
6
7
  from .fqn import * # noqa: F401,F403
7
8
  from .global_client import get_global_client, get_global_fs_client # noqa: F401
thds/adls/conf.py CHANGED
@@ -31,3 +31,12 @@ UPLOAD_FILE_MAX_CONCURRENCY = config.item("upload_file_max_concurrency", 10, par
31
31
  UPLOAD_CHUNK_SIZE = config.item("upload_chunk_size", 2**20 * 100, parse=int) # 100 MB
32
32
 
33
33
  CONNECTION_TIMEOUT = config.item("connection_timeout", 2000, parse=int) # seconds
34
+
35
+ # below are for SAS tokens
36
+ _SAS_EXPIRY_DEFAULT = 900 # seconds
37
+ # 15 minutes seems like a decent default for the time it would take to complete a SAS requiring operation
38
+ USER_DELEGATION_KEY_EXPIRY = config.ConfigItem(
39
+ "users_delegation_key_expiry", _SAS_EXPIRY_DEFAULT, parse=int
40
+ ) # seconds
41
+ # above key needs to remain valid for signed SAS tokens to remain valid
42
+ BLOB_SAS_EXPIRY = config.ConfigItem("blob_sas_expiry", _SAS_EXPIRY_DEFAULT, parse=int) # seconds
thds/adls/copy.py ADDED
@@ -0,0 +1,183 @@
1
+ """Functions for copying blobs across remote locations."""
2
+ import datetime
3
+ import random
4
+ import time
5
+ import typing as ty
6
+
7
+ from azure.storage.blob import BlobSasPermissions, BlobServiceClient, UserDelegationKey
8
+
9
+ from thds.core import cache, log, parallel, thunks
10
+
11
+ from .file_properties import exists, get_blob_properties, get_file_properties, is_directory
12
+ from .fqn import AdlsFqn
13
+ from .global_client import get_global_blob_container_client, get_global_blob_service_client
14
+ from .sas_tokens import gen_blob_sas_token, get_user_delegation_key
15
+ from .uri import UriIsh, parse_any
16
+
17
+ logger = log.getLogger(__name__)
18
+
19
+ _WAIT_TIMEOUT = 300 # seconds
20
+ # purposely set shorter than SAS token expiry as this is more of a reasonable amount of time for a human to wait
21
+ # rather than a reasonable amount of time for a file to finish copying.
22
+ # the functions using this constant use it as a default argument, so it can be changed as needed.
23
+
24
+
25
+ OverwriteMethod = ty.Literal["error", "warn", "skip", "silent"]
26
+
27
+ CopyRequest = ty.Dict[str, ty.Union[str, datetime.datetime]]
28
+
29
+
30
+ class CopyInfo(ty.NamedTuple):
31
+ src: AdlsFqn
32
+ dest: AdlsFqn
33
+ request: CopyRequest
34
+
35
+ @property
36
+ def copy_occurred(self) -> bool:
37
+ return bool(self.request)
38
+
39
+
40
+ def _copy_file(
41
+ src: AdlsFqn,
42
+ dest: AdlsFqn,
43
+ account_key: ty.Union[str, UserDelegationKey],
44
+ overwrite_method: OverwriteMethod = "error",
45
+ ) -> CopyInfo:
46
+ if not exists(src):
47
+ raise ValueError(f"{src} does not exist!")
48
+
49
+ if is_directory(get_file_properties(src)):
50
+ raise ValueError(f"{src} is a directory!")
51
+
52
+ src_blob_client = get_global_blob_container_client(src.sa, src.container).get_blob_client(src.path)
53
+ dest_blob_client = get_global_blob_container_client(dest.sa, dest.container).get_blob_client(
54
+ dest.path
55
+ )
56
+
57
+ def md5s_exist_and_are_equal() -> bool:
58
+ if (
59
+ src_md5 := src_blob_client.get_blob_properties().content_settings.content_md5
60
+ ) and src_md5 == dest_blob_client.get_blob_properties().content_settings.content_md5:
61
+ return True
62
+ return False
63
+
64
+ if dest_blob_client.exists():
65
+ if md5s_exist_and_are_equal():
66
+ # no point in copying if the files are the same
67
+ logger.info(
68
+ "%s already exists with the same md5 as the file at %s, no copy will occur", dest, src
69
+ )
70
+ return CopyInfo(src, dest, dict())
71
+
72
+ logger.info(
73
+ "%s already exists but it and/or the file at %s have unknown md5s, or the md5s differ. "
74
+ "The overwrite method provided will determine the copy behavior.",
75
+ dest,
76
+ src,
77
+ )
78
+
79
+ if overwrite_method == "error":
80
+ raise ValueError(f"{dest} already exists!")
81
+ elif overwrite_method == "warn":
82
+ logger.warning("%s will be overwritten with the file from %s", dest, src)
83
+ elif overwrite_method == "skip":
84
+ logger.warning("%s already exists, skipping copy from %s", dest, src)
85
+ return CopyInfo(src, dest, dict())
86
+
87
+ sas_token = gen_blob_sas_token(
88
+ src, account_key=account_key, permissions=BlobSasPermissions(read=True)
89
+ )
90
+ # we may not always need a SAS token for the copy, but it would be hard to reason about when they are(n't)
91
+
92
+ logger.info("Copying %s to %s...", src, dest)
93
+ return CopyInfo(
94
+ src, dest, dest_blob_client.start_copy_from_url(f"{src_blob_client.url}?{sas_token}")
95
+ )
96
+
97
+
98
+ def wait_for_copy(fqn: AdlsFqn, timeout: int = _WAIT_TIMEOUT) -> AdlsFqn:
99
+ """Set timeout to 0 or a negative int to wait indefinitely.
100
+
101
+ Keep in mind that, if the copy is authorized using a SAS token, that token expiry exists separate from the wait
102
+ timeout.
103
+ """
104
+ start_time = time.monotonic()
105
+
106
+ while (time.monotonic() - start_time < timeout) if timeout > 0 else True:
107
+ blob_copy_props = get_blob_properties(fqn).copy
108
+ # blob copy properties actually show copy progress so we could add a progress bar
109
+
110
+ if blob_copy_props.status == "success":
111
+ return fqn
112
+ elif blob_copy_props.status is None:
113
+ raise ValueError(f"{fqn} is not an a pending or completed copy.")
114
+ elif blob_copy_props.status != "pending":
115
+ raise ValueError(
116
+ f"The copy to {fqn} failed with the status: {blob_copy_props.status}. "
117
+ f"See blob copy properties: {blob_copy_props}"
118
+ )
119
+
120
+ time.sleep(random.uniform(1, 3))
121
+
122
+ raise TimeoutError(
123
+ f"Copying to {fqn} did not finish within {timeout} seconds. It may still be copying."
124
+ )
125
+
126
+
127
+ def copy_file(
128
+ src: UriIsh,
129
+ dest: UriIsh,
130
+ overwrite_method: OverwriteMethod = "error",
131
+ wait: bool = True,
132
+ timeout: int = _WAIT_TIMEOUT,
133
+ get_account_key: ty.Callable[
134
+ [BlobServiceClient], ty.Union[str, UserDelegationKey]
135
+ ] = get_user_delegation_key,
136
+ ) -> CopyInfo:
137
+ src_fqn = parse_any(src)
138
+ dest_fqn = parse_any(dest)
139
+
140
+ func = thunks.thunking(_copy_file)(
141
+ src_fqn,
142
+ dest_fqn,
143
+ account_key=get_account_key(get_global_blob_service_client(src_fqn.sa)),
144
+ overwrite_method=overwrite_method,
145
+ )
146
+
147
+ if wait:
148
+ copy_info = func()
149
+ wait_for_copy(dest_fqn, timeout=timeout)
150
+ logger.info("Finished copying %s to %s", src_fqn, dest_fqn)
151
+ return copy_info
152
+
153
+ return func()
154
+
155
+
156
+ def copy_files(
157
+ src_dest_pairs: ty.Collection[ty.Tuple[UriIsh, UriIsh]],
158
+ overwrite_method: OverwriteMethod = "error",
159
+ wait: bool = True,
160
+ timeout: int = _WAIT_TIMEOUT,
161
+ get_account_key: ty.Callable[
162
+ [BlobServiceClient], ty.Union[str, UserDelegationKey]
163
+ ] = get_user_delegation_key,
164
+ ) -> ty.List[CopyInfo]:
165
+ src_dest_fqn_pairs = [(parse_any(src), parse_any(dest)) for src, dest in src_dest_pairs]
166
+ get_account_key = cache.locking(get_account_key)
167
+
168
+ def copy_wrapper(src: AdlsFqn, dest: AdlsFqn) -> CopyInfo:
169
+ return copy_file(
170
+ src,
171
+ dest,
172
+ overwrite_method=overwrite_method,
173
+ wait=wait,
174
+ timeout=timeout,
175
+ get_account_key=get_account_key,
176
+ )
177
+
178
+ # would be cool to do this async, but using threads for quicker dev
179
+ return list(
180
+ parallel.yield_results(
181
+ [thunks.thunking(copy_wrapper)(src, dest) for src, dest in src_dest_fqn_pairs]
182
+ )
183
+ )
@@ -1,7 +1,10 @@
1
+ from azure.core.exceptions import AzureError, ResourceNotFoundError
2
+ from azure.storage.blob import BlobProperties
1
3
  from azure.storage.filedatalake import FileProperties
2
4
 
5
+ from .errors import translate_azure_error
3
6
  from .fqn import AdlsFqn
4
- from .global_client import get_global_fs_client
7
+ from .global_client import get_global_blob_container_client, get_global_fs_client
5
8
 
6
9
 
7
10
  def is_directory(info: FileProperties) -> bool:
@@ -11,3 +14,28 @@ def is_directory(info: FileProperties) -> bool:
11
14
 
12
15
  def get_file_properties(fqn: AdlsFqn) -> FileProperties:
13
16
  return get_global_fs_client(fqn.sa, fqn.container).get_file_client(fqn.path).get_file_properties()
17
+
18
+
19
+ def get_blob_properties(fqn: AdlsFqn) -> BlobProperties:
20
+ return (
21
+ get_global_blob_container_client(fqn.sa, fqn.container)
22
+ .get_blob_client(fqn.path)
23
+ .get_blob_properties()
24
+ )
25
+
26
+
27
+ # At some point it may make sense to separate file and blob property modules,
28
+ # but they also are very closely tied together. AFAIK all files are blobs, and given our usage of ADLS,
29
+ # I don't know if we ever deal with things that are blobs but not files.
30
+
31
+
32
+ def exists(fqn: AdlsFqn) -> bool:
33
+ try:
34
+ get_blob_properties(fqn)
35
+ # could generally use `get_file_properties` interchangeably,
36
+ # but blobs are a lower-level primitive than the ADLS file abstraction
37
+ return True
38
+ except ResourceNotFoundError:
39
+ return False
40
+ except AzureError as err:
41
+ translate_azure_error(get_global_fs_client(fqn.sa, fqn.container), fqn.path, err)
@@ -58,12 +58,9 @@ get_global_client = cache.locking(adls_fs_client)
58
58
  get_global_fs_client = get_global_client
59
59
 
60
60
 
61
- def adls_blob_container_client(
62
- storage_account: str, container: str, connpool_size: int = DEFAULT_CONNECTION_POOL_SIZE()
63
- ) -> ContainerClient:
64
- """This seems to support an atomic write operation,
65
- which is in theory going to be faster than two separate network requests.
66
- """
61
+ def adls_blob_service_client(
62
+ storage_account: str, connpool_size: int = DEFAULT_CONNECTION_POOL_SIZE()
63
+ ) -> BlobServiceClient:
67
64
  return BlobServiceClient(
68
65
  account_url=f"https://{storage_account}.blob.core.windows.net",
69
66
  credential=SharedCredential(),
@@ -72,7 +69,19 @@ def adls_blob_container_client(
72
69
  max_chunk_get_size=conf.MAX_CHUNK_GET_SIZE(), # for downloads
73
70
  max_block_size=conf.MAX_BLOCK_SIZE(), # for uploads
74
71
  max_single_put_size=conf.MAX_SINGLE_PUT_SIZE(), # for_uploads
75
- ).get_container_client(container)
72
+ )
73
+
74
+
75
+ get_global_blob_service_client = cache.locking(adls_blob_service_client)
76
+
77
+
78
+ def adls_blob_container_client(
79
+ storage_account: str, container: str, connpool_size: int = DEFAULT_CONNECTION_POOL_SIZE()
80
+ ) -> ContainerClient:
81
+ """This seems to support an atomic write operation,
82
+ which is in theory going to be faster than two separate network requests.
83
+ """
84
+ return get_global_blob_service_client(storage_account, connpool_size).get_container_client(container)
76
85
 
77
86
 
78
87
  get_global_blob_container_client = cache.locking(adls_blob_container_client)
thds/adls/meta.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
- "git_commit": "3cf733adc3c92ae7ff6ebd4bfbeea9d2de71f673",
2
+ "git_commit": "33bfc2c90d8350529a8067c6bf5f6115a35fd8de",
3
3
  "git_branch": "main",
4
4
  "git_is_clean": true,
5
- "pyproject_version": "3.1.20250204215147",
5
+ "pyproject_version": "3.1.20250205222211",
6
6
  "thds_user": "runner",
7
7
  "misc": {}
8
8
  }
@@ -0,0 +1,61 @@
1
+ import datetime
2
+ import typing as ty
3
+
4
+ from azure.storage.blob import (
5
+ BlobSasPermissions,
6
+ BlobServiceClient,
7
+ UserDelegationKey,
8
+ generate_blob_sas,
9
+ )
10
+
11
+ from .conf import BLOB_SAS_EXPIRY, USER_DELEGATION_KEY_EXPIRY
12
+ from .fqn import AdlsFqn
13
+
14
+
15
+ def get_user_delegation_key(
16
+ blob_service_client: BlobServiceClient,
17
+ expiry: ty.Union[int, ty.Callable[[], int]] = USER_DELEGATION_KEY_EXPIRY,
18
+ ) -> UserDelegationKey:
19
+ """For using as an account key to generate SAS tokens.
20
+
21
+ This key must remain valid for the SAS tokens it signs to remain valid.
22
+
23
+ `expiry` is in seconds.
24
+ """
25
+ now = datetime.datetime.now(datetime.timezone.utc)
26
+
27
+ return blob_service_client.get_user_delegation_key(
28
+ key_start_time=now,
29
+ key_expiry_time=now + datetime.timedelta(seconds=expiry() if callable(expiry) else expiry),
30
+ )
31
+
32
+
33
+ def gen_blob_sas_token(
34
+ fqn: AdlsFqn,
35
+ account_key: ty.Union[str, UserDelegationKey],
36
+ permissions: BlobSasPermissions,
37
+ expiry: ty.Union[int, ty.Callable[[], int]] = BLOB_SAS_EXPIRY,
38
+ ) -> str:
39
+ """Generates a SAS token for the blob present at the `fqn`.
40
+
41
+ If `account_key` is a `UserDelegationKey`, the expiry will be set to the `UserDelegationKey`'s expiry,
42
+ as when that key expires, the SAS tokens it signs will also expire.
43
+
44
+ `expiry` is in seconds.
45
+ """
46
+ if isinstance(account_key, UserDelegationKey):
47
+ expiry_datetime = account_key.signed_expiry
48
+ else:
49
+ expiry_datetime = datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(
50
+ seconds=expiry() if callable(expiry) else expiry
51
+ )
52
+
53
+ return generate_blob_sas(
54
+ fqn.sa,
55
+ fqn.container,
56
+ fqn.path,
57
+ account_key=account_key, # type: ignore[arg-type]
58
+ # the Azure SDK has too restrictive a type here
59
+ permission=permissions,
60
+ expiry=expiry_datetime,
61
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: thds.adls
3
- Version: 3.1.20250204215147
3
+ Version: 3.1.20250205222211
4
4
  Summary: ADLS tools
5
5
  Author: Trilliant Health
6
6
  Description-Content-Type: text/markdown
@@ -1,24 +1,26 @@
1
- thds/adls/__init__.py,sha256=20oVbjjKkXWMsuhZo9I5NxQDUl5fL4r10hD-jIKPjeM,715
1
+ thds/adls/__init__.py,sha256=Tc23Gy292GO7hC8qwxiAnVE3jGqLcEjHed2MKlSSpvI,784
2
2
  thds/adls/_progress.py,sha256=SVOiVRvfvOfUjp-y-UUgA6n0FhAnpKesxltaQ16kmOk,6394
3
3
  thds/adls/_upload.py,sha256=q6Sk0CRnNcAjUOUPiBj4CfO4tJD196SQY0lT25CTSE4,4364
4
4
  thds/adls/abfss.py,sha256=ZRJOLjDuXmS4bIbQAQpQxWWWeu74N9NKEKCNfXQek80,726
5
5
  thds/adls/cached_up_down.py,sha256=qTMrsHHrDq3YZ0gaLMFIPnGNraz0ctG9yoFd3HCVit4,1896
6
- thds/adls/conf.py,sha256=GmVO1L8nEvC0J0Tv7gUfC7II3yHZkejIddXEDPcCLM4,1522
6
+ thds/adls/conf.py,sha256=q1SPrgb46NpobVzwt_Oyv71-BvsIbZLq9nRWS3LZjz0,1990
7
+ thds/adls/copy.py,sha256=jUWbGvTpb4B3yRGS0nhGSbDzqRPzUqYgH0z1lFRJB3k,6365
7
8
  thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
8
9
  thds/adls/defaults.py,sha256=VxyaGz1gCcz2rGTR9acNRybbxGOYhTldvD_-SdGLBh0,652
9
10
  thds/adls/download.py,sha256=Bw4caFGNq5QD9TMzSOVCNV5HVcQsW28WlNDhNItbb0U,16461
10
11
  thds/adls/download_lock.py,sha256=_JZj-kjCUfHk9FvrmEuYpJYknmbam5eReFhGNDgzdLQ,2520
11
12
  thds/adls/errors.py,sha256=IQfkdLVg0feAUd-xQzkS4-3Lp5_Ld9V9-SqiXeWLRtA,1335
12
13
  thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
13
- thds/adls/file_properties.py,sha256=i3O2VNyAsySRcZwc9Si-V4r0TPdFAbVblZ6Nl_b3KfE,506
14
+ thds/adls/file_properties.py,sha256=JFkobkxcAaRGAh3TJ9mfc3X872gqrGuw4Xk_HSTj1sY,1582
14
15
  thds/adls/fqn.py,sha256=pGPBAWKrXorXs1DGkH-XXeDgwwQk3S_heQE7VkPAbW4,5598
15
- thds/adls/global_client.py,sha256=kl1miL5W7r0k6P0_E3U__zuIEfBbMQlURCokjoli0Rs,3433
16
+ thds/adls/global_client.py,sha256=f4VJw5y_Yh__8gQUcdSYTh1aU6iEPlauMchVirSAwDQ,3716
16
17
  thds/adls/impl.py,sha256=x1nSqc8W4NeuX8-JGOp2MRkK8ff6GnelTWedGxPs-qY,42494
17
18
  thds/adls/md5.py,sha256=qOX4_7WUj1QkbH_IwREcQNHvvZccOj-HpHZBfsKn1gY,1846
18
- thds/adls/meta.json,sha256=khEuo_U7-2O_vNDBQrDYJgsyGGw16NlYe4UgaHCzvuA,195
19
+ thds/adls/meta.json,sha256=orDAfHUVcBKGEXl3Q-KNeW71DtfLtvKSp8mipWILANA,195
19
20
  thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
20
21
  thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
22
  thds/adls/ro_cache.py,sha256=F0uXol0t95mcRuBukNg3A7wt7XXQxpD5Sy09d9sl8f0,4825
23
+ thds/adls/sas_tokens.py,sha256=tO7uUh5EVp2F_NITzz1Vks78KCZwVxT7C0faLj8UJqA,1889
22
24
  thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
23
25
  thds/adls/source.py,sha256=1JYliDqafqpLewOQ2XMKp4T2WD7b5Rl7XYViyLYtzX8,2437
24
26
  thds/adls/uri.py,sha256=pDH956p_VEHnjLLUnjWY6sGgRqksp9gdpc9KOW4gEP0,1205
@@ -29,8 +31,8 @@ thds/adls/resource/up_down.py,sha256=3uNlTvm2gVhSyYdQTBwsGecOgwtINQfINckR-awwV0Y
29
31
  thds/adls/tools/download.py,sha256=ZZ8t1g9MRRy-aENGjr10E4aAqyVkKq2OqpyH70pCD_8,965
30
32
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
31
33
  thds/adls/tools/upload.py,sha256=eMk4pdug1aCMPDDWpIE3Zoq77i5APp9Uuh-sVCCDNJE,493
32
- thds.adls-3.1.20250204215147.dist-info/METADATA,sha256=7wYzQn1dTM8JLN4lGYyBGr5jZZPNf3jwEl3Y89vW0q8,397
33
- thds.adls-3.1.20250204215147.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
34
- thds.adls-3.1.20250204215147.dist-info/entry_points.txt,sha256=uTqreT1AIwqJboMfLv5w6sviM8mNbAkln765gIjzoA4,152
35
- thds.adls-3.1.20250204215147.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
36
- thds.adls-3.1.20250204215147.dist-info/RECORD,,
34
+ thds.adls-3.1.20250205222211.dist-info/METADATA,sha256=pRGWlQvbXlQGBSdYpf_AZRl3KpRCXAY-Jifi5sbf9pw,397
35
+ thds.adls-3.1.20250205222211.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
36
+ thds.adls-3.1.20250205222211.dist-info/entry_points.txt,sha256=uTqreT1AIwqJboMfLv5w6sviM8mNbAkln765gIjzoA4,152
37
+ thds.adls-3.1.20250205222211.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
38
+ thds.adls-3.1.20250205222211.dist-info/RECORD,,