thds.adls 4.1.20250701001205__py3-none-any.whl → 4.1.20250701190349__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

@@ -1,77 +0,0 @@
1
- import json
2
- import typing as ty
3
-
4
- from thds.core import hashing, log, source
5
- from thds.core.hashing import b64
6
-
7
- from ..errors import blob_not_found_translation
8
- from ..file_properties import get_file_properties
9
- from ..fqn import AdlsFqn
10
- from ..md5 import check_reasonable_md5b64
11
-
12
- logger = log.getLogger(__name__)
13
-
14
-
15
- class AdlsHashedResource(ty.NamedTuple):
16
- """See the containing package for documentation on how to use this and its motivation."""
17
-
18
- fqn: AdlsFqn
19
- md5b64: str
20
-
21
- @property
22
- def serialized(self) -> str:
23
- return serialize(self)
24
-
25
- @staticmethod
26
- def of(fqn_or_uri: ty.Union[AdlsFqn, str], md5b64: str) -> "AdlsHashedResource":
27
- return of(fqn_or_uri, md5b64)
28
-
29
- @staticmethod
30
- def parse(serialized_dict: str) -> "AdlsHashedResource":
31
- return parse(serialized_dict)
32
-
33
-
34
- def of(fqn_or_uri: ty.Union[AdlsFqn, str], md5b64: str) -> AdlsHashedResource:
35
- assert md5b64, "md5b64 must be non-empty"
36
- fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
37
- return AdlsHashedResource(fqn, md5b64)
38
-
39
-
40
- def from_source(source: source.Source) -> AdlsHashedResource:
41
- assert source.hash, "Source must have a hash"
42
- assert source.hash.algo == "md5", f"Source Hash type must be MD5! Got: {source.hash.algo}"
43
- return of(source.uri, hashing.b64(source.hash.bytes))
44
-
45
-
46
- def to_source(resource: AdlsHashedResource) -> source.Source:
47
- return source.from_uri(
48
- str(resource.fqn),
49
- hash=hashing.Hash("md5", hashing.db64(resource.md5b64)),
50
- )
51
-
52
-
53
- def serialize(resource: AdlsHashedResource) -> str:
54
- d = resource._asdict()
55
- # we use uri instead of fqn in order to make these a more generic format
56
- return json.dumps(dict(uri=str(d["fqn"]), md5b64=d["md5b64"]))
57
-
58
-
59
- def parse(serialized_dict: str) -> AdlsHashedResource:
60
- actual_dict = json.loads(serialized_dict)
61
- # accept either uri or fqn
62
- uri = actual_dict["uri"] if "uri" in actual_dict else actual_dict["fqn"]
63
- md5b64 = actual_dict["md5b64"]
64
- check_reasonable_md5b64(md5b64)
65
- return AdlsHashedResource.of(AdlsFqn.parse(uri), md5b64)
66
-
67
-
68
- def get(fqn_or_uri: ty.Union[AdlsFqn, str]) -> AdlsHashedResource:
69
- """Creates an AdlsHashedResource from a remote-only file.
70
-
71
- The file _must_ have a pre-existing Content MD5!
72
- """
73
- fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
74
- with blob_not_found_translation(fqn):
75
- props = get_file_properties(fqn)
76
- assert props.content_settings.content_md5, "ADLS file has empty Content-MD5!"
77
- return AdlsHashedResource.of(fqn, b64(props.content_settings.content_md5))
@@ -1,54 +0,0 @@
1
- import os
2
- import typing as ty
3
- from pathlib import Path
4
-
5
- from thds.core.hashing import b64
6
-
7
- from ..file_properties import get_file_properties
8
- from .core import AdlsHashedResource, parse, serialize
9
-
10
- _AZURE_PLACEHOLDER_SIZE_LIMIT = 4096
11
- # it is assumed that no placeholder will ever need to be larger than 4 KB.
12
-
13
-
14
- def resource_from_path(path: ty.Union[str, Path]) -> AdlsHashedResource:
15
- """Raises if the path does not represent a serialized AdlsHashedResource."""
16
- with open(path) as maybe_json_file:
17
- json_str = maybe_json_file.read(_AZURE_PLACEHOLDER_SIZE_LIMIT)
18
- return parse(json_str)
19
-
20
-
21
- class MustCommitResourceLocally(Exception):
22
- """Means you need to read the message and commit the change to the repo."""
23
-
24
-
25
- def _check_ci(resource_json_path: ty.Union[str, Path], resource: AdlsHashedResource):
26
- if os.getenv("CI"):
27
- print("\n" + resource.serialized + "\n")
28
- # Because we can't modify+commit in CI, this should fail
29
- # if it's recreating a resource in a CI environment.
30
- raise MustCommitResourceLocally(
31
- "In CI, a newly built resource cannot be recorded."
32
- " However, it did successfully build and upload!"
33
- f" You should paste '{resource.serialized}'"
34
- f" into '{resource_json_path}' locally, and commit."
35
- )
36
-
37
-
38
- def resource_to_path(
39
- path: ty.Union[str, Path], resource: AdlsHashedResource, *, check_ci: bool = False
40
- ) -> None:
41
- if check_ci:
42
- _check_ci(path, resource)
43
- with open(path, "w") as json_file:
44
- json_file.write(serialize(resource) + "\n")
45
-
46
-
47
- def validate_resource(srcfile: ty.Union[str, Path]) -> AdlsHashedResource:
48
- res = resource_from_path(srcfile)
49
- fqn, md5b64 = res
50
- props = get_file_properties(fqn)
51
- md5 = props.content_settings.content_md5
52
- assert md5, f"{fqn} was incorrectly uploaded to ADLS without an MD5 embedded."
53
- assert md5b64 == b64(md5), f"You probably need to update the MD5 in {srcfile}"
54
- return res
@@ -1,242 +0,0 @@
1
- """Tools for using and creating locally-cached resources."""
2
-
3
- import typing as ty
4
- from pathlib import Path
5
-
6
- from azure.core.exceptions import HttpResponseError, ResourceModifiedError
7
- from azure.storage.blob import ContentSettings
8
-
9
- from thds.core import files, fretry, hashing, link, log, scope, tmp
10
-
11
- from .._progress import report_upload_progress
12
- from .._upload import metadata_for_upload, upload_decision_and_settings
13
- from ..conf import UPLOAD_FILE_MAX_CONCURRENCY
14
- from ..download import download_or_use_verified
15
- from ..errors import BlobNotFoundError
16
- from ..file_properties import get_file_properties
17
- from ..fqn import AdlsFqn
18
- from ..global_client import get_global_blob_container_client, get_global_fs_client
19
- from ..ro_cache import Cache, global_cache
20
- from .file_pointers import AdlsHashedResource, resource_from_path, resource_to_path
21
-
22
- logger = log.getLogger(__name__)
23
- _SLOW_CONNECTION_WORKAROUND = 14400 # seconds
24
-
25
-
26
- # DOWNLOAD
27
- def get_read_only(
28
- resource: AdlsHashedResource,
29
- local_path: ty.Optional[Path] = None,
30
- cache: Cache = global_cache(),
31
- ) -> Path:
32
- """Downloads a read-only resource if it is not already present in
33
- the cache or at the local_path.
34
-
35
- Because the resource includes a hash, we can save a lot of
36
- bandwidth if we can detect that it already is present locally.
37
-
38
- By default, downloads through the machine-global cache. Caching
39
- cannot be disabled, but the location of the cache can be changed.
40
- """
41
- cache = cache or global_cache()
42
- local_path = local_path or cache.path(resource.fqn)
43
- download_or_use_verified(
44
- get_global_fs_client(resource.fqn.sa, resource.fqn.container),
45
- resource.fqn.path,
46
- local_path,
47
- md5b64=resource.md5b64,
48
- cache=cache,
49
- )
50
- return local_path
51
-
52
-
53
- # UPLOAD
54
- UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
55
-
56
-
57
- def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
58
- @scope.bound
59
- def _try_write_through() -> bool:
60
- if isinstance(data, Path) and data.exists():
61
- link.link_or_copy(data, local_cache_path, "ref")
62
- return True
63
- out = scope.enter(tmp.temppath_same_fs(local_cache_path))
64
- if hasattr(data, "read") and hasattr(data, "seek"):
65
- with open(out, "wb") as f:
66
- f.write(data.read()) # type: ignore
67
- data.seek(0) # type: ignore
68
- link.link_or_copy(out, local_cache_path)
69
- return True
70
- if isinstance(data, bytes):
71
- with open(out, "wb") as f:
72
- f.write(data)
73
- link.link_or_copy(out, local_cache_path)
74
- return True
75
- return False
76
-
77
- if _try_write_through():
78
- try:
79
- # it's a reflink or a copy, so the cache now owns its copy
80
- # and we don't want to allow anyone to write to its copy.
81
- files.set_read_only(local_cache_path)
82
- return local_cache_path
83
- except FileNotFoundError:
84
- # may have hit a race condition.
85
- # don't fail upload just because we couldn't write through the cache.
86
- pass
87
- return None
88
-
89
-
90
- @scope.bound
91
- @fretry.retry_sleep(
92
- # ADLS lib has a bug where parallel uploads of the same thing will
93
- # hit a race condition and error. this will detect that scenario
94
- # and avoid re-uploading as well.
95
- fretry.is_exc(ResourceModifiedError),
96
- fretry.expo(retries=5),
97
- )
98
- def upload(
99
- dest: ty.Union[AdlsFqn, str],
100
- src: UploadSrc,
101
- write_through_cache: ty.Optional[Cache] = None,
102
- *,
103
- content_type: str = "",
104
- **upload_data_kwargs: ty.Any,
105
- ) -> ty.Optional[AdlsHashedResource]:
106
- """Uploads only if the remote does not exist or does not match
107
- md5.
108
-
109
- Always embeds md5 in upload if at all possible. In very rare cases
110
- it may not be possible for us to calculate one. Will always
111
- be possible if the passed data was a Path. If one can be
112
- calculated, an AdlsHashedResource is returned.
113
-
114
- Can write through a local cache, which may save you a download later.
115
-
116
- content_type and all upload_data_kwargs will be ignored if the file
117
- has already been uploaded and the md5 matches.
118
- """
119
- dest_ = AdlsFqn.parse(dest) if isinstance(dest, str) else dest
120
- if write_through_cache:
121
- _write_through_local_cache(write_through_cache.path(dest_), src)
122
- # we always use the original source file to upload, not the cached path,
123
- # because uploading from a shared location risks race conditions.
124
-
125
- blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
126
- blob_client = blob_container_client.get_blob_client(dest_.path)
127
- decision = upload_decision_and_settings(blob_client.get_blob_properties, src) # type: ignore [arg-type]
128
- if decision.upload_required:
129
- # set up some bookkeeping
130
- n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
131
- if isinstance(src, Path):
132
- n_bytes = src.stat().st_size
133
- src = scope.enter(open(src, "rb"))
134
- elif isinstance(src, bytes):
135
- n_bytes = len(src)
136
-
137
- adls_meta = metadata_for_upload()
138
- if "metadata" in upload_data_kwargs:
139
- adls_meta.update(upload_data_kwargs.pop("metadata"))
140
-
141
- upload_content_settings = decision.content_settings or ContentSettings()
142
- if content_type:
143
- upload_content_settings.content_type = content_type
144
-
145
- # we are now using blob_client instead of file system client
146
- # because blob client (as of 2024-06-24) does actually do
147
- # some one-step, atomic uploads, wherein there is not a separate
148
- # create/truncate action associated with an overwrite.
149
- # This is both faster, as well as simpler to reason about, and
150
- # in fact was the behavior I had been assuming all along...
151
- blob_client.upload_blob(
152
- report_upload_progress(ty.cast(ty.IO, src), str(dest_), n_bytes or 0),
153
- overwrite=True,
154
- length=n_bytes,
155
- content_settings=upload_content_settings,
156
- connection_timeout=_SLOW_CONNECTION_WORKAROUND,
157
- max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
158
- metadata=adls_meta,
159
- **upload_data_kwargs,
160
- )
161
-
162
- # if at all possible (if the md5 is known), return a resource containing it.
163
- if decision.content_settings and decision.content_settings.content_md5:
164
- return AdlsHashedResource.of(dest_, hashing.b64(decision.content_settings.content_md5))
165
- return None
166
-
167
-
168
- def verify_remote_md5(resource: AdlsHashedResource) -> bool:
169
- try:
170
- props = get_file_properties(resource.fqn)
171
- if props.content_settings.content_md5:
172
- return hashing.b64(props.content_settings.content_md5) == resource.md5b64
173
- except HttpResponseError:
174
- return False
175
- except Exception:
176
- logger.exception("Unable to verify remote md5")
177
- return False
178
-
179
-
180
- # DOWNLOAD if exists, else CREATE and UPLOAD
181
-
182
-
183
- def verify_or_create_resource(
184
- resource_json_path: Path,
185
- get_adls_fqn: ty.Callable[[], AdlsFqn],
186
- creator: ty.Callable[[], Path],
187
- cache: ty.Optional[Cache] = global_cache(),
188
- ) -> AdlsHashedResource:
189
- """Return an MD5-verified resource if it already exists and
190
- matches the FQN _and_ the MD5 embedded in the resource JSON file.
191
-
192
- Does not download the actual resource if it can be verified to
193
- exist and match what is expected.
194
-
195
- If if does not exist or does not match, creates the resource and
196
- uploads it to the requested path as well.
197
-
198
- Basically an idempotent get-or-create pattern, applied to things
199
- that are more expensive to build than to upload, and that result
200
- in a single file.
201
-
202
- For this to work correctly, you will need the FQN itself to change
203
- based on some kind of key that can be considered unique to a
204
- particular version of the resource. Think of it like a cache
205
- key. For instance, if you have a resource that gets changed every
206
- time your library gets built, then your library version could be
207
- part of the Adls FQN.
208
-
209
- If running in CI, will create and upload your resource from CI,
210
- but will then raise an exception, since you need to commit the
211
- serialized resource to the resource_json_path.
212
- """
213
- remote_fqn = get_adls_fqn() # this is lazy to allow partial application at a module level.
214
- if resource_json_path.exists():
215
- try:
216
- resource = resource_from_path(resource_json_path)
217
- try:
218
- if resource.fqn == remote_fqn:
219
- if verify_remote_md5(resource):
220
- return resource
221
- else:
222
- logger.info("Resource MD5 does not match; must recreate.")
223
- logger.info("Resource FQN does not match - it needs to be recreated.")
224
- except BlobNotFoundError:
225
- logger.info(f"Resource does not exist at {resource.fqn}; will create.")
226
- except Exception:
227
- logger.exception(f"Failed to get resource from {resource.fqn}, will recreate.")
228
- except Exception:
229
- logger.exception(f"Unable to parse a resource from {resource_json_path}; must recreate.")
230
-
231
- logger.info(f"Creating resource for {remote_fqn}...")
232
- created_path = creator()
233
- sz_mb = created_path.stat().st_size / 2**20 # 1 MB
234
- logger.info(
235
- f"Uploading created resource of size {sz_mb:.1f} MB to {remote_fqn} from {created_path} ..."
236
- )
237
- uploaded_resource = upload(remote_fqn, created_path, write_through_cache=cache)
238
- assert (
239
- uploaded_resource
240
- ), "Cannot create a shared resource without being able to calculate MD5 prior to upload."
241
- resource_to_path(resource_json_path, uploaded_resource, check_ci=True)
242
- return uploaded_resource
@@ -1,40 +0,0 @@
1
- thds/adls/__init__.py,sha256=er14MoCC9PlJMxWVS4G1hAeMJItaJj4EAsrTZlvlb0M,797
2
- thds/adls/_progress.py,sha256=ZzCHn_G7nHakioNFxdvoJZRr-jN6ymsp5JXf-iReROM,6580
3
- thds/adls/_upload.py,sha256=XyP6tDM7s-A3G0SPSVlXRT4IZYsPqpOE4TeqtxP5i-I,4375
4
- thds/adls/abfss.py,sha256=ZRJOLjDuXmS4bIbQAQpQxWWWeu74N9NKEKCNfXQek80,726
5
- thds/adls/cached_up_down.py,sha256=3rdTuD_qKodFEo7vsl2mN5QMHSjDCGf3UaRKta6BTaI,2955
6
- thds/adls/conf.py,sha256=q1SPrgb46NpobVzwt_Oyv71-BvsIbZLq9nRWS3LZjz0,1990
7
- thds/adls/copy.py,sha256=jUWbGvTpb4B3yRGS0nhGSbDzqRPzUqYgH0z1lFRJB3k,6365
8
- thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
9
- thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
10
- thds/adls/download.py,sha256=c1GapH8e7H5AmGR7S2u6CJ-XqQaDdtVQ6tKIwjuMTio,18510
11
- thds/adls/download_lock.py,sha256=ttD2GhPNRnITNoV1XH2PvKbMsHppZirjy3RZ4P4kgKM,2826
12
- thds/adls/errors.py,sha256=B_rMsQvQnNmP_sf-x8kmGsv2vIeOh4G9kVbdNVyk350,1469
13
- thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
14
- thds/adls/file_properties.py,sha256=DYUu4zI_5amwTSMbhvUVimSEfEsfpgoqOawQ4Z4W-nY,1790
15
- thds/adls/fqn.py,sha256=0zHmHhBWN7GEfKRB3fBC1NVhaiIHHifBdCRanyT01X8,5822
16
- thds/adls/global_client.py,sha256=f4VJw5y_Yh__8gQUcdSYTh1aU6iEPlauMchVirSAwDQ,3716
17
- thds/adls/impl.py,sha256=4qa70w1sehzp60CI6lW82NLDK-lsM1uUfhPmZnYJaw0,42589
18
- thds/adls/md5.py,sha256=qOX4_7WUj1QkbH_IwREcQNHvvZccOj-HpHZBfsKn1gY,1846
19
- thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
20
- thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- thds/adls/ro_cache.py,sha256=F0uXol0t95mcRuBukNg3A7wt7XXQxpD5Sy09d9sl8f0,4825
22
- thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
23
- thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
24
- thds/adls/source.py,sha256=1JYliDqafqpLewOQ2XMKp4T2WD7b5Rl7XYViyLYtzX8,2437
25
- thds/adls/source_tree.py,sha256=yP_v2XrKxXqUOdZ-x8kqHhBFAuur3AlAq3zi4hHj4AE,2235
26
- thds/adls/uri.py,sha256=pDH956p_VEHnjLLUnjWY6sGgRqksp9gdpc9KOW4gEP0,1205
27
- thds/adls/azcopy/__init__.py,sha256=nTNbgz2GcEiGeswYbAgy4oPhivnzl_5crF3HqCdWWiw,31
28
- thds/adls/azcopy/download.py,sha256=49Vv5z_bBTzkcCg8IP2fvQnMvrxqHS4AfxPWQ6CSEh0,6610
29
- thds/adls/resource/__init__.py,sha256=IZ7_aRf1b3jEp7wXOxqHop0gV2gUcf9SOLeEEjIWlCU,1669
30
- thds/adls/resource/core.py,sha256=u6iaJAbnk88Xfn6SQBgITDnvHpEFdMjUtE_fOgebbTU,2627
31
- thds/adls/resource/file_pointers.py,sha256=2g5lUtauA5vCZ4-skxIdmLeOp4cLZ7Mtv-MAhq6nIsA,1983
32
- thds/adls/resource/up_down.py,sha256=eTQMbeqM6edfVz18uHXRiOe0oEzOAaW1TBvym6lctNM,9822
33
- thds/adls/tools/download.py,sha256=vvBO8lSDl9oPugv75qpCkoemT9pOM9BV6yeExlkyG08,1594
34
- thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
35
- thds/adls/tools/upload.py,sha256=eMk4pdug1aCMPDDWpIE3Zoq77i5APp9Uuh-sVCCDNJE,493
36
- thds_adls-4.1.20250701001205.dist-info/METADATA,sha256=86BiNStEl525bbhbeqOzlIOzXy9-v-sESVUD9vQJuK0,587
37
- thds_adls-4.1.20250701001205.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
- thds_adls-4.1.20250701001205.dist-info/entry_points.txt,sha256=uTqreT1AIwqJboMfLv5w6sviM8mNbAkln765gIjzoA4,152
39
- thds_adls-4.1.20250701001205.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
40
- thds_adls-4.1.20250701001205.dist-info/RECORD,,