thds.adls 3.0.20250116223841__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/md5.py ADDED
@@ -0,0 +1,60 @@
1
+ """Why MD5 when it's no longer a good choice for most use cases?
2
+ Because Azure/ADLS support Content-MD5 but nothing else, and I don't
3
+ want to lie to them and get us confused later.
4
+
5
+ Thankfully, there are no real security concerns for us with purely
6
+ internal code and data sets.
7
+
8
+ That said, please _do not_ use MD5 for non-Azure things. Prefer SHA256
9
+ if at all possible.
10
+ """
11
+ import hashlib
12
+ import typing as ty
13
+ from pathlib import Path
14
+
15
+ from thds.core.hash_cache import hash_file
16
+ from thds.core.hashing import SomehowReadable, hash_anything, hash_using
17
+ from thds.core.types import StrOrPath
18
+
19
+ AnyStrSrc = ty.Union[SomehowReadable, ty.Iterable[ty.AnyStr]]
20
+ # this type closely corresponds to what the underlying DataLakeStorageClient will accept for upload_data.
21
+
22
+
23
+ def md5_file(file: StrOrPath) -> bytes:
24
+ """Raise exception if it cannot be read.
25
+
26
+ Safely caches the hash on your local filesystem (uses mtimes to
27
+ determine staleness).
28
+ """
29
+ return hash_file(file, hashlib.md5())
30
+
31
+
32
+ def hex_md5_str(string: str) -> str:
33
+ return hash_using(string.encode(), hashlib.md5()).hexdigest()
34
+
35
+
36
+ def try_md5(data: AnyStrSrc) -> ty.Optional[bytes]:
37
+ """Ideally, we calculate an MD5 sum for all data that we upload.
38
+
39
+ The only circumstances under which we cannot do this are if the
40
+ stream does not exist in its entirety before the upload begins.
41
+ """
42
+ if isinstance(data, Path):
43
+ return md5_file(data)
44
+ res = hash_anything(data, hashlib.md5())
45
+ if res:
46
+ return res.digest()
47
+ return None
48
+
49
+
50
+ def is_reasonable_b64(md5: str):
51
+ if len(md5) == 22:
52
+ return True
53
+ if len(md5) == 24 and md5.endswith("=="):
54
+ return True
55
+ return False
56
+
57
+
58
+ def check_reasonable_md5b64(maybe_md5: str):
59
+ if not is_reasonable_b64(maybe_md5):
60
+ raise ValueError(f"MD5 '{maybe_md5}' is not a reasonable MD5.")
thds/adls/meta.json ADDED
@@ -0,0 +1,8 @@
1
+ {
2
+ "git_commit": "8119ce98e26d99335cda51ac5cbebbcd6d87c416",
3
+ "git_branch": "task/ci/open-source",
4
+ "git_is_clean": true,
5
+ "pyproject_version": "3.0.20250116223841",
6
+ "thds_user": "peter.gaultney",
7
+ "misc": {}
8
+ }
@@ -0,0 +1,26 @@
1
+ """Just a utility for keeping actual Storage Account+Container pairs (AdlsRoots) defined
2
+ in a central location and referencing those by name throughout your codebase.
3
+ """
4
+
5
+ import typing as ty
6
+
7
+ from .fqn import AdlsRoot
8
+
9
+ _NAMED_ROOTS: ty.Dict[str, AdlsRoot] = dict()
10
+
11
+
12
+ def add(**named_roots: AdlsRoot) -> None:
13
+ """Globally sets some named roots, as a layer of indirection for ADLS URIs."""
14
+ _NAMED_ROOTS.update(named_roots)
15
+
16
+
17
+ def require(name: str) -> AdlsRoot:
18
+ if name not in _NAMED_ROOTS:
19
+ raise ValueError(f"Unknown named root: {name}")
20
+
21
+ return _NAMED_ROOTS[name]
22
+
23
+
24
+ def require_uri(name: str) -> str:
25
+ """For use when a system expects a URI rather than the in-house AdlsRoot representation."""
26
+ return str(require(name))
thds/adls/py.typed ADDED
File without changes
@@ -0,0 +1,36 @@
1
+ """The reason for a hashed resource is that it enables worry-free caching.
2
+
3
+ If under any circumstances we re-use a name/URI with different bytes,
4
+ then having captured a hash will enable us to transparently detect the
5
+ situation and re-download.
6
+
7
+ It is strongly recommended that you construct these using `of`, as
8
+ that will avoid the accidental, invalid creation of an
9
+ AdlsHashedResource containing an empty hash.
10
+
11
+ How to get the hash itself?
12
+
13
+ From our experience, it seems that any file uploaded using Azure
14
+ Storage Explorer will have an MD5 calculated locally before upload and
15
+ that will be embedded in the remote file. You can look in the
16
+ properties of the uploaded file for Content-MD5 and copy-paste that
17
+ into whatever you're writing.
18
+
19
+ Programmatically, you can instead use `resource.upload`, which will
20
+ return to you an in-memory AdlsHashedResource object. If you want to
21
+ store it programmatically rather than in the source code, it's
22
+ recommended that you use `resource.to_path`, and then load it using
23
+ `resource.from_path`.
24
+
25
+ Prefer importing this module `as resource` or `from thds.adls
26
+ import resource`, and then using it as a namespace,
27
+ e.g. `resource.of(uri)`.
28
+ """
29
+ from .core import AdlsHashedResource, from_source, get, of, parse, serialize, to_source # noqa: F401
30
+ from .file_pointers import resource_from_path as from_path # noqa: F401
31
+ from .file_pointers import resource_to_path as to_path # noqa: F401
32
+ from .file_pointers import validate_resource as validate # noqa: F401
33
+ from .up_down import get_read_only, upload # noqa: F401
34
+ from .up_down import verify_or_create_resource as verify_or_create # noqa: F401
35
+
36
+ AHR = AdlsHashedResource # just an alias
@@ -0,0 +1,79 @@
1
+ import json
2
+ import typing as ty
3
+
4
+ from thds.core import hashing, log, source
5
+ from thds.core.hashing import b64
6
+
7
+ from ..errors import blob_not_found_translation
8
+ from ..fqn import AdlsFqn
9
+ from ..global_client import get_global_fs_client
10
+ from ..md5 import check_reasonable_md5b64
11
+
12
+ logger = log.getLogger(__name__)
13
+
14
+
15
+ class AdlsHashedResource(ty.NamedTuple):
16
+ """See the containing package for documentation on how to use this and its motivation."""
17
+
18
+ fqn: AdlsFqn
19
+ md5b64: str
20
+
21
+ @property
22
+ def serialized(self) -> str:
23
+ return serialize(self)
24
+
25
+ @staticmethod
26
+ def of(fqn_or_uri: ty.Union[AdlsFqn, str], md5b64: str) -> "AdlsHashedResource":
27
+ return of(fqn_or_uri, md5b64)
28
+
29
+ @staticmethod
30
+ def parse(serialized_dict: str) -> "AdlsHashedResource":
31
+ return parse(serialized_dict)
32
+
33
+
34
+ def of(fqn_or_uri: ty.Union[AdlsFqn, str], md5b64: str) -> AdlsHashedResource:
35
+ assert md5b64, "md5b64 must be non-empty"
36
+ fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
37
+ return AdlsHashedResource(fqn, md5b64)
38
+
39
+
40
+ def from_source(source: source.Source) -> AdlsHashedResource:
41
+ assert source.hash, "Source must have a hash"
42
+ assert source.hash.algo == "md5", f"Source Hash type must be MD5! Got: {source.hash.algo}"
43
+ return of(source.uri, hashing.b64(source.hash.bytes))
44
+
45
+
46
+ def to_source(resource: AdlsHashedResource) -> source.Source:
47
+ return source.from_uri(
48
+ str(resource.fqn),
49
+ hash=source.Hash("md5", hashing.db64(resource.md5b64)),
50
+ )
51
+
52
+
53
+ def serialize(resource: AdlsHashedResource) -> str:
54
+ d = resource._asdict()
55
+ # we use uri instead of fqn in order to make these a more generic format
56
+ return json.dumps(dict(uri=str(d["fqn"]), md5b64=d["md5b64"]))
57
+
58
+
59
+ def parse(serialized_dict: str) -> AdlsHashedResource:
60
+ actual_dict = json.loads(serialized_dict)
61
+ # accept either uri or fqn
62
+ uri = actual_dict["uri"] if "uri" in actual_dict else actual_dict["fqn"]
63
+ md5b64 = actual_dict["md5b64"]
64
+ check_reasonable_md5b64(md5b64)
65
+ return AdlsHashedResource.of(AdlsFqn.parse(uri), md5b64)
66
+
67
+
68
+ def get(fqn_or_uri: ty.Union[AdlsFqn, str]) -> AdlsHashedResource:
69
+ """Creates an AdlsHashedResource from a remote-only file.
70
+
71
+ The file _must_ have a pre-existing Content MD5!
72
+ """
73
+ fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
74
+ with blob_not_found_translation(fqn):
75
+ props = (
76
+ get_global_fs_client(fqn.sa, fqn.container).get_file_client(fqn.path).get_file_properties()
77
+ )
78
+ assert props.content_settings.content_md5, "ADLS file has empty Content-MD5!"
79
+ return AdlsHashedResource.of(fqn, b64(props.content_settings.content_md5))
@@ -0,0 +1,54 @@
1
+ import os
2
+ import typing as ty
3
+ from pathlib import Path
4
+
5
+ from thds.core.hashing import b64
6
+
7
+ from ..global_client import get_global_fs_client
8
+ from .core import AdlsHashedResource, parse, serialize
9
+
10
+ _AZURE_PLACEHOLDER_SIZE_LIMIT = 4096
11
+ # it is assumed that no placeholder will ever need to be larger than 4 KB.
12
+
13
+
14
+ def resource_from_path(path: ty.Union[str, Path]) -> AdlsHashedResource:
15
+ """Raises if the path does not represent a serialized AdlsHashedResource."""
16
+ with open(path) as maybe_json_file:
17
+ json_str = maybe_json_file.read(_AZURE_PLACEHOLDER_SIZE_LIMIT)
18
+ return parse(json_str)
19
+
20
+
21
+ class MustCommitResourceLocally(Exception):
22
+ """Means you need to read the message and commit the change to the repo."""
23
+
24
+
25
+ def _check_ci(resource_json_path: ty.Union[str, Path], resource: AdlsHashedResource):
26
+ if os.getenv("CI"):
27
+ print("\n" + resource.serialized + "\n")
28
+ # Because we can't modify+commit in CI, this should fail
29
+ # if it's recreating a resource in a CI environment.
30
+ raise MustCommitResourceLocally(
31
+ "In CI, a newly built resource cannot be recorded."
32
+ " However, it did successfully build and upload!"
33
+ f" You should paste '{resource.serialized}'"
34
+ f" into '{resource_json_path}' locally, and commit."
35
+ )
36
+
37
+
38
+ def resource_to_path(
39
+ path: ty.Union[str, Path], resource: AdlsHashedResource, *, check_ci: bool = False
40
+ ) -> None:
41
+ if check_ci:
42
+ _check_ci(path, resource)
43
+ with open(path, "w") as json_file:
44
+ json_file.write(serialize(resource) + "\n")
45
+
46
+
47
+ def validate_resource(srcfile: ty.Union[str, Path]) -> AdlsHashedResource:
48
+ res = resource_from_path(srcfile)
49
+ fqn, md5b64 = res
50
+ props = get_global_fs_client(fqn.sa, fqn.container).get_file_client(fqn.path).get_file_properties()
51
+ md5 = props.content_settings.content_md5
52
+ assert md5, f"{fqn} was incorrectly uploaded to ADLS without an MD5 embedded."
53
+ assert md5b64 == b64(md5), f"You probably need to update the MD5 in {srcfile}"
54
+ return res
@@ -0,0 +1,245 @@
1
+ """Tools for using and creating locally-cached resources."""
2
+
3
+ import typing as ty
4
+ from pathlib import Path
5
+
6
+ from azure.core.exceptions import HttpResponseError, ResourceModifiedError
7
+ from azure.storage.blob import ContentSettings
8
+
9
+ from thds.core import files, fretry, hashing, link, log, scope, tmp
10
+
11
+ from .._progress import report_upload_progress
12
+ from .._upload import metadata_for_upload, upload_decision_and_settings
13
+ from ..conf import UPLOAD_FILE_MAX_CONCURRENCY
14
+ from ..download import download_or_use_verified
15
+ from ..errors import BlobNotFoundError
16
+ from ..fqn import AdlsFqn
17
+ from ..global_client import get_global_blob_container_client, get_global_fs_client
18
+ from ..ro_cache import Cache, global_cache
19
+ from .file_pointers import AdlsHashedResource, resource_from_path, resource_to_path
20
+
21
+ logger = log.getLogger(__name__)
22
+ _SLOW_CONNECTION_WORKAROUND = 14400 # seconds
23
+
24
+
25
+ # DOWNLOAD
26
+ def get_read_only(
27
+ resource: AdlsHashedResource,
28
+ local_path: ty.Optional[Path] = None,
29
+ cache: Cache = global_cache(),
30
+ ) -> Path:
31
+ """Downloads a read-only resource if it is not already present in
32
+ the cache or at the local_path.
33
+
34
+ Because the resource includes a hash, we can save a lot of
35
+ bandwidth if we can detect that it already is present locally.
36
+
37
+ By default, downloads through the machine-global cache. Caching
38
+ cannot be disabled, but the location of the cache can be changed.
39
+ """
40
+ cache = cache or global_cache()
41
+ local_path = local_path or cache.path(resource.fqn)
42
+ download_or_use_verified(
43
+ get_global_fs_client(resource.fqn.sa, resource.fqn.container),
44
+ resource.fqn.path,
45
+ local_path,
46
+ md5b64=resource.md5b64,
47
+ cache=cache,
48
+ )
49
+ return local_path
50
+
51
+
52
+ # UPLOAD
53
+ UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
54
+
55
+
56
+ def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
57
+ @scope.bound
58
+ def _try_write_through() -> bool:
59
+ if isinstance(data, Path) and data.exists():
60
+ link.link_or_copy(data, local_cache_path, "ref")
61
+ return True
62
+ out = scope.enter(tmp.temppath_same_fs(local_cache_path))
63
+ if hasattr(data, "read") and hasattr(data, "seek"):
64
+ with open(out, "wb") as f:
65
+ f.write(data.read()) # type: ignore
66
+ data.seek(0) # type: ignore
67
+ link.link_or_copy(out, local_cache_path)
68
+ return True
69
+ if isinstance(data, bytes):
70
+ with open(out, "wb") as f:
71
+ f.write(data)
72
+ link.link_or_copy(out, local_cache_path)
73
+ return True
74
+ return False
75
+
76
+ if _try_write_through():
77
+ try:
78
+ # it's a reflink or a copy, so the cache now owns its copy
79
+ # and we don't want to allow anyone to write to its copy.
80
+ files.set_read_only(local_cache_path)
81
+ return local_cache_path
82
+ except FileNotFoundError:
83
+ # may have hit a race condition.
84
+ # don't fail upload just because we couldn't write through the cache.
85
+ pass
86
+ return None
87
+
88
+
89
+ @scope.bound
90
+ @fretry.retry_sleep(
91
+ # ADLS lib has a bug where parallel uploads of the same thing will
92
+ # hit a race condition and error. this will detect that scenario
93
+ # and avoid re-uploading as well.
94
+ fretry.is_exc(ResourceModifiedError),
95
+ fretry.expo(retries=5),
96
+ )
97
+ def upload(
98
+ dest: ty.Union[AdlsFqn, str],
99
+ src: UploadSrc,
100
+ write_through_cache: ty.Optional[Cache] = None,
101
+ *,
102
+ content_type: str = "",
103
+ **upload_data_kwargs: ty.Any,
104
+ ) -> ty.Optional[AdlsHashedResource]:
105
+ """Uploads only if the remote does not exist or does not match
106
+ md5.
107
+
108
+ Always embeds md5 in upload if at all possible. In very rare cases
109
+ it may not be possible for us to calculate one. Will always
110
+ be possible if the passed data was a Path. If one can be
111
+ calculated, an AdlsHashedResource is returned.
112
+
113
+ Can write through a local cache, which may save you a download later.
114
+
115
+ content_type and all upload_data_kwargs will be ignored if the file
116
+ has already been uploaded and the md5 matches.
117
+ """
118
+ dest_ = AdlsFqn.parse(dest) if isinstance(dest, str) else dest
119
+ if write_through_cache:
120
+ _write_through_local_cache(write_through_cache.path(dest_), src)
121
+ # we always use the original source file to upload, not the cached path,
122
+ # because uploading from a shared location risks race conditions.
123
+
124
+ blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
125
+ blob_client = blob_container_client.get_blob_client(dest_.path)
126
+ decision = upload_decision_and_settings(blob_client.get_blob_properties, src) # type: ignore [arg-type]
127
+ if decision.upload_required:
128
+ # set up some bookkeeping
129
+ n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
130
+ if isinstance(src, Path):
131
+ n_bytes = src.stat().st_size
132
+ src = scope.enter(open(src, "rb"))
133
+ elif isinstance(src, bytes):
134
+ n_bytes = len(src)
135
+
136
+ adls_meta = metadata_for_upload()
137
+ if "metadata" in upload_data_kwargs:
138
+ adls_meta.update(upload_data_kwargs.pop("metadata"))
139
+
140
+ upload_content_settings = decision.content_settings or ContentSettings()
141
+ if content_type:
142
+ upload_content_settings.content_type = content_type
143
+
144
+ # we are now using blob_client instead of file system client
145
+ # because blob client (as of 2024-06-24) does actually do
146
+ # some one-step, atomic uploads, wherein there is not a separate
147
+ # create/truncate action associated with an overwrite.
148
+ # This is both faster, as well as simpler to reason about, and
149
+ # in fact was the behavior I had been assuming all along...
150
+ blob_client.upload_blob(
151
+ report_upload_progress(ty.cast(ty.IO, src), str(dest_), n_bytes or 0),
152
+ overwrite=True,
153
+ length=n_bytes,
154
+ content_settings=upload_content_settings,
155
+ connection_timeout=_SLOW_CONNECTION_WORKAROUND,
156
+ max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
157
+ metadata=adls_meta,
158
+ **upload_data_kwargs,
159
+ )
160
+
161
+ # if at all possible (if the md5 is known), return a resource containing it.
162
+ if decision.content_settings and decision.content_settings.content_md5:
163
+ return AdlsHashedResource.of(dest_, hashing.b64(decision.content_settings.content_md5))
164
+ return None
165
+
166
+
167
+ def verify_remote_md5(resource: AdlsHashedResource) -> bool:
168
+ try:
169
+ props = (
170
+ get_global_fs_client(resource.fqn.sa, resource.fqn.container)
171
+ .get_file_client(resource.fqn.path)
172
+ .get_file_properties()
173
+ )
174
+ if props.content_settings.content_md5:
175
+ return hashing.b64(props.content_settings.content_md5) == resource.md5b64
176
+ except HttpResponseError:
177
+ return False
178
+ except Exception:
179
+ logger.exception("Unable to verify remote md5")
180
+ return False
181
+
182
+
183
+ # DOWNLOAD if exists, else CREATE and UPLOAD
184
+
185
+
186
+ def verify_or_create_resource(
187
+ resource_json_path: Path,
188
+ get_adls_fqn: ty.Callable[[], AdlsFqn],
189
+ creator: ty.Callable[[], Path],
190
+ cache: ty.Optional[Cache] = global_cache(),
191
+ ) -> AdlsHashedResource:
192
+ """Return an MD5-verified resource if it already exists and
193
+ matches the FQN _and_ the MD5 embedded in the resource JSON file.
194
+
195
+ Does not download the actual resource if it can be verified to
196
+ exist and match what is expected.
197
+
198
+ If if does not exist or does not match, creates the resource and
199
+ uploads it to the requested path as well.
200
+
201
+ Basically an idempotent get-or-create pattern, applied to things
202
+ that are more expensive to build than to upload, and that result
203
+ in a single file.
204
+
205
+ For this to work correctly, you will need the FQN itself to change
206
+ based on some kind of key that can be considered unique to a
207
+ particular version of the resource. Think of it like a cache
208
+ key. For instance, if you have a resource that gets changed every
209
+ time your library gets built, then your library version could be
210
+ part of the Adls FQN.
211
+
212
+ If running in CI, will create and upload your resource from CI,
213
+ but will then raise an exception, since you need to commit the
214
+ serialized resource to the resource_json_path.
215
+ """
216
+ remote_fqn = get_adls_fqn() # this is lazy to allow partial application at a module level.
217
+ if resource_json_path.exists():
218
+ try:
219
+ resource = resource_from_path(resource_json_path)
220
+ try:
221
+ if resource.fqn == remote_fqn:
222
+ if verify_remote_md5(resource):
223
+ return resource
224
+ else:
225
+ logger.info("Resource MD5 does not match; must recreate.")
226
+ logger.info("Resource FQN does not match - it needs to be recreated.")
227
+ except BlobNotFoundError:
228
+ logger.info(f"Resource does not exist at {resource.fqn}; will create.")
229
+ except Exception:
230
+ logger.exception(f"Failed to get resource from {resource.fqn}, will recreate.")
231
+ except Exception:
232
+ logger.exception(f"Unable to parse a resource from {resource_json_path}; must recreate.")
233
+
234
+ logger.info(f"Creating resource for {remote_fqn}...")
235
+ created_path = creator()
236
+ sz_mb = created_path.stat().st_size / 2**20 # 1 MB
237
+ logger.info(
238
+ f"Uploading created resource of size {sz_mb:.1f} MB to {remote_fqn} from {created_path} ..."
239
+ )
240
+ uploaded_resource = upload(remote_fqn, created_path, write_through_cache=cache)
241
+ assert (
242
+ uploaded_resource
243
+ ), "Cannot create a shared resource without being able to calculate MD5 prior to upload."
244
+ resource_to_path(resource_json_path, uploaded_resource, check_ci=True)
245
+ return uploaded_resource
thds/adls/ro_cache.py ADDED
@@ -0,0 +1,126 @@
1
+ import os
2
+ import sys
3
+ import typing as ty
4
+ from pathlib import Path
5
+
6
+ from thds.core import config, log
7
+ from thds.core import types as ct
8
+ from thds.core.files import set_read_only
9
+ from thds.core.home import HOMEDIR
10
+ from thds.core.link import LinkType, link_or_copy
11
+
12
+ from .fqn import AdlsFqn
13
+ from .md5 import hex_md5_str
14
+
15
+ GLOBAL_CACHE_PATH = config.item("global-cache-path", HOMEDIR() / ".adls-md5-ro-cache", parse=Path)
16
+ MAX_FILENAME_LEN = config.item("max-filename-len", 255, parse=int) # safe on most local filesystems?
17
+ MAX_TOTAL_PATH_LEN = config.item(
18
+ "max-total-path-len", 1023 if sys.platform == "darwin" else 4095, parse=int
19
+ )
20
+ logger = log.getLogger(__name__)
21
+
22
+ LinkOpts = ty.Union[bool, ty.Tuple[LinkType, ...]]
23
+ # if True, order is reflink, hardlink, softlink, copy
24
+
25
+
26
+ class Cache(ty.NamedTuple):
27
+ """Immutable struct declaring what cache behavior is desired."""
28
+
29
+ root: Path
30
+ link: LinkOpts
31
+
32
+ def path(self, fqn: AdlsFqn) -> Path:
33
+ p = _cache_path_for_fqn(self, fqn)
34
+ # we do not call this function anywhere unless we're planning
35
+ # on downloading, so in general this should not create
36
+ # empty directories that we didn't expect to use.
37
+ p.parent.mkdir(parents=True, exist_ok=True)
38
+ return p
39
+
40
+
41
+ def global_cache(link: LinkOpts = ("ref", "hard")) -> Cache:
42
+ """This is the recommended caching configuration."""
43
+ return Cache(GLOBAL_CACHE_PATH(), link)
44
+
45
+
46
+ def _compress_long_path_part(part: str, max_bytes: int) -> str:
47
+ md5_of_entire_part = "-md5-" + hex_md5_str(part) + "-"
48
+ start_of_excised_section = (len(part) - len(md5_of_entire_part)) // 2
49
+ end_of_excised_section = start_of_excised_section + len(md5_of_entire_part)
50
+
51
+ while True:
52
+ compressed_part = (
53
+ part[:start_of_excised_section] + md5_of_entire_part + part[end_of_excised_section:]
54
+ )
55
+ num_bytes_overage = len(compressed_part.encode()) - max_bytes
56
+ if num_bytes_overage <= 0:
57
+ return compressed_part
58
+
59
+ if len(part) - end_of_excised_section < start_of_excised_section:
60
+ start_of_excised_section -= 1
61
+ else:
62
+ end_of_excised_section += 1
63
+ # this is a very naive iterative approach to taking more 'bites' out of the middle of the filename.
64
+ # we can't easily reason about how many bytes each character is, but we also can't
65
+ # operate at the byte level directly, because removing bytes out of Unicode characters
66
+ # will inevitably lead to invalid UTF-8 sequences.
67
+
68
+ assert start_of_excised_section >= 0, (
69
+ part,
70
+ compressed_part,
71
+ start_of_excised_section,
72
+ )
73
+ assert end_of_excised_section <= len(part), (
74
+ part,
75
+ compressed_part,
76
+ end_of_excised_section,
77
+ )
78
+
79
+
80
+ def _cache_path_for_fqn(cache: Cache, fqn: AdlsFqn) -> Path:
81
+ """
82
+ On Linux, file paths can be 255 bytes per part, and the max full path limit is
83
+ 4095, not including the NULL terminator. On Mac, the max total length is 1023, and
84
+ the max part length is 255.
85
+ """
86
+ # we assume that neither the SA nor the container will ever be more than MAX_FILENAME_LEN bytes.
87
+ # However, we know that sometimes the path parts _are_, so in rare
88
+ # cases we need unique yet mostly readable abbreviation for those.
89
+ parts = list()
90
+ for part in fqn.path.split("/"):
91
+ part_bytes = part.encode()
92
+ if len(part_bytes) > MAX_FILENAME_LEN():
93
+ part = _compress_long_path_part(part, MAX_FILENAME_LEN())
94
+ parts.append(part)
95
+
96
+ full_path = str(Path(cache.root.resolve() / fqn.sa / fqn.container, *parts))
97
+ if len(full_path.encode()) > MAX_TOTAL_PATH_LEN():
98
+ full_path = _compress_long_path_part(full_path, MAX_TOTAL_PATH_LEN())
99
+
100
+ return Path(full_path)
101
+
102
+
103
+ def _opts_to_types(opts: LinkOpts) -> ty.Tuple[LinkType, ...]:
104
+ if opts is True:
105
+ return ("ref", "hard")
106
+ elif opts is False:
107
+ return tuple()
108
+ return opts
109
+
110
+
111
+ def from_cache_path_to_local(cache_path: ct.StrOrPath, local_path: ct.StrOrPath, link_opts: LinkOpts):
112
+ set_read_only(cache_path)
113
+
114
+ link_type = link_or_copy(cache_path, local_path, *_opts_to_types(link_opts))
115
+ if link_type in {"ref", "", "same"}:
116
+ # hard and soft links do not have their own permissions - they
117
+ # share the read-only permissions of their target. reflinks
118
+ # and copies will not, so those should not be marked as
119
+ # read-only since edits to them will not affect the original,
120
+ # cached copy.
121
+ os.chmod(local_path, 0o644) # 0o644 == rw-r--r-- (user, group, all)
122
+
123
+
124
+ def from_local_path_to_cache(local_path: ct.StrOrPath, cache_path: ct.StrOrPath, link_opts: LinkOpts):
125
+ link_or_copy(local_path, cache_path, *_opts_to_types(link_opts))
126
+ set_read_only(cache_path)