thds.adls 3.0.20250116223841__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/download.py ADDED
@@ -0,0 +1,394 @@
1
+ import contextlib
2
+ import enum
3
+ import os
4
+ import shutil
5
+ import typing as ty
6
+ from base64 import b64decode
7
+
8
+ from azure.core.exceptions import AzureError, HttpResponseError
9
+ from azure.storage.filedatalake import (
10
+ ContentSettings,
11
+ DataLakeFileClient,
12
+ FileProperties,
13
+ FileSystemClient,
14
+ )
15
+
16
+ from thds.core import log, scope, tmp
17
+ from thds.core.hashing import b64
18
+ from thds.core.types import StrOrPath
19
+
20
+ from ._progress import report_download_progress
21
+ from .conf import CONNECTION_TIMEOUT, DOWNLOAD_FILE_MAX_CONCURRENCY
22
+ from .download_lock import download_lock
23
+ from .errors import translate_azure_error
24
+ from .etag import match_etag
25
+ from .fqn import AdlsFqn
26
+ from .md5 import check_reasonable_md5b64, md5_file
27
+ from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
28
+
29
+ logger = log.getLogger(__name__)
30
+
31
+
32
+ class MD5MismatchError(Exception):
33
+ """Indicates that something needs to be done by the developer to correct a hash mismatch."""
34
+
35
+
36
+ @contextlib.contextmanager
37
+ def _atomic_download_and_move(
38
+ fqn: AdlsFqn,
39
+ dest: StrOrPath,
40
+ properties: ty.Optional[FileProperties] = None,
41
+ ) -> ty.Iterator[ty.IO[bytes]]:
42
+ with tmp.temppath_same_fs(dest) as dpath:
43
+ with open(dpath, "wb") as f:
44
+ known_size = (properties.size or 0) if properties else 0
45
+ logger.debug("Downloading %s", fqn)
46
+ yield report_download_progress(f, str(dest), known_size)
47
+ try:
48
+ os.rename(dpath, dest) # will succeed even if dest is read-only
49
+ except OSError as oserr:
50
+ if "Invalid cross-device link" in str(oserr):
51
+ # this shouldn't ever happen because of temppath_same_fs, but just in case...
52
+ shutil.copyfile(dpath, dest)
53
+ else:
54
+ raise
55
+
56
+
57
+ @contextlib.contextmanager
58
+ def _verify_md5s_before_and_after_download(
59
+ remote_md5b64: str, expected_md5b64: str, fqn: AdlsFqn, local_dest: StrOrPath
60
+ ) -> ty.Iterator[None]:
61
+ if expected_md5b64:
62
+ check_reasonable_md5b64(expected_md5b64)
63
+ if remote_md5b64:
64
+ check_reasonable_md5b64(remote_md5b64)
65
+ if remote_md5b64 and expected_md5b64 and remote_md5b64 != expected_md5b64:
66
+ raise MD5MismatchError(
67
+ f"ADLS thinks the MD5 of {fqn} is {remote_md5b64}, but we expected {expected_md5b64}."
68
+ " This may indicate that we need to update a hash in the codebase."
69
+ )
70
+
71
+ yield # perform download
72
+
73
+ with log.logger_context(hash_for="after-download"):
74
+ local_md5b64 = b64(md5_file(local_dest))
75
+ check_reasonable_md5b64(local_md5b64) # must always exist
76
+ if remote_md5b64 and remote_md5b64 != local_md5b64:
77
+ raise MD5MismatchError(
78
+ f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
79
+ f" but the remote ({fqn}) says it should be {remote_md5b64}."
80
+ f" This may indicate that ADLS has an erroneous MD5 for {fqn}."
81
+ )
82
+ if expected_md5b64 and local_md5b64 != expected_md5b64:
83
+ raise MD5MismatchError(
84
+ f"The MD5 of the downloaded file {local_dest} is {local_md5b64},"
85
+ f" but we expected it to be {expected_md5b64}."
86
+ f" This probably indicates a corrupted download of {fqn}"
87
+ )
88
+ all_hashes = dict(local=local_md5b64, remote=remote_md5b64, expected=expected_md5b64)
89
+ assert 1 == len(set(filter(None, all_hashes.values()))), all_hashes
90
+
91
+
92
+ def _md5b64_path_if_exists(path: StrOrPath) -> ty.Optional[str]:
93
+ if not path or not os.path.exists(path): # does not exist if it's a symlink with a bad referent.
94
+ return None
95
+ return b64(md5_file(path))
96
+
97
+
98
+ def _remote_md5b64(file_properties: FileProperties) -> str:
99
+ if file_properties.content_settings.content_md5:
100
+ return b64(file_properties.content_settings.content_md5)
101
+ return ""
102
+
103
+
104
+ # Async is weird.
105
+ #
106
+ # You cannot easily call an async function from within a standard/non-async function.
107
+ # And while you _can_ call a synchronous function from within an async one,
108
+ # it's highly discouraged if that function is doing network I/O,
109
+ # because your sync network IO will block the entire async green thread,
110
+ # grinding all 'async' work to a halt. It's very unneighborly.
111
+ #
112
+ # Unfortunately, this means that it's quite difficult to share the implementation
113
+ # of complex logic between async and non-async users. You can't use callbacks to abstract
114
+ # I/O, because those would themselves need to be either async or not.
115
+ #
116
+ # What you can do is a sort of 'functional core, imperative shell' approach,
117
+ # where the I/O parts are performed in a top level imperative shell, and the functional
118
+ # (logic) parts are performed in the shared core. As with many such things,
119
+ # the trick then is how to structure the functional core such that it 'makes sense' to a reader.
120
+ #
121
+ # One traditional means of doing this is breaking the functional core up into
122
+ # several functions to be called before and after the network calls.
123
+ # However, that does tend to impair readability, as those core functions
124
+ # each only do part of the work, and sometimes the 'part' doesn't make as much
125
+ # sense on its own as you might like.
126
+ #
127
+ # This is (to me) an entirely novel approach, and as such is an experiment.
128
+ # By writing a coroutine (the logic directly below) as the functional core,
129
+ # we can make the main 'logic' of the system readable in one go.
130
+ # What is required is a willingness to read the `yield` statements as
131
+ # essentially the 'inverse' of async/await - yield means "send this value to
132
+ # my controller and wait for their response'. Once the response is sent, we can
133
+ # resume where we left off, and the logic flows reasonably nicely.
134
+ #
135
+ # One _additional_ advantage of this approach is that certain bits of IO
136
+ # can actually be re-requested at any time, and the coroutine's controller
137
+ # can in a sense 'cache' those responses. So instead of the core logic
138
+ # having to keep track of whether it has performed the IO, it can request the result
139
+ # again and rely on the controller to re-send the previously fetched result.
140
+
141
+
142
+ class _IoRequest(enum.Enum):
143
+ FILE_PROPERTIES = "file_properties"
144
+
145
+
146
+ IoRequest = ty.Union[_IoRequest, ty.IO[bytes]]
147
+ IoResponse = ty.Union[FileProperties, None]
148
+
149
+
150
+ class _FileResult(ty.NamedTuple):
151
+ md5b64: str
152
+ hit: bool
153
+
154
+
155
+ _dl_scope = scope.Scope("adls.download")
156
+
157
+
158
+ def _download_or_use_verified_cached_coroutine( # noqa: C901
159
+ fqn: AdlsFqn,
160
+ local_path: StrOrPath,
161
+ md5b64: str = "",
162
+ cache: ty.Optional[Cache] = None,
163
+ ) -> ty.Generator[IoRequest, IoResponse, _FileResult]:
164
+ """Make a file on ADLS available at the local path provided.
165
+
166
+ When we download from ADLS we want to know for sure that we have
167
+ the bytes we expected. Sometimes we have a hash upfront that we
168
+ want to verify. Other times, we simply want to rely on the hash
169
+ ADLS has. If we have both, we should check anything we have the
170
+ opportunity to check.
171
+
172
+ Because we're verifying everything, we can optionally offer two
173
+ sorts of verified caching.
174
+
175
+ 1. With no local cache, we can at least verify whether the file is
176
+ present at the local path and contains the expected bytes. If it
177
+ does, there's no need to re-download.
178
+
179
+ 2. With a cache provided, we can also check that cache, and if the
180
+ file is present in the local cache, we can either hard/soft link or copy
181
+ it into the expected location, depending on the Cache configuration.
182
+
183
+ If the file is not present in either location, we finally download
184
+ the file from ADLS and then verify that the local hash matches
185
+ both what we expected and what ADLS told us it would be, if any.
186
+
187
+ The downloaded file will always placed into the cache if a cache
188
+ is provided. The local path, if different from the cache path,
189
+ will either be linked or copied to, as selected by
190
+ `cache.link`. `link=True` will save storage space but is not what
191
+ you want if you intend to modify the file.
192
+
193
+ Files placed in the cache will be marked as read-only to prevent
194
+ _some_ types of accidents. This will not prevent you from
195
+ accidentally or maliciously moving files on top of existing cached
196
+ files, but it will prevent you from opening those files for
197
+ writing in a standard fashion.
198
+
199
+ Raises StopIteration when complete. StopIteration.value.hit will
200
+ be True if there was a cache hit, and False if a download was
201
+ required. `.value` will also contain the md5b64 of the downloaded
202
+ file, which may be used as desired.
203
+ """
204
+ if not local_path:
205
+ raise ValueError("Must provide a destination path.")
206
+
207
+ _dl_scope.enter(log.logger_context(dl=fqn))
208
+ file_properties = None
209
+ if not md5b64:
210
+ # we don't know what we expect, so attempt to retrieve an
211
+ # expectation from ADLS itself.
212
+ file_properties = yield _IoRequest.FILE_PROPERTIES
213
+ md5b64 = _remote_md5b64(file_properties)
214
+
215
+ def attempt_cache_hit() -> ty.Optional[_FileResult]:
216
+ if not md5b64:
217
+ return None
218
+
219
+ check_reasonable_md5b64(md5b64)
220
+ with log.logger_context(hash_for="before-download-dest"):
221
+ local_md5b64 = _md5b64_path_if_exists(local_path)
222
+ if local_md5b64 == md5b64:
223
+ logger.debug("Local path matches MD5 - no need to look further")
224
+ if cache:
225
+ cache_path = cache.path(fqn)
226
+ with log.logger_context(hash_for="before-download-cache"):
227
+ if local_md5b64 != _md5b64_path_if_exists(cache_path):
228
+ # only copy if the cache is out of date
229
+ from_local_path_to_cache(local_path, cache_path, cache.link)
230
+ return _FileResult(local_md5b64, hit=True)
231
+
232
+ if local_md5b64:
233
+ logger.debug("Local path exists but does not match expected md5 %s", md5b64)
234
+ if cache:
235
+ cache_path = cache.path(fqn)
236
+ cache_md5b64 = _md5b64_path_if_exists(cache_path)
237
+ if cache_md5b64 == md5b64: # file in cache matches!
238
+ from_cache_path_to_local(cache_path, local_path, cache.link)
239
+ return _FileResult(cache_md5b64, hit=True)
240
+
241
+ if cache_md5b64:
242
+ logger.debug("Cache path exists but does not match expected md5 %s", md5b64)
243
+ return None
244
+
245
+ # attempt cache hit before taking a lock, to avoid contention for existing files.
246
+ if file_result := attempt_cache_hit():
247
+ return file_result # noqa: B901
248
+
249
+ _dl_scope.enter(download_lock(str(cache.path(fqn) if cache else local_path)))
250
+ # create lockfile name from the (shared) cache path if present, otherwise the final
251
+ # destination. Non-cache users may then still incur multiple downloads in parallel,
252
+ # but if you wanted to coordinate then you should probably have been using the global
253
+ # cache in the first place.
254
+
255
+ # re-attempt cache hit - we may have gotten the lock after somebody else downloaded
256
+ if file_result := attempt_cache_hit():
257
+ logger.debug("Got cache hit on the second attempt, after acquiring lock for %s", fqn)
258
+ return file_result # noqa: B901
259
+
260
+ logger.debug("Unable to find a cached version anywhere that we looked...")
261
+ file_properties = yield _IoRequest.FILE_PROPERTIES
262
+ # no point in downloading if we've asked for hash X but ADLS only has hash Y.
263
+ with _verify_md5s_before_and_after_download(
264
+ _remote_md5b64(file_properties),
265
+ md5b64,
266
+ fqn,
267
+ local_path,
268
+ ): # download new data directly to local path
269
+ with _atomic_download_and_move(fqn, local_path, file_properties) as tmpwriter:
270
+ yield tmpwriter
271
+ if cache:
272
+ from_local_path_to_cache(local_path, cache.path(fqn), cache.link)
273
+ return _FileResult(md5b64 or b64(md5_file(local_path)), hit=False)
274
+
275
+
276
+ # So ends the crazy download caching coroutine.
277
+ #
278
+ # Below this point are several helper functions, and after that are the two
279
+ # (async and non-async) coroutine controllers. While you can still see duplication
280
+ # between the two controllers, it is clearly much less code than would otherwise
281
+ # have to be duplicated in order to maintain an async and non-async
282
+ # implementation in parallel.
283
+
284
+
285
+ def _prep_download_coroutine(
286
+ fs_client: FileSystemClient,
287
+ remote_key: str,
288
+ local_path: StrOrPath,
289
+ md5b64: str = "",
290
+ cache: ty.Optional[Cache] = None,
291
+ ) -> ty.Tuple[
292
+ ty.Generator[IoRequest, IoResponse, _FileResult],
293
+ IoRequest,
294
+ ty.Optional[FileProperties],
295
+ DataLakeFileClient,
296
+ ]:
297
+ co = _download_or_use_verified_cached_coroutine(
298
+ AdlsFqn(fs_client.account_name, fs_client.file_system_name, remote_key),
299
+ local_path,
300
+ md5b64=md5b64,
301
+ cache=cache,
302
+ )
303
+ return co, co.send(None), None, fs_client.get_file_client(remote_key)
304
+
305
+
306
+ def _set_md5_if_missing(
307
+ file_properties: ty.Optional[FileProperties], md5b64: str
308
+ ) -> ty.Optional[ContentSettings]:
309
+ if not file_properties or file_properties.content_settings.content_md5:
310
+ return None
311
+ file_properties.content_settings.content_md5 = b64decode(md5b64)
312
+ return file_properties.content_settings
313
+
314
+
315
+ @_dl_scope.bound
316
+ def download_or_use_verified(
317
+ fs_client: FileSystemClient,
318
+ remote_key: str,
319
+ local_path: StrOrPath,
320
+ md5b64: str = "",
321
+ cache: ty.Optional[Cache] = None,
322
+ ) -> bool:
323
+ """Download a file or use the existing, cached copy if one exists in the cache and is verifiable.
324
+
325
+ Note that you will get a logged warning if `local_path` already exists when you call
326
+ this function.
327
+ """
328
+ file_properties = None
329
+ try:
330
+ co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
331
+ fs_client, remote_key, local_path, md5b64, cache
332
+ )
333
+ while True:
334
+ if co_request == _IoRequest.FILE_PROPERTIES:
335
+ if not file_properties:
336
+ # only fetch these if they haven't already been requested
337
+ file_properties = dl_file_client.get_file_properties()
338
+ co_request = co.send(file_properties)
339
+ else: # needs file object
340
+ dl_file_client.download_file(
341
+ max_concurrency=DOWNLOAD_FILE_MAX_CONCURRENCY(),
342
+ connection_timeout=CONNECTION_TIMEOUT(),
343
+ ).readinto(co_request)
344
+ co_request = co.send(None)
345
+ except StopIteration as si:
346
+ if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
347
+ try:
348
+ logger.info(f"Setting missing MD5 for {remote_key}")
349
+ assert file_properties
350
+ dl_file_client.set_http_headers(cs, **match_etag(file_properties))
351
+ except HttpResponseError as hre:
352
+ logger.info(f"Unable to set MD5 for {remote_key}: {hre}")
353
+ return si.value.hit
354
+ except AzureError as err:
355
+ translate_azure_error(fs_client, remote_key, err)
356
+
357
+
358
+ @_dl_scope.bound
359
+ async def async_download_or_use_verified(
360
+ fs_client: FileSystemClient,
361
+ remote_key: str,
362
+ local_path: StrOrPath,
363
+ md5b64: str = "",
364
+ cache: ty.Optional[Cache] = None,
365
+ ) -> bool:
366
+ file_properties = None
367
+ try:
368
+ co, co_request, file_properties, dl_file_client = _prep_download_coroutine(
369
+ fs_client, remote_key, local_path, md5b64, cache
370
+ )
371
+ while True:
372
+ if co_request == _IoRequest.FILE_PROPERTIES:
373
+ if not file_properties:
374
+ # only fetch these if they haven't already been requested
375
+ file_properties = await dl_file_client.get_file_properties()
376
+ co_request = co.send(file_properties)
377
+ else: # needs file object
378
+ reader = await dl_file_client.download_file(
379
+ max_concurrency=DOWNLOAD_FILE_MAX_CONCURRENCY(),
380
+ connection_timeout=CONNECTION_TIMEOUT(),
381
+ )
382
+ await reader.readinto(co_request)
383
+ co_request = co.send(None)
384
+ except StopIteration as si:
385
+ if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
386
+ try:
387
+ logger.info(f"Setting missing MD5 for {remote_key}")
388
+ assert file_properties
389
+ await dl_file_client.set_http_headers(cs, **match_etag(file_properties))
390
+ except HttpResponseError as hre:
391
+ logger.info(f"Unable to set MD5 for {remote_key}: {hre}")
392
+ return si.value.hit
393
+ except AzureError as err:
394
+ translate_azure_error(fs_client, remote_key, err)
@@ -0,0 +1,57 @@
1
+ import random
2
+ import time
3
+ from datetime import timedelta
4
+ from pathlib import Path
5
+
6
+ from filelock import FileLock
7
+
8
+ from thds.core import config, home, log
9
+
10
+ from .md5 import hex_md5_str
11
+
12
+ DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".adls-md5-download-locks", parse=Path)
13
+ _CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
14
+ logger = log.getLogger(__name__)
15
+
16
+
17
+ def _clean_download_locks() -> int:
18
+ deleted = 0
19
+ deletion_threshold = time.time() - _CLEAN_UP_LOCKFILES_AFTER_TIME.total_seconds()
20
+ try:
21
+ for f in DOWNLOAD_LOCKS_DIR().iterdir():
22
+ if f.is_file() and f.stat().st_mtime < deletion_threshold:
23
+ f.unlink()
24
+ deleted += 1
25
+ except Exception:
26
+ # this should be, hopefully, both very rare and completely inconsequential as to
27
+ # program correctness. if you see this happen multiple times, you may have some
28
+ # read-only files or something and want to manually clean up this directory.
29
+ logger.exception("Failed to clean download locks directory.")
30
+ return deleted
31
+
32
+
33
+ def _occasionally_clean_download_locks():
34
+ if random.random() < 0.005: # do this about every 200 downloads
35
+ # random.random is considered to be very fast, and we have no need of cryptographic quality.
36
+ _clean_download_locks()
37
+
38
+
39
+ def download_lock(download_unique_str: str) -> FileLock:
40
+ """Note that the lockfiles will never be deleted automatically.
41
+ https://py-filelock.readthedocs.io/en/latest/api.html#filelock.BaseFileLock.release
42
+
43
+ also see:
44
+ https://stackoverflow.com/questions/58098634/why-does-the-python-filelock-library-delete-lockfiles-on-windows-but-not-unix
45
+
46
+ This means local developers would have a whole bunch of zero-byte files in their
47
+ download locks directory. So, we take a slightly idiosyncratic approach to cleaning
48
+ this up: not wanting to run this code on every download, but also not wanting
49
+ developers to see an infinitely-growing mess. Since parallel downloads will
50
+ (generally) not constitute a correctness issue, the 'safest' time to clean it up will
51
+ be when you don't have any downloads in progress, but in practice it seems likely that
52
+ we can get rid of old lockfiles after they've existed for more than 24 hours, since
53
+ it's quite rare that a download would last that long.
54
+ """
55
+ DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
56
+ _occasionally_clean_download_locks()
57
+ return FileLock(DOWNLOAD_LOCKS_DIR() / hex_md5_str(download_unique_str))
thds/adls/errors.py ADDED
@@ -0,0 +1,44 @@
1
+ import typing as ty
2
+ from contextlib import contextmanager
3
+
4
+ from azure.core.exceptions import AzureError, HttpResponseError
5
+
6
+ from thds.core.log import getLogger
7
+
8
+ from .fqn import AdlsFqn
9
+
10
+ logger = getLogger(__name__)
11
+
12
+
13
+ class BlobNotFoundError(HttpResponseError):
14
+ def __init__(self, fqn: AdlsFqn, type_hint: str = "Blob"):
15
+ super().__init__(f"{type_hint} not found: {fqn}")
16
+
17
+
18
+ def is_blob_not_found(exc: Exception) -> bool:
19
+ return (isinstance(exc, HttpResponseError) and exc.status_code == 404) or isinstance(
20
+ exc, BlobNotFoundError
21
+ )
22
+
23
+
24
+ def translate_blob_not_found(hre: HttpResponseError, sa: str, container: str, path: str) -> ty.NoReturn:
25
+ if is_blob_not_found(hre):
26
+ raise BlobNotFoundError(AdlsFqn.of(sa, container, path)) from hre
27
+ raise hre
28
+
29
+
30
+ @contextmanager
31
+ def blob_not_found_translation(fqn: AdlsFqn) -> ty.Iterator[None]:
32
+ try:
33
+ yield
34
+ except HttpResponseError as hre:
35
+ translate_blob_not_found(hre, *fqn)
36
+
37
+
38
+ def translate_azure_error(client, key: str, err: AzureError) -> ty.NoReturn:
39
+ """We reserve the right to translate others in the future."""
40
+ fqn = AdlsFqn.of(client.account_name, client.file_system_name, key)
41
+ if is_blob_not_found(err):
42
+ raise BlobNotFoundError(fqn) from err
43
+ logger.error("Failed when operating on %s", fqn)
44
+ raise err
thds/adls/etag.py ADDED
@@ -0,0 +1,6 @@
1
+ from azure.core import MatchConditions
2
+ from azure.storage.filedatalake import FileProperties
3
+
4
+
5
+ def match_etag(file_properties: FileProperties) -> dict:
6
+ return dict(etag=file_properties.etag, match_condition=MatchConditions.IfNotModified)
@@ -0,0 +1,13 @@
1
+ from azure.storage.filedatalake import FileProperties
2
+
3
+ from .fqn import AdlsFqn
4
+ from .global_client import get_global_fs_client
5
+
6
+
7
+ def is_directory(info: FileProperties) -> bool:
8
+ # from https://github.com/Azure/azure-sdk-for-python/issues/24814#issuecomment-1159280840
9
+ return str(info.metadata.get("hdi_isfolder", "")).lower() == "true"
10
+
11
+
12
+ def get_file_properties(fqn: AdlsFqn) -> FileProperties:
13
+ return get_global_fs_client(fqn.sa, fqn.container).get_file_client(fqn.path).get_file_properties()
thds/adls/fqn.py ADDED
@@ -0,0 +1,169 @@
1
+ import re
2
+ from functools import reduce
3
+ from typing import NamedTuple, TypeVar, Union
4
+
5
+ ADLS_SCHEME = (
6
+ "adls://" # this is our invention, but ADLS does not appear to define one suitable for general use.
7
+ )
8
+
9
+
10
+ def join(*parts: str) -> str:
11
+ """For joining ADLS paths together."""
12
+
13
+ def join_(prefix: str, suffix: str) -> str:
14
+ prefix = prefix.rstrip("/")
15
+ suffix = suffix.lstrip("/")
16
+ return f"{prefix}/{suffix}".rstrip("/")
17
+
18
+ return reduce(join_, parts)
19
+
20
+
21
+ class AdlsRoot(NamedTuple):
22
+ sa: str
23
+ container: str
24
+
25
+ def __str__(self) -> str:
26
+ return format_fqn(*self)
27
+
28
+ @staticmethod
29
+ def of(storage_account: str, container: str) -> "AdlsRoot":
30
+ return parse_fqn(format_fqn(storage_account, container, "/")).root()
31
+
32
+ @staticmethod
33
+ def parse(root_uri: str) -> "AdlsRoot":
34
+ if not root_uri.endswith("/"):
35
+ root_uri = root_uri + "/"
36
+ fqn = AdlsFqn.parse(root_uri)
37
+ assert not fqn.path, f"URI '{root_uri}' does not represent an ADLS root!"
38
+ return AdlsRoot(fqn.sa, fqn.container)
39
+
40
+ def join(self, *path: str) -> "AdlsFqn":
41
+ return AdlsFqn(self.sa, self.container, join("", *path))
42
+
43
+ def __truediv__(self, path: str) -> "AdlsFqn":
44
+ return self.join(path)
45
+
46
+ @property
47
+ def parent(self) -> "AdlsRoot":
48
+ return self
49
+
50
+
51
+ class AdlsFqn(NamedTuple):
52
+ """A fully-qualified ADLS path.
53
+
54
+ Represents a (Storage Account, Container) root, if path is empty.
55
+
56
+ Should usually be constructed via `parse`, `parse_fqn`, or `of`,
57
+ which will perform validation.
58
+ """
59
+
60
+ sa: str
61
+ container: str
62
+ path: str
63
+
64
+ def __str__(self) -> str:
65
+ return format_fqn(*self)
66
+
67
+ @staticmethod
68
+ def of(storage_account: str, container: str, path: str = "") -> "AdlsFqn":
69
+ """Expensive but includes validation."""
70
+ return of(storage_account, container, path)
71
+
72
+ @staticmethod
73
+ def parse(fully_qualified_name: str) -> "AdlsFqn":
74
+ return parse_fqn(fully_qualified_name)
75
+
76
+ def join(self, *path_suffix: str) -> "AdlsFqn":
77
+ return AdlsFqn(self.sa, self.container, join(self.path, *path_suffix))
78
+
79
+ def __truediv__(self, path: str) -> "AdlsFqn":
80
+ return self.join(path)
81
+
82
+ # Should be a property?
83
+ def root(self) -> AdlsRoot:
84
+ return AdlsRoot(self.sa, self.container)
85
+
86
+ @property
87
+ def parent(self) -> "AdlsFqn":
88
+ return parent(self)
89
+
90
+
91
+ FR = TypeVar("FR", bound=Union[AdlsFqn, AdlsRoot])
92
+
93
+
94
+ def of(storage_account: str, container: str, path: str = "") -> AdlsFqn:
95
+ return AdlsFqn.parse(format_fqn(storage_account, container, path))
96
+
97
+
98
+ def parent(fqn: FR) -> FR:
99
+ if isinstance(fqn, AdlsRoot):
100
+ return fqn # type: ignore
101
+ assert isinstance(fqn, AdlsFqn)
102
+ if "/" not in fqn.path.strip("/"):
103
+ return AdlsFqn(fqn.sa, fqn.container, "") # type: ignore
104
+ return AdlsFqn(fqn.sa, fqn.container, join(*fqn.path.split("/")[:-1])) # type: ignore
105
+
106
+
107
+ SA_REGEX = re.compile(r"^[\w]{3,24}$")
108
+ # https://github.com/MicrosoftDocs/azure-docs/issues/64497#issuecomment-714380739
109
+ CONT_REGEX = re.compile(r"^\w[\w\-]{2,63}$")
110
+ # https://learn.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata#container-names
111
+
112
+
113
+ class NotAdlsUri(ValueError):
114
+ """This string does not represent an adls:// uri"""
115
+
116
+
117
+ def parse_fqn(fully_qualified_uri: str) -> AdlsFqn:
118
+ """There are many ways to represent a fully qualified ADLS path, and most of them are cumbersome.
119
+
120
+ This is an attempt to provide a standard way across our codebases
121
+ that keeps all parts together, but allows separating them for
122
+ passing into libraries.
123
+
124
+ Because Storage Account names can only include alphanumeric
125
+ characters, and Container names may only include alphanumerics
126
+ plus the dash character, this simple format turns out to be
127
+ unambiguous and easy for humans to read.
128
+
129
+ We accept formatted strings with or without the leading forward
130
+ slash in front of the path even though the formatter below
131
+ guarantees the leading forward slash, but we do require there to
132
+ be two spaces. If you wish to represent a Storage Account and
133
+ Container with no path, simply append a forward slash to the end
134
+ of your string, which represents the root of that SA and
135
+ container, because a single forward slash is not valid as a path
136
+ name for a blob in ADLS.
137
+ """
138
+ # an older, scheme-less version of format_fqn used spaces to separate sa and container.
139
+ if fully_qualified_uri.startswith(ADLS_SCHEME):
140
+ fully_qualified_uri = fully_qualified_uri[len(ADLS_SCHEME) :]
141
+ sep = "/"
142
+ else:
143
+ sep = None
144
+ try:
145
+ sa, container, path = fully_qualified_uri.split(sep, 2)
146
+ except ValueError as ve:
147
+ raise NotAdlsUri(
148
+ f"Cannot split {fully_qualified_uri} into at least three '/'-separated pieces."
149
+ ) from ve
150
+ assert SA_REGEX.match(sa), sa
151
+ assert CONT_REGEX.match(container), container
152
+ return AdlsFqn(sa, container, path.lstrip("/"))
153
+
154
+
155
+ parse = parse_fqn
156
+
157
+
158
+ def format_fqn(storage_account: str, container: str, path: str = "") -> str:
159
+ """Returns a fully-qualifed ADLS name in URI format, with adls:// as a prefix.
160
+
161
+ When formatting, we will prefix your path with a forward-slash (/)
162
+
163
+ if it does not already have one, in order to allow empty paths to
164
+ be formatted and parsed simply.
165
+ """
166
+
167
+ assert SA_REGEX.match(storage_account), storage_account
168
+ assert CONT_REGEX.match(container), container
169
+ return f"{ADLS_SCHEME}{storage_account}/{container}/{path.lstrip('/')}"