thds.adls 4.4.20251020213233__py3-none-any.whl → 4.4.20251021162747__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/__init__.py +1 -0
- thds/adls/blob_meta.py +38 -0
- thds/adls/list_fast.py +27 -12
- thds/adls/source_tree.py +7 -54
- {thds_adls-4.4.20251020213233.dist-info → thds_adls-4.4.20251021162747.dist-info}/METADATA +1 -1
- {thds_adls-4.4.20251020213233.dist-info → thds_adls-4.4.20251021162747.dist-info}/RECORD +9 -8
- {thds_adls-4.4.20251020213233.dist-info → thds_adls-4.4.20251021162747.dist-info}/WHEEL +0 -0
- {thds_adls-4.4.20251020213233.dist-info → thds_adls-4.4.20251021162747.dist-info}/entry_points.txt +0 -0
- {thds_adls-4.4.20251020213233.dist-info → thds_adls-4.4.20251021162747.dist-info}/top_level.txt +0 -0
thds/adls/__init__.py
CHANGED
thds/adls/blob_meta.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from azure.storage.blob import BlobProperties, ContainerClient
|
|
5
|
+
|
|
6
|
+
from thds.core import hashing
|
|
7
|
+
|
|
8
|
+
from . import hashes
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class BlobMeta:
|
|
13
|
+
path: str
|
|
14
|
+
size: int
|
|
15
|
+
hash: ty.Optional[hashing.Hash]
|
|
16
|
+
metadata: dict[str, str]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
|
|
20
|
+
return BlobMeta(
|
|
21
|
+
blob_props.name,
|
|
22
|
+
blob_props.size,
|
|
23
|
+
next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
|
|
24
|
+
blob_props.metadata or {},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_dir(blob_meta: BlobMeta) -> bool:
|
|
29
|
+
return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
|
|
33
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
|
|
34
|
+
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
|
|
35
|
+
def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
|
|
36
|
+
for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
|
|
37
|
+
# `list_blobs` does not include metadata by default, so we need to explicitly specify including it
|
|
38
|
+
yield to_blob_meta(blob_props)
|
thds/adls/list_fast.py
CHANGED
|
@@ -6,11 +6,12 @@ client instead of the file system client.
|
|
|
6
6
|
|
|
7
7
|
import typing as ty
|
|
8
8
|
|
|
9
|
-
from thds.core import log, parallel, thunks
|
|
9
|
+
from thds.core import log, parallel, source, thunks
|
|
10
10
|
|
|
11
|
-
from . import global_client
|
|
11
|
+
from . import blob_meta, global_client
|
|
12
|
+
from . import source as adls_source
|
|
12
13
|
from .fqn import AdlsFqn
|
|
13
|
-
from .
|
|
14
|
+
from .uri import UriIsh, parse_any
|
|
14
15
|
|
|
15
16
|
R = ty.TypeVar("R")
|
|
16
17
|
|
|
@@ -27,7 +28,7 @@ def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R
|
|
|
27
28
|
)
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
|
|
31
|
+
def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[blob_meta.BlobMeta]:
|
|
31
32
|
"""A fast way to find all blobs in a directory tree; we do this in parallel on
|
|
32
33
|
subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
|
|
33
34
|
|
|
@@ -37,7 +38,7 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
37
38
|
"""
|
|
38
39
|
if layers <= 0:
|
|
39
40
|
# directly yield the blobs
|
|
40
|
-
yield from yield_blob_meta(
|
|
41
|
+
yield from blob_meta.yield_blob_meta(
|
|
41
42
|
global_client.get_global_blob_container_client(fqn.sa, fqn.container),
|
|
42
43
|
fqn.path.rstrip("/") + "/",
|
|
43
44
|
)
|
|
@@ -77,8 +78,10 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
77
78
|
|
|
78
79
|
blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
|
|
79
80
|
|
|
80
|
-
def _get_blob_meta(blob_name: str) -> BlobMeta:
|
|
81
|
-
return to_blob_meta(
|
|
81
|
+
def _get_blob_meta(blob_name: str) -> blob_meta.BlobMeta:
|
|
82
|
+
return blob_meta.to_blob_meta(
|
|
83
|
+
blob_container_client.get_blob_client(blob_name).get_blob_properties()
|
|
84
|
+
)
|
|
82
85
|
|
|
83
86
|
for blob_meta_iter in (
|
|
84
87
|
_failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
|
|
@@ -94,10 +97,22 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
|
|
|
94
97
|
yield from blob_meta_iter
|
|
95
98
|
|
|
96
99
|
|
|
97
|
-
def
|
|
98
|
-
return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
|
|
100
|
+
def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[blob_meta.BlobMeta]:
|
|
102
101
|
"""Only for use within multi_layer_yield_blobs."""
|
|
103
102
|
return list(multilayer_yield_blob_meta(fqn, layers))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def multilayer_yield_sources(
|
|
106
|
+
fqn_or_uri: UriIsh,
|
|
107
|
+
layers: int = 1,
|
|
108
|
+
filter_: ty.Callable[[blob_meta.BlobMeta], bool] = lambda _: True,
|
|
109
|
+
) -> ty.Iterator[source.Source]:
|
|
110
|
+
"""
|
|
111
|
+
if you want to list directories and files, use `multilayer_yield_blob_meta` instead
|
|
112
|
+
"""
|
|
113
|
+
fqn = parse_any(fqn_or_uri)
|
|
114
|
+
root = fqn.root()
|
|
115
|
+
for blob in multilayer_yield_blob_meta(fqn, layers):
|
|
116
|
+
if not blob_meta.is_dir(blob) and filter_(blob):
|
|
117
|
+
# ^ a "dir" Source would not make sense
|
|
118
|
+
yield adls_source.from_adls(root / blob.path, hash=blob.hash, size=blob.size)
|
thds/adls/source_tree.py
CHANGED
|
@@ -1,53 +1,6 @@
|
|
|
1
|
-
import typing as ty
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
from azure.storage.blob import BlobProperties, ContainerClient
|
|
5
|
-
|
|
6
|
-
from thds.core import hashing
|
|
7
1
|
from thds.core.source.tree import SourceTree
|
|
8
2
|
|
|
9
|
-
from . import fqn,
|
|
10
|
-
|
|
11
|
-
# TODO refactor BlobMeta into its own module.
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class BlobMeta:
|
|
16
|
-
path: str
|
|
17
|
-
size: int
|
|
18
|
-
hash: ty.Optional[hashing.Hash]
|
|
19
|
-
metadata: dict[str, str]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
|
|
23
|
-
return BlobMeta(
|
|
24
|
-
blob_props.name,
|
|
25
|
-
blob_props.size,
|
|
26
|
-
next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
|
|
27
|
-
blob_props.metadata or {},
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
|
|
32
|
-
for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
|
|
33
|
-
# `list_blobs` does not include metadata by default, so we need to explicitly specify including it
|
|
34
|
-
yield to_blob_meta(blob_props)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
|
|
38
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
|
|
39
|
-
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
|
|
40
|
-
def list_blob_meta(
|
|
41
|
-
container_client: ContainerClient, root_dir: str, match_suffix: str = ""
|
|
42
|
-
) -> ty.List[BlobMeta]:
|
|
43
|
-
"""Gets the path (relative to the SA/container root), size, and _a_ hash (if available) of all blobs in a directory."""
|
|
44
|
-
return [
|
|
45
|
-
blob_meta
|
|
46
|
-
for blob_meta in yield_blob_meta(container_client, root_dir)
|
|
47
|
-
if blob_meta.size > 0
|
|
48
|
-
# container client lists directories as blobs with size 0
|
|
49
|
-
and blob_meta.path.endswith(match_suffix)
|
|
50
|
-
]
|
|
3
|
+
from . import fqn, list_fast, uri
|
|
51
4
|
|
|
52
5
|
|
|
53
6
|
def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
|
|
@@ -56,12 +9,12 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
|
|
|
56
9
|
"""
|
|
57
10
|
root_fqn = uri.parse_any(adls_path)
|
|
58
11
|
|
|
59
|
-
container_client = global_client.get_global_blob_container_client(root_fqn.sa, root_fqn.container)
|
|
60
|
-
container_root = root_fqn.root()
|
|
61
12
|
return SourceTree(
|
|
62
|
-
sources=
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
13
|
+
sources=sorted(
|
|
14
|
+
list_fast.multilayer_yield_sources(
|
|
15
|
+
root_fqn, layers=0, filter_=lambda blob: blob.path.endswith(match_suffix)
|
|
16
|
+
),
|
|
17
|
+
key=lambda src: src.uri,
|
|
18
|
+
),
|
|
66
19
|
higher_logical_root=fqn.split(root_fqn)[-1],
|
|
67
20
|
)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
thds/adls/__init__.py,sha256=
|
|
1
|
+
thds/adls/__init__.py,sha256=CzNcmTdP5RHrD-9K1boDU4Z3pbJAvEAFfFhXGFwpEXw,1088
|
|
2
2
|
thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
|
|
3
3
|
thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
|
|
4
4
|
thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
|
|
5
5
|
thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
|
|
6
|
+
thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
|
|
6
7
|
thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
|
|
7
8
|
thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
|
|
8
9
|
thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
|
|
@@ -17,7 +18,7 @@ thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
|
|
|
17
18
|
thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
|
|
18
19
|
thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
|
|
19
20
|
thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
|
|
20
|
-
thds/adls/list_fast.py,sha256=
|
|
21
|
+
thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
|
|
21
22
|
thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
|
|
22
23
|
thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
|
|
23
24
|
thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -25,7 +26,7 @@ thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
|
|
|
25
26
|
thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
|
|
26
27
|
thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
|
|
27
28
|
thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
|
|
28
|
-
thds/adls/source_tree.py,sha256=
|
|
29
|
+
thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
|
|
29
30
|
thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
|
|
30
31
|
thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
|
|
31
32
|
thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
|
|
@@ -38,8 +39,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
|
|
|
38
39
|
thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
|
|
39
40
|
thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
|
|
40
41
|
thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
|
|
41
|
-
thds_adls-4.4.
|
|
42
|
-
thds_adls-4.4.
|
|
43
|
-
thds_adls-4.4.
|
|
44
|
-
thds_adls-4.4.
|
|
45
|
-
thds_adls-4.4.
|
|
42
|
+
thds_adls-4.4.20251021162747.dist-info/METADATA,sha256=oYeTcXzql2mAFAUzfgYfHkNOi5PSLxcVgT8HKo7gZkw,587
|
|
43
|
+
thds_adls-4.4.20251021162747.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
thds_adls-4.4.20251021162747.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
|
|
45
|
+
thds_adls-4.4.20251021162747.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
|
|
46
|
+
thds_adls-4.4.20251021162747.dist-info/RECORD,,
|
|
File without changes
|
{thds_adls-4.4.20251020213233.dist-info → thds_adls-4.4.20251021162747.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{thds_adls-4.4.20251020213233.dist-info → thds_adls-4.4.20251021162747.dist-info}/top_level.txt
RENAMED
|
File without changes
|