thds.adls 4.2.20250827225749__py3-none-any.whl → 4.2.20250902211626__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/__init__.py +12 -1
- thds/adls/_progress.py +1 -2
- thds/adls/abfss.py +1 -0
- thds/adls/copy.py +1 -0
- thds/adls/list_fast.py +95 -0
- thds/adls/source_tree.py +2 -0
- thds/adls/tools/ls_fast.py +22 -0
- {thds_adls-4.2.20250827225749.dist-info → thds_adls-4.2.20250902211626.dist-info}/METADATA +1 -1
- {thds_adls-4.2.20250827225749.dist-info → thds_adls-4.2.20250902211626.dist-info}/RECORD +12 -10
- {thds_adls-4.2.20250827225749.dist-info → thds_adls-4.2.20250902211626.dist-info}/WHEEL +0 -0
- {thds_adls-4.2.20250827225749.dist-info → thds_adls-4.2.20250902211626.dist-info}/entry_points.txt +0 -0
- {thds_adls-4.2.20250827225749.dist-info → thds_adls-4.2.20250902211626.dist-info}/top_level.txt +0 -0
thds/adls/__init__.py
CHANGED
|
@@ -1,6 +1,17 @@
|
|
|
1
1
|
from thds import core
|
|
2
2
|
|
|
3
|
-
from . import
|
|
3
|
+
from . import ( # noqa: F401
|
|
4
|
+
abfss,
|
|
5
|
+
defaults,
|
|
6
|
+
etag,
|
|
7
|
+
fqn,
|
|
8
|
+
hashes,
|
|
9
|
+
list_fast,
|
|
10
|
+
named_roots,
|
|
11
|
+
source,
|
|
12
|
+
source_tree,
|
|
13
|
+
uri,
|
|
14
|
+
)
|
|
4
15
|
from .cached import download_directory, download_to_cache, upload_through_cache # noqa: F401
|
|
5
16
|
from .copy import copy_file, copy_files, wait_for_copy # noqa: F401
|
|
6
17
|
from .errors import BlobNotFoundError # noqa: F401
|
thds/adls/_progress.py
CHANGED
thds/adls/abfss.py
CHANGED
thds/adls/copy.py
CHANGED
thds/adls/list_fast.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""This module is roughly 10x as fast as ADLSFileSystem.get_directory_info.
|
|
2
|
+
|
|
3
|
+
Part of that is the parallelism, but part of it seems to be using the blob container
|
|
4
|
+
client instead of the file system client.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing as ty
|
|
8
|
+
|
|
9
|
+
from thds.core import parallel, thunks
|
|
10
|
+
|
|
11
|
+
from . import global_client
|
|
12
|
+
from .fqn import AdlsFqn
|
|
13
|
+
from .source_tree import BlobMeta, to_blob_meta, yield_blob_meta
|
|
14
|
+
|
|
15
|
+
R = ty.TypeVar("R")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R]:
|
|
19
|
+
yield from (res for _, res in parallel.failfast(parallel.yield_all(parallel.create_keys(thunks))))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
|
|
23
|
+
"""A fast way to find all blobs in a directory tree; we do this in parallel on
|
|
24
|
+
subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
|
|
25
|
+
|
|
26
|
+
Does not maintain order.
|
|
27
|
+
|
|
28
|
+
Does return directories; if you want to filter those out, you can use filter with is_dir.
|
|
29
|
+
"""
|
|
30
|
+
if layers <= 0:
|
|
31
|
+
# directly yield the blobs
|
|
32
|
+
yield from yield_blob_meta(
|
|
33
|
+
global_client.get_global_blob_container_client(fqn.sa, fqn.container),
|
|
34
|
+
fqn.path,
|
|
35
|
+
)
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
# This code seems more complex than it needs to be... Why is it like this?
|
|
39
|
+
#
|
|
40
|
+
# Well, it's an optimization. The underlying Azure API being used (i.e.,
|
|
41
|
+
# call to list_blobs) limits the number of paths returned with each
|
|
42
|
+
# invocation and each subsequent request must use a continuation token
|
|
43
|
+
# received from the previous request to get the next "page" (it paginates serially).
|
|
44
|
+
#
|
|
45
|
+
# Running this sequentially for tens of thousands of blobs is slow if we just
|
|
46
|
+
# iterate over the paths recursively. Therefore, we first list/get the
|
|
47
|
+
# top-level subdirectories (not recursively), and then in parallel ask
|
|
48
|
+
# the API to list the parquet paths of each subdirectory in parallel,
|
|
49
|
+
# greatly speeding things up.
|
|
50
|
+
#
|
|
51
|
+
# Because there may be parquet files in the top-level directory, we also
|
|
52
|
+
# gather those up and get `BlobProperties` for those individually (this is
|
|
53
|
+
# what azure's SDK offers us as an option -_-. At least we can do each of
|
|
54
|
+
# these gets in parallel...), then convert each of them to a adls.source_tree.BlobMeta.
|
|
55
|
+
#
|
|
56
|
+
# API reference: https://learn.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/path/list?view=rest-storageservices-datalakestoragegen2-2019-12-12
|
|
57
|
+
|
|
58
|
+
# you cannot limit recursion depth using the container client, so to speed things up
|
|
59
|
+
# we use the file system client to get the direct children, and then use the container client
|
|
60
|
+
# to list within each final subdir
|
|
61
|
+
subdirs, files = set(), set() # direct children of the top-level directory
|
|
62
|
+
for child_path_props in global_client.get_global_fs_client(fqn.sa, fqn.container).get_paths(
|
|
63
|
+
path=fqn.path, recursive=False
|
|
64
|
+
):
|
|
65
|
+
if child_path_props.is_directory:
|
|
66
|
+
subdirs.add(child_path_props.name)
|
|
67
|
+
else:
|
|
68
|
+
files.add(child_path_props.name)
|
|
69
|
+
|
|
70
|
+
blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
|
|
71
|
+
|
|
72
|
+
def _get_blob_meta(blob_name: str) -> BlobMeta:
|
|
73
|
+
return to_blob_meta(blob_container_client.get_blob_client(blob_name).get_blob_properties())
|
|
74
|
+
|
|
75
|
+
for blob_meta_iter in (
|
|
76
|
+
_failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
|
|
77
|
+
*(
|
|
78
|
+
_failfast_parallel(
|
|
79
|
+
# we use list_blobs (defined below) rather than yield here because returning
|
|
80
|
+
# an iterator across a thread boundary doesn't work
|
|
81
|
+
thunks.thunking(_list_blob_meta)(AdlsFqn(fqn.sa, fqn.container, subdir), layers - 1)
|
|
82
|
+
for subdir in subdirs
|
|
83
|
+
)
|
|
84
|
+
),
|
|
85
|
+
):
|
|
86
|
+
yield from blob_meta_iter
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def is_dir(blob_meta: BlobMeta) -> bool: # TODO move to blob_meta.py once it exists
|
|
90
|
+
return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
|
|
94
|
+
"""Only for use within multi_layer_yield_blobs."""
|
|
95
|
+
return list(multilayer_yield_blob_meta(fqn, layers))
|
thds/adls/source_tree.py
CHANGED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from thds.adls import list_fast, uri
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main():
|
|
7
|
+
parser = argparse.ArgumentParser()
|
|
8
|
+
parser.add_argument("uri", type=uri.parse_any, help="A fully qualified path to an ADLS location")
|
|
9
|
+
parser.add_argument("--verbose", "-v", action="store_true")
|
|
10
|
+
args = parser.parse_args()
|
|
11
|
+
|
|
12
|
+
for f in list_fast.multilayer_yield_blob_meta(args.uri):
|
|
13
|
+
print(f.path)
|
|
14
|
+
if args.verbose:
|
|
15
|
+
print(" ", f.size)
|
|
16
|
+
print(" ", f.hash)
|
|
17
|
+
print(" ", f.metadata)
|
|
18
|
+
print()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
if __name__ == "__main__":
|
|
22
|
+
main()
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
thds/adls/__init__.py,sha256=
|
|
1
|
+
thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
|
|
2
2
|
thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
|
|
3
|
-
thds/adls/_progress.py,sha256=
|
|
3
|
+
thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
|
|
4
4
|
thds/adls/_upload.py,sha256=mhTdWiQroaugYuwQg7R8CEgdfCYF4xvJthlsqO0jlnE,4692
|
|
5
|
-
thds/adls/abfss.py,sha256=
|
|
5
|
+
thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
|
|
6
6
|
thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
|
|
7
7
|
thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
|
|
8
|
-
thds/adls/copy.py,sha256=
|
|
8
|
+
thds/adls/copy.py,sha256=BGtzroE46ZrL10eFsW_a2ng6W9dUgG2tlk1wk0UNeDI,6366
|
|
9
9
|
thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
|
|
10
10
|
thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
|
|
11
11
|
thds/adls/download.py,sha256=LHxkv073T-pCmIRmiXIgYEXIEpEm_OIZKvNHp1e4_k4,19124
|
|
@@ -17,6 +17,7 @@ thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
|
|
|
17
17
|
thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
|
|
18
18
|
thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
|
|
19
19
|
thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
|
|
20
|
+
thds/adls/list_fast.py,sha256=7jHnln4DMWYVLHhejj-fdWMBWflBiWfynegKxcUlNDY,4189
|
|
20
21
|
thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
|
|
21
22
|
thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
|
|
22
23
|
thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -24,7 +25,7 @@ thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
|
|
|
24
25
|
thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
|
|
25
26
|
thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
|
|
26
27
|
thds/adls/source.py,sha256=8HVMYuxDn1XYGwFFSBowMlvQ6r2Jm2CQlpu4h85JvsE,2559
|
|
27
|
-
thds/adls/source_tree.py,sha256=
|
|
28
|
+
thds/adls/source_tree.py,sha256=gl2JLjxAduo4cGQBb8LqBnmRHHk2wqIC5yt-sqkXOEo,2589
|
|
28
29
|
thds/adls/upload.py,sha256=MRHK9Am-x5FKBPh1SXLTbPC1r0Xk0bGWNU8CcNuUMLo,6602
|
|
29
30
|
thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
|
|
30
31
|
thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
|
|
@@ -35,9 +36,10 @@ thds/adls/azcopy/system_resources.py,sha256=okgDEKAp0oWGQF7OKikbgJ9buBeiOgNaDYy-
|
|
|
35
36
|
thds/adls/azcopy/upload.py,sha256=RQLDJzS6qsMM12t5bykWJWBXs0UrmImrEFnPMxX2UlM,2767
|
|
36
37
|
thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1587
|
|
37
38
|
thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
|
|
39
|
+
thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
|
|
38
40
|
thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
|
|
39
|
-
thds_adls-4.2.
|
|
40
|
-
thds_adls-4.2.
|
|
41
|
-
thds_adls-4.2.
|
|
42
|
-
thds_adls-4.2.
|
|
43
|
-
thds_adls-4.2.
|
|
41
|
+
thds_adls-4.2.20250902211626.dist-info/METADATA,sha256=UiTXMLzpZi6uvnbHbBJ_h4wOwG5cF5XvtgbK3SyWAMg,587
|
|
42
|
+
thds_adls-4.2.20250902211626.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
43
|
+
thds_adls-4.2.20250902211626.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
|
|
44
|
+
thds_adls-4.2.20250902211626.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
|
|
45
|
+
thds_adls-4.2.20250902211626.dist-info/RECORD,,
|
|
File without changes
|
{thds_adls-4.2.20250827225749.dist-info → thds_adls-4.2.20250902211626.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{thds_adls-4.2.20250827225749.dist-info → thds_adls-4.2.20250902211626.dist-info}/top_level.txt
RENAMED
|
File without changes
|