thds.adls 4.2.20250827225749__py3-none-any.whl → 4.2.20250902211626__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/__init__.py CHANGED
@@ -1,6 +1,17 @@
1
1
  from thds import core
2
2
 
3
- from . import abfss, defaults, etag, fqn, hashes, named_roots, source, source_tree, uri # noqa: F401
3
+ from . import ( # noqa: F401
4
+ abfss,
5
+ defaults,
6
+ etag,
7
+ fqn,
8
+ hashes,
9
+ list_fast,
10
+ named_roots,
11
+ source,
12
+ source_tree,
13
+ uri,
14
+ )
4
15
  from .cached import download_directory, download_to_cache, upload_through_cache # noqa: F401
5
16
  from .copy import copy_file, copy_files, wait_for_copy # noqa: F401
6
17
  from .errors import BlobNotFoundError # noqa: F401
thds/adls/_progress.py CHANGED
@@ -53,8 +53,7 @@ def _blobs(n: list) -> str:
53
53
 
54
54
 
55
55
  class _Reporter(ty.Protocol):
56
- def __call__(self, states: ty.List[ProgressState]):
57
- ...
56
+ def __call__(self, states: ty.List[ProgressState]): ...
58
57
 
59
58
 
60
59
  class DumbReporter:
thds/adls/abfss.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Translate ADLS URIs to ABFSS URIs (for use with Spark/Hadoop)."""
2
+
2
3
  from .fqn import AdlsFqn
3
4
 
4
5
  ABFSS_SCHEME = "abfss://"
thds/adls/copy.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Functions for copying blobs across remote locations."""
2
+
2
3
  import datetime
3
4
  import random
4
5
  import time
thds/adls/list_fast.py ADDED
@@ -0,0 +1,95 @@
1
+ """This module is roughly 10x as fast as ADLSFileSystem.get_directory_info.
2
+
3
+ Part of that is the parallelism, but part of it seems to be using the blob container
4
+ client instead of the file system client.
5
+ """
6
+
7
+ import typing as ty
8
+
9
+ from thds.core import parallel, thunks
10
+
11
+ from . import global_client
12
+ from .fqn import AdlsFqn
13
+ from .source_tree import BlobMeta, to_blob_meta, yield_blob_meta
14
+
15
+ R = ty.TypeVar("R")
16
+
17
+
18
+ def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R]:
19
+ yield from (res for _, res in parallel.failfast(parallel.yield_all(parallel.create_keys(thunks))))
20
+
21
+
22
+ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
23
+ """A fast way to find all blobs in a directory tree; we do this in parallel on
24
+ subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
25
+
26
+ Does not maintain order.
27
+
28
+ Does return directories; if you want to filter those out, you can use filter with is_dir.
29
+ """
30
+ if layers <= 0:
31
+ # directly yield the blobs
32
+ yield from yield_blob_meta(
33
+ global_client.get_global_blob_container_client(fqn.sa, fqn.container),
34
+ fqn.path,
35
+ )
36
+ return
37
+
38
+ # This code seems more complex than it needs to be... Why is it like this?
39
+ #
40
+ # Well, it's an optimization. The underlying Azure API being used (i.e.,
41
+ # call to list_blobs) limits the number of paths returned with each
42
+ # invocation and each subsequent request must use a continuation token
43
+ # received from the previous request to get the next "page" (it paginates serially).
44
+ #
45
+ # Running this sequentially for tens of thousands of blobs is slow if we just
46
+ # iterate over the paths recursively. Therefore, we first list/get the
47
+ # top-level subdirectories (not recursively), and then in parallel ask
48
+ # the API to list the parquet paths of each subdirectory in parallel,
49
+ # greatly speeding things up.
50
+ #
51
+ # Because there may be parquet files in the top-level directory, we also
52
+ # gather those up and get `BlobProperties` for those individually (this is
53
+ # what azure's SDK offers us as an option -_-. At least we can do each of
54
+ # these gets in parallel...), then convert each of them to a adls.source_tree.BlobMeta.
55
+ #
56
+ # API reference: https://learn.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/path/list?view=rest-storageservices-datalakestoragegen2-2019-12-12
57
+
58
+ # you cannot limit recursion depth using the container client, so to speed things up
59
+ # we use the file system client to get the direct children, and then use the container client
60
+ # to list within each final subdir
61
+ subdirs, files = set(), set() # direct children of the top-level directory
62
+ for child_path_props in global_client.get_global_fs_client(fqn.sa, fqn.container).get_paths(
63
+ path=fqn.path, recursive=False
64
+ ):
65
+ if child_path_props.is_directory:
66
+ subdirs.add(child_path_props.name)
67
+ else:
68
+ files.add(child_path_props.name)
69
+
70
+ blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
71
+
72
+ def _get_blob_meta(blob_name: str) -> BlobMeta:
73
+ return to_blob_meta(blob_container_client.get_blob_client(blob_name).get_blob_properties())
74
+
75
+ for blob_meta_iter in (
76
+ _failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
77
+ *(
78
+ _failfast_parallel(
79
+ # we use list_blobs (defined below) rather than yield here because returning
80
+ # an iterator across a thread boundary doesn't work
81
+ thunks.thunking(_list_blob_meta)(AdlsFqn(fqn.sa, fqn.container, subdir), layers - 1)
82
+ for subdir in subdirs
83
+ )
84
+ ),
85
+ ):
86
+ yield from blob_meta_iter
87
+
88
+
89
+ def is_dir(blob_meta: BlobMeta) -> bool: # TODO move to blob_meta.py once it exists
90
+ return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
91
+
92
+
93
+ def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
94
+ """Only for use within multi_layer_yield_blobs."""
95
+ return list(multilayer_yield_blob_meta(fqn, layers))
thds/adls/source_tree.py CHANGED
@@ -8,6 +8,8 @@ from thds.core.source.tree import SourceTree
8
8
 
9
9
  from . import fqn, global_client, hashes, source, uri
10
10
 
11
+ # TODO refactor BlobMeta into its own module.
12
+
11
13
 
12
14
  @dataclass
13
15
  class BlobMeta:
@@ -0,0 +1,22 @@
1
+ import argparse
2
+
3
+ from thds.adls import list_fast, uri
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument("uri", type=uri.parse_any, help="A fully qualified path to an ADLS location")
9
+ parser.add_argument("--verbose", "-v", action="store_true")
10
+ args = parser.parse_args()
11
+
12
+ for f in list_fast.multilayer_yield_blob_meta(args.uri):
13
+ print(f.path)
14
+ if args.verbose:
15
+ print(" ", f.size)
16
+ print(" ", f.hash)
17
+ print(" ", f.metadata)
18
+ print()
19
+
20
+
21
+ if __name__ == "__main__":
22
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thds.adls
3
- Version: 4.2.20250827225749
3
+ Version: 4.2.20250902211626
4
4
  Summary: ADLS tools
5
5
  Author-email: Trilliant Health <info@trillianthealth.com>
6
6
  License: MIT
@@ -1,11 +1,11 @@
1
- thds/adls/__init__.py,sha256=PL0BRhiLhW_xY_2hhBgd8v3_NS1zpR4Kdd28zjNHBgo,1017
1
+ thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
2
2
  thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
3
- thds/adls/_progress.py,sha256=D6XIipzG_xwmxs_08LuiYFfThGqHTU2KiIyjNduiOFY,6656
3
+ thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
4
4
  thds/adls/_upload.py,sha256=mhTdWiQroaugYuwQg7R8CEgdfCYF4xvJthlsqO0jlnE,4692
5
- thds/adls/abfss.py,sha256=ZRJOLjDuXmS4bIbQAQpQxWWWeu74N9NKEKCNfXQek80,726
5
+ thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
6
6
  thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
7
7
  thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
8
- thds/adls/copy.py,sha256=jUWbGvTpb4B3yRGS0nhGSbDzqRPzUqYgH0z1lFRJB3k,6365
8
+ thds/adls/copy.py,sha256=BGtzroE46ZrL10eFsW_a2ng6W9dUgG2tlk1wk0UNeDI,6366
9
9
  thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
10
10
  thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
11
11
  thds/adls/download.py,sha256=LHxkv073T-pCmIRmiXIgYEXIEpEm_OIZKvNHp1e4_k4,19124
@@ -17,6 +17,7 @@ thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
17
17
  thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
18
18
  thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
19
19
  thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
20
+ thds/adls/list_fast.py,sha256=7jHnln4DMWYVLHhejj-fdWMBWflBiWfynegKxcUlNDY,4189
20
21
  thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
21
22
  thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
22
23
  thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -24,7 +25,7 @@ thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
24
25
  thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
25
26
  thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
26
27
  thds/adls/source.py,sha256=8HVMYuxDn1XYGwFFSBowMlvQ6r2Jm2CQlpu4h85JvsE,2559
27
- thds/adls/source_tree.py,sha256=FFcSJpbb7w5y6v1kzRQl0cUjjuSb34V8J2iHrrm9zDA,2542
28
+ thds/adls/source_tree.py,sha256=gl2JLjxAduo4cGQBb8LqBnmRHHk2wqIC5yt-sqkXOEo,2589
28
29
  thds/adls/upload.py,sha256=MRHK9Am-x5FKBPh1SXLTbPC1r0Xk0bGWNU8CcNuUMLo,6602
29
30
  thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
30
31
  thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
@@ -35,9 +36,10 @@ thds/adls/azcopy/system_resources.py,sha256=okgDEKAp0oWGQF7OKikbgJ9buBeiOgNaDYy-
35
36
  thds/adls/azcopy/upload.py,sha256=RQLDJzS6qsMM12t5bykWJWBXs0UrmImrEFnPMxX2UlM,2767
36
37
  thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1587
37
38
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
39
+ thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
38
40
  thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
39
- thds_adls-4.2.20250827225749.dist-info/METADATA,sha256=sRSt8S6xJPAVYqh9z4XCnVoGI9HB8WOhmuDZuhXiQrI,587
40
- thds_adls-4.2.20250827225749.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
41
- thds_adls-4.2.20250827225749.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
42
- thds_adls-4.2.20250827225749.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
43
- thds_adls-4.2.20250827225749.dist-info/RECORD,,
41
+ thds_adls-4.2.20250902211626.dist-info/METADATA,sha256=UiTXMLzpZi6uvnbHbBJ_h4wOwG5cF5XvtgbK3SyWAMg,587
42
+ thds_adls-4.2.20250902211626.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
+ thds_adls-4.2.20250902211626.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
44
+ thds_adls-4.2.20250902211626.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
45
+ thds_adls-4.2.20250902211626.dist-info/RECORD,,