thds.adls 4.4.20251020213233__py3-none-any.whl → 4.4.20251021205028__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/__init__.py CHANGED
@@ -2,6 +2,7 @@ from thds import core
2
2
 
3
3
  from . import ( # noqa: F401
4
4
  abfss,
5
+ blob_meta,
5
6
  defaults,
6
7
  etag,
7
8
  fqn,
thds/adls/blob_meta.py ADDED
@@ -0,0 +1,38 @@
1
+ import typing as ty
2
+ from dataclasses import dataclass
3
+
4
+ from azure.storage.blob import BlobProperties, ContainerClient
5
+
6
+ from thds.core import hashing
7
+
8
+ from . import hashes
9
+
10
+
11
+ @dataclass
12
+ class BlobMeta:
13
+ path: str
14
+ size: int
15
+ hash: ty.Optional[hashing.Hash]
16
+ metadata: dict[str, str]
17
+
18
+
19
+ def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
20
+ return BlobMeta(
21
+ blob_props.name,
22
+ blob_props.size,
23
+ next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
24
+ blob_props.metadata or {},
25
+ )
26
+
27
+
28
+ def is_dir(blob_meta: BlobMeta) -> bool:
29
+ return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
30
+
31
+
32
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
33
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
34
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
35
+ def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
36
+ for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
37
+ # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
38
+ yield to_blob_meta(blob_props)
thds/adls/list_fast.py CHANGED
@@ -6,11 +6,12 @@ client instead of the file system client.
6
6
 
7
7
  import typing as ty
8
8
 
9
- from thds.core import log, parallel, thunks
9
+ from thds.core import log, parallel, source, thunks
10
10
 
11
- from . import global_client
11
+ from . import blob_meta, global_client
12
+ from . import source as adls_source
12
13
  from .fqn import AdlsFqn
13
- from .source_tree import BlobMeta, to_blob_meta, yield_blob_meta
14
+ from .uri import UriIsh, parse_any
14
15
 
15
16
  R = ty.TypeVar("R")
16
17
 
@@ -27,7 +28,7 @@ def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R
27
28
  )
28
29
 
29
30
 
30
- def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
31
+ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[blob_meta.BlobMeta]:
31
32
  """A fast way to find all blobs in a directory tree; we do this in parallel on
32
33
  subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
33
34
 
@@ -37,7 +38,7 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
37
38
  """
38
39
  if layers <= 0:
39
40
  # directly yield the blobs
40
- yield from yield_blob_meta(
41
+ yield from blob_meta.yield_blob_meta(
41
42
  global_client.get_global_blob_container_client(fqn.sa, fqn.container),
42
43
  fqn.path.rstrip("/") + "/",
43
44
  )
@@ -77,8 +78,10 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
77
78
 
78
79
  blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
79
80
 
80
- def _get_blob_meta(blob_name: str) -> BlobMeta:
81
- return to_blob_meta(blob_container_client.get_blob_client(blob_name).get_blob_properties())
81
+ def _get_blob_meta(blob_name: str) -> blob_meta.BlobMeta:
82
+ return blob_meta.to_blob_meta(
83
+ blob_container_client.get_blob_client(blob_name).get_blob_properties()
84
+ )
82
85
 
83
86
  for blob_meta_iter in (
84
87
  _failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
@@ -94,10 +97,22 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
94
97
  yield from blob_meta_iter
95
98
 
96
99
 
97
- def is_dir(blob_meta: BlobMeta) -> bool: # TODO move to blob_meta.py once it exists
98
- return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
99
-
100
-
101
- def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
100
+ def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[blob_meta.BlobMeta]:
102
101
  """Only for use within multi_layer_yield_blobs."""
103
102
  return list(multilayer_yield_blob_meta(fqn, layers))
103
+
104
+
105
+ def multilayer_yield_sources(
106
+ fqn_or_uri: UriIsh,
107
+ layers: int = 1,
108
+ filter_: ty.Callable[[blob_meta.BlobMeta], bool] = lambda _: True,
109
+ ) -> ty.Iterator[source.Source]:
110
+ """
111
+ if you want to list directories and files, use `multilayer_yield_blob_meta` instead
112
+ """
113
+ fqn = parse_any(fqn_or_uri)
114
+ root = fqn.root()
115
+ for blob in multilayer_yield_blob_meta(fqn, layers):
116
+ if not blob_meta.is_dir(blob) and filter_(blob):
117
+ # ^ a "dir" Source would not make sense
118
+ yield adls_source.from_adls(root / blob.path, hash=blob.hash, size=blob.size)
thds/adls/source_tree.py CHANGED
@@ -1,53 +1,6 @@
1
- import typing as ty
2
- from dataclasses import dataclass
3
-
4
- from azure.storage.blob import BlobProperties, ContainerClient
5
-
6
- from thds.core import hashing
7
1
  from thds.core.source.tree import SourceTree
8
2
 
9
- from . import fqn, global_client, hashes, source, uri
10
-
11
- # TODO refactor BlobMeta into its own module.
12
-
13
-
14
- @dataclass
15
- class BlobMeta:
16
- path: str
17
- size: int
18
- hash: ty.Optional[hashing.Hash]
19
- metadata: dict[str, str]
20
-
21
-
22
- def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
23
- return BlobMeta(
24
- blob_props.name,
25
- blob_props.size,
26
- next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
27
- blob_props.metadata or {},
28
- )
29
-
30
-
31
- def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
32
- for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
33
- # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
34
- yield to_blob_meta(blob_props)
35
-
36
-
37
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
38
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
39
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
40
- def list_blob_meta(
41
- container_client: ContainerClient, root_dir: str, match_suffix: str = ""
42
- ) -> ty.List[BlobMeta]:
43
- """Gets the path (relative to the SA/container root), size, and _a_ hash (if available) of all blobs in a directory."""
44
- return [
45
- blob_meta
46
- for blob_meta in yield_blob_meta(container_client, root_dir)
47
- if blob_meta.size > 0
48
- # container client lists directories as blobs with size 0
49
- and blob_meta.path.endswith(match_suffix)
50
- ]
3
+ from . import fqn, list_fast, uri
51
4
 
52
5
 
53
6
  def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
@@ -56,12 +9,12 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
56
9
  """
57
10
  root_fqn = uri.parse_any(adls_path)
58
11
 
59
- container_client = global_client.get_global_blob_container_client(root_fqn.sa, root_fqn.container)
60
- container_root = root_fqn.root()
61
12
  return SourceTree(
62
- sources=[
63
- source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash, size=blob_meta.size)
64
- for blob_meta in list_blob_meta(container_client, root_fqn.path, match_suffix=match_suffix)
65
- ],
13
+ sources=sorted(
14
+ list_fast.multilayer_yield_sources(
15
+ root_fqn, layers=0, filter_=lambda blob: blob.path.endswith(match_suffix)
16
+ ),
17
+ key=lambda src: src.uri,
18
+ ),
66
19
  higher_logical_root=fqn.split(root_fqn)[-1],
67
20
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thds.adls
3
- Version: 4.4.20251020213233
3
+ Version: 4.4.20251021205028
4
4
  Summary: ADLS tools
5
5
  Author-email: Trilliant Health <info@trillianthealth.com>
6
6
  License: MIT
@@ -1,8 +1,9 @@
1
- thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
1
+ thds/adls/__init__.py,sha256=CzNcmTdP5RHrD-9K1boDU4Z3pbJAvEAFfFhXGFwpEXw,1088
2
2
  thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
3
3
  thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
4
4
  thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
5
5
  thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
6
+ thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
6
7
  thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
7
8
  thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
8
9
  thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
@@ -17,7 +18,7 @@ thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
17
18
  thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
18
19
  thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
19
20
  thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
20
- thds/adls/list_fast.py,sha256=t6rIk0ub-YO7Uo--7ml2zx0_L-AYknCeALwA-JGi95s,4321
21
+ thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
21
22
  thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
22
23
  thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
23
24
  thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,7 +26,7 @@ thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
25
26
  thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
26
27
  thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
27
28
  thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
28
- thds/adls/source_tree.py,sha256=FqVXgvfYPiowrWhRsXBItjvB7t41JRI3sCFVAHxjwgI,2610
29
+ thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
29
30
  thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
30
31
  thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
31
32
  thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
@@ -38,8 +39,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
38
39
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
39
40
  thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
40
41
  thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
41
- thds_adls-4.4.20251020213233.dist-info/METADATA,sha256=xlrF_oJgsTBFz5lbfwG1Lu-LAlAveyCqkcMdw6fCr50,587
42
- thds_adls-4.4.20251020213233.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- thds_adls-4.4.20251020213233.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
44
- thds_adls-4.4.20251020213233.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
45
- thds_adls-4.4.20251020213233.dist-info/RECORD,,
42
+ thds_adls-4.4.20251021205028.dist-info/METADATA,sha256=RR3YDafVvO9Yrrk6k3H6Cx5HUr0WHDYmf_xNy1V95NQ,587
43
+ thds_adls-4.4.20251021205028.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ thds_adls-4.4.20251021205028.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
45
+ thds_adls-4.4.20251021205028.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
46
+ thds_adls-4.4.20251021205028.dist-info/RECORD,,