thds.adls 4.3.20251014213630__py3-none-any.whl → 4.4.20251117191451__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thds/adls/__init__.py CHANGED
@@ -2,6 +2,7 @@ from thds import core
2
2
 
3
3
  from . import ( # noqa: F401
4
4
  abfss,
5
+ blob_meta,
5
6
  defaults,
6
7
  etag,
7
8
  fqn,
thds/adls/blob_meta.py ADDED
@@ -0,0 +1,38 @@
1
+ import typing as ty
2
+ from dataclasses import dataclass
3
+
4
+ from azure.storage.blob import BlobProperties, ContainerClient
5
+
6
+ from thds.core import hashing
7
+
8
+ from . import hashes
9
+
10
+
11
+ @dataclass
12
+ class BlobMeta:
13
+ path: str
14
+ size: int
15
+ hash: ty.Optional[hashing.Hash]
16
+ metadata: dict[str, str]
17
+
18
+
19
+ def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
20
+ return BlobMeta(
21
+ blob_props.name,
22
+ blob_props.size,
23
+ next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
24
+ blob_props.metadata or {},
25
+ )
26
+
27
+
28
+ def is_dir(blob_meta: BlobMeta) -> bool:
29
+ return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
30
+
31
+
32
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
33
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
34
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
35
+ def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
36
+ for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
37
+ # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
38
+ yield to_blob_meta(blob_props)
thds/adls/list_fast.py CHANGED
@@ -6,20 +6,29 @@ client instead of the file system client.
6
6
 
7
7
  import typing as ty
8
8
 
9
- from thds.core import parallel, thunks
9
+ from thds.core import log, parallel, source, thunks
10
10
 
11
- from . import global_client
11
+ from . import blob_meta, global_client
12
+ from . import source as adls_source
12
13
  from .fqn import AdlsFqn
13
- from .source_tree import BlobMeta, to_blob_meta, yield_blob_meta
14
+ from .uri import UriIsh, parse_any
14
15
 
15
16
  R = ty.TypeVar("R")
16
17
 
17
18
 
19
+ logger = log.getLogger(__name__)
20
+
21
+
18
22
  def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R]:
19
- yield from (res for _, res in parallel.failfast(parallel.yield_all(parallel.create_keys(thunks))))
23
+ yield from (
24
+ res
25
+ for _, res in parallel.failfast(
26
+ parallel.yield_all(parallel.create_keys(thunks), progress_logger=logger.debug)
27
+ )
28
+ )
20
29
 
21
30
 
22
- def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
31
+ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[blob_meta.BlobMeta]:
23
32
  """A fast way to find all blobs in a directory tree; we do this in parallel on
24
33
  subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
25
34
 
@@ -29,7 +38,7 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
29
38
  """
30
39
  if layers <= 0:
31
40
  # directly yield the blobs
32
- yield from yield_blob_meta(
41
+ yield from blob_meta.yield_blob_meta(
33
42
  global_client.get_global_blob_container_client(fqn.sa, fqn.container),
34
43
  fqn.path.rstrip("/") + "/",
35
44
  )
@@ -69,8 +78,10 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
69
78
 
70
79
  blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
71
80
 
72
- def _get_blob_meta(blob_name: str) -> BlobMeta:
73
- return to_blob_meta(blob_container_client.get_blob_client(blob_name).get_blob_properties())
81
+ def _get_blob_meta(blob_name: str) -> blob_meta.BlobMeta:
82
+ return blob_meta.to_blob_meta(
83
+ blob_container_client.get_blob_client(blob_name).get_blob_properties()
84
+ )
74
85
 
75
86
  for blob_meta_iter in (
76
87
  _failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
@@ -86,10 +97,22 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
86
97
  yield from blob_meta_iter
87
98
 
88
99
 
89
- def is_dir(blob_meta: BlobMeta) -> bool: # TODO move to blob_meta.py once it exists
90
- return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
91
-
92
-
93
- def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
100
+ def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[blob_meta.BlobMeta]:
94
101
  """Only for use within multi_layer_yield_blobs."""
95
102
  return list(multilayer_yield_blob_meta(fqn, layers))
103
+
104
+
105
+ def multilayer_yield_sources(
106
+ fqn_or_uri: UriIsh,
107
+ layers: int = 1,
108
+ filter_: ty.Callable[[blob_meta.BlobMeta], bool] = lambda _: True,
109
+ ) -> ty.Iterator[source.Source]:
110
+ """
111
+ if you want to list directories and files, use `multilayer_yield_blob_meta` instead
112
+ """
113
+ fqn = parse_any(fqn_or_uri)
114
+ root = fqn.root()
115
+ for blob in multilayer_yield_blob_meta(fqn, layers):
116
+ if not blob_meta.is_dir(blob) and filter_(blob):
117
+ # ^ a "dir" Source would not make sense
118
+ yield adls_source.from_adls(root / blob.path, hash=blob.hash, size=blob.size)
thds/adls/source_tree.py CHANGED
@@ -1,53 +1,6 @@
1
- import typing as ty
2
- from dataclasses import dataclass
3
-
4
- from azure.storage.blob import BlobProperties, ContainerClient
5
-
6
- from thds.core import hashing
7
1
  from thds.core.source.tree import SourceTree
8
2
 
9
- from . import fqn, global_client, hashes, source, uri
10
-
11
- # TODO refactor BlobMeta into its own module.
12
-
13
-
14
- @dataclass
15
- class BlobMeta:
16
- path: str
17
- size: int
18
- hash: ty.Optional[hashing.Hash]
19
- metadata: dict[str, str]
20
-
21
-
22
- def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
23
- return BlobMeta(
24
- blob_props.name,
25
- blob_props.size,
26
- next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
27
- blob_props.metadata or {},
28
- )
29
-
30
-
31
- def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
32
- for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
33
- # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
34
- yield to_blob_meta(blob_props)
35
-
36
-
37
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
38
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
39
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
40
- def list_blob_meta(
41
- container_client: ContainerClient, root_dir: str, match_suffix: str = ""
42
- ) -> ty.List[BlobMeta]:
43
- """Gets the path (relative to the SA/container root), size, and _a_ hash (if available) of all blobs in a directory."""
44
- return [
45
- blob_meta
46
- for blob_meta in yield_blob_meta(container_client, root_dir)
47
- if blob_meta.size > 0
48
- # container client lists directories as blobs with size 0
49
- and blob_meta.path.endswith(match_suffix)
50
- ]
3
+ from . import fqn, list_fast, uri
51
4
 
52
5
 
53
6
  def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
@@ -56,12 +9,12 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
56
9
  """
57
10
  root_fqn = uri.parse_any(adls_path)
58
11
 
59
- container_client = global_client.get_global_blob_container_client(root_fqn.sa, root_fqn.container)
60
- container_root = root_fqn.root()
61
12
  return SourceTree(
62
- sources=[
63
- source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash, size=blob_meta.size)
64
- for blob_meta in list_blob_meta(container_client, root_fqn.path, match_suffix=match_suffix)
65
- ],
13
+ sources=sorted(
14
+ list_fast.multilayer_yield_sources(
15
+ root_fqn, layers=0, filter_=lambda blob: blob.path.endswith(match_suffix)
16
+ ),
17
+ key=lambda src: src.uri,
18
+ ),
66
19
  higher_logical_root=fqn.split(root_fqn)[-1],
67
20
  )
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: thds.adls
3
+ Version: 4.4.20251117191451
4
+ Summary: ADLS tools
5
+ Author-email: Trilliant Health <info@trillianthealth.com>
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: aiohttp>=3.8.1
11
+ Requires-Dist: aiostream>=0.4.5
12
+ Requires-Dist: azure-identity>=1.9
13
+ Requires-Dist: azure-storage-file-datalake>=12.6
14
+ Requires-Dist: blake3
15
+ Requires-Dist: filelock>=3.0
16
+ Requires-Dist: xxhash
17
+ Requires-Dist: thds-core
18
+
19
+ # thds.adls
20
+
21
+ A high-performance Azure Data Lake Storage (ADLS Gen2) client for the THDS monorepo. It wraps the Azure
22
+ SDK with hash-aware caching, azcopy acceleration, and shared client/credential plumbing so applications
23
+ can transfer large blob datasets quickly and reliably.
24
+
25
+ ## Highlights
26
+
27
+ - **Environment-aware paths first:** Almost every consumer starts by importing `fqn`, `AdlsFqn`, and
28
+ `defaults.env_root()` to build storage-account/container URIs that follow the current THDS environment.
29
+ - **Cache-backed reads:** `download_to_cache` is the standard entry point for pulling blobs down with a
30
+ verified hash so local workflows, tests, and pipelines can operate on read-only copies.
31
+ - **Bulk filesystem helpers:** `ADLSFileSystem` powers scripts and jobs that need to walk directories,
32
+ fetch batches of files, or mirror hive tables without re-implementing Azure SDK plumbing.
33
+ - **Spark/Databricks bridges:** `abfss` and `uri` conversions keep analytics code agnostic to whether it
34
+ needs an `adls://`, `abfss://`, `https://`, or `dbfs://` view of the same path.
35
+ - **Composable utilities:** Higher-level modules (cache, upload, copy, list) layer on top of those
36
+ imports so teams can opt into more advanced behavior without leaving the public API surface.
37
+
38
+ ## Key Modules
39
+
40
+ | Component | Typical usage in the monorepo |
41
+ | ------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
42
+ | `fqn` | Parse, validate, and join ADLS paths; used when materializing model datasets and configuring pipelines. |
43
+ | `AdlsFqn` | Strongly typed value passed between tasks and tests to represent a single blob or directory. |
44
+ | `defaults` / `named_roots` | Resolve environment-specific storage roots (`defaults.env_root()`, `named_roots.require(...)`). |
45
+ | `download_to_cache` (`cached` module) | Bring a blob down to the shared read-only cache before analytics, feature builds, or test fixtures run. |
46
+ | `ADLSFileSystem` (`impl` module) | Fetch or list entire directory trees and integrate with caching inside scripts and notebooks. |
47
+ | `abfss` | Translate `AdlsFqn` objects into `abfss://` URIs for Spark/Databricks jobs. |
48
+ | `uri` | Normalize `adls://`, `abfss://`, `https://`, and `dbfs://` strings into `AdlsFqn` values (and vice versa). |
49
+ | `global_client` / `shared_credential` | Shared, fork-safe Azure clients and credentials backing the public helpers above. |
50
+
51
+ ## Example Usage
52
+
53
+ 1. Use the caching helpers and Source integration:
54
+
55
+ ```python
56
+ from thds.adls import cached, upload, source
57
+
58
+ cache_path = cached.download_to_cache("adls://acct/container/path/to/file")
59
+ src = upload("adls://acct/container/path/out.parquet", cache_path)
60
+ verified = source.get_with_hash(src.uri)
61
+ ```
62
+
63
+ 1. For CLI usage, run (from repo root):
64
+
65
+ ```bash
66
+ uv run python -m thds.adls.tools.download adls://acct/container/path/file
67
+ ```
68
+
69
+ ## Operational Notes
70
+
71
+ - **Hash metadata:** Uploads attach `hash_xxh3_128_b64` automatically when the bytes are known. Download
72
+ completion back-fills missing hashes when permissions allow.
73
+ - **Locks and concurrency:** Large transfers acquire per-path file locks to keep azcopy instances
74
+ cooperative. Global HTTP connection pools default to 100 but are configurable via `thds.core.config`.
75
+ - **Error handling:** `BlobNotFoundError` and other ADLS-specific exceptions translate into custom error
76
+ types to simplify retries and diagnostics.
77
+ - **Extensibility:** Additional hash algorithms can be registered by importing dependent packages (e.g.,
78
+ `blake3`). Named roots can be populated dynamically via environment-specific modules
79
+ (`thds.adls._thds_defaults` hook).
@@ -1,8 +1,9 @@
1
- thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
1
+ thds/adls/__init__.py,sha256=CzNcmTdP5RHrD-9K1boDU4Z3pbJAvEAFfFhXGFwpEXw,1088
2
2
  thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
3
3
  thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
4
4
  thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
5
5
  thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
6
+ thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
6
7
  thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
7
8
  thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
8
9
  thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
@@ -17,7 +18,7 @@ thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
17
18
  thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
18
19
  thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
19
20
  thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
20
- thds/adls/list_fast.py,sha256=yk0ydFiBa7U5JU3BCcIGCcrnS-J3yJaZbaZQ_Xj9xWU,4207
21
+ thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
21
22
  thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
22
23
  thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
23
24
  thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,7 +26,7 @@ thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
25
26
  thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
26
27
  thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
27
28
  thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
28
- thds/adls/source_tree.py,sha256=FqVXgvfYPiowrWhRsXBItjvB7t41JRI3sCFVAHxjwgI,2610
29
+ thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
29
30
  thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
30
31
  thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
31
32
  thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
@@ -38,8 +39,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
38
39
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
39
40
  thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
40
41
  thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
41
- thds_adls-4.3.20251014213630.dist-info/METADATA,sha256=oJ4p2IS6hLfeR1BEG7-hCKl8uvfGCifr82ZPJpjIYGY,587
42
- thds_adls-4.3.20251014213630.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- thds_adls-4.3.20251014213630.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
44
- thds_adls-4.3.20251014213630.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
45
- thds_adls-4.3.20251014213630.dist-info/RECORD,,
42
+ thds_adls-4.4.20251117191451.dist-info/METADATA,sha256=phV7EH6lnptlnQYY5TSfyZZk0Wiv0wqZ6L6o7pcP4UM,4586
43
+ thds_adls-4.4.20251117191451.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ thds_adls-4.4.20251117191451.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
45
+ thds_adls-4.4.20251117191451.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
46
+ thds_adls-4.4.20251117191451.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: thds.adls
3
- Version: 4.3.20251014213630
4
- Summary: ADLS tools
5
- Author-email: Trilliant Health <info@trillianthealth.com>
6
- License: MIT
7
- Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
8
- Requires-Python: >=3.9
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: aiohttp>=3.8.1
11
- Requires-Dist: aiostream>=0.4.5
12
- Requires-Dist: azure-identity>=1.9
13
- Requires-Dist: azure-storage-file-datalake>=12.6
14
- Requires-Dist: blake3
15
- Requires-Dist: filelock>=3.0
16
- Requires-Dist: xxhash
17
- Requires-Dist: thds-core
18
-
19
- # adls Library
20
-
21
- A port of `core.adls`.