PyPI - thds.adls - Versions diffs - 4.2.20250926202021__py3-none-any.whl → 4.4.20251117191451__py3-none-any.whl - Mend

thds.adls 4.2.20250926202021py3-none-any.whl → 4.4.20251117191451py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

thds/adls/__init__.py +1 -0
thds/adls/_upload.py +15 -10
thds/adls/blob_meta.py +38 -0
thds/adls/download.py +3 -3
thds/adls/{download_lock.py → file_lock.py} +12 -12
thds/adls/list_fast.py +37 -14
thds/adls/source.py +8 -4
thds/adls/source_tree.py +7 -54
thds/adls/upload.py +14 -14
thds_adls-4.4.20251117191451.dist-info/METADATA +79 -0
{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/RECORD +14 -13
thds_adls-4.2.20250926202021.dist-info/METADATA +0 -21
{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/WHEEL +0 -0
{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/entry_points.txt +0 -0
{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/top_level.txt +0 -0

thds/adls/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from thds import core
 from . import (  # noqa: F401
     abfss,
+    blob_meta,
     defaults,
     etag,
     fqn,

thds/adls/_upload.py CHANGED Viewed

@@ -38,17 +38,22 @@ def _try_default_hash(data: hashes.AnyStrSrc) -> ty.Optional[hashing.Hash]:
     return None
+UploadSrc = ty.Union[Path, bytes, ty.IO[bytes], ty.Iterable[bytes]]
+def upload_src_len(upload_src: UploadSrc, default: int = 0) -> int:
+    if isinstance(upload_src, Path) and upload_src.exists():
+        return upload_src.stat().st_size
+    try:
+        return len(upload_src)  # type: ignore
+    except TypeError as te:
+        logger.debug(f"failed to get length? {repr(te)} for {upload_src!r}")
+        return default
 def _too_small_to_skip_upload(data: hashes.AnyStrSrc, min_size_for_remote_check: int) -> bool:
-    def _len() -> int:
-        if isinstance(data, Path) and data.exists():
-            return data.stat().st_size
-        try:
-            return len(data)  # type: ignore
-        except TypeError as te:
-            logger.debug(f"failed to get length? {repr(te)} for {data}")
-            return min_size_for_remote_check + 1
-    return _len() < min_size_for_remote_check
+    len_ = upload_src_len(data) or min_size_for_remote_check + 1
+    return len_ < min_size_for_remote_check
 class UploadDecision(ty.NamedTuple):

thds/adls/blob_meta.py ADDED Viewed

@@ -0,0 +1,38 @@
+import typing as ty
+from dataclasses import dataclass
+from azure.storage.blob import BlobProperties, ContainerClient
+from thds.core import hashing
+from . import hashes
+@dataclass
+class BlobMeta:
+    path: str
+    size: int
+    hash: ty.Optional[hashing.Hash]
+    metadata: dict[str, str]
+def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
+    return BlobMeta(
+        blob_props.name,
+        blob_props.size,
+        next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
+        blob_props.metadata or {},
+    )
+def is_dir(blob_meta: BlobMeta) -> bool:
+    return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
+# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
+# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
+# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
+def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
+    for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
+        # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
+        yield to_blob_meta(blob_props)

thds/adls/download.py CHANGED Viewed

@@ -16,7 +16,7 @@ from thds.core.types import StrOrPath
 from . import azcopy, errors, etag, hashes
 from ._progress import report_download_progress
-from .download_lock import download_lock
+from .file_lock import file_lock
 from .fqn import AdlsFqn
 from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
@@ -240,12 +240,12 @@ def _download_or_use_verified_cached_coroutine(  # noqa: C901
     # No cache hit, so its time to prepare to download. if a cache was provided, we will
     # _put_ the resulting file in it.
-    file_lock = str(cache.path(fqn) if cache else local_path)
+    file_lock_str = str(cache.path(fqn) if cache else local_path)
     # create lockfile name from the (shared) cache path if present, otherwise the final
     # destination.  Non-cache users may then still incur multiple downloads in parallel,
     # but if you wanted to coordinate then you should probably have been using the global
     # cache in the first place.
-    _dl_scope.enter(download_lock(file_lock))
+    _dl_scope.enter(file_lock(file_lock_str))
     # re-attempt cache hit - we may have gotten the lock after somebody else downloaded
     if file_result := attempt_cache_hit():

thds/adls/{download_lock.py → file_lock.py} RENAMED Viewed

@@ -9,18 +9,18 @@ from thds.core import config, home, log
 from .md5 import hex_md5_str
-DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/download-locks", parse=Path)
+FILELOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/file-locks", parse=Path)
 _CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
 _CLEAN_UP_LOCKFILES_EVERY = timedelta(hours=1).total_seconds()
 _LAST_CLEANED_BY_THIS_PROCESS = time.monotonic() - _CLEAN_UP_LOCKFILES_EVERY
 logger = log.getLogger(__name__)
-def _clean_download_locks() -> int:
+def _clean_file_locks() -> int:
     deleted = 0
     deletion_threshold = time.time() - _CLEAN_UP_LOCKFILES_AFTER_TIME.total_seconds()
     try:
-        for f in DOWNLOAD_LOCKS_DIR().iterdir():
+        for f in FILELOCKS_DIR().rglob("*"):
             fstat = f.stat()
             if stat.S_ISREG(fstat.st_mode) and fstat.st_mtime < deletion_threshold:
                 f.unlink()
@@ -29,20 +29,20 @@ def _clean_download_locks() -> int:
         # this should be, hopefully, both very rare and completely inconsequential as to
         # program correctness.  if you see this happen multiple times, you may have some
         # read-only files or something and want to manually clean up this directory.
-        logger.exception("Failed to clean download locks directory.")
+        logger.exception("Failed to clean file locks directory.")
     return deleted
-def _occasionally_clean_download_locks():
+def _occasionally_clean_file_locks():
     global _LAST_CLEANED_BY_THIS_PROCESS
     # do this about once an hour
     if time.monotonic() > _LAST_CLEANED_BY_THIS_PROCESS + _CLEAN_UP_LOCKFILES_EVERY:
         _LAST_CLEANED_BY_THIS_PROCESS = time.monotonic()
         # minor race condition with other threads but it doesn't really matter.
-        _clean_download_locks()
+        _clean_file_locks()
-def download_lock(download_unique_str: str) -> FileLock:
+def file_lock(lock_unique_str: str, locktype: str = "download") -> FileLock:
     """Note that the lockfiles will never be deleted automatically.
     https://py-filelock.readthedocs.io/en/latest/api.html#filelock.BaseFileLock.release
@@ -50,7 +50,7 @@ def download_lock(download_unique_str: str) -> FileLock:
     https://stackoverflow.com/questions/58098634/why-does-the-python-filelock-library-delete-lockfiles-on-windows-but-not-unix
     This means local developers would have a whole bunch of zero-byte files in their
-    download locks directory. So, we take a slightly idiosyncratic approach to cleaning
+    file locks directory. So, we take a slightly idiosyncratic approach to cleaning
     this up: not wanting to run this code on every download, but also not wanting
     developers to see an infinitely-growing mess.  Since parallel downloads will
     (generally) not constitute a correctness issue, the 'safest' time to clean it up will
@@ -58,11 +58,11 @@ def download_lock(download_unique_str: str) -> FileLock:
     we can get rid of old lockfiles after they've existed for more than 24 hours, since
     it's quite rare that a download would last that long.
     """
-    DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
-    _occasionally_clean_download_locks()
+    lock_type_dir = FILELOCKS_DIR() / locktype
+    lock_type_dir.mkdir(parents=True, exist_ok=True)
+    _occasionally_clean_file_locks()
     return FileLock(
-        DOWNLOAD_LOCKS_DIR()
-        / (download_unique_str.split("/")[-1][:50] + hex_md5_str(download_unique_str)),
+        lock_type_dir / (lock_unique_str.split("/")[-1][:50] + hex_md5_str(lock_unique_str)),
         # is_singleton=True,
         # critical for keeping this reentrant without passing the lock around.
         # see https://github.com/tox-dev/filelock/issues/315#issuecomment-2016797681

thds/adls/list_fast.py CHANGED Viewed

@@ -6,20 +6,29 @@ client instead of the file system client.
 import typing as ty
-from thds.core import parallel, thunks
+from thds.core import log, parallel, source, thunks
-from . import global_client
+from . import blob_meta, global_client
+from . import source as adls_source
 from .fqn import AdlsFqn
-from .source_tree import BlobMeta, to_blob_meta, yield_blob_meta
+from .uri import UriIsh, parse_any
 R = ty.TypeVar("R")
+logger = log.getLogger(__name__)
 def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R]:
-    yield from (res for _, res in parallel.failfast(parallel.yield_all(parallel.create_keys(thunks))))
+    yield from (
+        res
+        for _, res in parallel.failfast(
+            parallel.yield_all(parallel.create_keys(thunks), progress_logger=logger.debug)
+        )
+    )
-def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
+def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[blob_meta.BlobMeta]:
     """A fast way to find all blobs in a directory tree; we do this in parallel on
     subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
@@ -29,9 +38,9 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
     """
     if layers <= 0:
         # directly yield the blobs
-        yield from yield_blob_meta(
+        yield from blob_meta.yield_blob_meta(
             global_client.get_global_blob_container_client(fqn.sa, fqn.container),
-            fqn.path,
+            fqn.path.rstrip("/") + "/",
         )
         return
@@ -69,8 +78,10 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
     blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
-    def _get_blob_meta(blob_name: str) -> BlobMeta:
-        return to_blob_meta(blob_container_client.get_blob_client(blob_name).get_blob_properties())
+    def _get_blob_meta(blob_name: str) -> blob_meta.BlobMeta:
+        return blob_meta.to_blob_meta(
+            blob_container_client.get_blob_client(blob_name).get_blob_properties()
+        )
     for blob_meta_iter in (
         _failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
@@ -86,10 +97,22 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
         yield from blob_meta_iter
-def is_dir(blob_meta: BlobMeta) -> bool:  # TODO move to blob_meta.py once it exists
-    return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
-def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
+def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[blob_meta.BlobMeta]:
     """Only for use within multi_layer_yield_blobs."""
     return list(multilayer_yield_blob_meta(fqn, layers))
+def multilayer_yield_sources(
+    fqn_or_uri: UriIsh,
+    layers: int = 1,
+    filter_: ty.Callable[[blob_meta.BlobMeta], bool] = lambda _: True,
+) -> ty.Iterator[source.Source]:
+    """
+    if you want to list directories and files, use `multilayer_yield_blob_meta` instead
+    """
+    fqn = parse_any(fqn_or_uri)
+    root = fqn.root()
+    for blob in multilayer_yield_blob_meta(fqn, layers):
+        if not blob_meta.is_dir(blob) and filter_(blob):
+            # ^ a "dir" Source would not make sense
+            yield adls_source.from_adls(root / blob.path, hash=blob.hash, size=blob.size)

thds/adls/source.py CHANGED Viewed

@@ -30,7 +30,9 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
 source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
-def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None) -> source.Source:
+def from_adls(
+    uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None, size: int = 0
+) -> source.Source:
     """Flexible, public interface to creating Sources from any ADLS-like reference.
     Does NOT automatically fetch a checksumming hash from the ADLS URI if it's not
@@ -40,7 +42,7 @@ def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None
     r_fqn = resolve_any(uri_or_fqn)
     if not r_fqn:
         raise ValueError(f"Could not resolve {uri_or_fqn} to an ADLS FQN")
-    return source.Source(str(r_fqn), hash)
+    return source.Source(str(r_fqn), hash, size)
 source.register_from_uri_handler(
@@ -55,13 +57,15 @@ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
     """
     fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
     with blob_not_found_translation(fqn):
-        uri_hashes = hashes.extract_hashes_from_props(get_file_properties(fqn))
+        props = get_file_properties(fqn)
+        uri_hashes = hashes.extract_hashes_from_props(props)
         if not uri_hashes:
             raise ValueError(
                 f"ADLS file {fqn} must have a hash to use this function. "
                 "If you know the hash, use `from_adls` with the hash parameter."
             )
-        return from_adls(fqn, next(iter(uri_hashes.values())))
+        size = int(props.get("size")) or 0
+        return from_adls(fqn, next(iter(uri_hashes.values())), size)
 def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:

thds/adls/source_tree.py CHANGED Viewed

@@ -1,53 +1,6 @@
-import typing as ty
-from dataclasses import dataclass
-from azure.storage.blob import BlobProperties, ContainerClient
-from thds.core import hashing
 from thds.core.source.tree import SourceTree
-from . import fqn, global_client, hashes, source, uri
-# TODO refactor BlobMeta into its own module.
-@dataclass
-class BlobMeta:
-    path: str
-    size: int
-    hash: ty.Optional[hashing.Hash]
-    metadata: dict[str, str]
-def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
-    return BlobMeta(
-        blob_props.name,
-        blob_props.size,
-        next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
-        blob_props.metadata or {},
-    )
-def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
-    for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
-        # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
-        yield to_blob_meta(blob_props)
-# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
-# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
-# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
-def list_blob_meta(
-    container_client: ContainerClient, root_dir: str, match_suffix: str = ""
-) -> ty.List[BlobMeta]:
-    """Gets the path (relative to the SA/container root), size, and _a_ hash (if available) of all blobs in a directory."""
-    return [
-        blob_meta
-        for blob_meta in yield_blob_meta(container_client, root_dir)
-        if blob_meta.size > 0
-        # container client lists directories as blobs with size 0
-        and blob_meta.path.endswith(match_suffix)
-    ]
+from . import fqn, list_fast, uri
 def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
@@ -56,12 +9,12 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
     """
     root_fqn = uri.parse_any(adls_path)
-    container_client = global_client.get_global_blob_container_client(root_fqn.sa, root_fqn.container)
-    container_root = root_fqn.root()
     return SourceTree(
-        sources=[
-            source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash)
-            for blob_meta in list_blob_meta(container_client, root_fqn.path, match_suffix=match_suffix)
-        ],
+        sources=sorted(
+            list_fast.multilayer_yield_sources(
+                root_fqn, layers=0, filter_=lambda blob: blob.path.endswith(match_suffix)
+            ),
+            key=lambda src: src.uri,
+        ),
         higher_logical_root=fqn.split(root_fqn)[-1],
     )

thds/adls/upload.py CHANGED Viewed

@@ -15,8 +15,9 @@ from thds.core import files, fretry, link, log, scope, source, tmp
 from . import azcopy, hashes
 from ._progress import report_upload_progress
-from ._upload import upload_decision_and_metadata
+from ._upload import UploadSrc, upload_decision_and_metadata, upload_src_len
 from .conf import UPLOAD_FILE_MAX_CONCURRENCY
+from .file_lock import file_lock
 from .fqn import AdlsFqn
 from .global_client import get_global_blob_container_client
 from .ro_cache import Cache
@@ -25,9 +26,6 @@ logger = log.getLogger(__name__)
 _SLOW_CONNECTION_WORKAROUND = 14400  # seconds
-UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
 def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
     @scope.bound
     def _try_write_through() -> bool:
@@ -40,8 +38,8 @@ def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Op
         out = scope.enter(tmp.temppath_same_fs(local_cache_path))
         if hasattr(data, "read") and hasattr(data, "seek"):
             with open(out, "wb") as f:
-                f.write(data.read())  # type: ignore
-            data.seek(0)  # type: ignore
+                f.write(data.read())
+            data.seek(0)
             link.link_or_copy(out, local_cache_path)
             return True
@@ -101,9 +99,12 @@ def upload(
         # we always use the original source file to upload, not the cached path,
         # because uploading from a shared location risks race conditions.
+    scope.enter(file_lock(str(dest_), locktype="upload"))
     blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
     blob_client = blob_container_client.get_blob_client(dest_.path)
     decision = upload_decision_and_metadata(blob_client.get_blob_properties, src)  # type: ignore [arg-type]
+    n_bytes = upload_src_len(src, default=0)
     def source_from_meta() -> source.Source:
         best_hash = next(iter(hashes.extract_hashes_from_metadata(decision.metadata)), None)
@@ -111,17 +112,14 @@ def upload(
             assert best_hash, "A hash should always be calculable for a local path."
             return source.from_file(src, hash=best_hash, uri=str(dest_))
-        return source.from_uri(str(dest_), hash=best_hash)
+        return source.from_uri(str(dest_), hash=best_hash, size=n_bytes)
     if decision.upload_required:
         # set up some bookkeeping
-        n_bytes = None  # if we pass 0 to upload_blob, it truncates the write now
         bytes_src: ty.Union[bytes, ty.IO, ty.Iterable[bytes]]
         if isinstance(src, Path):
-            n_bytes = src.stat().st_size
             bytes_src = scope.enter(open(src, "rb"))
         elif isinstance(src, bytes):
-            n_bytes = len(src)
             bytes_src = src
         else:
             bytes_src = src
@@ -129,7 +127,7 @@ def upload(
         if "metadata" in upload_data_kwargs:
             decision.metadata.update(upload_data_kwargs.pop("metadata"))
-        if azcopy.upload.should_use_azcopy(n_bytes or 0) and isinstance(src, Path):
+        if azcopy.upload.should_use_azcopy(n_bytes) and isinstance(src, Path):
             logger.info("Using azcopy to upload %s to %s", src, dest_)
             try:
                 azcopy.upload.run(
@@ -137,7 +135,7 @@ def upload(
                         src, dest_, content_type=content_type, metadata=decision.metadata, overwrite=True
                     ),
                     dest_,
-                    n_bytes or 0,
+                    n_bytes,
                 )
                 return source_from_meta()
@@ -155,9 +153,11 @@ def upload(
         # This is both faster, as well as simpler to reason about, and
         # in fact was the behavior I had been assuming all along...
         blob_client.upload_blob(
-            report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes or 0),
+            report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes),
             overwrite=True,
-            length=n_bytes,
+            length=(
+                n_bytes if n_bytes > 0 else None
+            ),  # if we pass 0 to upload_blob, it truncates the write now
             content_settings=upload_content_settings,
             connection_timeout=_SLOW_CONNECTION_WORKAROUND,
             max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),

thds_adls-4.4.20251117191451.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,79 @@
+Metadata-Version: 2.4
+Name: thds.adls
+Version: 4.4.20251117191451
+Summary: ADLS tools
+Author-email: Trilliant Health <info@trillianthealth.com>
+License: MIT
+Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: aiohttp>=3.8.1
+Requires-Dist: aiostream>=0.4.5
+Requires-Dist: azure-identity>=1.9
+Requires-Dist: azure-storage-file-datalake>=12.6
+Requires-Dist: blake3
+Requires-Dist: filelock>=3.0
+Requires-Dist: xxhash
+Requires-Dist: thds-core
+# thds.adls
+A high-performance Azure Data Lake Storage (ADLS Gen2) client for the THDS monorepo. It wraps the Azure
+SDK with hash-aware caching, azcopy acceleration, and shared client/credential plumbing so applications
+can transfer large blob datasets quickly and reliably.
+## Highlights
+- **Environment-aware paths first:** Almost every consumer starts by importing `fqn`, `AdlsFqn`, and
+  `defaults.env_root()` to build storage-account/container URIs that follow the current THDS environment.
+- **Cache-backed reads:** `download_to_cache` is the standard entry point for pulling blobs down with a
+  verified hash so local workflows, tests, and pipelines can operate on read-only copies.
+- **Bulk filesystem helpers:** `ADLSFileSystem` powers scripts and jobs that need to walk directories,
+  fetch batches of files, or mirror hive tables without re-implementing Azure SDK plumbing.
+- **Spark/Databricks bridges:** `abfss` and `uri` conversions keep analytics code agnostic to whether it
+  needs an `adls://`, `abfss://`, `https://`, or `dbfs://` view of the same path.
+- **Composable utilities:** Higher-level modules (cache, upload, copy, list) layer on top of those
+  imports so teams can opt into more advanced behavior without leaving the public API surface.
+## Key Modules
+| Component                             | Typical usage in the monorepo                                                                              |
+| ------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
+| `fqn`                                 | Parse, validate, and join ADLS paths; used when materializing model datasets and configuring pipelines.    |
+| `AdlsFqn`                             | Strongly typed value passed between tasks and tests to represent a single blob or directory.               |
+| `defaults` / `named_roots`            | Resolve environment-specific storage roots (`defaults.env_root()`, `named_roots.require(...)`).            |
+| `download_to_cache` (`cached` module) | Bring a blob down to the shared read-only cache before analytics, feature builds, or test fixtures run.    |
+| `ADLSFileSystem` (`impl` module)      | Fetch or list entire directory trees and integrate with caching inside scripts and notebooks.              |
+| `abfss`                               | Translate `AdlsFqn` objects into `abfss://` URIs for Spark/Databricks jobs.                                |
+| `uri`                                 | Normalize `adls://`, `abfss://`, `https://`, and `dbfs://` strings into `AdlsFqn` values (and vice versa). |
+| `global_client` / `shared_credential` | Shared, fork-safe Azure clients and credentials backing the public helpers above.                          |
+## Example Usage
+1. Use the caching helpers and Source integration:
+   ```python
+   from thds.adls import cached, upload, source
+   cache_path = cached.download_to_cache("adls://acct/container/path/to/file")
+   src = upload("adls://acct/container/path/out.parquet", cache_path)
+   verified = source.get_with_hash(src.uri)
+   ```
+1. For CLI usage, run (from repo root):
+   ```bash
+   uv run python -m thds.adls.tools.download adls://acct/container/path/file
+   ```
+## Operational Notes
+- **Hash metadata:** Uploads attach `hash_xxh3_128_b64` automatically when the bytes are known. Download
+  completion back-fills missing hashes when permissions allow.
+- **Locks and concurrency:** Large transfers acquire per-path file locks to keep azcopy instances
+  cooperative. Global HTTP connection pools default to 100 but are configurable via `thds.core.config`.
+- **Error handling:** `BlobNotFoundError` and other ADLS-specific exceptions translate into custom error
+  types to simplify retries and diagnostics.
+- **Extensibility:** Additional hash algorithms can be registered by importing dependent packages (e.g.,
+  `blake3`). Named roots can be populated dynamically via environment-specific modules
+  (`thds.adls._thds_defaults` hook).

{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/RECORD RENAMED Viewed

@@ -1,32 +1,33 @@
-thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
+thds/adls/__init__.py,sha256=CzNcmTdP5RHrD-9K1boDU4Z3pbJAvEAFfFhXGFwpEXw,1088
 thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
 thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
-thds/adls/_upload.py,sha256=mhTdWiQroaugYuwQg7R8CEgdfCYF4xvJthlsqO0jlnE,4692
+thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
 thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
+thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
 thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
 thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
 thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
 thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
 thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
-thds/adls/download.py,sha256=z31w4Yuz4CqmU0iectcXmSoM2QJb1mSp9tGs0GHEhtY,19146
-thds/adls/download_lock.py,sha256=tgT48l4C5_qmArGeq05gl7VlxT22dZBH2Xwxx0itE9o,3176
+thds/adls/download.py,sha256=IPg5nz_sGE7dX8DUQyWjG2D9z54PXLScap-pZzTUFTk,19142
 thds/adls/errors.py,sha256=6NMLHtVNsWBRDXaes9yzHj9cwKOD9t1dwL4BltdtjhU,1895
 thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
+thds/adls/file_lock.py,sha256=yLak5XDpnIYwfUNdpGFbIGG64uEs98-yVscNpJlqMxM,3176
 thds/adls/file_properties.py,sha256=dhRtbsMNOYfExkEiy76wrLfrJ6IMQeN1Z3LIxgKceqY,2042
 thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
 thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
 thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
 thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
-thds/adls/list_fast.py,sha256=7jHnln4DMWYVLHhejj-fdWMBWflBiWfynegKxcUlNDY,4189
+thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
 thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
 thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
 thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
 thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
 thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
-thds/adls/source.py,sha256=8HVMYuxDn1XYGwFFSBowMlvQ6r2Jm2CQlpu4h85JvsE,2559
-thds/adls/source_tree.py,sha256=gl2JLjxAduo4cGQBb8LqBnmRHHk2wqIC5yt-sqkXOEo,2589
-thds/adls/upload.py,sha256=MRHK9Am-x5FKBPh1SXLTbPC1r0Xk0bGWNU8CcNuUMLo,6602
+thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
+thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
+thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
 thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
 thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
 thds/adls/azcopy/download.py,sha256=FOtYyYh7ZXNWNdkj04yTV26lxcKOVj-YhS2p_EclYxA,6526
@@ -38,8 +39,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
 thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
 thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
 thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
-thds_adls-4.2.20250926202021.dist-info/METADATA,sha256=WTNhMwL8Xk0cJBvpuqUA4NjZDw55hC1ra4VbiJqFX7o,587
-thds_adls-4.2.20250926202021.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-thds_adls-4.2.20250926202021.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
-thds_adls-4.2.20250926202021.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
-thds_adls-4.2.20250926202021.dist-info/RECORD,,
+thds_adls-4.4.20251117191451.dist-info/METADATA,sha256=phV7EH6lnptlnQYY5TSfyZZk0Wiv0wqZ6L6o7pcP4UM,4586
+thds_adls-4.4.20251117191451.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+thds_adls-4.4.20251117191451.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
+thds_adls-4.4.20251117191451.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
+thds_adls-4.4.20251117191451.dist-info/RECORD,,

thds_adls-4.2.20250926202021.dist-info/METADATA DELETED Viewed

@@ -1,21 +0,0 @@
-Metadata-Version: 2.4
-Name: thds.adls
-Version: 4.2.20250926202021
-Summary: ADLS tools
-Author-email: Trilliant Health <info@trillianthealth.com>
-License: MIT
-Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown
-Requires-Dist: aiohttp>=3.8.1
-Requires-Dist: aiostream>=0.4.5
-Requires-Dist: azure-identity>=1.9
-Requires-Dist: azure-storage-file-datalake>=12.6
-Requires-Dist: blake3
-Requires-Dist: filelock>=3.0
-Requires-Dist: xxhash
-Requires-Dist: thds-core
-# adls Library
-A port of `core.adls`.

{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/WHEEL RENAMED Viewed

File without changes

{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{thds_adls-4.2.20250926202021.dist-info → thds_adls-4.4.20251117191451.dist-info}/top_level.txt RENAMED Viewed

File without changes

thds.adls 4.2.20250926202021__py3-none-any.whl → 4.4.20251117191451__py3-none-any.whl

thds.adls 4.2.20250926202021py3-none-any.whl → 4.4.20251117191451py3-none-any.whl