PyPI - thds.adls - Versions diffs - 3.0.20250116223841__py3-none-any.whl - Mend

thds.adls 3.0.20250116223841__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of thds.adls might be problematic. Click here for more details.

Files changed (35) hide show

thds/adls/__init__.py +15 -0
thds/adls/_progress.py +193 -0
thds/adls/_upload.py +127 -0
thds/adls/abfss.py +24 -0
thds/adls/cached_up_down.py +48 -0
thds/adls/conf.py +33 -0
thds/adls/dbfs.py +60 -0
thds/adls/defaults.py +26 -0
thds/adls/download.py +394 -0
thds/adls/download_lock.py +57 -0
thds/adls/errors.py +44 -0
thds/adls/etag.py +6 -0
thds/adls/file_properties.py +13 -0
thds/adls/fqn.py +169 -0
thds/adls/global_client.py +78 -0
thds/adls/impl.py +1111 -0
thds/adls/md5.py +60 -0
thds/adls/meta.json +8 -0
thds/adls/named_roots.py +26 -0
thds/adls/py.typed +0 -0
thds/adls/resource/__init__.py +36 -0
thds/adls/resource/core.py +79 -0
thds/adls/resource/file_pointers.py +54 -0
thds/adls/resource/up_down.py +245 -0
thds/adls/ro_cache.py +126 -0
thds/adls/shared_credential.py +107 -0
thds/adls/source.py +66 -0
thds/adls/tools/download.py +35 -0
thds/adls/tools/ls.py +38 -0
thds/adls/uri.py +38 -0
thds.adls-3.0.20250116223841.dist-info/METADATA +16 -0
thds.adls-3.0.20250116223841.dist-info/RECORD +35 -0
thds.adls-3.0.20250116223841.dist-info/WHEEL +5 -0
thds.adls-3.0.20250116223841.dist-info/entry_points.txt +3 -0
thds.adls-3.0.20250116223841.dist-info/top_level.txt +1 -0

thds/adls/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from thds.core import meta
+from . import abfss, defaults, etag, fqn, named_roots, resource, source, uri  # noqa: F401
+from .cached_up_down import download_directory, download_to_cache, upload_through_cache  # noqa: F401
+from .errors import BlobNotFoundError  # noqa: F401
+from .fqn import *  # noqa: F401,F403
+from .global_client import get_global_client, get_global_fs_client  # noqa: F401
+from .impl import *  # noqa: F401,F403
+from .ro_cache import Cache, global_cache  # noqa: F401
+from .uri import UriIsh, parse_any, parse_uri, resolve_any, resolve_uri  # noqa: F401
+__version__ = meta.get_version(__name__)
+metadata = meta.read_metadata(__name__)
+__basepackage__ = __name__
+__commit__ = metadata.git_commit

thds/adls/_progress.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""An app-global progress reporter that attempts to reduce the number
+of progress reports using either a time delay or by using fancy
+progress bars.
+"""
+import os
+import typing as ty
+from functools import reduce
+from timeit import default_timer
+from thds.core import log
+logger = log.getLogger(__name__)
+_1MB = 2**20
+_UPDATE_INTERVAL_S = 5
+_SUPPORTS_CR = not bool(os.getenv("CI"))
+# CI does not support carriage returns.
+# if we find other cases that don't, we can add them here.
+class ProgressState(ty.NamedTuple):
+    start: float
+    total: int
+    n: int
+def _dumb_report_progress(desc: str, state: ProgressState):
+    if not state.total:
+        logger.info(f"{desc} complete!")
+        return
+    if not state.n:
+        return  # don't report when nothing has happened yet.
+    start, total, n_bytes = state
+    pct = 100 * (n_bytes / total)
+    elapsed = default_timer() - start
+    rate_s = f" at {n_bytes/_1MB/elapsed:,.1f} MiB/s"
+    logger.info(f"{desc}: {n_bytes:,} / {total:,} bytes ({pct:.1f}%){rate_s} in {elapsed:.1f}s")
+def _sum_ps(ps: ty.Iterable[ProgressState]) -> ProgressState:
+    return reduce(
+        lambda x, y: ProgressState(min(x.start, y.start), x.total + y.total, y.n + x.n),
+        ps,
+        ProgressState(default_timer(), 0, 0),
+    )
+def _blobs(n: list) -> str:
+    if not n:
+        return ""
+    return f" {len(n)} blob" + ("" if len(n) == 1 else "s")
+class _Reporter(ty.Protocol):
+    def __call__(self, states: ty.List[ProgressState]):
+        ...
+class DumbReporter:
+    def __init__(self, desc: str):
+        self._desc = desc
+        self._started = default_timer()
+        self._last_reported = self._started
+    def __call__(self, states: ty.List[ProgressState]):
+        now = default_timer()
+        # two cases that require a report:
+        # 1. it's been a long enough time (update interval) since the last report.
+        if now - self._last_reported > _UPDATE_INTERVAL_S:
+            _dumb_report_progress(self._desc + f" {_blobs(states)}", _sum_ps(states))
+            self._last_reported = now
+        # 2. a download finished _and_ that specific download took longer overall than our update interval.
+        else:
+            for state in states:
+                if (
+                    state.total
+                    and state.n >= state.total  # download finished
+                    and (now - state.start) > _UPDATE_INTERVAL_S  # and it took a while
+                ):
+                    # report individually for each download that finished.
+                    _dumb_report_progress(self._desc + f" {_blobs([state])}", state)
+            # notably, we do not delay the next 'standard' report because of downloads finishing.
+class TqdmReporter:
+    """Falls back to DumbReporter if tqdm is not installed."""
+    def __init__(self, desc: str):
+        self._desc = desc
+        self._bar = None
+        self._dumb = DumbReporter(desc)
+    def __call__(self, states: ty.List[ProgressState]):
+        try:
+            from tqdm import tqdm  # type: ignore
+            bar = self._bar
+            state = _sum_ps(states)
+            if not bar and state.total > 0:
+                bar = tqdm(
+                    total=state.total,
+                    delay=_UPDATE_INTERVAL_S,
+                    mininterval=_UPDATE_INTERVAL_S,
+                    initial=state.n,
+                    unit="byte",
+                    unit_scale=True,
+                )  # type: ignore
+            if bar:
+                # if there are zero active states (which is possible),
+                # n and total will be zero after sum, and we don't
+                # want to set zeros on an existing non-zero bar.
+                bar.total == state.total or bar.total
+                new_n = state.n or bar.n
+                bar.update(new_n - bar.n)
+                bar.desc = f"{self._desc}{_blobs(states)}"
+                if _SUPPORTS_CR:
+                    bar.refresh()
+                if bar.n >= bar.total:
+                    bar.close()
+                    bar = None
+                self._bar = bar
+        except ModuleNotFoundError:
+            self._dumb(states)
+class Tracker:
+    def __init__(self, reporter: _Reporter):
+        self._progresses: ty.Dict[str, ProgressState] = dict()
+        self._reporter = reporter
+    def add(self, key: str, total: int) -> ty.Tuple["Tracker", str]:
+        if total < 0:
+            total = 0
+        self._progresses[key] = ProgressState(default_timer(), total, 0)
+        self._reporter(list(self._progresses.values()))
+        return self, key
+    def __call__(self, key: str, written: int):
+        assert written >= 0, "cannot write negative bytes: {written}"
+        try:
+            start, total, n = self._progresses[key]
+            self._progresses[key] = ProgressState(start, total, n + written)
+            self._reporter(list(self._progresses.values()))
+            if self._progresses[key].n >= total:
+                del self._progresses[key]
+        except KeyError:
+            self._reporter(list(self._progresses.values()))
+_GLOBAL_DN_TRACKER = Tracker(TqdmReporter("thds.adls downloading"))
+_GLOBAL_UP_TRACKER = Tracker(TqdmReporter("thds.adls uploading"))
+T = ty.TypeVar("T", bound=ty.IO)
+def _proxy_io(io_type: str, stream: T, key: str, total_len: int) -> T:
+    assert io_type in ("read", "write"), io_type
+    try:
+        old_io = getattr(stream, io_type)
+        total_len = total_len or len(stream)  # type: ignore
+    except (AttributeError, TypeError):
+        return stream
+    if io_type == "read":
+        tracker, _ = _GLOBAL_UP_TRACKER.add(key, total_len)
+    else:
+        tracker, _ = _GLOBAL_DN_TRACKER.add(key, total_len)
+    def io(data_or_len: ty.Union[bytes, int]):
+        r = old_io(data_or_len)
+        io_len = (
+            total_len
+            if data_or_len == -1
+            else (len(data_or_len) if isinstance(data_or_len, bytes) else data_or_len)
+        )
+        tracker(key, io_len)
+        return r
+    setattr(stream, io_type, io)
+    return stream
+def report_download_progress(stream: T, key: str, total: int = 0) -> T:
+    if not total:  # if we don't know how big a download is, we can't report progress.
+        return stream
+    return _proxy_io("write", stream, key, total)
+def report_upload_progress(stream: T, key: str, total: int = 0) -> T:
+    return _proxy_io("read", stream, key, total)

thds/adls/_upload.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""Just utilities for deciding whether or not to upload.
+Not an officially-published API of the thds.adls library.
+"""
+import typing as ty
+from pathlib import Path
+import azure.core.exceptions
+from azure.storage.blob import ContentSettings
+from thds.core import hostname, log
+from .md5 import AnyStrSrc, try_md5
+_SKIP_ALREADY_UPLOADED_CHECK_IF_MORE_THAN_BYTES = 2 * 2**20  # 2 MB is about right
+logger = log.getLogger(__name__)
+def _get_checksum_content_settings(data: AnyStrSrc) -> ty.Optional[ContentSettings]:
+    """Ideally, we calculate an MD5 sum for all data that we upload.
+    The only circumstances under which we cannot do this are if the
+    stream does not exist in its entirety before the upload begins.
+    """
+    md5 = try_md5(data)
+    if md5:
+        return ContentSettings(content_md5=md5)
+    return None
+def _too_small_to_skip_upload(data: AnyStrSrc, min_size_for_remote_check: int) -> bool:
+    def _len() -> int:
+        if isinstance(data, Path) and data.exists():
+            return data.stat().st_size
+        try:
+            return len(data)  # type: ignore
+        except TypeError as te:
+            logger.debug(f"failed to get length? {repr(te)} for {data}")
+            return min_size_for_remote_check + 1
+    return _len() < min_size_for_remote_check
+class UploadDecision(ty.NamedTuple):
+    upload_required: bool
+    content_settings: ty.Optional[ContentSettings]
+class Properties(ty.Protocol):
+    name: str
+    content_settings: ContentSettings
+def _co_content_settings_for_upload_unless_file_present_with_matching_checksum(
+    data: AnyStrSrc, min_size_for_remote_check: int
+) -> ty.Generator[bool, ty.Optional[Properties], UploadDecision]:
+    local_content_settings = _get_checksum_content_settings(data)
+    if not local_content_settings:
+        return UploadDecision(True, None)
+    if _too_small_to_skip_upload(data, min_size_for_remote_check):
+        logger.debug("Too small to bother with an early call - let's just upload...")
+        return UploadDecision(True, local_content_settings)
+    remote_properties = yield True
+    if not remote_properties:
+        logger.debug("No remote properties could be fetched so an upload is required")
+        return UploadDecision(True, local_content_settings)
+    if remote_properties.content_settings.content_md5 == local_content_settings.content_md5:
+        logger.info(f"Remote file {remote_properties.name} already exists and has matching checksum")
+        return UploadDecision(False, local_content_settings)
+    logger.debug("Remote file exists but MD5 does not match - upload required.")
+    return UploadDecision(True, local_content_settings)
+doc = """
+Returns False for upload_required if the file is large and the remote
+exists and has a known, matching checksum.
+Returns ContentSettings if an MD5 checksum can be calculated.
+"""
+async def async_upload_decision_and_settings(
+    get_properties: ty.Callable[[], ty.Awaitable[Properties]],
+    data: AnyStrSrc,
+    min_size_for_remote_check: int = _SKIP_ALREADY_UPLOADED_CHECK_IF_MORE_THAN_BYTES,
+) -> UploadDecision:
+    try:
+        co = _co_content_settings_for_upload_unless_file_present_with_matching_checksum(
+            data, min_size_for_remote_check
+        )
+        while True:
+            co.send(None)
+            try:
+                co.send(await get_properties())
+            except azure.core.exceptions.ResourceNotFoundError:
+                co.send(None)
+    except StopIteration as stop:
+        return stop.value
+def upload_decision_and_settings(
+    get_properties: ty.Callable[[], Properties],
+    data: AnyStrSrc,
+    min_size_for_remote_check: int = _SKIP_ALREADY_UPLOADED_CHECK_IF_MORE_THAN_BYTES,
+) -> UploadDecision:
+    try:
+        co = _co_content_settings_for_upload_unless_file_present_with_matching_checksum(
+            data, min_size_for_remote_check
+        )
+        while True:
+            co.send(None)
+            try:
+                co.send(get_properties())
+            except azure.core.exceptions.ResourceNotFoundError:
+                co.send(None)
+    except StopIteration as stop:
+        return stop.value
+async_upload_decision_and_settings.__doc__ = doc
+upload_decision_and_settings.__doc__ = doc
+def metadata_for_upload() -> ty.Dict[str, str]:
+    return {"upload_wrapper_sw": "thds.adls", "upload_hostname": hostname.friendly()}

thds/adls/abfss.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Translate ADLS URIs to ABFSS URIs (for use with Spark/Hadoop)."""
+from .fqn import AdlsFqn
+ABFSS_SCHEME = "abfss://"
+class NotAbfssUri(ValueError):
+    pass
+def from_adls_fqn(fqn: AdlsFqn) -> str:
+    return f"{ABFSS_SCHEME}{fqn.container}@{fqn.sa}.dfs.core.windows.net/{fqn.path.lstrip('/')}"
+def from_adls_uri(uri: str) -> str:
+    return from_adls_fqn(AdlsFqn.parse(uri))
+def to_adls_fqn(abfss_uri: str) -> AdlsFqn:
+    if not abfss_uri.startswith(ABFSS_SCHEME):
+        raise NotAbfssUri(f"URI does not start with {ABFSS_SCHEME!r}: {abfss_uri!r}")
+    container, rest = abfss_uri[len(ABFSS_SCHEME) :].split("@", 1)
+    sa, path = rest.split(".dfs.core.windows.net/")
+    return AdlsFqn.of(sa, container, path)

thds/adls/cached_up_down.py ADDED Viewed

@@ -0,0 +1,48 @@
+from pathlib import Path
+from .download import download_or_use_verified
+from .fqn import AdlsFqn
+from .global_client import get_global_fs_client
+from .impl import ADLSFileSystem
+from .resource.up_down import AdlsHashedResource, upload
+from .ro_cache import global_cache
+from .uri import UriIsh, parse_any
+def download_to_cache(fqn_or_uri: UriIsh, md5b64: str = "") -> Path:
+    """Downloads directly to the cache and returns a Path to the read-only file.
+    This will allow you to download a file 'into' the cache even if
+    you provide no MD5 and the remote file properties does not have
+    one. However, future attempts to reuse the cache will force a
+    re-download if no MD5 is available at that time.
+    """
+    fqn = parse_any(fqn_or_uri)
+    cache_path = global_cache().path(fqn)
+    download_or_use_verified(
+        get_global_fs_client(fqn.sa, fqn.container), fqn.path, cache_path, md5b64, cache=global_cache()
+    )
+    return cache_path
+def upload_through_cache(dest: UriIsh, src_path: Path) -> AdlsHashedResource:
+    """Return an AdlsHashedResource, since by definition an upload through the cache must have a known checksum.
+    Uses global client, which is pretty much always what you want.
+    """
+    assert src_path.is_file(), "src_path must be a file."
+    resource = upload(dest, src_path, write_through_cache=global_cache())
+    assert resource, "MD5 should always be calculable for a local path."
+    return resource
+def download_directory(fqn: AdlsFqn) -> Path:
+    """Download a directory from an AdlsFqn.
+    If you know you only need to download a single file, use download_to_cache.
+    """
+    fs = ADLSFileSystem(fqn.sa, fqn.container)
+    cached_dir_root = global_cache().path(fqn)
+    fs.fetch_directory(fqn.path, cached_dir_root)
+    assert cached_dir_root.is_dir(), "Directory should have been downloaded to the cache."
+    return cached_dir_root

thds/adls/conf.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""This is where fine-tuning environment variables are defined."""
+from thds.core import config
+# These defaults were tested to perform well (~200 MB/sec) on a 2 core
+# machine on Kubernetes.  Larger numbers did not do any better, but
+# these numbers did roughly 4x as well as the defaults, which are
+# concurrency=1 and chunk_get_size=32 MB.
+#
+# As always, your mileage may vary.
+#
+# For more info, see docs at
+# https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blob-download-python#specify-data-transfer-options-on-download
+#
+# Also see
+# azure.storage.filedatalake._shared.base_client.create_configuration
+# for actual details...
+#
+DOWNLOAD_FILE_MAX_CONCURRENCY = config.item("download_file_max_concurrency", 4, parse=int)
+MAX_CHUNK_GET_SIZE = config.item("max_chunk_get_size", 2**20 * 64, parse=int)  # 64MB
+MAX_SINGLE_GET_SIZE = config.item(
+    "max_single_get_size", 2**20 * 64, parse=lambda i: max(MAX_CHUNK_GET_SIZE(), int(i))
+)  # 64MB
+MAX_SINGLE_PUT_SIZE = config.item(
+    "max_single_put_size", 2**20 * 64, parse=lambda i: max(MAX_CHUNK_GET_SIZE(), int(i))
+)  # 64MB
+# these are for upload
+# these achieved 380 MB/sec on a 2 core machine on Kubernetes
+MAX_BLOCK_SIZE = config.item("max_block_put_size", 2**20 * 64, parse=int)  # 64 MB
+UPLOAD_FILE_MAX_CONCURRENCY = config.item("upload_file_max_concurrency", 10, parse=int)
+UPLOAD_CHUNK_SIZE = config.item("upload_chunk_size", 2**20 * 100, parse=int)  # 100 MB
+CONNECTION_TIMEOUT = config.item("connection_timeout", 2000, parse=int)  # seconds

thds/adls/dbfs.py ADDED Viewed

@@ -0,0 +1,60 @@
+import typing as ty
+from .fqn import AdlsFqn, AdlsRoot, join, parse_fqn
+DBFS_SCHEME = "dbfs:/"
+ADLS_TO_SPARK_MAPPING = {
+    "adls://uaapdatascience/data/": "/mnt/datascience/data/",
+    "adls://thdsdatasets/prod-datasets/": "/mnt/datascience/datasets/",
+    "adls://uaapdatascience/hive/": "/mnt/datascience/hive/",
+    "adls://thdsscratch/tmp/": "/mnt/datascience/scratch/",
+}
+ADLS_TO_DBFS_MAPPING = {k: join(DBFS_SCHEME, v) for k, v in ADLS_TO_SPARK_MAPPING.items()}
+# Spark read/write implicitly adds a 'dbfs:/' prefix.
+SPARK_TO_ADLS_MAPPING = {v: k for k, v in ADLS_TO_SPARK_MAPPING.items()}
+DBFS_TO_ADLS_MAPPING = {join(DBFS_SCHEME, k): v for k, v in SPARK_TO_ADLS_MAPPING.items()}
+def to_adls_root(root_uri: str) -> AdlsRoot:
+    try:
+        return AdlsRoot.parse(
+            DBFS_TO_ADLS_MAPPING[root_uri]
+            if root_uri.startswith(DBFS_SCHEME)
+            else SPARK_TO_ADLS_MAPPING[root_uri]
+        )
+    except KeyError:
+        raise ValueError(f"URI '{root_uri}' does not have a defined ADLS root!")
+def to_adls_fqn(fully_qualified_name: str) -> AdlsFqn:
+    mapping = (
+        DBFS_TO_ADLS_MAPPING if fully_qualified_name.startswith(DBFS_SCHEME) else SPARK_TO_ADLS_MAPPING
+    )
+    try:
+        dbfs_root, adls_root = next(
+            ((k, v) for k, v in mapping.items() if fully_qualified_name.startswith(k))
+        )
+    except StopIteration:
+        raise ValueError(f"{fully_qualified_name} does not have a defined ADLS path!")
+    return parse_fqn(join(adls_root, fully_qualified_name.split(dbfs_root)[1]))
+def to_uri(adls_path: ty.Union[AdlsRoot, AdlsFqn], spark: bool = True) -> str:
+    def get_root_uri(adls_root: AdlsRoot) -> str:
+        try:
+            return (
+                ADLS_TO_SPARK_MAPPING[str(adls_root)] if spark else ADLS_TO_DBFS_MAPPING[str(adls_root)]
+            )
+        except KeyError:
+            raise ValueError(f"{str(adls_root)} does not have a corresponding dbfs root!")
+    if isinstance(adls_path, AdlsRoot):
+        return get_root_uri(adls_path)
+    try:
+        return join(get_root_uri(adls_path.root()), adls_path.path)
+    except ValueError:
+        raise ValueError(f"{str(adls_path)} does not have a corresponding dbfs path!")

thds/adls/defaults.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Prefer using named_containers for new code."""
+from thds.core.env import Env
+from . import fqn, named_roots
+try:
+    import thds.adls._thds_defaults  # noqa: F401
+except ImportError:
+    pass
+def env_root(env: Env = "") -> fqn.AdlsRoot:
+    """In many cases, you may want to call this with no arguments
+    to default to using the THDS_ENV environment variable.
+    """
+    return named_roots.require(env)
+def env_root_uri(env: Env = "") -> str:
+    return str(env_root(env))
+def mops_root() -> str:
+    """Returns a URI corresponding to the location where mops materialization should be put."""
+    return str(named_roots.require("mops"))