PyPI - thds.adls - Versions diffs - 4.1.20250701001205__py3-none-any.whl → 4.1.20250701190349__py3-none-any.whl - Mend

thds.adls 4.1.20250701001205py3-none-any.whl → 4.1.20250701190349py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of thds.adls might be problematic. Click here for more details.

Files changed (33) hide show

thds/adls/__init__.py +10 -5
thds/adls/_upload.py +54 -41
thds/adls/azcopy/__init__.py +1 -1
thds/adls/azcopy/download.py +66 -100
thds/adls/azcopy/login.py +39 -0
thds/adls/azcopy/progress.py +49 -0
thds/adls/azcopy/system_resources.py +26 -0
thds/adls/azcopy/upload.py +95 -0
thds/adls/{cached_up_down.py → cached.py} +21 -16
thds/adls/conf.py +1 -0
thds/adls/download.py +129 -152
thds/adls/download_lock.py +9 -2
thds/adls/errors.py +10 -2
thds/adls/file_properties.py +8 -0
thds/adls/hashes.py +147 -0
thds/adls/impl.py +3 -4
thds/adls/md5.py +5 -52
thds/adls/ro_cache.py +1 -2
thds/adls/source.py +37 -34
thds/adls/tools/download.py +3 -3
thds/adls/tools/upload.py +3 -4
thds/adls/upload.py +162 -0
thds/adls/uri.py +6 -0
{thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250701190349.dist-info}/METADATA +1 -1
thds_adls-4.1.20250701190349.dist-info/RECORD +42 -0
thds/adls/resource/__init__.py +0 -36
thds/adls/resource/core.py +0 -77
thds/adls/resource/file_pointers.py +0 -54
thds/adls/resource/up_down.py +0 -242
thds_adls-4.1.20250701001205.dist-info/RECORD +0 -40
{thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250701190349.dist-info}/WHEEL +0 -0
{thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250701190349.dist-info}/entry_points.txt +0 -0
{thds_adls-4.1.20250701001205.dist-info → thds_adls-4.1.20250701190349.dist-info}/top_level.txt +0 -0

thds/adls/__init__.py CHANGED Viewed

@@ -1,16 +1,21 @@
-from thds.core import meta
+from thds import core
-from . import abfss, defaults, etag, fqn, named_roots, resource, source, source_tree, uri  # noqa: F401
-from .cached_up_down import download_directory, download_to_cache, upload_through_cache  # noqa: F401
+from . import abfss, defaults, etag, fqn, hashes, named_roots, source, source_tree, uri  # noqa: F401
+from .cached import download_directory, download_to_cache, upload_through_cache  # noqa: F401
 from .copy import copy_file, copy_files, wait_for_copy  # noqa: F401
 from .errors import BlobNotFoundError  # noqa: F401
 from .fqn import *  # noqa: F401,F403
 from .global_client import get_global_client, get_global_fs_client  # noqa: F401
 from .impl import *  # noqa: F401,F403
 from .ro_cache import Cache, global_cache  # noqa: F401
+from .upload import upload  # noqa: F401
 from .uri import UriIsh, parse_any, parse_uri, resolve_any, resolve_uri  # noqa: F401
-__version__ = meta.get_version(__name__)
-metadata = meta.read_metadata(__name__)
+__version__ = core.meta.get_version(__name__)
+metadata = core.meta.read_metadata(__name__)
 __basepackage__ = __name__
 __commit__ = metadata.git_commit
+hashes.register_hashes()
+# SPOOKY: without the above line, the hashing algorithms will not be registered with thds.core.hash_cache,
+# which will be bad for core.Source as well as uploads and downloads.

thds/adls/_upload.py CHANGED Viewed

@@ -2,15 +2,16 @@
 Not an officially-published API of the thds.adls library.
 """
 import typing as ty
 from pathlib import Path
 import azure.core.exceptions
-from azure.storage.blob import ContentSettings
-from thds.core import hostname, log
+from thds.core import hash_cache, hashing, hostname, log
-from .md5 import AnyStrSrc, try_md5
+from . import hashes
+from .file_properties import PropertiesP
 _SKIP_ALREADY_UPLOADED_CHECK_IF_MORE_THAN_BYTES = 2 * 2**20  # 2 MB is about right
@@ -18,19 +19,26 @@ _SKIP_ALREADY_UPLOADED_CHECK_IF_MORE_THAN_BYTES = 2 * 2**20  # 2 MB is about rig
 logger = log.getLogger(__name__)
-def _get_checksum_content_settings(data: AnyStrSrc) -> ty.Optional[ContentSettings]:
-    """Ideally, we calculate an MD5 sum for all data that we upload.
+def _try_default_hash(data: hashes.AnyStrSrc) -> ty.Optional[hashing.Hash]:
+    """Ideally, we calculate a hash/checksum for all data that we upload.
     The only circumstances under which we cannot do this are if the
     stream does not exist in its entirety before the upload begins.
     """
-    md5 = try_md5(data)
-    if md5:
-        return ContentSettings(content_md5=bytearray(md5))
+    hasher = hashes.default_hasher()
+    hbytes = None
+    if isinstance(data, Path):
+        hbytes = hash_cache.hash_file(data, hasher)
+    elif hashing.hash_anything(data, hasher):
+        hbytes = hasher.digest()
+    if hbytes:
+        return hashing.Hash(hasher.name.lower(), hbytes)
     return None
-def _too_small_to_skip_upload(data: AnyStrSrc, min_size_for_remote_check: int) -> bool:
+def _too_small_to_skip_upload(data: hashes.AnyStrSrc, min_size_for_remote_check: int) -> bool:
     def _len() -> int:
         if isinstance(data, Path) and data.exists():
             return data.stat().st_size
@@ -45,49 +53,58 @@ def _too_small_to_skip_upload(data: AnyStrSrc, min_size_for_remote_check: int) -
 class UploadDecision(ty.NamedTuple):
     upload_required: bool
-    content_settings: ty.Optional[ContentSettings]
+    metadata: ty.Dict[str, str]
-class Properties(ty.Protocol):
-    name: str
-    content_settings: ContentSettings
+def metadata_for_upload() -> ty.Dict[str, str]:
+    return {"upload_wrapper_sw": "thds.adls", "upload_hostname": hostname.friendly()}
+def _co_upload_decision_unless_file_present_with_matching_checksum(
+    data: hashes.AnyStrSrc, min_size_for_remote_check: int
+) -> ty.Generator[bool, ty.Optional[PropertiesP], UploadDecision]:
+    local_hash = _try_default_hash(data)
+    if not local_hash:
+        return UploadDecision(True, metadata_for_upload())
-def _co_content_settings_for_upload_unless_file_present_with_matching_checksum(
-    data: AnyStrSrc, min_size_for_remote_check: int
-) -> ty.Generator[bool, ty.Optional[Properties], UploadDecision]:
-    local_content_settings = _get_checksum_content_settings(data)
-    if not local_content_settings:
-        return UploadDecision(True, None)
+    hash_meta = hashes.metadata_hash_dict(local_hash)
+    metadata = dict(metadata_for_upload(), **hash_meta)
     if _too_small_to_skip_upload(data, min_size_for_remote_check):
         logger.debug("Too small to bother with an early call - let's just upload...")
-        return UploadDecision(True, local_content_settings)
+        return UploadDecision(True, metadata)
     remote_properties = yield True
     if not remote_properties:
         logger.debug("No remote properties could be fetched so an upload is required")
-        return UploadDecision(True, local_content_settings)
-    if remote_properties.content_settings.content_md5 == local_content_settings.content_md5:
-        logger.info(f"Remote file {remote_properties.name} already exists and has matching checksum")
-        return UploadDecision(False, local_content_settings)
-    logger.debug("Remote file exists but MD5 does not match - upload required.")
-    return UploadDecision(True, local_content_settings)
+        return UploadDecision(True, metadata)
+    remote_hashes = hashes.extract_hashes_from_props(remote_properties)
+    for algo in remote_hashes:
+        mkey = hashes.metadata_hash_b64_key(algo)
+        if mkey in hash_meta and hashing.b64(remote_hashes[algo].bytes) == hash_meta[mkey]:
+            logger.info(f"Remote file {remote_properties.name} already exists and has matching checksum")
+            return UploadDecision(False, metadata)
+    print(remote_hashes, hash_meta)
+    logger.debug("Remote file exists but hash does not match - upload required.")
+    return UploadDecision(True, metadata)
 doc = """
 Returns False for upload_required if the file is large and the remote
 exists and has a known, matching checksum.
-Returns ContentSettings if an MD5 checksum can be calculated.
+Returns a metadata dict that should be added to any upload.
 """
-async def async_upload_decision_and_settings(
-    get_properties: ty.Callable[[], ty.Awaitable[Properties]],
-    data: AnyStrSrc,
+async def async_upload_decision_and_metadata(
+    get_properties: ty.Callable[[], ty.Awaitable[PropertiesP]],
+    data: hashes.AnyStrSrc,
     min_size_for_remote_check: int = _SKIP_ALREADY_UPLOADED_CHECK_IF_MORE_THAN_BYTES,
 ) -> UploadDecision:
     try:
-        co = _co_content_settings_for_upload_unless_file_present_with_matching_checksum(
+        co = _co_upload_decision_unless_file_present_with_matching_checksum(
             data, min_size_for_remote_check
         )
         while True:
@@ -100,13 +117,13 @@ async def async_upload_decision_and_settings(
         return stop.value
-def upload_decision_and_settings(
-    get_properties: ty.Callable[[], Properties],
-    data: AnyStrSrc,
+def upload_decision_and_metadata(
+    get_properties: ty.Callable[[], PropertiesP],
+    data: hashes.AnyStrSrc,
     min_size_for_remote_check: int = _SKIP_ALREADY_UPLOADED_CHECK_IF_MORE_THAN_BYTES,
 ) -> UploadDecision:
     try:
-        co = _co_content_settings_for_upload_unless_file_present_with_matching_checksum(
+        co = _co_upload_decision_unless_file_present_with_matching_checksum(
             data, min_size_for_remote_check
         )
         while True:
@@ -119,9 +136,5 @@ def upload_decision_and_settings(
         return stop.value
-async_upload_decision_and_settings.__doc__ = doc
-upload_decision_and_settings.__doc__ = doc
-def metadata_for_upload() -> ty.Dict[str, str]:
-    return {"upload_wrapper_sw": "thds.adls", "upload_hostname": hostname.friendly()}
+async_upload_decision_and_metadata.__doc__ = doc
+upload_decision_and_metadata.__doc__ = doc

thds/adls/azcopy/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from . import download # noqa
1	+ from . import download, upload # noqa: F401

thds/adls/azcopy/download.py CHANGED Viewed

@@ -7,161 +7,121 @@
 # very end of the download), so for local users who don't have huge bandwidth, it's likely
 # a better user experience to disable this globally.
 import asyncio
-import json
-import os
 import subprocess
 import typing as ty
-import urllib.parse
-from contextlib import contextmanager
+from contextlib import nullcontext
+from dataclasses import dataclass
 from pathlib import Path
 from azure.storage.filedatalake import DataLakeFileClient
-from thds.core import cache, config, log
+from thds.core import config, log
-from .. import _progress, conf, uri
+from .. import conf
+from . import login, progress, system_resources
-DONT_USE_AZCOPY = config.item("dont_use", default=True, parse=config.tobool)
-_AZCOPY_LOGIN_WORKLOAD_IDENTITY = "azcopy login --login-type workload".split()
-_AZCOPY_LOGIN_LOCAL_STATUS = "azcopy login status".split()
-# device login is an interactive process involving a web browser,
-# which is not acceptable for large scale automation.
-# So instead of logging in, we check to see if you _are_ logged in,
-# and if you are, we try using azcopy in the future.
+DONT_USE_AZCOPY = config.item("dont_use", default=False, parse=config.tobool)
+MIN_FILE_SIZE = config.item("min_file_size", default=20 * 10**6, parse=int)  # 20 MB
 logger = log.getLogger(__name__)
-class DownloadRequest(ty.NamedTuple):
-    """Use one or the other, but not both, to write the results."""
-    writer: ty.IO[bytes]
+@dataclass
+class DownloadRequest:
     temp_path: Path
+    size_bytes: int
-@cache.locking  # only run this once per process.
-def _good_azcopy_login() -> bool:
-    if DONT_USE_AZCOPY():
-        return False
-    try:
-        subprocess.run(_AZCOPY_LOGIN_WORKLOAD_IDENTITY, check=True, capture_output=True)
-        logger.info("Will use azcopy for downloads in this process...")
-        return True
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        pass
-    try:
-        subprocess.run(_AZCOPY_LOGIN_LOCAL_STATUS, check=True)
-        logger.info("Will use azcopy for downloads in this process...", dl=None)
-        return True
-    except FileNotFoundError:
-        logger.info("azcopy is not installed or not on your PATH, so we cannot speed up downloads")
-    except subprocess.CalledProcessError as cpe:
-        logger.warning(
-            "You are not logged in with azcopy, so we cannot speed up downloads."
-            f" Run `azcopy login` to fix this. Return code was {cpe.returncode}"
-        )
-    return False
-def _azcopy_download_command(dl_file_client: DataLakeFileClient, path: Path) -> ty.List[str]:
-    return ["azcopy", "copy", dl_file_client.url, str(path), "--output-type=json"]
+@dataclass
+class SdkDownloadRequest(DownloadRequest):
+    """Use one or the other, but not both, to write the results."""
-class AzCopyMessage(ty.TypedDict):
-    TotalBytesEnumerated: str
-    TotalBytesTransferred: str
+    writer: ty.IO[bytes]
-class AzCopyJsonLine(ty.TypedDict):
-    MessageType: str
-    MessageContent: AzCopyMessage
+def _is_big_enough_for_azcopy(size_bytes: int) -> bool:
+    return size_bytes >= MIN_FILE_SIZE()
-def _parse_azcopy_json_output(line: str) -> AzCopyJsonLine:
-    outer_msg = json.loads(line)
-    return AzCopyJsonLine(
-        MessageType=outer_msg["MessageType"],
-        MessageContent=json.loads(outer_msg["MessageContent"]),
+def should_use_azcopy(file_size_bytes: int) -> bool:
+    return (
+        _is_big_enough_for_azcopy(file_size_bytes)
+        and not DONT_USE_AZCOPY()
+        and login.good_azcopy_login()
     )
-@contextmanager
-def _track_azcopy_progress(http_url: str) -> ty.Iterator[ty.Callable[[str], None]]:
-    """Context manager that tracks progress from AzCopy JSON lines. This works for both async and sync impls."""
-    tracker = _progress.get_global_download_tracker()
-    adls_uri = urllib.parse.unquote(str(uri.parse_uri(http_url)))
-    def track(line: str):
-        if not line:
-            return
-        try:
-            prog = _parse_azcopy_json_output(line)
-            if prog["MessageType"] == "Progress":
-                tracker(adls_uri, total_written=int(prog["MessageContent"]["TotalBytesTransferred"]))
-        except json.JSONDecodeError:
-            pass
-    yield track
-def _restrict_mem() -> dict:
-    return dict(os.environ, AZCOPY_BUFFER_GB="0.3")
+def _azcopy_download_command(dl_file_client: DataLakeFileClient, path: Path) -> ty.List[str]:
+    # turns out azcopy checks md5 by default - but we we do our own checking, sometimes with faster methods,
+    # and their checking _dramatically_ slows downloads on capable machines, so we disable it.
+    return ["azcopy", "copy", dl_file_client.url, str(path), "--output-type=json", "--check-md5=NoCheck"]
 def sync_fastpath(
     dl_file_client: DataLakeFileClient,
     download_request: DownloadRequest,
 ) -> None:
-    if _good_azcopy_login():
+    if not isinstance(download_request, SdkDownloadRequest):
+        logger.debug("Downloading %s using azcopy", dl_file_client.url)
         try:
             # Run the copy
             process = subprocess.Popen(
                 _azcopy_download_command(dl_file_client, download_request.temp_path),
                 stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
                 text=True,
-                env=_restrict_mem(),
+                env=system_resources.restrict_usage(),
             )
             assert process.stdout
-            with _track_azcopy_progress(dl_file_client.url) as track:
+            with progress.azcopy_tracker(dl_file_client.url, download_request.size_bytes) as track:
                 for line in process.stdout:
                     track(line)
+            process.wait()
+            if process.returncode != 0:
+                raise subprocess.SubprocessError(f"AzCopy failed with return code {process.returncode}")
+            assert (
+                download_request.temp_path.exists()
+            ), f"AzCopy did not create the file at {download_request.temp_path}"
             return  # success
-        except (subprocess.SubprocessError, FileNotFoundError):
+        except (subprocess.CalledProcessError, FileNotFoundError):
             logger.warning("Falling back to Python SDK for download")
-    dl_file_client.download_file(
-        max_concurrency=conf.DOWNLOAD_FILE_MAX_CONCURRENCY(),
-        connection_timeout=conf.CONNECTION_TIMEOUT(),
-    ).readinto(download_request.writer)
+    logger.debug("Downloading %s using Python SDK", dl_file_client.url)
+    if hasattr(download_request, "writer"):
+        writer_cm = nullcontext(download_request.writer)
+    else:
+        writer_cm = open(download_request.temp_path, "wb")  # type: ignore[assignment]
+    with writer_cm as writer:
+        dl_file_client.download_file(
+            max_concurrency=conf.DOWNLOAD_FILE_MAX_CONCURRENCY(),
+            connection_timeout=conf.CONNECTION_TIMEOUT(),
+        ).readinto(writer)
 async def async_fastpath(
     dl_file_client: DataLakeFileClient,
     download_request: DownloadRequest,
 ) -> None:
-    # technically it would be 'better' to do this login in an async subproces,
+    # technically it would be 'better' to do this login in an async subprocess,
     # but it involves a lot of boilerplate, _and_ we have no nice way to cache
     # the value, which is going to be computed one per process and never again.
     # So we'll just block the async loop for a couple of seconds one time...
-    if _good_azcopy_login():
+    if not isinstance(download_request, SdkDownloadRequest):
+        logger.debug("Downloading %s using azcopy", dl_file_client.url)
         try:
             # Run the copy
             copy_proc = await asyncio.create_subprocess_exec(
                 *_azcopy_download_command(dl_file_client, download_request.temp_path),
                 stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
-                env=_restrict_mem(),
+                stderr=asyncio.subprocess.STDOUT,
+                env=system_resources.restrict_usage(),
             )
             assert copy_proc.stdout
             # Feed lines to the tracker asynchronously
-            with _track_azcopy_progress(dl_file_client.url) as track:
+            with progress.azcopy_tracker(dl_file_client.url, download_request.size_bytes) as track:
                 while True:
                     line = await copy_proc.stdout.readline()
                     if not line:  # EOF
@@ -178,9 +138,15 @@ async def async_fastpath(
         except (subprocess.SubprocessError, FileNotFoundError):
             logger.warning("Falling back to Python SDK for download")
-    reader = await dl_file_client.download_file(  # type: ignore[misc]
-        # TODO - check above type ignore
-        max_concurrency=conf.DOWNLOAD_FILE_MAX_CONCURRENCY(),
-        connection_timeout=conf.CONNECTION_TIMEOUT(),
-    )
-    await reader.readinto(download_request.writer)
+    logger.debug("Downloading %s using Async Python SDK", dl_file_client.url)
+    if hasattr(download_request, "writer"):
+        writer_cm = nullcontext(download_request.writer)
+    else:
+        writer_cm = open(download_request.temp_path, "wb")  # type: ignore[assignment]
+    with writer_cm as writer:
+        reader = await dl_file_client.download_file(  # type: ignore[misc]
+            # TODO - check above type ignore
+            max_concurrency=conf.DOWNLOAD_FILE_MAX_CONCURRENCY(),
+            connection_timeout=conf.CONNECTION_TIMEOUT(),
+        )
+        await reader.readinto(writer)

thds/adls/azcopy/login.py ADDED Viewed

@@ -0,0 +1,39 @@
+import subprocess
+from thds.core import cache, log, scope
+_AZCOPY_LOGIN_WORKLOAD_IDENTITY = "azcopy login --login-type workload".split()
+_AZCOPY_LOGIN_LOCAL_STATUS = "azcopy login status".split()
+# device login is an interactive process involving a web browser,
+# which is not acceptable for large scale automation.
+# So instead of logging in, we check to see if you _are_ logged in,
+# and if you are, we try using azcopy in the future.
+logger = log.getLogger(__name__)
+@cache.locking  # only run this once per process.
+@scope.bound
+def good_azcopy_login() -> bool:
+    scope.enter(log.logger_context(dl=None))
+    try:
+        subprocess.run(_AZCOPY_LOGIN_WORKLOAD_IDENTITY, check=True, capture_output=True)
+        logger.info("Azcopy login with workload identity, so we can use it for large file transfers")
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    try:
+        subprocess.run(_AZCOPY_LOGIN_LOCAL_STATUS, check=True)
+        logger.info("Azcopy login with local token, so we can use it for large file transfers")
+        return True
+    except FileNotFoundError:
+        logger.info(
+            "azcopy is not installed or not on your PATH, so we cannot speed up large file transfers"
+        )
+    except subprocess.CalledProcessError as cpe:
+        logger.warning(
+            "You are not logged in with azcopy, so we cannot speed up large file transfers."
+            f" Run `azcopy login` to fix this. Return code was {cpe.returncode}"
+        )
+    return False

thds/adls/azcopy/progress.py ADDED Viewed

@@ -0,0 +1,49 @@
+import json
+import typing as ty
+import urllib.parse
+from contextlib import contextmanager
+from .. import _progress, uri
+class AzCopyMessage(ty.TypedDict):
+    TotalBytesEnumerated: str
+    TotalBytesTransferred: str
+class AzCopyJsonLine(ty.TypedDict):
+    MessageType: str
+    MessageContent: AzCopyMessage
+def _parse_azcopy_json_output(line: str) -> AzCopyJsonLine:
+    outer_msg = json.loads(line)
+    return AzCopyJsonLine(
+        MessageType=outer_msg["MessageType"],
+        MessageContent=json.loads(outer_msg["MessageContent"]),
+    )
+@contextmanager
+def azcopy_tracker(http_url: str, size_bytes: int) -> ty.Iterator[ty.Callable[[str], None]]:
+    """Context manager that tracks progress from AzCopy JSON lines. This works for both async and sync impls."""
+    tracker = _progress.get_global_download_tracker()
+    adls_uri = urllib.parse.unquote(str(uri.parse_uri(http_url)))
+    if size_bytes:
+        tracker.add(adls_uri, total=size_bytes)
+    def track(line: str):
+        if not size_bytes:
+            return  # no size, no progress
+        if not line:
+            return
+        try:
+            prog = _parse_azcopy_json_output(line)
+            if prog["MessageType"] == "Progress":
+                tracker(adls_uri, total_written=int(prog["MessageContent"]["TotalBytesTransferred"]))
+        except json.JSONDecodeError:
+            pass
+    yield track

thds/adls/azcopy/system_resources.py ADDED Viewed

@@ -0,0 +1,26 @@
+import os
+from functools import lru_cache
+from thds.core import cpus, log
+logger = log.getLogger(__name__)
+@lru_cache
+def restrict_usage() -> dict:
+    num_cpus = cpus.available_cpu_count()
+    env = dict(os.environ)
+    if "AZCOPY_BUFFER_GB" not in os.environ:
+        likely_mem_gb_available = num_cpus * 4  # assume 4 GB per CPU core is available
+        # o3 suggested 15% of the total available memory...
+        env["AZCOPY_BUFFER_GB"] = str(likely_mem_gb_available * 0.15)
+    if "AZCOPY_CONCURRENCY" not in os.environ:
+        env["AZCOPY_CONCURRENCY"] = str(int(num_cpus * 2))
+    logger.info(
+        "AZCOPY_BUFFER_GB == %s and AZCOPY_CONCURRENCY == %s",
+        env["AZCOPY_BUFFER_GB"],
+        env["AZCOPY_CONCURRENCY"],
+    )
+    return env

thds/adls/azcopy/upload.py ADDED Viewed

@@ -0,0 +1,95 @@
+import subprocess
+import typing as ty
+from pathlib import Path
+from thds.core import config
+from .. import uri
+from . import login, progress, system_resources
+DONT_USE_AZCOPY = config.item("dont_use", default=False, parse=config.tobool)
+MIN_FILE_SIZE = config.item("min_file_size", default=20 * 10**6, parse=int)  # 20 MB
+def build_azcopy_upload_command(
+    source_path: Path,
+    dest: uri.UriIsh,
+    *,
+    content_type: str = "",
+    metadata: ty.Mapping[str, str] = dict(),  # noqa: B006
+    overwrite: bool = True,
+) -> list[str]:
+    """
+    Build azcopy upload command as a list of strings.
+    Args:
+        source_path: Path to local file to upload
+        dest_url: Full Azure blob URL (e.g., https://account.blob.core.windows.net/container/blob)
+        content_type: MIME content type
+        metadata: Mapping of metadata key-value pairs
+        overwrite: Whether to overwrite existing blob
+    Returns:
+        List of strings suitable for subprocess.run()
+    """
+    cmd = ["azcopy", "copy", str(source_path), uri.to_blob_windows_url(dest)]
+    if overwrite:
+        cmd.append("--overwrite=true")
+    if content_type:
+        cmd.append(f"--content-type={content_type}")
+    if metadata:
+        # Format metadata as key1=value1;key2=value2
+        metadata_str = ";".join(f"{k}={v}" for k, v in metadata.items())
+        cmd.append(f"--metadata={metadata_str}")
+    cmd.append("--output-type=json")  # for progress tracking
+    return cmd
+def _is_big_enough_for_azcopy(size_bytes: int) -> bool:
+    """
+    Determine if a file is big enough to warrant using azcopy for upload.
+    Args:
+        size_bytes: Size of the file in bytes
+    Returns:
+        True if the file is big enough, False otherwise
+    """
+    return size_bytes >= MIN_FILE_SIZE()
+def should_use_azcopy(file_size_bytes: int) -> bool:
+    return (
+        _is_big_enough_for_azcopy(file_size_bytes)
+        and not DONT_USE_AZCOPY()
+        and login.good_azcopy_login()
+    )
+def run(
+    cmd: ty.Sequence[str],
+    dest: uri.UriIsh,
+    size_bytes: int,
+) -> None:
+    # Run the copy
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        env=system_resources.restrict_usage(),
+    )
+    assert process.stdout
+    with progress.azcopy_tracker(uri.to_blob_windows_url(dest), size_bytes) as track:
+        for line in process.stdout:
+            track(line)
+    process.wait()
+    if process.returncode != 0:
+        raise subprocess.SubprocessError(f"AzCopy failed with return code {process.returncode}")

thds.adls 4.1.20250701001205__py3-none-any.whl → 4.1.20250701190349__py3-none-any.whl

Potentially problematic release.

thds.adls 4.1.20250701001205py3-none-any.whl → 4.1.20250701190349py3-none-any.whl