PyPI - divbase-cli - Versions diffs - 0.1.0.dev2__py3-none-any.whl → 0.1.0.dev3__py3-none-any.whl - Mend

divbase-cli 0.1.0.dev2py3-none-any.whl → 0.1.0.dev3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

divbase_cli/__init__.py +1 -1
divbase_cli/cli_commands/auth_cli.py +4 -9
divbase_cli/cli_commands/dimensions_cli.py +4 -8
divbase_cli/cli_commands/file_cli.py +284 -70
divbase_cli/cli_commands/query_cli.py +3 -7
divbase_cli/cli_commands/shared_args_options.py +20 -0
divbase_cli/cli_commands/task_history_cli.py +3 -8
divbase_cli/cli_commands/user_config_cli.py +14 -44
divbase_cli/cli_commands/version_cli.py +16 -24
divbase_cli/cli_config.py +18 -7
divbase_cli/cli_exceptions.py +37 -22
divbase_cli/config_resolver.py +10 -10
divbase_cli/divbase_cli.py +1 -1
divbase_cli/retries.py +34 -0
divbase_cli/services/__init__.py +0 -0
divbase_cli/services/pre_signed_urls.py +446 -0
divbase_cli/services/project_versions.py +77 -0
divbase_cli/services/s3_files.py +355 -0
divbase_cli/user_auth.py +26 -13
divbase_cli/user_config.py +20 -9
divbase_cli/utils.py +47 -0
{divbase_cli-0.1.0.dev2.dist-info → divbase_cli-0.1.0.dev3.dist-info}/METADATA +4 -3
divbase_cli-0.1.0.dev3.dist-info/RECORD +27 -0
divbase_cli/pre_signed_urls.py +0 -169
divbase_cli/services.py +0 -219
divbase_cli-0.1.0.dev2.dist-info/RECORD +0 -22
{divbase_cli-0.1.0.dev2.dist-info → divbase_cli-0.1.0.dev3.dist-info}/WHEEL +0 -0
{divbase_cli-0.1.0.dev2.dist-info → divbase_cli-0.1.0.dev3.dist-info}/entry_points.txt +0 -0

divbase_cli/services/pre_signed_urls.py ADDED Viewed

@@ -0,0 +1,446 @@
+"""
+Module responsible for taking pre-signed urls and using them to do file download and upload.
+Grouped into 4 sections:
+- Downloading singlepart files
+- Downloading multipart files (based on size threshold)
+- Uploading singlepart files
+- Uploading multipart files (based on size threshold)
+Size thresholds are defined in divbase_lib.divbase_constants and which function to call is determined by the caller.
+Retry logic here uses the stamina library.
+TODO: Consider adding progress bars
+"""
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+import httpx
+import stamina
+from divbase_cli.cli_exceptions import DivBaseAPIConnectionError, DivBaseAPIError
+from divbase_cli.retries import retry_only_on_retryable_http_errors
+from divbase_cli.user_auth import make_authenticated_request
+from divbase_lib.api_schemas.s3 import (
+    AbortMultipartUploadRequest,
+    CompleteMultipartUploadRequest,
+    CompleteMultipartUploadResponse,
+    CreateMultipartUploadRequest,
+    CreateMultipartUploadResponse,
+    GetPresignedPartUrlsRequest,
+    PreSignedDownloadResponse,
+    PreSignedSinglePartUploadResponse,
+    PresignedUploadPartUrlResponse,
+    UploadedPart,
+)
+from divbase_lib.divbase_constants import (
+    MAX_S3_API_BATCH_SIZE,
+    S3_MULTIPART_CHUNK_SIZE,
+)
+from divbase_lib.exceptions import ChecksumVerificationError
+from divbase_lib.s3_checksums import calculate_md5_checksum_for_chunk, verify_downloaded_checksum
+logger = logging.getLogger(__name__)
+# Used for multipart file transfers
+MAX_CONCURRENCY = 10
+MB = 1024 * 1024
+# We can download in whatever chunk size we want,
+# the checksum validation has to use the right chunk size though
+# Uploads should use the same threshold and chunk size as divbase-api
+# So those are defined in a shared lib.
+MULTIPART_DOWNLOAD_THRESHOLD = 32 * MB
+DOWNLOAD_CHUNK_SIZE = 8 * MB
+@dataclass
+class SuccessfulDownload:
+    """Represents a successfully downloaded file."""
+    file_path: Path
+    object_name: str
+@dataclass
+class FailedDownload:
+    """Represents a failed download attempt."""
+    object_name: str
+    file_path: Path
+    exception: Exception
+@dataclass
+class DownloadOutcome:
+    """Outcome of attempting to download multiple files."""
+    successful: list[SuccessfulDownload]
+    failed: list[FailedDownload]
+def download_multiple_pre_signed_urls(
+    pre_signed_urls: list[PreSignedDownloadResponse], verify_checksums: bool, download_dir: Path
+) -> tuple[list[SuccessfulDownload], list[FailedDownload]]:
+    """
+    Download files using pre-signed URLs.
+    Returns a tuple of both the successful and failed downloads.
+    """
+    successful_downloads, failed_downloads = [], []
+    with httpx.Client() as client:
+        for obj in pre_signed_urls:
+            output_file_path = download_dir / obj.name
+            object_name = obj.name
+            try:
+                result = _download_single_pre_signed_url(
+                    httpx_client=client,
+                    pre_signed_url=obj.pre_signed_url,
+                    verify_checksums=verify_checksums,
+                    output_file_path=output_file_path,
+                    object_name=object_name,
+                )
+            except httpx.HTTPError as err:
+                output_file_path.unlink(missing_ok=True)  # Clean up possible partial file
+                result = FailedDownload(object_name=object_name, file_path=output_file_path, exception=err)
+            if isinstance(result, SuccessfulDownload):
+                successful_downloads.append(result)
+            else:
+                failed_downloads.append(result)
+    return successful_downloads, failed_downloads
+def _download_single_pre_signed_url(
+    httpx_client: httpx.Client, pre_signed_url: str, verify_checksums: bool, output_file_path: Path, object_name: str
+) -> SuccessfulDownload | FailedDownload:
+    """
+    Download a single file using a pre-signed URL.
+    If the file is large enough, we download in chunks.
+    Helper function, do not call directly from outside this module.
+    """
+    content_length, server_checksum = _get_content_length_and_checksum(
+        httpx_client=httpx_client, pre_signed_url=pre_signed_url
+    )
+    if content_length < MULTIPART_DOWNLOAD_THRESHOLD:
+        _perform_singlepart_download(
+            httpx_client=httpx_client,
+            pre_signed_url=pre_signed_url,
+            output_file_path=output_file_path,
+        )
+    else:
+        logger.info(f"Starting multipart download for large file '{object_name}' of size {content_length} bytes.")
+        _perform_multipart_download(httpx_client, pre_signed_url, output_file_path, content_length)
+    if verify_checksums:
+        try:
+            verify_downloaded_checksum(file_path=output_file_path, expected_checksum=server_checksum)
+        except ChecksumVerificationError as err:
+            output_file_path.unlink()
+            return FailedDownload(object_name=object_name, file_path=output_file_path, exception=err)
+    return SuccessfulDownload(file_path=output_file_path, object_name=object_name)
+@stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
+def _get_content_length_and_checksum(httpx_client: httpx.Client, pre_signed_url: str) -> tuple[int, str]:
+    """
+    "HEAD" a pre-signed download URL to get it's content length and checksum.
+    As you can't HEAD a presigned GET, we do a GET with a Range header to only get the first byte.
+    Otherwise would have to be given a separate pre-signed HEAD url to do this.
+    """
+    with httpx_client.stream("GET", pre_signed_url, headers={"Range": "bytes=0-0"}) as head_response:
+        head_response.raise_for_status()
+        content_range = head_response.headers["Content-Range"]
+        # format is "bytes 0-0/12345"
+        content_length = int(content_range.split("/")[-1])
+        server_checksum = head_response.headers["ETag"].strip('"')
+    return content_length, server_checksum
+@stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
+def _perform_singlepart_download(httpx_client: httpx.Client, pre_signed_url: str, output_file_path: Path) -> None:
+    """Used on objects smaller than the multipart threshold cutoff"""
+    with httpx_client.stream("GET", pre_signed_url) as response:
+        response.raise_for_status()
+        with open(output_file_path, "wb") as file:
+            for chunk in response.iter_bytes(chunk_size=8192):
+                file.write(chunk)
+def _perform_multipart_download(httpx_client, pre_signed_url, output_file_path, content_length):
+    """
+    Download a large file in multiple chunks using range requests.
+    As we write to the file concurrently, the file is first created with the correct size,
+    and then each chunk is written to the correct position in the file.
+    """
+    with open(output_file_path, "wb") as f:
+        f.seek(content_length - 1)
+        f.write(b"\0")
+    with ThreadPoolExecutor(max_workers=MAX_CONCURRENCY) as executor:
+        futures = []
+        for i in range(0, content_length, DOWNLOAD_CHUNK_SIZE):
+            start = i
+            end = min(i + DOWNLOAD_CHUNK_SIZE, content_length)
+            futures.append(
+                executor.submit(
+                    _download_chunk,
+                    httpx_client,
+                    pre_signed_url,
+                    start,
+                    end,
+                    output_file_path,
+                )
+            )
+        for future in futures:
+            future.result()
+@stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
+def _download_chunk(client: httpx.Client, url: str, start: int, end: int, output_file_path: Path) -> None:
+    """
+    Downloads a specific range of bytes of a file (aka chunk),
+    and writes it to the correct place in the file.
+    """
+    headers = {"Range": f"bytes={start}-{end - 1}"}
+    with client.stream("GET", url, headers=headers) as response:
+        response.raise_for_status()
+        with open(output_file_path, "rb+") as f:
+            f.seek(start)
+            for chunk in response.iter_bytes():
+                f.write(chunk)
+@dataclass
+class SuccessfulUpload:
+    """Represents a successfully uploaded file."""
+    file_path: Path
+    object_name: str
+@dataclass
+class FailedUpload:
+    """Represents a failed upload attempt."""
+    object_name: str
+    file_path: Path
+    exception: Exception
+@dataclass
+class UploadOutcome:
+    """Outcome of attempting to upload multiple files."""
+    successful: list[SuccessfulUpload]
+    failed: list[FailedUpload]
+def upload_multiple_singlepart_pre_signed_urls(
+    pre_signed_urls: list[PreSignedSinglePartUploadResponse], all_files: list[Path]
+) -> UploadOutcome:
+    """
+    Upload singlepart files using pre-signed PUT URLs.
+    Returns a UploadResults object containing the results of the upload attempts.
+    """
+    file_map = {file.name: file for file in all_files}
+    successful_uploads, failed_uploads = [], []
+    with httpx.Client() as client:
+        for obj in pre_signed_urls:
+            result = _upload_one_singlepart_pre_signed_url(
+                httpx_client=client,
+                pre_signed_url=obj.pre_signed_url,
+                file_path=file_map[obj.name],
+                object_name=obj.name,
+                headers=obj.put_headers,
+            )
+            if isinstance(result, SuccessfulUpload):
+                successful_uploads.append(result)
+            else:
+                failed_uploads.append(result)
+    return UploadOutcome(successful=successful_uploads, failed=failed_uploads)
+@stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
+def _upload_one_singlepart_pre_signed_url(
+    httpx_client: httpx.Client,
+    pre_signed_url: str,
+    file_path: Path,
+    object_name: str,
+    headers: dict[str, str],
+) -> SuccessfulUpload | FailedUpload:
+    """
+    Upload one singlepart file to S3 using a pre-signed PUT URL.
+    Helper function, do not call directly from outside this module.
+    """
+    with open(file_path, "rb") as file:
+        try:
+            response = httpx_client.put(pre_signed_url, content=file, headers=headers)
+            response.raise_for_status()
+        except httpx.HTTPError as err:
+            return FailedUpload(object_name=object_name, file_path=file_path, exception=err)
+    return SuccessfulUpload(file_path=file_path, object_name=object_name)
+### multipart upload logic below ###
+def perform_multipart_upload(
+    project_name: str,
+    divbase_base_url: str,
+    file_path: Path,
+    safe_mode: bool,
+) -> SuccessfulUpload | FailedUpload:
+    """
+    Manages the entire multi-part upload process for a single file.
+    See the docs docs/development/s3_transfers.md for high level overview of the process.
+    """
+    object_name = file_path.name
+    file_size = file_path.stat().st_size
+    # 1. Create multipart upload
+    create_request = CreateMultipartUploadRequest(name=object_name, content_length=file_size)
+    response = make_authenticated_request(
+        method="POST",
+        divbase_base_url=divbase_base_url,
+        api_route=f"v1/s3/upload/multi-part/create?project_name={project_name}",
+        json=create_request.model_dump(),
+    )
+    object_data = CreateMultipartUploadResponse(**response.json())
+    # 2. Upload each part in batches as divbase server limits how many part urls it will give at once
+    parts_to_request = list(range(1, object_data.number_of_parts + 1))
+    try:
+        uploaded_parts: list[UploadedPart] = []
+        for i in range(0, len(parts_to_request), MAX_S3_API_BATCH_SIZE):
+            part_batch_numbers = parts_to_request[i : i + MAX_S3_API_BATCH_SIZE]
+            part_urls = _get_part_urls(
+                project_name=project_name,
+                divbase_base_url=divbase_base_url,
+                object_name=object_name,
+                upload_id=object_data.upload_id,
+                part_numbers=part_batch_numbers,
+                file_path=file_path,
+                safe_mode=safe_mode,
+            )
+            batch_uploads = _upload_parts(part_urls=part_urls, file_path=file_path)
+            uploaded_parts.extend(batch_uploads)
+        # 3. Complete multipart upload
+        complete_request_body = CompleteMultipartUploadRequest(
+            name=object_name,
+            upload_id=object_data.upload_id,
+            parts=uploaded_parts,
+        )
+        response = make_authenticated_request(
+            method="POST",
+            divbase_base_url=divbase_base_url,
+            api_route=f"v1/s3/upload/multi-part/complete?project_name={project_name}",
+            json=complete_request_body.model_dump(),
+        )
+        completed_upload = CompleteMultipartUploadResponse(**response.json())
+        return SuccessfulUpload(file_path=file_path, object_name=completed_upload.name)
+    # 4. If any unexpected error occurs, abort the multipart upload
+    # To avoid leaving incomplete uploads in S3
+    except Exception as e:
+        try:
+            abort_request = AbortMultipartUploadRequest(name=object_name, upload_id=object_data.upload_id)
+            make_authenticated_request(
+                method="DELETE",
+                divbase_base_url=divbase_base_url,
+                api_route=f"v1/s3/upload/multi-part/abort?project_name={project_name}",
+                json=abort_request.model_dump(),
+            )
+        except (DivBaseAPIConnectionError, DivBaseAPIError):
+            logger.error(f"Failed to abort multipart upload for object '{object_name}' after an upload error.")
+        return FailedUpload(object_name=object_name, file_path=file_path, exception=e)
+def _get_part_urls(
+    project_name: str,
+    divbase_base_url: str,
+    object_name: str,
+    upload_id: str,
+    part_numbers: list[int],
+    file_path: Path,
+    safe_mode: bool,
+) -> list[PresignedUploadPartUrlResponse]:
+    """
+    Gets up to 100 pre-signed URLs (from divbase server) for uploading parts of a large file to S3.
+    Not responsible for uploading the parts, just getting the URLs.
+    """
+    md5_checksums = None
+    if safe_mode:
+        md5_checksums = []
+        for part_num in part_numbers:
+            checksum = calculate_md5_checksum_for_chunk(
+                file_path=file_path,
+                start_byte=(part_num - 1) * S3_MULTIPART_CHUNK_SIZE,
+                chunk_size=S3_MULTIPART_CHUNK_SIZE,
+            )
+            md5_checksums.append(checksum)
+    request_body = GetPresignedPartUrlsRequest(
+        name=object_name,
+        upload_id=upload_id,
+        parts_range_start=part_numbers[0],
+        parts_range_end=part_numbers[-1],
+        md5_checksums=md5_checksums,
+    )
+    response = make_authenticated_request(
+        method="POST",
+        divbase_base_url=divbase_base_url,
+        api_route=f"v1/s3/upload/multi-part/part-urls?project_name={project_name}",
+        json=request_body.model_dump(),
+    )
+    return [PresignedUploadPartUrlResponse(**item) for item in response.json()]
+def _upload_parts(part_urls: list[PresignedUploadPartUrlResponse], file_path: Path) -> list[UploadedPart]:
+    """Uploads a batch of parts in parallel and returns their ETag info."""
+    completed_parts = []
+    with ThreadPoolExecutor(max_workers=MAX_CONCURRENCY) as executor:
+        future_to_part = {executor.submit(_upload_chunk, part=part, file_path=file_path): part for part in part_urls}
+        for future in as_completed(future_to_part):
+            part_number, etag = future.result()
+            completed_parts.append(UploadedPart(part_number=part_number, etag=etag))
+    return completed_parts
+@stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
+def _upload_chunk(part: PresignedUploadPartUrlResponse, file_path: Path) -> tuple[int, str]:
+    """Uploads a single chunk of a file to a pre-signed URL and returns its part number and ETag."""
+    start_byte = (part.part_number - 1) * S3_MULTIPART_CHUNK_SIZE
+    with open(file_path, "rb") as f:
+        f.seek(start_byte)
+        data_to_upload = f.read(S3_MULTIPART_CHUNK_SIZE)
+    with httpx.Client() as client:
+        response = client.put(
+            part.pre_signed_url,
+            content=data_to_upload,
+            headers=part.headers,
+            timeout=httpx.Timeout(5.0, write=30.0),
+        )
+        response.raise_for_status()
+        # ETag is returned with quotes, which must be stripped prior to comparison
+        etag = response.headers["ETag"].strip('"')
+        return part.part_number, etag

divbase_cli/services/project_versions.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""
+Service layer for DivBase CLI project version operations.
+"""
+from divbase_cli.user_auth import make_authenticated_request
+from divbase_lib.api_schemas.project_versions import (
+    AddVersionRequest,
+    AddVersionResponse,
+    DeleteVersionRequest,
+    DeleteVersionResponse,
+    ProjectVersionDetailResponse,
+    ProjectVersionInfo,
+)
+def add_version_command(project_name: str, divbase_base_url: str, name: str, description: str) -> AddVersionResponse:
+    """Add a new version to the project versions table stored on the divbase server"""
+    request_data = AddVersionRequest(name=name, description=description)
+    response = make_authenticated_request(
+        method="PATCH",
+        divbase_base_url=divbase_base_url,
+        api_route=f"v1/project-versions/add?project_name={project_name}",
+        json=request_data.model_dump(),
+    )
+    return AddVersionResponse(**response.json())
+def list_versions_command(project_name: str, include_deleted: bool, divbase_base_url: str) -> list[ProjectVersionInfo]:
+    """
+    List all versions in the project versions table stored on the divbase server.
+    Returns a dict of version names (keys) to details about the versions.
+    """
+    response = make_authenticated_request(
+        method="GET",
+        divbase_base_url=divbase_base_url,
+        api_route=f"v1/project-versions/list?project_name={project_name}&include_deleted={str(include_deleted).lower()}",
+    )
+    project_versions = []
+    response_data = response.json()
+    for version in response_data:
+        project_versions.append(ProjectVersionInfo(**version))
+    return project_versions
+def get_version_details_command(
+    project_name: str, divbase_base_url: str, version_name: str
+) -> ProjectVersionDetailResponse:
+    """Get details about a specific project version, including all files and their version IDs at that version."""
+    response = make_authenticated_request(
+        method="GET",
+        divbase_base_url=divbase_base_url,
+        api_route=f"v1/project-versions/version_details?project_name={project_name}&version_name={version_name}",
+    )
+    return ProjectVersionDetailResponse(**response.json())
+def delete_version_command(project_name: str, divbase_base_url: str, version_name: str) -> DeleteVersionResponse:
+    """
+    Delete a version from the project versions table stored on the divbase server.
+    This marks the version as (soft) deleted server side,
+    and it will eventually be permanently deleted (after some grace period).
+    """
+    request_data = DeleteVersionRequest(version_name=version_name)
+    response = make_authenticated_request(
+        method="DELETE",
+        divbase_base_url=divbase_base_url,
+        api_route=f"v1/project-versions/delete?project_name={project_name}",
+        json=request_data.model_dump(),
+    )
+    return DeleteVersionResponse(**response.json())

divbase-cli 0.1.0.dev2__py3-none-any.whl → 0.1.0.dev3__py3-none-any.whl

divbase-cli 0.1.0.dev2py3-none-any.whl → 0.1.0.dev3py3-none-any.whl