PyPI - divbase-lib - Versions diffs - 0.1.0.dev1__py3-none-any.whl → 0.1.0.dev3__py3-none-any.whl - Mend

divbase-lib 0.1.0.dev1py3-none-any.whl → 0.1.0.dev3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

divbase_lib/__init__.py +1 -1
divbase_lib/api_schemas/queries.py +1 -0
divbase_lib/api_schemas/s3.py +169 -14
divbase_lib/divbase_constants.py +45 -0
divbase_lib/exceptions.py +4 -15
divbase_lib/s3_checksums.py +67 -13
{divbase_lib-0.1.0.dev1.dist-info → divbase_lib-0.1.0.dev3.dist-info}/METADATA +1 -1
divbase_lib-0.1.0.dev3.dist-info/RECORD +14 -0
divbase_lib-0.1.0.dev1.dist-info/RECORD +0 -13
{divbase_lib-0.1.0.dev1.dist-info → divbase_lib-0.1.0.dev3.dist-info}/WHEEL +0 -0

divbase_lib/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.0.~~dev1~~"
1	+ __version__ = "0.1.0.dev3"

divbase_lib/api_schemas/queries.py CHANGED Viewed

@@ -45,6 +45,7 @@ class BcftoolsQueryKwargs(BaseModel):
     project_id: int
     project_name: str
     user_id: int
+    job_id: int
 class SampleMetadataQueryTaskResult(BaseModel):

divbase_lib/api_schemas/s3.py CHANGED Viewed

@@ -1,12 +1,73 @@
 """
 Schemas for DivBase's S3 API routes.
+Pre-signed download URLs do not need to account for single vs multipart as this can be controlled by the client
+using the HTTP range header when downloading (so you only need 1 pre-signed URL per object for download).
+Pre-signed upload URLs need to account for single vs multipart uploads hence all the extra schemas below.
 """
+from datetime import datetime
 from pydantic import BaseModel, Field
+from divbase_lib.divbase_constants import S3_MULTIPART_CHUNK_SIZE
+MB = 1024 * 1024
+## list objects models ##
+class ListObjectsRequest(BaseModel):
+    """Request model for listing objects in an S3 bucket."""
+    prefix: str | None = Field(None, description="Optional prefix to filter objects by name.")
+    next_token: str | None = Field(
+        None, description="Token to continue listing files from the end of a previous request."
+    )
+class ObjectDetails(BaseModel):
+    """Details about a single object in an S3 bucket."""
+    name: str = Field(..., description="The name of the object in the bucket.")
+    size: int = Field(..., description="The size of the object in bytes.")
+    last_modified: datetime = Field(..., description="The date and time the object was last modified.")
+    etag: str = Field(..., description="The ETag of the object, which is the MD5 checksum.")
+class ListObjectsResponse(BaseModel):
+    """Response model for listing objects in an S3 bucket."""
+    objects: list[ObjectDetails] = Field(
+        ..., description="A list of objects in the bucket.", min_length=0, max_length=1000
+    )
+    next_token: str | None = Field(
+        None, description="Token for fetching the next page of results. If None, no more results."
+    )
+## file info models ##
+class ObjectVersionInfo(BaseModel):
+    """Detailed information about a single version of an S3 object."""
+    version_id: str = Field(..., description="The version ID of the object.")
+    last_modified: datetime = Field(..., description="The date and time the object version was last modified.")
+    size: int = Field(..., description="The size of the object in bytes.")
+    etag: str = Field(..., description="The ETag of the object, which is the MD5 checksum.")
+    is_latest: bool = Field(..., description="Indicates if this is the latest version of the object.")
+class ObjectInfoResponse(BaseModel):
+    """Response model for detailed information about all versions of a single object stored in S3."""
+    object_name: str = Field(..., description="The name of the object.")
+    is_currently_deleted: bool = Field(..., description="True if the latest version of the object is a delete marker.")
+    versions: list[ObjectVersionInfo] = Field(..., description="A list of all versions of the object.")
+## download models ##
 class DownloadObjectRequest(BaseModel):
-    """Request model to upload a single object using a pre-signed URL."""
+    """Request model to download a single object using a pre-signed URL."""
     name: str = Field(..., description="Name of the object to be downloaded")
     version_id: str | None = Field(..., description="Version ID of the object, None if latest version")
@@ -20,32 +81,126 @@ class PreSignedDownloadResponse(BaseModel):
     version_id: str | None = Field(..., description="Version ID of the object, None if latest version")
-class UploadObjectRequest(BaseModel):
-    """Request model to upload a single object using a pre-signed URL."""
+### Single-part upload models ###
+class UploadSinglePartObjectRequest(BaseModel):
+    """Request model to upload a single object as a single part using a pre-signed URL."""
     name: str = Field(..., description="Name of the object to be uploaded")
     content_length: int = Field(..., description="Size of the file in bytes")
     md5_hash: str | None = Field(None, description="Optional MD5 hash of the object for integrity check")
-class PreSignedUploadResponse(BaseModel):
-    """Response model to upload a single object using the pre-signed URL using PUT."""
+class PreSignedSinglePartUploadResponse(BaseModel):
+    """Response model to upload a single object as a single part using the pre-signed URL using PUT."""
     name: str = Field(..., description="Name of the object to be uploaded")
     pre_signed_url: str = Field(..., description="Pre-signed URL to which the file should be uploaded")
     put_headers: dict[str, str] = Field(..., description="Headers to be included in the PUT request")
-class CheckFileExistsRequest(BaseModel):
-    """Request model to check if a file already exists in the bucket (using the checksum)"""
+### Multipart upload models ###
+class CreateMultipartUploadRequest(BaseModel):
+    """Request model to create a multipart upload using pre-signed URLs."""
+    name: str = Field(..., description="Name of the object to be uploaded")
+    content_length: int = Field(..., description="Size of the file in bytes")
+class CreateMultipartUploadResponse(BaseModel):
+    """Response model to create a multipart upload using pre-signed URLs."""
+    name: str = Field(..., description="Name of the object to be uploaded")
+    upload_id: str = Field(..., description="Upload ID for the multipart upload")
+    number_of_parts: int = Field(..., description="Total number of parts required for the upload", ge=1, le=10000)
+    part_size: int = Field(
+        S3_MULTIPART_CHUNK_SIZE, description="Size of each part in bytes (the last part may be smaller)."
+    )
+class GetPresignedPartUrlsRequest(BaseModel):
+    """
+    Request model to get pre-signed URLs for multiple parts of a presigned multipart upload.
+    You can request up to 100 parts at a time.
+    Part number indexing is 1-based (with max allowed range: 1 to 10000).
+    """
+    name: str = Field(..., description="Name of the object to be uploaded")
+    upload_id: str = Field(..., description="Upload ID for the multipart upload")
+    parts_range_start: int = Field(..., description="Starting part number", ge=1, le=10000)
+    parts_range_end: int = Field(..., description="Ending part number", ge=1, le=10000)
+    md5_checksums: list[str] | None = Field(
+        None, description="Optional list of MD5 checksums for each part to be uploaded"
+    )
+class PresignedUploadPartUrlResponse(BaseModel):
+    """Response model for a pre-signed URL for a single part of a multipart upload."""
+    part_number: int = Field(..., description="Part number", ge=1, le=10000)
+    pre_signed_url: str = Field(..., description="Pre-signed URL for uploading this part")
+    headers: dict[str, str] = Field(..., description="Headers to be included in the PUT request for this part")
+class UploadedPart(BaseModel):
+    """Model representing a part of an object that has been uploaded via multi-part upload."""
+    part_number: int = Field(..., description="Part number", ge=1, le=10000)
+    etag: str = Field(description="ETag returned by S3 after uploading the part")
+class CompleteMultipartUploadRequest(BaseModel):
+    """Request model to complete a multipart upload using pre-signed URLs."""
+    name: str = Field(..., description="Name of the object to be uploaded")
+    upload_id: str = Field(..., description="Upload ID for the multipart upload")
+    parts: list[UploadedPart] = Field(..., description="List of parts that have been uploaded")
+class CompleteMultipartUploadResponse(BaseModel):
+    """Response model to complete a multipart upload using pre-signed URLs."""
+    name: str = Field(..., description="Name of the object that was uploaded")
+    version_id: str = Field(..., description="Version ID of the uploaded object")
+    md5_hash: str = Field(..., description="MD5 hash of the uploaded object")
+class AbortMultipartUploadRequest(BaseModel):
+    """Request model to abort a multipart upload and clean up parts."""
+    name: str = Field(..., description="Name of the object being uploaded")
+    upload_id: str = Field(..., description="Upload ID for the multipart upload to be aborted")
+class AbortMultipartUploadResponse(BaseModel):
+    """Response model to abort a multipart upload."""
+    name: str = Field(..., description="Name of the object being uploaded")
+    upload_id: str = Field(..., description="Upload ID for the multipart upload that was aborted")
+class RestoreObjectsResponse(BaseModel):
+    """Response model for restoring soft-deleted objects in a bucket."""
-    object_name: str
-    md5_checksum: str
+    restored: list[str] = Field(
+        ...,
+        description="List of object names that were successfully restored, this includes objects that were already live",
+    )
+    not_restored: list[str] = Field(
+        ...,
+        description=(
+            "List of object names that could not be processed.\n"
+            "This could be due to several reasons:\n"
+            "1. The object does not exist in the bucket (e.g., a typo in the name).\n"
+            "2. The object was hard-deleted and is unrecoverable.\n"
+            "3. An unexpected server error occurred during the restore attempt."
+        ),
+    )
-class ExistingFileResponse(BaseModel):
-    """Response model for reporting a file that already exists in the bucket (using it's checksum)"""
+## checksum models ##
+class FileChecksumResponse(BaseModel):
+    """Response model for reporting a file's checksum in the bucket."""
-    object_name: str
-    md5_checksum: str
-    matching_object_name: str | None
+    object_name: str = Field(..., description="Name of the object in the bucket")
+    md5_checksum: str = Field(..., description="MD5 checksum of the object in the bucket")

divbase_lib/divbase_constants.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""
+Constants that both divbase-api and divbase-cli need to agree on.
+"""
+ONE_MiB = 1024 * 1024
+# When you download a file that has been uploaded in parts, you have
+# to know the part/chunk size used in order to correctly calculate the composite checksum
+S3_MULTIPART_CHUNK_SIZE = 32 * ONE_MiB
+# At what point you swap from single part to multipart upload to S3.
+# If server and client used the same threshold then makes life easier
+# when validating the checksums of files in s3 as single part and multipart uploads use different ETag formats.
+# (No benefit in constraining the download threshold, so not done here)
+S3_MULTIPART_UPLOAD_THRESHOLD = 96 * ONE_MiB
+# Max number of items that can be processed in a single API call to divbase-api's S3 routes
+# covers e.g pre-signed urls for upload/download, soft delete and checksum comparisons
+# client has to batch requests if exceeding this limit
+MAX_S3_API_BATCH_SIZE = 100
+# How long the pre-signed URLs divbase-api creates are valid for
+SINGLE_PART_UPLOAD_URL_EXPIRATION_SECONDS = 3600  # 1 hour
+MULTI_PART_UPLOAD_URL_EXPIRATION_SECONDS = 36000  # 10 hours
+DOWNLOAD_URL_EXPIRATION_SECONDS = 36000  # 10 hours
+# (Not used anywhere, just making it explicit)
+# This is limited by our fixing of the chunk size and S3's limit to the number of chunks allowed (10,000)
+# 320 GiB if using 32 MiB chunks
+LARGEST_FILE_UPLOADABLE_TO_DIVBASE_BYTES = 10_000 * S3_MULTIPART_CHUNK_SIZE
+# File types that DivBase supports
+# Whilst we can't realistically limit what file types a user actually uploads,
+# this is here to say what we know should work in DivBase.
+SUPPORTED_DIVBASE_FILE_TYPES = (".tsv", ".vcf.gz", ".csi", ".tbi")
+# Characters that are not allowed in file names uploaded to DivBase
+# This is to prevent issues when users try to filter/query files on DivBase using these characters
+# or when downloading files (e.g. ":" is used to specify file versions when downloading files
+UNSUPPORTED_CHARACTERS_IN_FILENAMES = (":", "*", "?", "<", ">", "|", "\\")
+# This prefix is used for all *.vcf.gz results files from a query job/task.
+# After the prefix comes the job id which is a rolling integer.
+# E.g. format: result_of_job_<job-id>.vcf.gz , where <job-id> = 1 and is auto-incremented for every new job.
+QUERY_RESULTS_FILE_PREFIX = "result_of_job_"

divbase_lib/exceptions.py CHANGED Viewed

@@ -10,20 +10,6 @@ we ensure that when you manually raise a specific exception the error message lo
 from pathlib import Path
-class ObjectDoesNotExistError(FileNotFoundError):
-    """Raised when an S3 object/key does not exist in the bucket."""
-    def __init__(self, key: str, bucket_name: str):
-        error_message = f"The file/object '{key}' does not exist in the bucket '{bucket_name}'. "
-        super().__init__(error_message)
-        self.key = key
-        self.bucket = bucket_name
-        self.error_message = error_message
-    def __str__(self):
-        return self.error_message
 class BcftoolsEnvironmentError(Exception):
     """Raised when there's an issue with the execution environment (Docker, etc.)."""
@@ -128,5 +114,8 @@ class ChecksumVerificationError(Exception):
         self.expected_checksum = expected_checksum
         self.calculated_checksum = calculated_checksum
-        message = f"Checksum verification failed. Expected: {expected_checksum}, Calculated: {calculated_checksum}"
+        message = (
+            f"Checksum verification failed. Expected: {expected_checksum}, Calculated: {calculated_checksum}"
+            f" The file has been deleted to avoid accidental use of a corrupted file."
+        )
         super().__init__(message)

divbase_lib/s3_checksums.py CHANGED Viewed

@@ -13,31 +13,59 @@ from enum import StrEnum
 from pathlib import Path
 from typing import Iterator
+from divbase_lib.divbase_constants import S3_MULTIPART_CHUNK_SIZE
 from divbase_lib.exceptions import ChecksumVerificationError
 logger = logging.getLogger(__name__)
-def _read_file_chunks(file_path: Path, chunk_size: int = 8192) -> Iterator[bytes]:
+class MD5CheckSumFormat(StrEnum):
+    HEX = "hex"
+    BASE64 = "base64"
+def verify_downloaded_checksum(
+    file_path: Path,
+    expected_checksum: str,
+) -> None:
+    """
+    Verify a downloaded file against its S3's ETag.
+    For files uploaded as single part, this is just the MD5 checksum in hex format.
+    For files uploaded as multipart, this is a composite checksum of all the parts
+    """
+    if "-" in expected_checksum:
+        calculated_checksum = calculate_composite_md5_s3_etag(file_path)
+    else:
+        calculated_checksum = calculate_md5_checksum(file_path=file_path, output_format=MD5CheckSumFormat.HEX)
+    if calculated_checksum != expected_checksum:
+        raise ChecksumVerificationError(expected_checksum=expected_checksum, calculated_checksum=calculated_checksum)
+def _read_file_chunks(file_path: Path, chunk_size: int) -> Iterator[bytes]:
     """Helper function to read a file in 'chunk_size' sized chunks."""
     with file_path.open(mode="rb") as infile:
         yield from iter(lambda: infile.read(chunk_size), b"")
-class MD5CheckSumFormat(StrEnum):
-    HEX = "hex"
-    BASE64 = "base64"
-def calculate_md5_checksum(file_path: Path, output_format: MD5CheckSumFormat) -> str:
+def calculate_md5_checksum(
+    file_path: Path, output_format: MD5CheckSumFormat, chunk_size: int = S3_MULTIPART_CHUNK_SIZE
+) -> str:
     """
     Calculate the MD5 checksum of a file.
     Returns the checksum in either hex-encoded (lowercase) or base64-encoded format.
+    Used for:
+    - BASE64: The "Content-MD5" header used in uploads to S3.
+    - HEX: Verifying downloaded files against S3's ETag for the file.
+    (only works for files which will be uploaded as single part - not composite/multipart uploads)
     """
     md5_hash = hashlib.md5()
-    for chunk in _read_file_chunks(file_path):
+    for chunk in _read_file_chunks(file_path=file_path, chunk_size=chunk_size):
         md5_hash.update(chunk)
     if output_format == MD5CheckSumFormat.HEX:
@@ -48,13 +76,39 @@ def calculate_md5_checksum(file_path: Path, output_format: MD5CheckSumFormat) ->
         raise ValueError(f"Unknown output format: {output_format}")
-def verify_downloaded_checksum(file_path: Path, expected_checksum: str) -> None:
+def calculate_md5_checksum_for_chunk(file_path: Path, start_byte: int, chunk_size: int) -> str:
+    """
+    Calculate the base64-encoded MD5 checksum for a specific chunk of a file.
+    S3 uses this checksum (Content-MD5 header) when uploading parts of a file.
+    """
+    md5_hash = hashlib.md5()
+    with file_path.open("rb") as f:
+        f.seek(start_byte)
+        chunk = f.read(chunk_size)
+        md5_hash.update(chunk)
+    return base64.b64encode(md5_hash.digest()).decode("utf-8")
+def calculate_composite_md5_s3_etag(
+    file_path: Path,
+    chunk_size: int = S3_MULTIPART_CHUNK_SIZE,
+) -> str:
     """
-    Verify a downloaded file against S3's ETag (MD5 checksum in hex format).
+    Calculate the composite ETag for a file that was uploaded via multipart upload to S3.
+    This is used to validate the downloaded file's integrity.
+    The process involves calculating the MD5 hash of each part, then combining these hashes to form a final ETag.
+    So the part size used here must match the part size used during upload.
     """
-    calculated_md5 = calculate_md5_checksum(file_path=file_path, output_format=MD5CheckSumFormat.HEX)
-    if calculated_md5 != expected_checksum:
-        raise ChecksumVerificationError(expected_checksum=expected_checksum, calculated_checksum=calculated_md5)
+    md5_digests = []
+    part_count = 0
+    for chunk in _read_file_chunks(file_path=file_path, chunk_size=chunk_size):
+        md5_digests.append(hashlib.md5(chunk).digest())
+        part_count += 1
+    composite_hash = hashlib.md5(b"".join(md5_digests))
+    return f"{composite_hash.hexdigest()}-{part_count}"
 def convert_checksum_hex_to_base64(hex_checksum: str) -> str:

{divbase_lib-0.1.0.dev1.dist-info → divbase_lib-0.1.0.dev3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: divbase-lib
-Version: 0.1.0.dev1
+Version: 0.1.0.dev3
 Summary: Library module for Divbase
 Project-URL: Homepage, https://divbase.scilifelab.se
 Project-URL: Documentation, https://scilifelabdatacentre.github.io/divbase

divbase_lib-0.1.0.dev3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+divbase_lib/__init__.py,sha256=jQHG8OW4TlfIzPKW4IrX9q58EfEf9pDfm2YgO0ydKaA,27
+divbase_lib/divbase_constants.py,sha256=kvY1_Plvwg5PIgUl_G5sTIhpMwnCBp_MwdICgHJ1ErM,2302
+divbase_lib/exceptions.py,sha256=qruN11zJEzPta_bF3wSzn81zx83X2RfNDVEAZfhMan0,4083
+divbase_lib/s3_checksums.py,sha256=pAaGDsxAoP916gRR_70frJXEl2GpWC7D_DVTsYfcPmg,4135
+divbase_lib/api_schemas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+divbase_lib/api_schemas/auth.py,sha256=RmfoGoRID31r1ZA8O0XiC4Iy2d-gkEdE_75l8EiVxbY,1331
+divbase_lib/api_schemas/project_versions.py,sha256=trv9a_I8CIquCEJEnRXANIKA3Mboe339eu-q_rjaSJE,2353
+divbase_lib/api_schemas/queries.py,sha256=VOWzwegNdjf3HR1dqkfvKuT6qDDnqI-JOSMBVGJ-UuQ,1762
+divbase_lib/api_schemas/s3.py,sha256=ayREJRDMvgcSgSuSwkNX8F04pB-QtsqIXA4nEGa-9i8,9066
+divbase_lib/api_schemas/task_history.py,sha256=BwmnjJl8fvZftDfuE6txUeYR5dv5WYp8GAeamkifvjY,1414
+divbase_lib/api_schemas/vcf_dimensions.py,sha256=o3hKPs_BJMsP4ULikZsuBnDx8CJy9MC66FYahcuSIzg,1276
+divbase_lib-0.1.0.dev3.dist-info/METADATA,sha256=t3o7gpqN8RCjni6n9Bq0U_WYPOmtH_N01FEc4BM1p_k,1564
+divbase_lib-0.1.0.dev3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+divbase_lib-0.1.0.dev3.dist-info/RECORD,,

divbase_lib-0.1.0.dev1.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-divbase_lib/__init__.py,sha256=7laO2T6HtGHGvqn4SNMbwiGjkxKXUl3tP2KtQ6BHPiA,27
-divbase_lib/exceptions.py,sha256=Ld9_EvV02BP2EudaXbWq5B5YneEedZ7lsKxH6ryk-lA,4442
-divbase_lib/s3_checksums.py,sha256=D_jQAYKpUQf8xFs3M65F_zV_sasQFBJUH6hRwXfN_GE,2175
-divbase_lib/api_schemas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-divbase_lib/api_schemas/auth.py,sha256=RmfoGoRID31r1ZA8O0XiC4Iy2d-gkEdE_75l8EiVxbY,1331
-divbase_lib/api_schemas/project_versions.py,sha256=trv9a_I8CIquCEJEnRXANIKA3Mboe339eu-q_rjaSJE,2353
-divbase_lib/api_schemas/queries.py,sha256=bdJttYzZpgnaqg-5Z9BVTlpfutFdoo8EUayw6FSHm8o,1746
-divbase_lib/api_schemas/s3.py,sha256=leQRlwnyAiSAMv-4CHdgjT_iPGXkXcpVQBFkcUS-kbs,2006
-divbase_lib/api_schemas/task_history.py,sha256=BwmnjJl8fvZftDfuE6txUeYR5dv5WYp8GAeamkifvjY,1414
-divbase_lib/api_schemas/vcf_dimensions.py,sha256=o3hKPs_BJMsP4ULikZsuBnDx8CJy9MC66FYahcuSIzg,1276
-divbase_lib-0.1.0.dev1.dist-info/METADATA,sha256=VywPx9jbpq8c_k-bWg6cm6F78oZEdQLOcXSOW4nkO-w,1564
-divbase_lib-0.1.0.dev1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-divbase_lib-0.1.0.dev1.dist-info/RECORD,,

{divbase_lib-0.1.0.dev1.dist-info → divbase_lib-0.1.0.dev3.dist-info}/WHEEL RENAMED Viewed

File without changes

divbase-lib 0.1.0.dev1__py3-none-any.whl → 0.1.0.dev3__py3-none-any.whl

divbase-lib 0.1.0.dev1py3-none-any.whl → 0.1.0.dev3py3-none-any.whl