divbase-lib 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0.dev0"
File without changes
@@ -0,0 +1,34 @@
1
+ """
2
+ Schemas for login + access and refresh tokens
3
+ """
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class CLILoginResponse(BaseModel):
9
+ """Response model for API (aka divbase-cli) login endpoint."""
10
+
11
+ access_token: str = Field(..., description="Bearer access token for authentication")
12
+ access_token_expires_at: int = Field(..., description="Unix timestamp when the access token expires")
13
+ refresh_token: str = Field(..., description="Bearer refresh token for obtaining new access tokens")
14
+ refresh_token_expires_at: int = Field(..., description="Unix timestamp when the refresh token expires")
15
+ email: str = Field(..., description="Email of the authenticated user")
16
+
17
+
18
+ class RefreshTokenRequest(BaseModel):
19
+ """Request model for refresh token endpoint."""
20
+
21
+ refresh_token: str = Field(..., description="Bearer refresh token for obtaining a new access token")
22
+
23
+
24
+ class RefreshTokenResponse(BaseModel):
25
+ """Response model for refresh token endpoint."""
26
+
27
+ access_token: str = Field(..., description="Bearer access token for authentication")
28
+ expires_at: int = Field(..., description="Unix timestamp when the access token expires")
29
+
30
+
31
+ class LogoutRequest(BaseModel):
32
+ """Request model for logout endpoint."""
33
+
34
+ refresh_token: str = Field(..., description="Bearer refresh token to be revoked on logout")
@@ -0,0 +1,62 @@
1
+ """
2
+ Schemas for project versioning routes.
3
+
4
+ Project versions are the state of all files in a project's storage bucket at a given time point.
5
+ """
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ # Request Models
11
+ class CreateVersioningFileRequest(BaseModel):
12
+ name: str = Field(..., description="Initial version name")
13
+ description: str = Field(..., description="Initial version description")
14
+
15
+
16
+ class AddVersionRequest(BaseModel):
17
+ name: str = Field(..., description="Name of the new version to add")
18
+ description: str = Field("", description="Description of the new version")
19
+
20
+
21
+ class DeleteVersionRequest(BaseModel):
22
+ version_name: str = Field(..., description="Name of the version to delete")
23
+
24
+
25
+ # Response Models
26
+ class ProjectBasicInfo(BaseModel):
27
+ """Base model for describing a single project version, not for direct use in an endpoint."""
28
+
29
+ name: str = Field(..., description="Version name")
30
+ description: str | None = Field(..., description="Version description")
31
+
32
+
33
+ class AddVersionResponse(ProjectBasicInfo):
34
+ """Response model for adding a version."""
35
+
36
+ created_at: str = Field(..., description="ISO timestamp when version was created")
37
+
38
+
39
+ class ProjectVersionInfo(ProjectBasicInfo):
40
+ """Basic information about a project version. You get a list of these when listing all versions in a project."""
41
+
42
+ created_at: str = Field(..., description="ISO timestamp when version was created")
43
+ is_deleted: bool = Field(..., description="Whether this version has been soft-deleted")
44
+
45
+
46
+ class ProjectVersionDetailResponse(ProjectBasicInfo):
47
+ """Full information about a single project version, including the files at that version."""
48
+
49
+ created_at: str = Field(..., description="ISO timestamp when version was created")
50
+ is_deleted: bool = Field(..., description="Whether this version has been soft-deleted")
51
+ files: dict[str, str] = Field(..., description="Mapping of file names to their version IDs")
52
+
53
+
54
+ class DeleteVersionResponse(BaseModel):
55
+ """Response model for deleting a version."""
56
+
57
+ name: str = Field(..., description="Name of the version that was deleted")
58
+ already_deleted: bool = Field(
59
+ False,
60
+ description="Whether the version was already soft-deleted before this request",
61
+ )
62
+ date_deleted: str = Field(..., description="ISO timestamp of when the version was soft-deleted")
@@ -0,0 +1,64 @@
1
+ """
2
+ Schemas for query routes.
3
+ """
4
+
5
+ from typing import Any, Optional
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ # Request models
11
+ class SampleMetadataQueryRequest(BaseModel):
12
+ """Request model for sample metadata query route."""
13
+
14
+ tsv_filter: str
15
+ metadata_tsv_name: str
16
+
17
+
18
+ class BcftoolsQueryRequest(BaseModel):
19
+ """Request model for sample metadata query route."""
20
+
21
+ tsv_filter: str
22
+ metadata_tsv_name: str
23
+ command: str # TODO add field to decribe that this is bcftools commands
24
+
25
+
26
+ # Models for task kwargs and task results. Reused in task history schemas too, hence pydantic models and not just dataclasses.
27
+ class SampleMetadataQueryKwargs(BaseModel):
28
+ """Keyword arguments for sample metadata query task. Used to pass info to Celery task, and also for recording task history."""
29
+
30
+ tsv_filter: str
31
+ metadata_tsv_name: str
32
+ bucket_name: str
33
+ project_id: int
34
+ project_name: str
35
+ user_id: int
36
+
37
+
38
+ class BcftoolsQueryKwargs(BaseModel):
39
+ """Keyword arguments for BCFtools query task. Used to pass info to Celery task, and also for recording task history."""
40
+
41
+ tsv_filter: str
42
+ command: str
43
+ metadata_tsv_name: str
44
+ bucket_name: str
45
+ project_id: int
46
+ project_name: str
47
+ user_id: int
48
+
49
+
50
+ class SampleMetadataQueryTaskResult(BaseModel):
51
+ """Metadata query task result details. Based on the return of tasks.sample_metadata_query."""
52
+
53
+ sample_and_filename_subset: list[dict[str, Any]]
54
+ unique_sample_ids: list[str]
55
+ unique_filenames: list[str]
56
+ query_message: str
57
+ status: Optional[str] = None
58
+
59
+
60
+ class BcftoolsQueryTaskResult(BaseModel):
61
+ """BCFtools query task result details. Based on the return of tasks.bcftools_query."""
62
+
63
+ output_file: str
64
+ status: Optional[str] = None
@@ -0,0 +1,51 @@
1
+ """
2
+ Schemas for DivBase's S3 API routes.
3
+ """
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class DownloadObjectRequest(BaseModel):
9
+ """Request model to upload a single object using a pre-signed URL."""
10
+
11
+ name: str = Field(..., description="Name of the object to be downloaded")
12
+ version_id: str | None = Field(..., description="Version ID of the object, None if latest version")
13
+
14
+
15
+ class PreSignedDownloadResponse(BaseModel):
16
+ """Response model to download a single object using the pre-signed URL and (optionally) version ID."""
17
+
18
+ name: str = Field(..., description="Name of the object to be downloaded")
19
+ pre_signed_url: str = Field(..., description="Pre-signed URL for downloading the object")
20
+ version_id: str | None = Field(..., description="Version ID of the object, None if latest version")
21
+
22
+
23
+ class UploadObjectRequest(BaseModel):
24
+ """Request model to upload a single object using a pre-signed URL."""
25
+
26
+ name: str = Field(..., description="Name of the object to be uploaded")
27
+ content_length: int = Field(..., description="Size of the file in bytes")
28
+ md5_hash: str | None = Field(None, description="Optional MD5 hash of the object for integrity check")
29
+
30
+
31
+ class PreSignedUploadResponse(BaseModel):
32
+ """Response model to upload a single object using the pre-signed URL using PUT."""
33
+
34
+ name: str = Field(..., description="Name of the object to be uploaded")
35
+ pre_signed_url: str = Field(..., description="Pre-signed URL to which the file should be uploaded")
36
+ put_headers: dict[str, str] = Field(..., description="Headers to be included in the PUT request")
37
+
38
+
39
+ class CheckFileExistsRequest(BaseModel):
40
+ """Request model to check if a file already exists in the bucket (using the checksum)"""
41
+
42
+ object_name: str
43
+ md5_checksum: str
44
+
45
+
46
+ class ExistingFileResponse(BaseModel):
47
+ """Response model for reporting a file that already exists in the bucket (using it's checksum)"""
48
+
49
+ object_name: str
50
+ md5_checksum: str
51
+ matching_object_name: str | None
@@ -0,0 +1,50 @@
1
+ """
2
+ Schemas for task history routes.
3
+ """
4
+
5
+ from typing import Any, Optional, Union
6
+
7
+ from pydantic import BaseModel
8
+
9
+ from divbase_lib.api_schemas.queries import (
10
+ BcftoolsQueryKwargs,
11
+ BcftoolsQueryTaskResult,
12
+ SampleMetadataQueryKwargs,
13
+ SampleMetadataQueryTaskResult,
14
+ )
15
+ from divbase_lib.api_schemas.vcf_dimensions import DimensionUpdateKwargs, DimensionUpdateTaskResult
16
+
17
+
18
+ class TaskHistoryResult(BaseModel):
19
+ """
20
+ Task details as returned by queries to the SQAlchemy+pg results backend.
21
+ """
22
+
23
+ id: int
24
+ submitter_email: Optional[str] = None
25
+ status: Optional[str] = None
26
+ result: Optional[
27
+ Union[
28
+ dict[
29
+ str, Any
30
+ ], # Note! This dict must come first here so that error results are preserved and not incorrectly inserted into the result models
31
+ SampleMetadataQueryTaskResult,
32
+ BcftoolsQueryTaskResult,
33
+ DimensionUpdateTaskResult,
34
+ ]
35
+ ] = None
36
+ date_done: Optional[str] = None
37
+ name: Optional[str] = None
38
+ args: Optional[str] = None
39
+ kwargs: Optional[
40
+ Union[
41
+ SampleMetadataQueryKwargs,
42
+ BcftoolsQueryKwargs,
43
+ DimensionUpdateKwargs,
44
+ ]
45
+ ] = None
46
+ worker: Optional[str] = None
47
+ created_at: Optional[str] = None
48
+ started_at: Optional[str] = None
49
+ completed_at: Optional[str] = None
50
+ runtime: Optional[float] = None
@@ -0,0 +1,42 @@
1
+ """
2
+ Schemas for VCF dimensions routes.
3
+ """
4
+
5
+ from typing import Optional
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class DimensionUpdateKwargs(BaseModel):
11
+ """Keyword arguments for dimension update task. Used to pass info to Celery task, and also for recording task history."""
12
+
13
+ bucket_name: str
14
+ project_id: int
15
+ project_name: str
16
+ user_id: int
17
+
18
+
19
+ class DimensionUpdateTaskResult(BaseModel):
20
+ """Dimension update task result details. Based on the return of tasks.update_dimensions_index."""
21
+
22
+ status: Optional[str] = None
23
+ VCF_files_added: Optional[list[str]] = Field(
24
+ None, description="VCF files that were added to dimensions index by this job"
25
+ )
26
+ VCF_files_skipped: Optional[list[str]] = Field(
27
+ None, description="VCF files skipped by this job (previous DivBase-generated result VCFs)"
28
+ )
29
+ VCF_files_deleted: Optional[list[str]] = Field(
30
+ None, description="VCF files that have been deleted from the project and thus have been dropped from the index"
31
+ )
32
+
33
+
34
+ class DimensionsShowResult(BaseModel):
35
+ """Result model for showing VCF dimensions for a project."""
36
+
37
+ project_id: int
38
+ project_name: str
39
+ vcf_file_count: int
40
+ vcf_files: list[dict]
41
+ skipped_file_count: int
42
+ skipped_files: list[dict]
@@ -0,0 +1,132 @@
1
+ """
2
+ Custom exceptions for DivBase packages.
3
+
4
+ These are raised by lover-level functions/methods which understand the context of the error.
5
+
6
+ Note: By adding the `__str__` method to each exception,
7
+ we ensure that when you manually raise a specific exception the error message looks good
8
+ """
9
+
10
+ from pathlib import Path
11
+
12
+
13
+ class ObjectDoesNotExistError(FileNotFoundError):
14
+ """Raised when an S3 object/key does not exist in the bucket."""
15
+
16
+ def __init__(self, key: str, bucket_name: str):
17
+ error_message = f"The file/object '{key}' does not exist in the bucket '{bucket_name}'. "
18
+ super().__init__(error_message)
19
+ self.key = key
20
+ self.bucket = bucket_name
21
+ self.error_message = error_message
22
+
23
+ def __str__(self):
24
+ return self.error_message
25
+
26
+
27
+ class BcftoolsEnvironmentError(Exception):
28
+ """Raised when there's an issue with the execution environment (Docker, etc.)."""
29
+
30
+ def __init__(self, container_name: str):
31
+ self.container_name = container_name
32
+ error_message = (
33
+ f"No running container found with name {self.container_name}. Ensure the Docker image is available.\n"
34
+ )
35
+ super().__init__(error_message)
36
+
37
+ self.error_message = error_message
38
+
39
+ def __str__(self):
40
+ return self.error_message
41
+
42
+
43
+ class BcftoolsCommandError(Exception):
44
+ """Raised when a bcftools command fails to execute properly."""
45
+
46
+ def __init__(self, command: str, error_details: Exception = None):
47
+ self.command = command
48
+ self.error_details = error_details
49
+
50
+ error_message = f"bcftools command failed: '{command}'"
51
+ if error_details:
52
+ error_message += f" with error details: {error_details}"
53
+
54
+ super().__init__(error_message)
55
+
56
+ def __str__(self):
57
+ if hasattr(self.error_details, "stderr") and self.error_details.stderr:
58
+ return f"bcftools command failed: '{self.command}' with error: {self.error_details.stderr}"
59
+ return super().__str__()
60
+
61
+
62
+ class BcftoolsPipeEmptyCommandError(Exception):
63
+ """Raised when an empty command is provided to the bcftools pipe."""
64
+
65
+ def __init__(self):
66
+ error_message = "Empty command provided. Please specify at least one valid bcftools command."
67
+ super().__init__(error_message)
68
+ self.error_message = error_message
69
+
70
+ def __str__(self):
71
+ return self.error_message
72
+
73
+
74
+ class BcftoolsPipeUnsupportedCommandError(Exception):
75
+ """Raised when a bcftools command unsupported by the BcftoolsQueryManager class is provided."""
76
+
77
+ def __init__(self, command: str, position: int, valid_commands: list[str]):
78
+ self.command = command
79
+ self.position = position
80
+ self.valid_commands = valid_commands
81
+
82
+ message = (
83
+ f"Unsupported bcftools command '{command}' at position {position}. "
84
+ f"Only the following commands are supported: {', '.join(valid_commands)}"
85
+ )
86
+ super().__init__(message)
87
+
88
+
89
+ class SidecarNoDataLoadedError(Exception):
90
+ """Raised when no data is loaded in SidecarQueryManager."""
91
+
92
+ def __init__(self, file_path: Path, submethod: str, error_details: str | None = None):
93
+ self.file_path = file_path
94
+ self.error_details = error_details
95
+
96
+ error_message = f"No data loaded from file '{file_path}', as raised in submethod '{submethod}'."
97
+ if error_details:
98
+ error_message += f"More details about the error: {error_details}"
99
+ super().__init__(error_message)
100
+ self.error_message = error_message
101
+
102
+ def __str__(self):
103
+ return self.error_message
104
+
105
+
106
+ class SidecarInvalidFilterError(Exception):
107
+ """Raised when an invalid filter is provided to SidecarQueryManager."""
108
+
109
+ pass
110
+
111
+
112
+ class SidecarColumnNotFoundError(Exception):
113
+ """Raised when a requested column is not found in the query result."""
114
+
115
+ pass
116
+
117
+
118
+ class NoVCFFilesFoundError(Exception):
119
+ """Raised when no VCF files are found in the project bucket."""
120
+
121
+ pass
122
+
123
+
124
+ class ChecksumVerificationError(Exception):
125
+ """Raised when a calculated file's checksum does not match the expected value."""
126
+
127
+ def __init__(self, expected_checksum: str, calculated_checksum: str):
128
+ self.expected_checksum = expected_checksum
129
+ self.calculated_checksum = calculated_checksum
130
+
131
+ message = f"Checksum verification failed. Expected: {expected_checksum}, Calculated: {calculated_checksum}"
132
+ super().__init__(message)
@@ -0,0 +1,66 @@
1
+ """
2
+ Manages creation/validation of S3 object checksums for both file uploads and downloads.
3
+
4
+ We do not support multipart uploads/downloads at this time.
5
+ Single part uploads/downloads have a limit of 5GBs.
6
+ Docs: https://docs.netapp.com/us-en/storagegrid/s3/put-object.html
7
+ """
8
+
9
+ import base64
10
+ import hashlib
11
+ import logging
12
+ from enum import StrEnum
13
+ from pathlib import Path
14
+ from typing import Iterator
15
+
16
+ from divbase_lib.exceptions import ChecksumVerificationError
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _read_file_chunks(file_path: Path, chunk_size: int = 8192) -> Iterator[bytes]:
22
+ """Helper function to read a file in 'chunk_size' sized chunks."""
23
+
24
+ with file_path.open(mode="rb") as infile:
25
+ yield from iter(lambda: infile.read(chunk_size), b"")
26
+
27
+
28
+ class MD5CheckSumFormat(StrEnum):
29
+ HEX = "hex"
30
+ BASE64 = "base64"
31
+
32
+
33
+ def calculate_md5_checksum(file_path: Path, output_format: MD5CheckSumFormat) -> str:
34
+ """
35
+ Calculate the MD5 checksum of a file.
36
+ Returns the checksum in either hex-encoded (lowercase) or base64-encoded format.
37
+ """
38
+ md5_hash = hashlib.md5()
39
+
40
+ for chunk in _read_file_chunks(file_path):
41
+ md5_hash.update(chunk)
42
+
43
+ if output_format == MD5CheckSumFormat.HEX:
44
+ return md5_hash.hexdigest()
45
+ elif output_format == MD5CheckSumFormat.BASE64:
46
+ return base64.b64encode(md5_hash.digest()).decode("utf-8")
47
+ else:
48
+ raise ValueError(f"Unknown output format: {output_format}")
49
+
50
+
51
+ def verify_downloaded_checksum(file_path: Path, expected_checksum: str) -> None:
52
+ """
53
+ Verify a downloaded file against S3's ETag (MD5 checksum in hex format).
54
+ """
55
+ calculated_md5 = calculate_md5_checksum(file_path=file_path, output_format=MD5CheckSumFormat.HEX)
56
+ if calculated_md5 != expected_checksum:
57
+ raise ChecksumVerificationError(expected_checksum=expected_checksum, calculated_checksum=calculated_md5)
58
+
59
+
60
+ def convert_checksum_hex_to_base64(hex_checksum: str) -> str:
61
+ """
62
+ Convert a hex-encoded MD5 checksum to base64-encoded format.
63
+ """
64
+ raw_bytes = bytes.fromhex(hex_checksum)
65
+ base64_checksum = base64.b64encode(raw_bytes).decode("utf-8")
66
+ return base64_checksum
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.4
2
+ Name: divbase-lib
3
+ Version: 0.1.0.dev0
4
+ Summary: Library module for Divbase
5
+ Project-URL: Homepage, https://divbase.scilifelab.se
6
+ Project-URL: Documentation, https://scilifelabdatacentre.github.io/divbase
7
+ Project-URL: Repository, https://github.com/ScilifelabDataCentre/divbase
8
+ Project-URL: Issues, https://github.com/ScilifelabDataCentre/divbase/issues
9
+ Author-email: SciLifeLab Data Centre <datacentre@scilifelab.se>
10
+ License-Expression: MIT
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: boto3<2,>=1.42.27
22
+ Requires-Dist: pandas<3,>=2.3.3
23
+ Requires-Dist: pydantic<3,>=2.12.5
24
+ Requires-Dist: pyyaml<7,>=6.0.3
25
+ Description-Content-Type: text/markdown
26
+
27
+ # divbase-lib
28
+
29
+ This is library package for Divbase. Code from this library is shared between the Divbase CLI and API packages.
30
+
31
+ ## Installation and usage
32
+
33
+ It is highly unlikely that you will need to use/install this package directly. Instead, it will be installed automatically by other packages (e.g. Divbase CLI) as a dependency.
34
+
35
+ ## Development
36
+
37
+ This package is developed in the [DivBase repository](https://github.com/ScilifelabDataCentre/divbase) by Scilifelab Data Centre.
@@ -0,0 +1,13 @@
1
+ divbase_lib/__init__.py,sha256=qbhMh4mg2JMoCFflKwIRvApU63RR8AxvYq1OT__8sgI,27
2
+ divbase_lib/exceptions.py,sha256=Ld9_EvV02BP2EudaXbWq5B5YneEedZ7lsKxH6ryk-lA,4442
3
+ divbase_lib/s3_checksums.py,sha256=D_jQAYKpUQf8xFs3M65F_zV_sasQFBJUH6hRwXfN_GE,2175
4
+ divbase_lib/api_schemas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ divbase_lib/api_schemas/auth.py,sha256=RmfoGoRID31r1ZA8O0XiC4Iy2d-gkEdE_75l8EiVxbY,1331
6
+ divbase_lib/api_schemas/project_versions.py,sha256=trv9a_I8CIquCEJEnRXANIKA3Mboe339eu-q_rjaSJE,2353
7
+ divbase_lib/api_schemas/queries.py,sha256=bdJttYzZpgnaqg-5Z9BVTlpfutFdoo8EUayw6FSHm8o,1746
8
+ divbase_lib/api_schemas/s3.py,sha256=leQRlwnyAiSAMv-4CHdgjT_iPGXkXcpVQBFkcUS-kbs,2006
9
+ divbase_lib/api_schemas/task_history.py,sha256=BwmnjJl8fvZftDfuE6txUeYR5dv5WYp8GAeamkifvjY,1414
10
+ divbase_lib/api_schemas/vcf_dimensions.py,sha256=o3hKPs_BJMsP4ULikZsuBnDx8CJy9MC66FYahcuSIzg,1276
11
+ divbase_lib-0.1.0.dev0.dist-info/METADATA,sha256=4Nvx2Kg96Hz10aaAxBHZ-ahMxlg3-E8jUvr2FC3MLSQ,1564
12
+ divbase_lib-0.1.0.dev0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
+ divbase_lib-0.1.0.dev0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any