divbase-lib 0.1.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- divbase_lib-0.1.0.dev0/.gitignore +38 -0
- divbase_lib-0.1.0.dev0/PKG-INFO +37 -0
- divbase_lib-0.1.0.dev0/README.md +11 -0
- divbase_lib-0.1.0.dev0/pyproject.toml +41 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/__init__.py +1 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/api_schemas/__init__.py +0 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/api_schemas/auth.py +34 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/api_schemas/project_versions.py +62 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/api_schemas/queries.py +64 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/api_schemas/s3.py +51 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/api_schemas/task_history.py +50 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/api_schemas/vcf_dimensions.py +42 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/exceptions.py +132 -0
- divbase_lib-0.1.0.dev0/src/divbase_lib/s3_checksums.py +66 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# repo secrets
|
|
2
|
+
.env
|
|
3
|
+
.env*
|
|
4
|
+
|
|
5
|
+
# python stuff
|
|
6
|
+
.venv
|
|
7
|
+
.ruff_cache/
|
|
8
|
+
__pycache__/
|
|
9
|
+
*.py[cod]
|
|
10
|
+
*$py.class
|
|
11
|
+
|
|
12
|
+
.vscode/
|
|
13
|
+
|
|
14
|
+
# project specific files
|
|
15
|
+
/sample_metadata.tsv
|
|
16
|
+
sample_metadata_*.tsv
|
|
17
|
+
*.vcf
|
|
18
|
+
*.vcf.gz
|
|
19
|
+
*.vcf.gz.csi
|
|
20
|
+
!tests/fixtures/*.vcf.gz
|
|
21
|
+
tests/fixtures/temp*
|
|
22
|
+
tests/fixtures/merged*
|
|
23
|
+
|
|
24
|
+
# query job config files
|
|
25
|
+
bcftools_divbase_job_config.json
|
|
26
|
+
|
|
27
|
+
# benchmarking files
|
|
28
|
+
vcf_dimensions.tsv
|
|
29
|
+
mock*.tsv
|
|
30
|
+
task_records*.json
|
|
31
|
+
|
|
32
|
+
#MacOS artifacts
|
|
33
|
+
.DS_Store
|
|
34
|
+
|
|
35
|
+
# mkdocs build cache
|
|
36
|
+
.cache/
|
|
37
|
+
# pypi
|
|
38
|
+
dist/
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: divbase-lib
|
|
3
|
+
Version: 0.1.0.dev0
|
|
4
|
+
Summary: Library module for Divbase
|
|
5
|
+
Project-URL: Homepage, https://divbase.scilifelab.se
|
|
6
|
+
Project-URL: Documentation, https://scilifelabdatacentre.github.io/divbase
|
|
7
|
+
Project-URL: Repository, https://github.com/ScilifelabDataCentre/divbase
|
|
8
|
+
Project-URL: Issues, https://github.com/ScilifelabDataCentre/divbase/issues
|
|
9
|
+
Author-email: SciLifeLab Data Centre <datacentre@scilifelab.se>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: boto3<2,>=1.42.27
|
|
22
|
+
Requires-Dist: pandas<3,>=2.3.3
|
|
23
|
+
Requires-Dist: pydantic<3,>=2.12.5
|
|
24
|
+
Requires-Dist: pyyaml<7,>=6.0.3
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# divbase-lib
|
|
28
|
+
|
|
29
|
+
This is library package for Divbase. Code from this library is shared between the Divbase CLI and API packages.
|
|
30
|
+
|
|
31
|
+
## Installation and usage
|
|
32
|
+
|
|
33
|
+
It is highly unlikely that you will need to use/install this package directly. Instead, it will be installed automatically by other packages (e.g. Divbase CLI) as a dependency.
|
|
34
|
+
|
|
35
|
+
## Development
|
|
36
|
+
|
|
37
|
+
This package is developed in the [DivBase repository](https://github.com/ScilifelabDataCentre/divbase) by Scilifelab Data Centre.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# divbase-lib
|
|
2
|
+
|
|
3
|
+
This is library package for Divbase. Code from this library is shared between the Divbase CLI and API packages.
|
|
4
|
+
|
|
5
|
+
## Installation and usage
|
|
6
|
+
|
|
7
|
+
It is highly unlikely that you will need to use/install this package directly. Instead, it will be installed automatically by other packages (e.g. Divbase CLI) as a dependency.
|
|
8
|
+
|
|
9
|
+
## Development
|
|
10
|
+
|
|
11
|
+
This package is developed in the [DivBase repository](https://github.com/ScilifelabDataCentre/divbase) by Scilifelab Data Centre.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "divbase-lib"
|
|
3
|
+
description = "Library module for Divbase"
|
|
4
|
+
readme = "README.md"
|
|
5
|
+
license = "MIT"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "SciLifeLab Data Centre", email = "datacentre@scilifelab.se" },
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"pyyaml>=6.0.3,<7",
|
|
12
|
+
"boto3>=1.42.27,<2",
|
|
13
|
+
"pandas>=2.3.3,<3",
|
|
14
|
+
"pydantic>=2.12.5,<3"
|
|
15
|
+
]
|
|
16
|
+
dynamic = ["version"]
|
|
17
|
+
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 3 - Alpha",
|
|
20
|
+
"Environment :: Console",
|
|
21
|
+
"Intended Audience :: Science/Research",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Operating System :: OS Independent",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Programming Language :: Python :: 3.14",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://divbase.scilifelab.se"
|
|
32
|
+
Documentation = "https://scilifelabdatacentre.github.io/divbase"
|
|
33
|
+
Repository = "https://github.com/ScilifelabDataCentre/divbase"
|
|
34
|
+
Issues = "https://github.com/ScilifelabDataCentre/divbase/issues"
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["hatchling >= 1.26"]
|
|
38
|
+
build-backend = "hatchling.build"
|
|
39
|
+
|
|
40
|
+
[tool.hatch.version]
|
|
41
|
+
path = "src/divbase_lib/__init__.py"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0.dev0"
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schemas for login + access and refresh tokens
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CLILoginResponse(BaseModel):
|
|
9
|
+
"""Response model for API (aka divbase-cli) login endpoint."""
|
|
10
|
+
|
|
11
|
+
access_token: str = Field(..., description="Bearer access token for authentication")
|
|
12
|
+
access_token_expires_at: int = Field(..., description="Unix timestamp when the access token expires")
|
|
13
|
+
refresh_token: str = Field(..., description="Bearer refresh token for obtaining new access tokens")
|
|
14
|
+
refresh_token_expires_at: int = Field(..., description="Unix timestamp when the refresh token expires")
|
|
15
|
+
email: str = Field(..., description="Email of the authenticated user")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RefreshTokenRequest(BaseModel):
|
|
19
|
+
"""Request model for refresh token endpoint."""
|
|
20
|
+
|
|
21
|
+
refresh_token: str = Field(..., description="Bearer refresh token for obtaining a new access token")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RefreshTokenResponse(BaseModel):
|
|
25
|
+
"""Response model for refresh token endpoint."""
|
|
26
|
+
|
|
27
|
+
access_token: str = Field(..., description="Bearer access token for authentication")
|
|
28
|
+
expires_at: int = Field(..., description="Unix timestamp when the access token expires")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LogoutRequest(BaseModel):
|
|
32
|
+
"""Request model for logout endpoint."""
|
|
33
|
+
|
|
34
|
+
refresh_token: str = Field(..., description="Bearer refresh token to be revoked on logout")
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schemas for project versioning routes.
|
|
3
|
+
|
|
4
|
+
Project versions are the state of all files in a project's storage bucket at a given time point.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Request Models
|
|
11
|
+
class CreateVersioningFileRequest(BaseModel):
|
|
12
|
+
name: str = Field(..., description="Initial version name")
|
|
13
|
+
description: str = Field(..., description="Initial version description")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AddVersionRequest(BaseModel):
|
|
17
|
+
name: str = Field(..., description="Name of the new version to add")
|
|
18
|
+
description: str = Field("", description="Description of the new version")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DeleteVersionRequest(BaseModel):
|
|
22
|
+
version_name: str = Field(..., description="Name of the version to delete")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Response Models
|
|
26
|
+
class ProjectBasicInfo(BaseModel):
|
|
27
|
+
"""Base model for describing a single project version, not for direct use in an endpoint."""
|
|
28
|
+
|
|
29
|
+
name: str = Field(..., description="Version name")
|
|
30
|
+
description: str | None = Field(..., description="Version description")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AddVersionResponse(ProjectBasicInfo):
|
|
34
|
+
"""Response model for adding a version."""
|
|
35
|
+
|
|
36
|
+
created_at: str = Field(..., description="ISO timestamp when version was created")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ProjectVersionInfo(ProjectBasicInfo):
|
|
40
|
+
"""Basic information about a project version. You get a list of these when listing all versions in a project."""
|
|
41
|
+
|
|
42
|
+
created_at: str = Field(..., description="ISO timestamp when version was created")
|
|
43
|
+
is_deleted: bool = Field(..., description="Whether this version has been soft-deleted")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ProjectVersionDetailResponse(ProjectBasicInfo):
|
|
47
|
+
"""Full information about a single project version, including the files at that version."""
|
|
48
|
+
|
|
49
|
+
created_at: str = Field(..., description="ISO timestamp when version was created")
|
|
50
|
+
is_deleted: bool = Field(..., description="Whether this version has been soft-deleted")
|
|
51
|
+
files: dict[str, str] = Field(..., description="Mapping of file names to their version IDs")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class DeleteVersionResponse(BaseModel):
|
|
55
|
+
"""Response model for deleting a version."""
|
|
56
|
+
|
|
57
|
+
name: str = Field(..., description="Name of the version that was deleted")
|
|
58
|
+
already_deleted: bool = Field(
|
|
59
|
+
False,
|
|
60
|
+
description="Whether the version was already soft-deleted before this request",
|
|
61
|
+
)
|
|
62
|
+
date_deleted: str = Field(..., description="ISO timestamp of when the version was soft-deleted")
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schemas for query routes.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Request models
|
|
11
|
+
class SampleMetadataQueryRequest(BaseModel):
|
|
12
|
+
"""Request model for sample metadata query route."""
|
|
13
|
+
|
|
14
|
+
tsv_filter: str
|
|
15
|
+
metadata_tsv_name: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BcftoolsQueryRequest(BaseModel):
|
|
19
|
+
"""Request model for sample metadata query route."""
|
|
20
|
+
|
|
21
|
+
tsv_filter: str
|
|
22
|
+
metadata_tsv_name: str
|
|
23
|
+
command: str # TODO add field to decribe that this is bcftools commands
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Models for task kwargs and task results. Reused in task history schemas too, hence pydantic models and not just dataclasses.
|
|
27
|
+
class SampleMetadataQueryKwargs(BaseModel):
|
|
28
|
+
"""Keyword arguments for sample metadata query task. Used to pass info to Celery task, and also for recording task history."""
|
|
29
|
+
|
|
30
|
+
tsv_filter: str
|
|
31
|
+
metadata_tsv_name: str
|
|
32
|
+
bucket_name: str
|
|
33
|
+
project_id: int
|
|
34
|
+
project_name: str
|
|
35
|
+
user_id: int
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class BcftoolsQueryKwargs(BaseModel):
|
|
39
|
+
"""Keyword arguments for BCFtools query task. Used to pass info to Celery task, and also for recording task history."""
|
|
40
|
+
|
|
41
|
+
tsv_filter: str
|
|
42
|
+
command: str
|
|
43
|
+
metadata_tsv_name: str
|
|
44
|
+
bucket_name: str
|
|
45
|
+
project_id: int
|
|
46
|
+
project_name: str
|
|
47
|
+
user_id: int
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class SampleMetadataQueryTaskResult(BaseModel):
|
|
51
|
+
"""Metadata query task result details. Based on the return of tasks.sample_metadata_query."""
|
|
52
|
+
|
|
53
|
+
sample_and_filename_subset: list[dict[str, Any]]
|
|
54
|
+
unique_sample_ids: list[str]
|
|
55
|
+
unique_filenames: list[str]
|
|
56
|
+
query_message: str
|
|
57
|
+
status: Optional[str] = None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class BcftoolsQueryTaskResult(BaseModel):
|
|
61
|
+
"""BCFtools query task result details. Based on the return of tasks.bcftools_query."""
|
|
62
|
+
|
|
63
|
+
output_file: str
|
|
64
|
+
status: Optional[str] = None
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schemas for DivBase's S3 API routes.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DownloadObjectRequest(BaseModel):
|
|
9
|
+
"""Request model to upload a single object using a pre-signed URL."""
|
|
10
|
+
|
|
11
|
+
name: str = Field(..., description="Name of the object to be downloaded")
|
|
12
|
+
version_id: str | None = Field(..., description="Version ID of the object, None if latest version")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PreSignedDownloadResponse(BaseModel):
|
|
16
|
+
"""Response model to download a single object using the pre-signed URL and (optionally) version ID."""
|
|
17
|
+
|
|
18
|
+
name: str = Field(..., description="Name of the object to be downloaded")
|
|
19
|
+
pre_signed_url: str = Field(..., description="Pre-signed URL for downloading the object")
|
|
20
|
+
version_id: str | None = Field(..., description="Version ID of the object, None if latest version")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class UploadObjectRequest(BaseModel):
|
|
24
|
+
"""Request model to upload a single object using a pre-signed URL."""
|
|
25
|
+
|
|
26
|
+
name: str = Field(..., description="Name of the object to be uploaded")
|
|
27
|
+
content_length: int = Field(..., description="Size of the file in bytes")
|
|
28
|
+
md5_hash: str | None = Field(None, description="Optional MD5 hash of the object for integrity check")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PreSignedUploadResponse(BaseModel):
|
|
32
|
+
"""Response model to upload a single object using the pre-signed URL using PUT."""
|
|
33
|
+
|
|
34
|
+
name: str = Field(..., description="Name of the object to be uploaded")
|
|
35
|
+
pre_signed_url: str = Field(..., description="Pre-signed URL to which the file should be uploaded")
|
|
36
|
+
put_headers: dict[str, str] = Field(..., description="Headers to be included in the PUT request")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CheckFileExistsRequest(BaseModel):
|
|
40
|
+
"""Request model to check if a file already exists in the bucket (using the checksum)"""
|
|
41
|
+
|
|
42
|
+
object_name: str
|
|
43
|
+
md5_checksum: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ExistingFileResponse(BaseModel):
|
|
47
|
+
"""Response model for reporting a file that already exists in the bucket (using it's checksum)"""
|
|
48
|
+
|
|
49
|
+
object_name: str
|
|
50
|
+
md5_checksum: str
|
|
51
|
+
matching_object_name: str | None
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schemas for task history routes.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from divbase_lib.api_schemas.queries import (
|
|
10
|
+
BcftoolsQueryKwargs,
|
|
11
|
+
BcftoolsQueryTaskResult,
|
|
12
|
+
SampleMetadataQueryKwargs,
|
|
13
|
+
SampleMetadataQueryTaskResult,
|
|
14
|
+
)
|
|
15
|
+
from divbase_lib.api_schemas.vcf_dimensions import DimensionUpdateKwargs, DimensionUpdateTaskResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TaskHistoryResult(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Task details as returned by queries to the SQAlchemy+pg results backend.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
id: int
|
|
24
|
+
submitter_email: Optional[str] = None
|
|
25
|
+
status: Optional[str] = None
|
|
26
|
+
result: Optional[
|
|
27
|
+
Union[
|
|
28
|
+
dict[
|
|
29
|
+
str, Any
|
|
30
|
+
], # Note! This dict must come first here so that error results are preserved and not incorrectly inserted into the result models
|
|
31
|
+
SampleMetadataQueryTaskResult,
|
|
32
|
+
BcftoolsQueryTaskResult,
|
|
33
|
+
DimensionUpdateTaskResult,
|
|
34
|
+
]
|
|
35
|
+
] = None
|
|
36
|
+
date_done: Optional[str] = None
|
|
37
|
+
name: Optional[str] = None
|
|
38
|
+
args: Optional[str] = None
|
|
39
|
+
kwargs: Optional[
|
|
40
|
+
Union[
|
|
41
|
+
SampleMetadataQueryKwargs,
|
|
42
|
+
BcftoolsQueryKwargs,
|
|
43
|
+
DimensionUpdateKwargs,
|
|
44
|
+
]
|
|
45
|
+
] = None
|
|
46
|
+
worker: Optional[str] = None
|
|
47
|
+
created_at: Optional[str] = None
|
|
48
|
+
started_at: Optional[str] = None
|
|
49
|
+
completed_at: Optional[str] = None
|
|
50
|
+
runtime: Optional[float] = None
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schemas for VCF dimensions routes.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DimensionUpdateKwargs(BaseModel):
|
|
11
|
+
"""Keyword arguments for dimension update task. Used to pass info to Celery task, and also for recording task history."""
|
|
12
|
+
|
|
13
|
+
bucket_name: str
|
|
14
|
+
project_id: int
|
|
15
|
+
project_name: str
|
|
16
|
+
user_id: int
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DimensionUpdateTaskResult(BaseModel):
|
|
20
|
+
"""Dimension update task result details. Based on the return of tasks.update_dimensions_index."""
|
|
21
|
+
|
|
22
|
+
status: Optional[str] = None
|
|
23
|
+
VCF_files_added: Optional[list[str]] = Field(
|
|
24
|
+
None, description="VCF files that were added to dimensions index by this job"
|
|
25
|
+
)
|
|
26
|
+
VCF_files_skipped: Optional[list[str]] = Field(
|
|
27
|
+
None, description="VCF files skipped by this job (previous DivBase-generated result VCFs)"
|
|
28
|
+
)
|
|
29
|
+
VCF_files_deleted: Optional[list[str]] = Field(
|
|
30
|
+
None, description="VCF files that have been deleted from the project and thus have been dropped from the index"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DimensionsShowResult(BaseModel):
|
|
35
|
+
"""Result model for showing VCF dimensions for a project."""
|
|
36
|
+
|
|
37
|
+
project_id: int
|
|
38
|
+
project_name: str
|
|
39
|
+
vcf_file_count: int
|
|
40
|
+
vcf_files: list[dict]
|
|
41
|
+
skipped_file_count: int
|
|
42
|
+
skipped_files: list[dict]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom exceptions for DivBase packages.
|
|
3
|
+
|
|
4
|
+
These are raised by lover-level functions/methods which understand the context of the error.
|
|
5
|
+
|
|
6
|
+
Note: By adding the `__str__` method to each exception,
|
|
7
|
+
we ensure that when you manually raise a specific exception the error message looks good
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ObjectDoesNotExistError(FileNotFoundError):
|
|
14
|
+
"""Raised when an S3 object/key does not exist in the bucket."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, key: str, bucket_name: str):
|
|
17
|
+
error_message = f"The file/object '{key}' does not exist in the bucket '{bucket_name}'. "
|
|
18
|
+
super().__init__(error_message)
|
|
19
|
+
self.key = key
|
|
20
|
+
self.bucket = bucket_name
|
|
21
|
+
self.error_message = error_message
|
|
22
|
+
|
|
23
|
+
def __str__(self):
|
|
24
|
+
return self.error_message
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BcftoolsEnvironmentError(Exception):
|
|
28
|
+
"""Raised when there's an issue with the execution environment (Docker, etc.)."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, container_name: str):
|
|
31
|
+
self.container_name = container_name
|
|
32
|
+
error_message = (
|
|
33
|
+
f"No running container found with name {self.container_name}. Ensure the Docker image is available.\n"
|
|
34
|
+
)
|
|
35
|
+
super().__init__(error_message)
|
|
36
|
+
|
|
37
|
+
self.error_message = error_message
|
|
38
|
+
|
|
39
|
+
def __str__(self):
|
|
40
|
+
return self.error_message
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class BcftoolsCommandError(Exception):
|
|
44
|
+
"""Raised when a bcftools command fails to execute properly."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, command: str, error_details: Exception = None):
|
|
47
|
+
self.command = command
|
|
48
|
+
self.error_details = error_details
|
|
49
|
+
|
|
50
|
+
error_message = f"bcftools command failed: '{command}'"
|
|
51
|
+
if error_details:
|
|
52
|
+
error_message += f" with error details: {error_details}"
|
|
53
|
+
|
|
54
|
+
super().__init__(error_message)
|
|
55
|
+
|
|
56
|
+
def __str__(self):
|
|
57
|
+
if hasattr(self.error_details, "stderr") and self.error_details.stderr:
|
|
58
|
+
return f"bcftools command failed: '{self.command}' with error: {self.error_details.stderr}"
|
|
59
|
+
return super().__str__()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class BcftoolsPipeEmptyCommandError(Exception):
|
|
63
|
+
"""Raised when an empty command is provided to the bcftools pipe."""
|
|
64
|
+
|
|
65
|
+
def __init__(self):
|
|
66
|
+
error_message = "Empty command provided. Please specify at least one valid bcftools command."
|
|
67
|
+
super().__init__(error_message)
|
|
68
|
+
self.error_message = error_message
|
|
69
|
+
|
|
70
|
+
def __str__(self):
|
|
71
|
+
return self.error_message
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class BcftoolsPipeUnsupportedCommandError(Exception):
|
|
75
|
+
"""Raised when a bcftools command unsupported by the BcftoolsQueryManager class is provided."""
|
|
76
|
+
|
|
77
|
+
def __init__(self, command: str, position: int, valid_commands: list[str]):
|
|
78
|
+
self.command = command
|
|
79
|
+
self.position = position
|
|
80
|
+
self.valid_commands = valid_commands
|
|
81
|
+
|
|
82
|
+
message = (
|
|
83
|
+
f"Unsupported bcftools command '{command}' at position {position}. "
|
|
84
|
+
f"Only the following commands are supported: {', '.join(valid_commands)}"
|
|
85
|
+
)
|
|
86
|
+
super().__init__(message)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class SidecarNoDataLoadedError(Exception):
|
|
90
|
+
"""Raised when no data is loaded in SidecarQueryManager."""
|
|
91
|
+
|
|
92
|
+
def __init__(self, file_path: Path, submethod: str, error_details: str | None = None):
|
|
93
|
+
self.file_path = file_path
|
|
94
|
+
self.error_details = error_details
|
|
95
|
+
|
|
96
|
+
error_message = f"No data loaded from file '{file_path}', as raised in submethod '{submethod}'."
|
|
97
|
+
if error_details:
|
|
98
|
+
error_message += f"More details about the error: {error_details}"
|
|
99
|
+
super().__init__(error_message)
|
|
100
|
+
self.error_message = error_message
|
|
101
|
+
|
|
102
|
+
def __str__(self):
|
|
103
|
+
return self.error_message
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class SidecarInvalidFilterError(Exception):
|
|
107
|
+
"""Raised when an invalid filter is provided to SidecarQueryManager."""
|
|
108
|
+
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class SidecarColumnNotFoundError(Exception):
|
|
113
|
+
"""Raised when a requested column is not found in the query result."""
|
|
114
|
+
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class NoVCFFilesFoundError(Exception):
|
|
119
|
+
"""Raised when no VCF files are found in the project bucket."""
|
|
120
|
+
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class ChecksumVerificationError(Exception):
|
|
125
|
+
"""Raised when a calculated file's checksum does not match the expected value."""
|
|
126
|
+
|
|
127
|
+
def __init__(self, expected_checksum: str, calculated_checksum: str):
|
|
128
|
+
self.expected_checksum = expected_checksum
|
|
129
|
+
self.calculated_checksum = calculated_checksum
|
|
130
|
+
|
|
131
|
+
message = f"Checksum verification failed. Expected: {expected_checksum}, Calculated: {calculated_checksum}"
|
|
132
|
+
super().__init__(message)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Manages creation/validation of S3 object checksums for both file uploads and downloads.
|
|
3
|
+
|
|
4
|
+
We do not support multipart uploads/downloads at this time.
|
|
5
|
+
Single part uploads/downloads have a limit of 5GBs.
|
|
6
|
+
Docs: https://docs.netapp.com/us-en/storagegrid/s3/put-object.html
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import hashlib
|
|
11
|
+
import logging
|
|
12
|
+
from enum import StrEnum
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Iterator
|
|
15
|
+
|
|
16
|
+
from divbase_lib.exceptions import ChecksumVerificationError
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _read_file_chunks(file_path: Path, chunk_size: int = 8192) -> Iterator[bytes]:
|
|
22
|
+
"""Helper function to read a file in 'chunk_size' sized chunks."""
|
|
23
|
+
|
|
24
|
+
with file_path.open(mode="rb") as infile:
|
|
25
|
+
yield from iter(lambda: infile.read(chunk_size), b"")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MD5CheckSumFormat(StrEnum):
|
|
29
|
+
HEX = "hex"
|
|
30
|
+
BASE64 = "base64"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def calculate_md5_checksum(file_path: Path, output_format: MD5CheckSumFormat) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Calculate the MD5 checksum of a file.
|
|
36
|
+
Returns the checksum in either hex-encoded (lowercase) or base64-encoded format.
|
|
37
|
+
"""
|
|
38
|
+
md5_hash = hashlib.md5()
|
|
39
|
+
|
|
40
|
+
for chunk in _read_file_chunks(file_path):
|
|
41
|
+
md5_hash.update(chunk)
|
|
42
|
+
|
|
43
|
+
if output_format == MD5CheckSumFormat.HEX:
|
|
44
|
+
return md5_hash.hexdigest()
|
|
45
|
+
elif output_format == MD5CheckSumFormat.BASE64:
|
|
46
|
+
return base64.b64encode(md5_hash.digest()).decode("utf-8")
|
|
47
|
+
else:
|
|
48
|
+
raise ValueError(f"Unknown output format: {output_format}")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def verify_downloaded_checksum(file_path: Path, expected_checksum: str) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Verify a downloaded file against S3's ETag (MD5 checksum in hex format).
|
|
54
|
+
"""
|
|
55
|
+
calculated_md5 = calculate_md5_checksum(file_path=file_path, output_format=MD5CheckSumFormat.HEX)
|
|
56
|
+
if calculated_md5 != expected_checksum:
|
|
57
|
+
raise ChecksumVerificationError(expected_checksum=expected_checksum, calculated_checksum=calculated_md5)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def convert_checksum_hex_to_base64(hex_checksum: str) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Convert a hex-encoded MD5 checksum to base64-encoded format.
|
|
63
|
+
"""
|
|
64
|
+
raw_bytes = bytes.fromhex(hex_checksum)
|
|
65
|
+
base64_checksum = base64.b64encode(raw_bytes).decode("utf-8")
|
|
66
|
+
return base64_checksum
|