gentroutils 1.5.0__py3-none-any.whl → 1.6.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gentroutils/__init__.py +8 -43
- gentroutils/errors.py +39 -0
- gentroutils/io/path/__init__.py +6 -0
- gentroutils/io/path/ftp.py +48 -0
- gentroutils/io/path/gcs.py +45 -0
- gentroutils/io/transfer/__init__.py +6 -0
- gentroutils/io/transfer/ftp_to_gcs.py +49 -0
- gentroutils/io/transfer/model.py +36 -0
- gentroutils/io/transfer/polars_to_gcs.py +20 -0
- gentroutils/parsers/__init__.py +1 -0
- gentroutils/parsers/curation.py +168 -0
- gentroutils/tasks/__init__.py +90 -0
- gentroutils/tasks/crawl.py +156 -0
- gentroutils/tasks/curation.py +110 -0
- gentroutils/tasks/fetch.py +141 -0
- gentroutils/transfer.py +81 -0
- gentroutils-1.6.0.dev1.dist-info/METADATA +274 -0
- gentroutils-1.6.0.dev1.dist-info/RECORD +22 -0
- gentroutils-1.6.0.dev1.dist-info/entry_points.txt +2 -0
- {gentroutils-1.5.0.dist-info → gentroutils-1.6.0.dev1.dist-info}/licenses/LICENSE +1 -1
- gentroutils/commands/__init__.py +0 -11
- gentroutils/commands/update_gwas_curation_metadata.py +0 -287
- gentroutils/commands/utils.py +0 -152
- gentroutils/commands/validate_gwas_curation.py +0 -165
- gentroutils-1.5.0.dist-info/METADATA +0 -135
- gentroutils-1.5.0.dist-info/RECORD +0 -11
- gentroutils-1.5.0.dist-info/entry_points.txt +0 -2
- {gentroutils-1.5.0.dist-info → gentroutils-1.6.0.dev1.dist-info}/WHEEL +0 -0
gentroutils/__init__.py
CHANGED
|
@@ -1,46 +1,11 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Main entry point for the gentroutils package."""
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from otter import Runner
|
|
4
4
|
|
|
5
|
-
import logging
|
|
6
|
-
import time
|
|
7
5
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
)
|
|
15
|
-
from gentroutils.commands.utils import set_log_file, set_log_lvl, teardown_cli
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger("gentroutils")
|
|
18
|
-
logger.setLevel(logging.DEBUG)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@click.group()
|
|
22
|
-
@click.option("-d", "--dry-run", is_flag=True, default=False)
|
|
23
|
-
@click.option(
|
|
24
|
-
"-v",
|
|
25
|
-
count=True,
|
|
26
|
-
default=0,
|
|
27
|
-
callback=set_log_lvl,
|
|
28
|
-
help="Increase verbosity of the logging. Can be used multiple times. The default log level is ERROR, -v is INFO, -vv is DEBUG",
|
|
29
|
-
)
|
|
30
|
-
@click.option("-q", "--log-file", callback=set_log_file, required=False)
|
|
31
|
-
@click.pass_context
|
|
32
|
-
def cli(ctx: click.Context, **kwargs: dict[str, str]) -> None:
|
|
33
|
-
r"""Gentroutils Command Line Interface."""
|
|
34
|
-
ascii_art = pyfiglet.Figlet(font="serifcap").renderText("Gentroutils")
|
|
35
|
-
click.echo(click.style(ascii_art, fg="blue"))
|
|
36
|
-
ctx.max_content_width = 200
|
|
37
|
-
ctx.ensure_object(dict)
|
|
38
|
-
ctx.obj["dry_run"] = kwargs["dry_run"]
|
|
39
|
-
ctx.obj["execution_start"] = time.time()
|
|
40
|
-
ctx.call_on_close(lambda: teardown_cli(ctx))
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
cli.add_command(update_gwas_curation_metadata_command)
|
|
44
|
-
cli.add_command(validate_gwas_curation)
|
|
45
|
-
|
|
46
|
-
__all__ = ["cli"]
|
|
6
|
+
def main() -> None:
|
|
7
|
+
"""Main function to start the gentroutils otter runner."""
|
|
8
|
+
runner = Runner("gentroutils")
|
|
9
|
+
runner.start()
|
|
10
|
+
runner.register_tasks("gentroutils.tasks")
|
|
11
|
+
runner.run()
|
gentroutils/errors.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Gentroutils exceptions module."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class GentroutilsErrorMessage(Enum):
|
|
7
|
+
"""Base class for all exceptions in the gentroutils module."""
|
|
8
|
+
|
|
9
|
+
UNSUPPORTED_URL_SCHEME = "Unsupported URL scheme: {scheme}"
|
|
10
|
+
BUCKET_NAME_MISSING = "Bucket name is missing in the URL: {url}"
|
|
11
|
+
FILE_NAME_MISSING = "File name is missing in the URL: {url}"
|
|
12
|
+
GCS_CLIENT_INITIALIZATION_FAILED = "Failed to initialize Google Cloud Storage client: {error}"
|
|
13
|
+
FTP_SERVER_MISSING = "FTP server is missing in the URL: {url}"
|
|
14
|
+
INVALID_TRANSFERABLE_OBJECTS = (
|
|
15
|
+
"Invalid transferable objects provided. Expected FTPtoGCPTransferableObject instances."
|
|
16
|
+
)
|
|
17
|
+
DOWNLOAD_STUDIES_EMPTY = "List of downloaded studies from GWAS Catalog release is empty: {path}"
|
|
18
|
+
PREVIOUS_CURATION_EMPTY = "Previous curation data is empty: {path}"
|
|
19
|
+
FAILED_TO_FETCH = "Failed to fetch the release information from the {uri}"
|
|
20
|
+
MISSING_RELEASE_DATE_TEMPLATE = (
|
|
21
|
+
"The destination must contain a template for the release date, e.g. some/path/{release_date}/file.txt."
|
|
22
|
+
)
|
|
23
|
+
EMPTY_TRANSFERABLE_OBJECTS = "Transferable objects list cannot be empty."
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GentroutilsError(Exception):
|
|
27
|
+
"""Base class for the gentroutils exceptions."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, message: GentroutilsErrorMessage, **kwargs: str) -> None:
|
|
30
|
+
"""Initialize the GentroutilsError exception.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
message (GentroutilsErrorMessage): The error message.
|
|
34
|
+
**kwargs (str): Additional arguments to format the message.
|
|
35
|
+
"""
|
|
36
|
+
super().__init__(message.value.format(**kwargs))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
__all__ = ["GentroutilsError", "GentroutilsErrorMessage"]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""FTP path representation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FTPPath:
|
|
11
|
+
"""A class to represent a path in a cloud storage system."""
|
|
12
|
+
|
|
13
|
+
# Supported URL schemes
|
|
14
|
+
SUPPORTED_SCHEMES = ["ftp"]
|
|
15
|
+
|
|
16
|
+
def __init__(self, uri: str) -> None:
|
|
17
|
+
"""Initialize the FTPPath object.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
uri (str): The path to object in ftp server.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
GentroutilsError: If the URL scheme is not supported or if the server or filename is missing.
|
|
24
|
+
"""
|
|
25
|
+
self.uri = uri
|
|
26
|
+
# NOTE: The urlparse matches to following tuple
|
|
27
|
+
# ('scheme', 'netloc', 'path', 'params', 'query', 'fragment')
|
|
28
|
+
parsed_url = urlparse(uri)
|
|
29
|
+
|
|
30
|
+
if parsed_url.scheme not in self.SUPPORTED_SCHEMES:
|
|
31
|
+
raise GentroutilsError(GentroutilsErrorMessage.UNSUPPORTED_URL_SCHEME, scheme=parsed_url.scheme)
|
|
32
|
+
|
|
33
|
+
self.server = parsed_url.netloc
|
|
34
|
+
if not self.server:
|
|
35
|
+
raise GentroutilsError(GentroutilsErrorMessage.FTP_SERVER_MISSING, url=uri)
|
|
36
|
+
|
|
37
|
+
self.filename = parsed_url.path.split("/")[-1]
|
|
38
|
+
if not self.filename:
|
|
39
|
+
raise GentroutilsError(GentroutilsErrorMessage.FILE_NAME_MISSING, url=uri)
|
|
40
|
+
self.base_dir = "/".join(parsed_url.path.split("/")[0:-1])
|
|
41
|
+
|
|
42
|
+
def __repr__(self) -> str:
|
|
43
|
+
"""Return the string representation of the CloudPath object.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
str: The string representation of the CloudPath object.
|
|
47
|
+
"""
|
|
48
|
+
return self.uri
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Google Cloud Storage (GCS) path representation."""
|
|
2
|
+
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GCSPath:
|
|
9
|
+
"""A class to represent a path in a cloud storage system."""
|
|
10
|
+
|
|
11
|
+
# Supported URL schemes
|
|
12
|
+
SUPPORTED_SCHEMES = ["gs"]
|
|
13
|
+
|
|
14
|
+
def __init__(self, uri: str) -> None:
|
|
15
|
+
"""Initialize the CloudPath object.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
uri (str): The path to the cloud storage object.
|
|
19
|
+
|
|
20
|
+
Raises:
|
|
21
|
+
GentroutilsError: If the URL scheme is not supported or if the bucket or object is missing.
|
|
22
|
+
"""
|
|
23
|
+
self.uri = uri
|
|
24
|
+
# NOTE: The urlparse matches to following tuple
|
|
25
|
+
# ('scheme', 'netloc', 'path', 'params', 'query', 'fragment')
|
|
26
|
+
parsed_url = urlparse(uri)
|
|
27
|
+
|
|
28
|
+
if parsed_url.scheme not in self.SUPPORTED_SCHEMES:
|
|
29
|
+
raise GentroutilsError(GentroutilsErrorMessage.UNSUPPORTED_URL_SCHEME, scheme=parsed_url.scheme)
|
|
30
|
+
|
|
31
|
+
self.bucket = parsed_url.netloc
|
|
32
|
+
if not self.bucket:
|
|
33
|
+
raise GentroutilsError(GentroutilsErrorMessage.BUCKET_NAME_MISSING, url=uri)
|
|
34
|
+
|
|
35
|
+
self.object = parsed_url.path.lstrip("/").rstrip("/")
|
|
36
|
+
if not self.object:
|
|
37
|
+
raise GentroutilsError(GentroutilsErrorMessage.FILE_NAME_MISSING, url=uri)
|
|
38
|
+
|
|
39
|
+
def __repr__(self) -> str:
|
|
40
|
+
"""Return the string representation of the CloudPath object.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
str: The string representation of the CloudPath object.
|
|
44
|
+
"""
|
|
45
|
+
return self.uri
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""Module for handling transfer operations in gentroutils."""
|
|
2
|
+
|
|
3
|
+
from gentroutils.io.transfer.ftp_to_gcs import FTPtoGCPTransferableObject
|
|
4
|
+
from gentroutils.io.transfer.polars_to_gcs import PolarsDataFrameToGCSTransferableObject
|
|
5
|
+
|
|
6
|
+
__all__ = ["FTPtoGCPTransferableObject", "PolarsDataFrameToGCSTransferableObject"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Transfer files from FTP to Google Cloud Storage (GCS)."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import re
|
|
5
|
+
from typing import Annotated
|
|
6
|
+
|
|
7
|
+
import aioftp
|
|
8
|
+
from google.cloud import storage # type: ignore[attr-defined]
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from pydantic import AfterValidator
|
|
11
|
+
|
|
12
|
+
from gentroutils.io.path import FTPPath, GCSPath
|
|
13
|
+
from gentroutils.io.transfer.model import TransferableObject
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FTPtoGCPTransferableObject(TransferableObject):
|
|
17
|
+
"""A class to represent an object that can be transferred from FTP to GCP."""
|
|
18
|
+
|
|
19
|
+
source: Annotated[str, AfterValidator(lambda x: str(FTPPath(x)))]
|
|
20
|
+
destination: Annotated[str, AfterValidator(lambda x: str(GCSPath(x)))]
|
|
21
|
+
|
|
22
|
+
async def transfer(self) -> None:
|
|
23
|
+
"""Transfer files from FTP to GCP.
|
|
24
|
+
|
|
25
|
+
This function fetches the data for the file provided in the local FTP path, collects the
|
|
26
|
+
data asynchronously to buffer, and uploads it to the provided GCP bucket blob.
|
|
27
|
+
"""
|
|
28
|
+
logger.info(f"Attempting to transfer data from {self.source} to {self.destination}.")
|
|
29
|
+
gcs_obj = GCSPath(self.destination)
|
|
30
|
+
ftp_obj = FTPPath(self.source)
|
|
31
|
+
|
|
32
|
+
async with aioftp.Client.context(ftp_obj.server, user="anonymous", password="anonymous") as ftp: # noqa: S106
|
|
33
|
+
bucket = storage.Client().bucket(gcs_obj.bucket)
|
|
34
|
+
blob = bucket.blob(gcs_obj.object)
|
|
35
|
+
logger.info(f"Changing directory to {ftp_obj.base_dir}.")
|
|
36
|
+
await ftp.change_directory(ftp_obj.base_dir)
|
|
37
|
+
pwd = await ftp.get_current_directory()
|
|
38
|
+
dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(pwd))
|
|
39
|
+
if dir_match:
|
|
40
|
+
logger.info(f"Found release date!: {dir_match.group('release_date')}")
|
|
41
|
+
buffer = io.BytesIO()
|
|
42
|
+
stream = await ftp.download_stream(ftp_obj.filename)
|
|
43
|
+
async with stream:
|
|
44
|
+
async for block in stream.iter_by_block():
|
|
45
|
+
buffer.write(block)
|
|
46
|
+
buffer.seek(0)
|
|
47
|
+
content = buffer.getvalue().decode("utf-8")
|
|
48
|
+
buffer.close()
|
|
49
|
+
blob.upload_from_string("".join(content))
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Base implementation for transferable objects in gentroutils."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TransferableObject(BaseModel):
|
|
9
|
+
"""Base class for transferable objects in gentroutils.
|
|
10
|
+
|
|
11
|
+
Each transferable object should implement the `transfer` method to define how the object is transferred.
|
|
12
|
+
Also each object should have the:
|
|
13
|
+
|
|
14
|
+
- `source`: The source location of the object.
|
|
15
|
+
- `destination`: The destination location where the object will be transferred.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
source: Any
|
|
19
|
+
destination: Any
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
"""Return a string representation of the transferable object."""
|
|
23
|
+
return f"{self.__class__.__name__}(source={self.source}, destination={self.destination})"
|
|
24
|
+
|
|
25
|
+
def __str__(self) -> str:
|
|
26
|
+
"""Return a string representation of the transferable object."""
|
|
27
|
+
return self.__repr__()
|
|
28
|
+
|
|
29
|
+
def transfer(self):
|
|
30
|
+
"""Transfer the object to the destination."""
|
|
31
|
+
raise NotImplementedError("Implement in derivative class.")
|
|
32
|
+
|
|
33
|
+
class Config:
|
|
34
|
+
"""Configuration that ensures that the derivative classes can have arbitrary types."""
|
|
35
|
+
|
|
36
|
+
arbitrary_types_allowed = True
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Module for transferring Polars DataFrames to Google Cloud Storage (GCS)."""
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
from loguru import logger
|
|
5
|
+
|
|
6
|
+
from gentroutils.io.transfer.model import TransferableObject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PolarsDataFrameToGCSTransferableObject(TransferableObject):
|
|
10
|
+
"""A TransferableObject for transferring Polars DataFrames to Google Cloud Storage."""
|
|
11
|
+
|
|
12
|
+
source: pl.DataFrame
|
|
13
|
+
destination: str
|
|
14
|
+
|
|
15
|
+
async def transfer(self) -> None:
|
|
16
|
+
"""Transfer the Polars DataFrame to the specified GCS destination."""
|
|
17
|
+
# Convert Polars DataFrame to CSV and upload to GCS
|
|
18
|
+
logger.info(f"Transferring Polars DataFrame to {self.destination}.")
|
|
19
|
+
self.source.write_csv(self.destination)
|
|
20
|
+
logger.info(f"Uploading DataFrame to {self.destination}")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Module for parsing and handling data in gentroutils."""
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Module to handle the curation of GWAS Catalog data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CurationSchema(StrEnum):
|
|
14
|
+
"""Enum to define the schema for curation tasks."""
|
|
15
|
+
|
|
16
|
+
STUDY_ID = "studyId"
|
|
17
|
+
"""The unique identifier for a study."""
|
|
18
|
+
STUDY_TYPE = "studyType"
|
|
19
|
+
"""The type of study (e.g., gwas)."""
|
|
20
|
+
ANALYSIS_FLAG = "analysisFlag"
|
|
21
|
+
"""Flag indicating the type of analysis performed."""
|
|
22
|
+
QUALITY_CONTROL = "qualityControl"
|
|
23
|
+
"""Quality control status of the study."""
|
|
24
|
+
IS_CURATED = "isCurated"
|
|
25
|
+
"""Flag indicating whether the study has been curated."""
|
|
26
|
+
PUBMED_ID = "pubmedId"
|
|
27
|
+
"""The PubMed identifier for the study."""
|
|
28
|
+
PUBLICATION_TITLE = "publicationTitle"
|
|
29
|
+
"""The title of the publication associated with the study."""
|
|
30
|
+
TRAIT_FROM_SOURCE = "traitFromSource"
|
|
31
|
+
"""The trait as reported in the source data."""
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def columns(cls) -> list[str]:
|
|
35
|
+
"""Get the list of columns defined in the schema."""
|
|
36
|
+
return [member.value for member in cls]
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def extended_columns(cls) -> list[str]:
|
|
40
|
+
"""Get the list of columns defined in the schema, including additional metadata."""
|
|
41
|
+
return [*cls.columns(), "status"]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DownloadStudiesSchema(StrEnum):
|
|
45
|
+
"""Enum to define the columns for the download studies task."""
|
|
46
|
+
|
|
47
|
+
STUDY_ID = "studyId"
|
|
48
|
+
"""The unique identifier for a study."""
|
|
49
|
+
TRAIT_FROM_SOURCE = "traitFromSource"
|
|
50
|
+
"""The trait as reported in the source data."""
|
|
51
|
+
PUBMED_ID = "pubmedId"
|
|
52
|
+
"""The PubMed identifier for the study."""
|
|
53
|
+
PUBLICATION_TITLE = "publicationTitle"
|
|
54
|
+
"""The title of the publication associated with the study."""
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def mapping(cls) -> dict[str, str]:
|
|
58
|
+
"""Get the mapping of columns to their respective names."""
|
|
59
|
+
return {
|
|
60
|
+
"STUDY ACCESSION": cls.STUDY_ID,
|
|
61
|
+
"STUDY": cls.PUBLICATION_TITLE,
|
|
62
|
+
"PUBMED ID": cls.PUBMED_ID,
|
|
63
|
+
"DISEASE/TRAIT": cls.TRAIT_FROM_SOURCE,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def columns(cls) -> list[str]:
|
|
68
|
+
"""Get the list of columns defined in the schema."""
|
|
69
|
+
return [member.value for member in cls]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class CuratedStudyStatus(StrEnum):
|
|
73
|
+
"""Enum to define the status of a curated study."""
|
|
74
|
+
|
|
75
|
+
REMOVED = "removed"
|
|
76
|
+
"""The study has been removed from the GWAS Catalog."""
|
|
77
|
+
NEW = "new"
|
|
78
|
+
"""The study is new in the GWAS Catalog."""
|
|
79
|
+
CURATED = "curated"
|
|
80
|
+
"""The study has been curated and is still in the GWAS Catalog."""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class GWASCatalogCuration:
|
|
84
|
+
"""Class to handle the curation of GWAS Catalog data."""
|
|
85
|
+
|
|
86
|
+
def __init__(self, previous_curation: pl.DataFrame, studies: pl.DataFrame):
|
|
87
|
+
"""Initialize the GWASCatalogCuration with previous curation and studies data."""
|
|
88
|
+
logger.debug("Initializing GWASCatalogCuration with previous curation and studies data.")
|
|
89
|
+
self.previous_curation = previous_curation
|
|
90
|
+
logger.debug("Previous curation data loaded with shape: {}", previous_curation.shape)
|
|
91
|
+
self.studies = studies
|
|
92
|
+
logger.debug("Studies data loaded with shape: {}", studies.shape)
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def from_prev_curation(cls, previous_curation_path: str, download_studies_path: str) -> GWASCatalogCuration:
|
|
96
|
+
"""Create a GWASCatalogCuration instance from previous curation and studies."""
|
|
97
|
+
previous_curation_df = pl.read_csv(
|
|
98
|
+
previous_curation_path,
|
|
99
|
+
separator="\t",
|
|
100
|
+
has_header=True,
|
|
101
|
+
columns=CurationSchema.columns(),
|
|
102
|
+
)
|
|
103
|
+
if previous_curation_df.is_empty():
|
|
104
|
+
raise GentroutilsError(GentroutilsErrorMessage.PREVIOUS_CURATION_EMPTY, path=previous_curation_path)
|
|
105
|
+
studies_df = pl.read_csv(
|
|
106
|
+
download_studies_path,
|
|
107
|
+
separator="\t",
|
|
108
|
+
quote_char="`",
|
|
109
|
+
has_header=True,
|
|
110
|
+
columns=list(DownloadStudiesSchema.mapping().keys()),
|
|
111
|
+
)
|
|
112
|
+
if studies_df.is_empty():
|
|
113
|
+
raise GentroutilsError(GentroutilsErrorMessage.DOWNLOAD_STUDIES_EMPTY, path=download_studies_path)
|
|
114
|
+
studies_df = studies_df.rename(mapping=DownloadStudiesSchema.mapping())
|
|
115
|
+
return cls(previous_curation_df, studies_df)
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def result(self) -> pl.DataFrame:
|
|
119
|
+
"""Curate the GWAS Catalog data."""
|
|
120
|
+
# Studies that are curated but were removed from the GWAS Catalog
|
|
121
|
+
removed_studies = self.previous_curation.join(self.studies, on=CurationSchema.STUDY_ID, how="anti").select(
|
|
122
|
+
CurationSchema.STUDY_ID, pl.lit(CuratedStudyStatus.REMOVED).alias("status")
|
|
123
|
+
)
|
|
124
|
+
logger.debug("Removed studies identified: {}", removed_studies.shape[0])
|
|
125
|
+
|
|
126
|
+
# studies that are curated and still in the GWAS Catalog
|
|
127
|
+
curated_studies = self.previous_curation.join(self.studies, on=CurationSchema.STUDY_ID, how="inner").select(
|
|
128
|
+
CurationSchema.STUDY_ID, pl.lit(CuratedStudyStatus.CURATED).alias("status")
|
|
129
|
+
)
|
|
130
|
+
logger.debug("Curated studies identified: {}", curated_studies.shape[0])
|
|
131
|
+
|
|
132
|
+
# Combine all previous studies with updated status information.
|
|
133
|
+
prev_studies = pl.concat([removed_studies, curated_studies], how="vertical")
|
|
134
|
+
logger.debug("Previous studies after combining removed and curated: {}", prev_studies.shape[0])
|
|
135
|
+
|
|
136
|
+
# Bring back the information from the previous curation
|
|
137
|
+
prev_studies = (
|
|
138
|
+
prev_studies.join(self.previous_curation, on=CurationSchema.STUDY_ID, how="full", coalesce=True)
|
|
139
|
+
.with_columns(pl.coalesce(CurationSchema.IS_CURATED, pl.lit(False)).alias(CurationSchema.IS_CURATED))
|
|
140
|
+
.select(*CurationSchema.extended_columns())
|
|
141
|
+
)
|
|
142
|
+
logger.debug("Previous studies after bringing previous curation data: {}", prev_studies.shape[0])
|
|
143
|
+
|
|
144
|
+
assert all(prev_studies.select(CurationSchema.STUDY_ID).is_unique()), "Study IDs must be unique after merging."
|
|
145
|
+
|
|
146
|
+
# Studies that are new in the GWAS Catalog
|
|
147
|
+
new_studies = self.studies.join(self.previous_curation, on=CurationSchema.STUDY_ID, how="anti").select(
|
|
148
|
+
CurationSchema.STUDY_ID,
|
|
149
|
+
pl.lit(None).alias(CurationSchema.STUDY_TYPE),
|
|
150
|
+
pl.lit(None).alias(CurationSchema.ANALYSIS_FLAG),
|
|
151
|
+
pl.lit(None).alias(CurationSchema.QUALITY_CONTROL),
|
|
152
|
+
pl.lit(False).alias(CurationSchema.IS_CURATED),
|
|
153
|
+
CurationSchema.PUBMED_ID,
|
|
154
|
+
CurationSchema.PUBLICATION_TITLE,
|
|
155
|
+
CurationSchema.TRAIT_FROM_SOURCE,
|
|
156
|
+
pl.lit(CuratedStudyStatus.NEW).alias("status"),
|
|
157
|
+
)
|
|
158
|
+
logger.debug("New studies identified: {}", new_studies.shape[0])
|
|
159
|
+
|
|
160
|
+
# Union of new studies and previously curated studies
|
|
161
|
+
all_studies = pl.concat([prev_studies, new_studies], how="vertical")
|
|
162
|
+
logger.debug("All studies after combining new and previous: {}", all_studies.shape[0])
|
|
163
|
+
|
|
164
|
+
# Ensure the contract on the output dataframe
|
|
165
|
+
assert all(all_studies.select(CurationSchema.STUDY_ID).is_unique()), "Study IDs must be unique after merging."
|
|
166
|
+
assert all_studies.shape[0] == all_studies.shape[0], "The number of studies must match after merging."
|
|
167
|
+
|
|
168
|
+
return all_studies.select(CurationSchema.extended_columns())
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Gentroutils otter tasks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import date
|
|
8
|
+
|
|
9
|
+
import aiohttp
|
|
10
|
+
from loguru import logger
|
|
11
|
+
from pydantic import AliasPath, BaseModel, Field
|
|
12
|
+
|
|
13
|
+
from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _requires_release_date_template(path: str) -> str:
|
|
17
|
+
"""Ensure that the destination path contains a template for the release date."""
|
|
18
|
+
if "{release_date}" not in path:
|
|
19
|
+
raise GentroutilsError(GentroutilsErrorMessage.MISSING_RELEASE_DATE_TEMPLATE, release_date="{release_date}")
|
|
20
|
+
return path
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class TemplateDestination:
|
|
25
|
+
"""A destination that can be formatted with a release date."""
|
|
26
|
+
|
|
27
|
+
destination: str
|
|
28
|
+
"""The destination path that can be formatted with a release date."""
|
|
29
|
+
is_substituted: bool = False
|
|
30
|
+
"""Whether the destination template has already been substituted."""
|
|
31
|
+
|
|
32
|
+
def format(self, substitutions: dict[str, str]) -> TemplateDestination:
|
|
33
|
+
"""Format the destination with the given substitutions.
|
|
34
|
+
|
|
35
|
+
This method returns a new TemplateDestination object (not a copy of the current one) with the formatted destination.
|
|
36
|
+
"""
|
|
37
|
+
return TemplateDestination(self.destination.format(**substitutions), True)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GwasCatalogReleaseInfo(BaseModel):
|
|
41
|
+
"""Model to hold GWAS Catalog release information."""
|
|
42
|
+
|
|
43
|
+
release_date: date = Field(validation_alias=AliasPath("date"))
|
|
44
|
+
"""Release date of the GWAS Catalog."""
|
|
45
|
+
|
|
46
|
+
number_of_associations: int = Field(validation_alias=AliasPath("associations"))
|
|
47
|
+
"""Number of associations in the GWAS Catalog."""
|
|
48
|
+
|
|
49
|
+
number_of_studies: int = Field(validation_alias=AliasPath("studies"))
|
|
50
|
+
"""Number of studies in the GWAS Catalog."""
|
|
51
|
+
|
|
52
|
+
number_of_sumstats: int = Field(validation_alias=AliasPath("sumstats"))
|
|
53
|
+
"""Number of summary statistics in the GWAS Catalog."""
|
|
54
|
+
|
|
55
|
+
number_of_snps: int = Field(validation_alias=AliasPath("snps"))
|
|
56
|
+
"""Number of SNPs in the GWAS Catalog."""
|
|
57
|
+
|
|
58
|
+
ensembl_build: str = Field(validation_alias=AliasPath("ensemblbuild"))
|
|
59
|
+
"""Ensembl version used in the GWAS Catalog."""
|
|
60
|
+
|
|
61
|
+
dbsnp_build: str = Field(validation_alias=AliasPath("dbsnpbuild"))
|
|
62
|
+
"""dbSNP version used in the GWAS Catalog."""
|
|
63
|
+
|
|
64
|
+
efo_version: str = Field(validation_alias=AliasPath("efoversion"))
|
|
65
|
+
"""EFO version used in the GWAS Catalog."""
|
|
66
|
+
|
|
67
|
+
gene_build: str = Field(validation_alias=AliasPath("genebuild"))
|
|
68
|
+
"""Gene build version used in the GWAS Catalog."""
|
|
69
|
+
|
|
70
|
+
def strfmt(self, format: str = "%Y%m%d") -> str:
|
|
71
|
+
"""Return a string representation of the release information."""
|
|
72
|
+
return self.release_date.strftime(format)
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
async def _get_release_info(uri: str) -> GwasCatalogReleaseInfo:
|
|
76
|
+
"""Get the release information from the specified URI."""
|
|
77
|
+
headers = {"Accept": "application/json"}
|
|
78
|
+
async with aiohttp.ClientSession(headers=headers) as session:
|
|
79
|
+
async with session.get(uri) as response:
|
|
80
|
+
release_info = await response.json()
|
|
81
|
+
return GwasCatalogReleaseInfo(**release_info)
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def from_uri(cls, uri: str) -> GwasCatalogReleaseInfo:
|
|
85
|
+
"""Fetch the release information from the specified URI."""
|
|
86
|
+
try:
|
|
87
|
+
return asyncio.run(cls._get_release_info(uri))
|
|
88
|
+
except aiohttp.ClientError as e:
|
|
89
|
+
logger.error(f"Error fetching release info: {e}")
|
|
90
|
+
raise GentroutilsError(GentroutilsErrorMessage.FAILED_TO_FETCH, uri=uri)
|