gentroutils 1.5.0__py3-none-any.whl → 1.6.0.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ """Module to handle the crawling of GWAS Catalog release information."""
2
+
3
+ import tempfile
4
+ from functools import cached_property
5
+ from pathlib import Path
6
+ from typing import Annotated, Any, Self
7
+
8
+ from loguru import logger
9
+ from otter.storage import get_remote_storage
10
+ from otter.task.model import Spec, Task, TaskContext
11
+ from otter.task.task_reporter import report
12
+ from pydantic import AfterValidator, computed_field
13
+
14
+ from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, _requires_release_date_template
15
+
16
+
17
+ class CrawlSpec(Spec):
18
+ """Configuration fields for the release crawler task.
19
+
20
+ The `CrawlSpec` defines the parameters needed to crawl the GWAS Catalog release information.
21
+ It includes the `stats_uri` that provides the release statistics and the `destination_prefix` where the
22
+ release information will be stored.
23
+
24
+ Examples:
25
+ ---
26
+ >>> cs = CrawlSpec(
27
+ ... name="crawl gwas catalog release information",
28
+ ... stats_uri="https://www.ebi.ac.uk/gwas/api/search/stats",
29
+ ... destination_template="gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json",
30
+ ... promote=True
31
+ ... )
32
+ >>> cs.name
33
+ 'crawl gwas catalog release information'
34
+ >>> cs.stats_uri
35
+ 'https://www.ebi.ac.uk/gwas/api/search/stats'
36
+ >>> cs.destination_template
37
+ 'gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json'
38
+ >>> cs.promote
39
+ True
40
+ >>> rs = GwasCatalogReleaseInfo(
41
+ ... date="2023-10-01",
42
+ ... associations=1000,
43
+ ... studies=200,
44
+ ... sumstats=300,
45
+ ... snps=400,
46
+ ... ensemblbuild="114.0",
47
+ ... dbsnpbuild="1.0.0",
48
+ ... efoversion="1.0.0",
49
+ ... genebuild="GRCh38",
50
+ ... )
51
+ >>> cs.substituted_destinations(rs)
52
+ ['gs://gwas_catalog_inputs/gentroutils/20231001/stats.json', 'gs://gwas_catalog_inputs/gentroutils/latest/stats.json']
53
+
54
+
55
+ ### Example configuration for the crawl task in a YAML file.
56
+ .. code-block:: yaml
57
+
58
+ steps:
59
+ - crawl gwas catalog release information:
60
+ destination: gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json
61
+ stats_uri: https://www.ebi.ac.uk/gwas/api/search/stats
62
+ promote: true
63
+ """
64
+
65
+ name: str = "crawl gwas catalog release information"
66
+ """The name of the crawl task."""
67
+
68
+ stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
69
+ """The URI to crawl the release statistics information from."""
70
+
71
+ destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
72
+ """The destination path to save the release information.
73
+ This path should always be a template string that includes `{release_date}`.
74
+ For example, `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`.
75
+
76
+ The `release_date` will be substituted with the actual release date or `latest` literal from the stats_uri endpoint.
77
+ """
78
+
79
+ promote: bool = True
80
+ """Whether to promote the release information as the latest release.
81
+
82
+ Given the destination: `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`
83
+
84
+ * If set to `False` the task will upload the release information
85
+ only to the specified destination with `release_date` substituted by the value from the stats_uri endpoint.
86
+ Resulting in following destinations:
87
+ * `gs://gwas_catalog_inputs/gentroutils/20231001/stats.json`
88
+
89
+ * If set to `True`, the task will also upload the release information
90
+ to the destination with `release_date` substituted to `latest` literal, effectively
91
+ promoting the release as the latest release.
92
+ """
93
+
94
+ @computed_field # type: ignore[prop-decorator]
95
+ @cached_property
96
+ def destinations(self) -> list[TemplateDestination]:
97
+ """Get the list of destinations templates where the release information will be saved.
98
+
99
+ Returns:
100
+ list[TemplateDestination]: A list of TemplateDestination objects with the formatted destination paths.
101
+
102
+ Depending on the `promote` flag this property will return:
103
+ * If `promote` is `False`, it will return a single destination template.
104
+ * If `promote` is `True`, it will return two destinations:
105
+ 1. The destination template with the release date substituted.
106
+ 2. The destination with the release date substituted to `latest`.
107
+ """
108
+ d1 = self.destination_template
109
+ if self.promote:
110
+ d2 = self.destination_template.format(release_date="latest")
111
+ return [TemplateDestination(d1, False), TemplateDestination(d2, True)]
112
+ return [TemplateDestination(d1, False)]
113
+
114
+ def substituted_destinations(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
115
+ """Safely parse the destination name to ensure it is valid."""
116
+ substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
117
+ return [
118
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
119
+ ]
120
+
121
+ def model_post_init(self, __context: Any) -> None:
122
+ """Method to ensure the scratchpad is set to ignore missing replacements."""
123
+ self.scratchpad_ignore_missing = True
124
+
125
+
126
+ class Crawl(Task):
127
+ """Task to crawl the GWAS Catalog release information."""
128
+
129
+ def __init__(self, spec: CrawlSpec, context: TaskContext) -> None:
130
+ super().__init__(spec, context)
131
+ self.spec: CrawlSpec
132
+
133
+ def _write_release_info(self, release_info: GwasCatalogReleaseInfo) -> Self:
134
+ """Write the release information to the specified GCP blob."""
135
+ with tempfile.NamedTemporaryFile() as source:
136
+ logger.info(f"Writing release information to {source.name}")
137
+ with open(source.name, "w") as source_file:
138
+ source_file.write(release_info.model_dump_json(indent=2, by_alias=False))
139
+ source_file.flush()
140
+ destinations = self.spec.substituted_destinations(release_info)
141
+ logger.info(f"Destinations for release information: {destinations}")
142
+ for destination in destinations:
143
+ storage = get_remote_storage(destination)
144
+ storage.upload(Path(source.name), destination)
145
+ logger.info(f"Release information written to {destination}")
146
+ return self
147
+
148
+ @report
149
+ def run(self) -> Self:
150
+ """Crawl the release information."""
151
+ logger.info(f"Crawling release information from {self.spec.stats_uri}")
152
+ release_info = GwasCatalogReleaseInfo.from_uri(self.spec.stats_uri)
153
+ logger.info("Crawling completed successfully.")
154
+ self._write_release_info(release_info)
155
+ logger.info("Writing release information completed successfully.")
156
+ return self
@@ -0,0 +1,110 @@
1
+ """Module to handle the business logic for the GWAS Catalog curation task."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import date
6
+ from functools import cached_property
7
+ from typing import Annotated, Any, Self
8
+
9
+ from otter.task.model import Spec, Task, TaskContext
10
+ from otter.task.task_reporter import report
11
+ from pydantic import AfterValidator, computed_field
12
+
13
+ from gentroutils.io.transfer.polars_to_gcs import PolarsDataFrameToGCSTransferableObject
14
+ from gentroutils.parsers.curation import GWASCatalogCuration
15
+ from gentroutils.tasks import TemplateDestination, _requires_release_date_template
16
+ from gentroutils.transfer import TransferManager
17
+
18
+
19
+ class CurationSpec(Spec):
20
+ """Configuration fields for the curation task.
21
+
22
+ The `CurationSpec` defines the parameters needed to curate GWAS Catalog data.
23
+ It includes the `previous_curation`, which is the path to the previous curation data,
24
+ and the `studies`, which is the path to the studies data.
25
+
26
+ Examples:
27
+ ---
28
+ >>> cs = CurationSpec(
29
+ ... name="curate gwas catalog data",
30
+ ... previous_curation="gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv",
31
+ ... studies="gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv",
32
+ ... destination_template="gs://gwas_catalog_inputs/{release_date}/pending/curation.tsv"
33
+ ... )
34
+ >>> cs.name
35
+ 'curate gwas catalog data'
36
+ >>> cs.previous_curation
37
+ 'gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv'
38
+ >>> cs.studies
39
+ 'gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv'
40
+ >>> cs.destination_template
41
+ 'gs://gwas_catalog_inputs/{release_date}/pending/curation.tsv'
42
+ """
43
+
44
+ name: str = "curate gwas catalog data"
45
+ """The name of the curation task."""
46
+
47
+ previous_curation: str
48
+ """The path to the previous curation data."""
49
+
50
+ studies: str
51
+ """The path to the studies data."""
52
+
53
+ destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
54
+ """The destination path for the curation data."""
55
+
56
+ promote: bool = False
57
+ """Whether to promote the curation data to the latest version."""
58
+
59
+ @computed_field # type: ignore[prop-decorator]
60
+ @cached_property
61
+ def destinations(self) -> list[TemplateDestination]:
62
+ """Get the list of destinations templates where the release information will be saved.
63
+
64
+ Returns:
65
+ list[TemplateDestination]: A list of TemplateDestination objects with the formatted destination paths.
66
+
67
+ Depending on the `promote` flag this property will return:
68
+ * If `promote` is `False`, it will return a single destination template.
69
+ * If `promote` is `True`, it will return two destinations:
70
+ 1. The destination template with the release date substituted.
71
+ 2. The destination with the release date substituted to `latest`.
72
+ """
73
+ d1 = TemplateDestination(self.destination_template, False)
74
+ if self.promote:
75
+ d2 = d1.format({"release_date": "latest"})
76
+ return [d1, d2]
77
+ return [d1]
78
+
79
+ def substituted_destinations(self, release_date: date) -> list[str]:
80
+ """Safely parse the destination name to ensure it is valid."""
81
+ substitutions = {"release_date": release_date.strftime("%Y%m%d")}
82
+ return [
83
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
84
+ ]
85
+
86
+ def model_post_init(self, __context: Any) -> None:
87
+ """Method to ensure the scratchpad is set to ignore missing replacements."""
88
+ self.scratchpad_ignore_missing = True
89
+
90
+
91
+ class Curation(Task):
92
+ """Task for curating GWAS Catalog data."""
93
+
94
+ def __init__(self, spec: CurationSpec, context: TaskContext) -> None:
95
+ """Initialize the Curation task with the given specification and context."""
96
+ super().__init__(spec, context)
97
+ self.spec: CurationSpec
98
+
99
+ @report
100
+ def run(self) -> Self:
101
+ """Run the curation task."""
102
+ release_date = date.today()
103
+ destinations = self.spec.substituted_destinations(release_date)
104
+ curation = GWASCatalogCuration.from_prev_curation(self.spec.previous_curation, self.spec.studies)
105
+ transfer_objects = [
106
+ PolarsDataFrameToGCSTransferableObject(source=curation.result, destination=d) for d in destinations
107
+ ]
108
+ TransferManager().transfer(transfer_objects)
109
+
110
+ return self
@@ -0,0 +1,141 @@
1
+ """Module to handle the fetching of GWAS Catalog release files."""
2
+
3
+ from functools import cached_property
4
+ from typing import Annotated, Any, Self
5
+
6
+ from loguru import logger
7
+ from otter.task.model import Spec, Task, TaskContext
8
+ from otter.task.task_reporter import report
9
+ from pydantic import AfterValidator, computed_field
10
+
11
+ from gentroutils.io.transfer import FTPtoGCPTransferableObject
12
+ from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, _requires_release_date_template
13
+ from gentroutils.transfer import TransferManager
14
+
15
+ MAX_CONCURRENT_CONNECTIONS = 10
16
+
17
+
18
+ class FetchSpec(Spec):
19
+ """Configuration fields for the fetch task.
20
+
21
+ The task downloads single file based on the `source_template` and uploads it to the `destination_template`.
22
+
23
+ The `FetchSpec` defines the parameters needed to fetch the GWAS Catalog release files.
24
+ These should be files that reside in the `https://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/` directory.
25
+
26
+ To make sure that we download the latest release and persist the release date,
27
+ we need to make a single request to the `stats_uri` endpoint, which returns the latest release date.
28
+ (We are not using the https://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/` endpoint, rather
29
+ the endpoint with `https://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/` format to
30
+ download the files.
31
+
32
+
33
+ Examples:
34
+ ---
35
+ >>> fs = FetchSpec(
36
+ ... name="fetch associations",
37
+ ... stats_uri="https://www.ebi.ac.uk/gwas/api/search/stats",
38
+ ... source_template="ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-associations_ontology-annotated.tsv",
39
+ ... destination_template="gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv",
40
+ ... promote=True
41
+ ... )
42
+ >>> fs.name
43
+ 'fetch associations'
44
+ >>> fs.stats_uri
45
+ 'https://www.ebi.ac.uk/gwas/api/search/stats'
46
+ >>> fs.source_template
47
+ 'ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-associations_ontology-annotated.tsv'
48
+ >>> fs.destination_template
49
+ 'gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv'
50
+ >>> fs.promote
51
+ True
52
+ """
53
+
54
+ name: str = "fetch gwas catalog data"
55
+ """The name of the task."""
56
+
57
+ stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
58
+ """The URI to crawl the release statistics information from."""
59
+
60
+ source_template: Annotated[str, AfterValidator(_requires_release_date_template)]
61
+ """The template URI of the file to download."""
62
+
63
+ destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
64
+ """The template URI to upload the file to."""
65
+
66
+ promote: bool = False
67
+ """Whether to promote the release information as the latest release.
68
+
69
+ Given the destination: `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`
70
+
71
+ * If set to `False` the task will upload the release information
72
+ only to the specified destination with `release_date` substituted by the value from the stats_uri endpoint.
73
+ Resulting in following destinations:
74
+ * `gs://gwas_catalog_inputs/gentroutils/20231001/stats.json`
75
+
76
+ * If set to `True`, the task will also upload the release information
77
+ to the destination with `release_date` substituted to `latest` literal, effectively
78
+ promoting the release as the latest release.
79
+ """
80
+
81
+ @computed_field # type: ignore[prop-decorator]
82
+ @cached_property
83
+ def destinations(self) -> list[TemplateDestination]:
84
+ """Get the list of destinations templates where the release information will be saved.
85
+
86
+ Returns:
87
+ list[TemplateDestination]: A list of TemplateDestination objects with the formatted destination paths.
88
+
89
+ Depending on the `promote` flag this property will return:
90
+ * If `promote` is `False`, it will return a single destination template.
91
+ * If `promote` is `True`, it will return two destinations:
92
+ 1. The destination template with the release date substituted.
93
+ 2. The destination with the release date substituted to `latest`.
94
+ """
95
+ d1 = TemplateDestination(self.destination_template, False)
96
+ if self.promote:
97
+ d2 = d1.format({"release_date": "latest"})
98
+ return [d1, d2]
99
+ return [d1]
100
+
101
+ def substituted_destinations(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
102
+ """Safely parse the destination name to ensure it is valid."""
103
+ substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
104
+ return [
105
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
106
+ ]
107
+
108
+ def substituted_sources(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
109
+ """Safely parse the source name to ensure it is valid."""
110
+ substitutions = {"release_date": release_info.strfmt("%Y/%m/%d")}
111
+ if self.promote:
112
+ return [self.source_template.format(**substitutions)] * 2
113
+ return [self.source_template.format(**substitutions)]
114
+
115
+ def model_post_init(self, __context: Any) -> None:
116
+ """Method to ensure the scratchpad is set to ignore missing replacements."""
117
+ self.scratchpad_ignore_missing = True
118
+
119
+
120
+ class Fetch(Task):
121
+ """Task to fetch files from the GWAS Catalog release directory file."""
122
+
123
+ def __init__(self, spec: FetchSpec, context: TaskContext) -> None:
124
+ super().__init__(spec, context)
125
+ self.spec: FetchSpec
126
+
127
+ @report
128
+ def run(self) -> Self:
129
+ """Fetch the file from the remote to local."""
130
+ logger.info(f"Fetching file from {self.spec.source_template}")
131
+ release_info = GwasCatalogReleaseInfo.from_uri(self.spec.stats_uri)
132
+ logger.info(f"Release information: {release_info}")
133
+ destinations = self.spec.substituted_destinations(release_info)
134
+ sources = self.spec.substituted_sources(release_info)
135
+ transferable_objects = [
136
+ FTPtoGCPTransferableObject(source=s, destination=d) for s, d in zip(sources, destinations, strict=True)
137
+ ]
138
+ logger.info(f"Transferable objects: {transferable_objects}")
139
+ TransferManager().transfer(transferable_objects)
140
+ logger.success("File transferred successfully.")
141
+ return self
@@ -0,0 +1,81 @@
1
+ """Transfer module."""
2
+
3
+ import asyncio
4
+ from collections.abc import Sequence
5
+ from typing import cast
6
+
7
+ import tqdm
8
+ from loguru import logger
9
+
10
+ from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
11
+ from gentroutils.io.transfer import FTPtoGCPTransferableObject, PolarsDataFrameToGCSTransferableObject
12
+ from gentroutils.io.transfer.model import TransferableObject
13
+
14
+
15
+ class TransferManager:
16
+ """Manager class for handling the transfer of various transferable objects.
17
+
18
+ This class provides static methods to `transfer` to move files or objects.
19
+ Currently it supports:
20
+
21
+ - FTP to Google Cloud Storage (GCP) transfers using `FTPtoGCPTransferableObject`.
22
+ - Polars DataFrame to GCS transfers using `PolarsDataFrameToGCSTransferableObject`.
23
+
24
+ """
25
+
26
+ @staticmethod
27
+ async def transfer_ftp_to_gcp(transferable_objects: Sequence[FTPtoGCPTransferableObject]) -> None:
28
+ """Update GWAS Catalog metadata directly to cloud bucket.
29
+
30
+ This method transfers files from FTP to Google Cloud Storage (GCS) using the provided
31
+ FTPtoGCPTransferableObject instances.
32
+ It fetches the data for the file provided in the local FTP path, collects the
33
+ data asynchronously to buffer, and uploads it to the provided GCP bucket blob.
34
+
35
+ Args:
36
+ transferable_objects (Sequence[FTPtoGCPTransferableObject]): A sequence of FTPtoGCPTransferableObject instances.
37
+
38
+ """
39
+ # we always want to have the logs from this command uploaded to the target bucket
40
+ transfer_tasks = [asyncio.create_task(x.transfer()) for x in transferable_objects]
41
+ for f in tqdm.tqdm(asyncio.as_completed(transfer_tasks), total=len(transfer_tasks), desc="Downloading"):
42
+ await f
43
+ logger.info("gwas_curation_update step completed.")
44
+
45
+ @staticmethod
46
+ async def transfer_polars_to_gcs(transferable_objects: Sequence[PolarsDataFrameToGCSTransferableObject]) -> None:
47
+ """Transfer Polars DataFrames to Google Cloud Storage.
48
+
49
+ This method transfers Polars DataFrames to GCS using the provided
50
+ PolarsDataFrameToGCSTransferableObject instances.
51
+
52
+ Args:
53
+ transferable_objects (Sequence[PolarsDataFrameToGCSTransferableObject]): A sequence of PolarsDataFrameToGCSTransferableObject instances.
54
+
55
+ """
56
+ transfer_tasks = [asyncio.create_task(x.transfer()) for x in transferable_objects]
57
+ for f in tqdm.tqdm(asyncio.as_completed(transfer_tasks), total=len(transfer_tasks), desc="Uploading"):
58
+ await f
59
+ logger.info("Polars DataFrame transfer to GCS completed.")
60
+
61
+ def transfer(self, transferable_objects: Sequence[TransferableObject]) -> None:
62
+ """Transfer method that handles different types of transferable objects.
63
+
64
+ Main method to manage the transfer of various transferable objects.
65
+
66
+ Args:
67
+ transferable_objects (Sequence[TransferableObject]): A sequence of TransferableObject instances.
68
+
69
+ Raises:
70
+ GentroutilsError: If the list of transferable objects is empty or if the objects are not instances of the expected types.
71
+ """
72
+ if not transferable_objects:
73
+ raise GentroutilsError(GentroutilsErrorMessage.EMPTY_TRANSFERABLE_OBJECTS)
74
+ elif all(isinstance(c, FTPtoGCPTransferableObject) for c in transferable_objects):
75
+ ftp_objects = cast(Sequence[FTPtoGCPTransferableObject], transferable_objects)
76
+ asyncio.run(self.transfer_ftp_to_gcp(ftp_objects))
77
+ elif all(isinstance(c, PolarsDataFrameToGCSTransferableObject) for c in transferable_objects):
78
+ polars_objects = cast(Sequence[PolarsDataFrameToGCSTransferableObject], transferable_objects)
79
+ asyncio.run(self.transfer_polars_to_gcs(polars_objects))
80
+ else:
81
+ raise GentroutilsError(GentroutilsErrorMessage.INVALID_TRANSFERABLE_OBJECTS)