gentroutils 1.5.0__py3-none-any.whl → 1.6.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gentroutils/__init__.py +8 -43
- gentroutils/errors.py +39 -0
- gentroutils/io/path/__init__.py +6 -0
- gentroutils/io/path/ftp.py +48 -0
- gentroutils/io/path/gcs.py +45 -0
- gentroutils/io/transfer/__init__.py +6 -0
- gentroutils/io/transfer/ftp_to_gcs.py +49 -0
- gentroutils/io/transfer/model.py +36 -0
- gentroutils/io/transfer/polars_to_gcs.py +20 -0
- gentroutils/parsers/__init__.py +1 -0
- gentroutils/parsers/curation.py +168 -0
- gentroutils/tasks/__init__.py +90 -0
- gentroutils/tasks/crawl.py +156 -0
- gentroutils/tasks/curation.py +110 -0
- gentroutils/tasks/fetch.py +141 -0
- gentroutils/transfer.py +81 -0
- gentroutils-1.6.0.dev1.dist-info/METADATA +274 -0
- gentroutils-1.6.0.dev1.dist-info/RECORD +22 -0
- gentroutils-1.6.0.dev1.dist-info/entry_points.txt +2 -0
- {gentroutils-1.5.0.dist-info → gentroutils-1.6.0.dev1.dist-info}/licenses/LICENSE +1 -1
- gentroutils/commands/__init__.py +0 -11
- gentroutils/commands/update_gwas_curation_metadata.py +0 -287
- gentroutils/commands/utils.py +0 -152
- gentroutils/commands/validate_gwas_curation.py +0 -165
- gentroutils-1.5.0.dist-info/METADATA +0 -135
- gentroutils-1.5.0.dist-info/RECORD +0 -11
- gentroutils-1.5.0.dist-info/entry_points.txt +0 -2
- {gentroutils-1.5.0.dist-info → gentroutils-1.6.0.dev1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Module to handle the crawling of GWAS Catalog release information."""
|
|
2
|
+
|
|
3
|
+
import tempfile
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Any, Self
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
from otter.storage import get_remote_storage
|
|
10
|
+
from otter.task.model import Spec, Task, TaskContext
|
|
11
|
+
from otter.task.task_reporter import report
|
|
12
|
+
from pydantic import AfterValidator, computed_field
|
|
13
|
+
|
|
14
|
+
from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, _requires_release_date_template
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CrawlSpec(Spec):
|
|
18
|
+
"""Configuration fields for the release crawler task.
|
|
19
|
+
|
|
20
|
+
The `CrawlSpec` defines the parameters needed to crawl the GWAS Catalog release information.
|
|
21
|
+
It includes the `stats_uri` that provides the release statistics and the `destination_prefix` where the
|
|
22
|
+
release information will be stored.
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
---
|
|
26
|
+
>>> cs = CrawlSpec(
|
|
27
|
+
... name="crawl gwas catalog release information",
|
|
28
|
+
... stats_uri="https://www.ebi.ac.uk/gwas/api/search/stats",
|
|
29
|
+
... destination_template="gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json",
|
|
30
|
+
... promote=True
|
|
31
|
+
... )
|
|
32
|
+
>>> cs.name
|
|
33
|
+
'crawl gwas catalog release information'
|
|
34
|
+
>>> cs.stats_uri
|
|
35
|
+
'https://www.ebi.ac.uk/gwas/api/search/stats'
|
|
36
|
+
>>> cs.destination_template
|
|
37
|
+
'gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json'
|
|
38
|
+
>>> cs.promote
|
|
39
|
+
True
|
|
40
|
+
>>> rs = GwasCatalogReleaseInfo(
|
|
41
|
+
... date="2023-10-01",
|
|
42
|
+
... associations=1000,
|
|
43
|
+
... studies=200,
|
|
44
|
+
... sumstats=300,
|
|
45
|
+
... snps=400,
|
|
46
|
+
... ensemblbuild="114.0",
|
|
47
|
+
... dbsnpbuild="1.0.0",
|
|
48
|
+
... efoversion="1.0.0",
|
|
49
|
+
... genebuild="GRCh38",
|
|
50
|
+
... )
|
|
51
|
+
>>> cs.substituted_destinations(rs)
|
|
52
|
+
['gs://gwas_catalog_inputs/gentroutils/20231001/stats.json', 'gs://gwas_catalog_inputs/gentroutils/latest/stats.json']
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
### Example configuration for the crawl task in a YAML file.
|
|
56
|
+
.. code-block:: yaml
|
|
57
|
+
|
|
58
|
+
steps:
|
|
59
|
+
- crawl gwas catalog release information:
|
|
60
|
+
destination: gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json
|
|
61
|
+
stats_uri: https://www.ebi.ac.uk/gwas/api/search/stats
|
|
62
|
+
promote: true
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
name: str = "crawl gwas catalog release information"
|
|
66
|
+
"""The name of the crawl task."""
|
|
67
|
+
|
|
68
|
+
stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
69
|
+
"""The URI to crawl the release statistics information from."""
|
|
70
|
+
|
|
71
|
+
destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
|
|
72
|
+
"""The destination path to save the release information.
|
|
73
|
+
This path should always be a template string that includes `{release_date}`.
|
|
74
|
+
For example, `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`.
|
|
75
|
+
|
|
76
|
+
The `release_date` will be substituted with the actual release date or `latest` literal from the stats_uri endpoint.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
promote: bool = True
|
|
80
|
+
"""Whether to promote the release information as the latest release.
|
|
81
|
+
|
|
82
|
+
Given the destination: `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`
|
|
83
|
+
|
|
84
|
+
* If set to `False` the task will upload the release information
|
|
85
|
+
only to the specified destination with `release_date` substituted by the value from the stats_uri endpoint.
|
|
86
|
+
Resulting in following destinations:
|
|
87
|
+
* `gs://gwas_catalog_inputs/gentroutils/20231001/stats.json`
|
|
88
|
+
|
|
89
|
+
* If set to `True`, the task will also upload the release information
|
|
90
|
+
to the destination with `release_date` substituted to `latest` literal, effectively
|
|
91
|
+
promoting the release as the latest release.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
@computed_field # type: ignore[prop-decorator]
|
|
95
|
+
@cached_property
|
|
96
|
+
def destinations(self) -> list[TemplateDestination]:
|
|
97
|
+
"""Get the list of destinations templates where the release information will be saved.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
list[TemplateDestination]: A list of TemplateDestination objects with the formatted destination paths.
|
|
101
|
+
|
|
102
|
+
Depending on the `promote` flag this property will return:
|
|
103
|
+
* If `promote` is `False`, it will return a single destination template.
|
|
104
|
+
* If `promote` is `True`, it will return two destinations:
|
|
105
|
+
1. The destination template with the release date substituted.
|
|
106
|
+
2. The destination with the release date substituted to `latest`.
|
|
107
|
+
"""
|
|
108
|
+
d1 = self.destination_template
|
|
109
|
+
if self.promote:
|
|
110
|
+
d2 = self.destination_template.format(release_date="latest")
|
|
111
|
+
return [TemplateDestination(d1, False), TemplateDestination(d2, True)]
|
|
112
|
+
return [TemplateDestination(d1, False)]
|
|
113
|
+
|
|
114
|
+
def substituted_destinations(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
|
|
115
|
+
"""Safely parse the destination name to ensure it is valid."""
|
|
116
|
+
substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
|
|
117
|
+
return [
|
|
118
|
+
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
def model_post_init(self, __context: Any) -> None:
|
|
122
|
+
"""Method to ensure the scratchpad is set to ignore missing replacements."""
|
|
123
|
+
self.scratchpad_ignore_missing = True
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class Crawl(Task):
|
|
127
|
+
"""Task to crawl the GWAS Catalog release information."""
|
|
128
|
+
|
|
129
|
+
def __init__(self, spec: CrawlSpec, context: TaskContext) -> None:
|
|
130
|
+
super().__init__(spec, context)
|
|
131
|
+
self.spec: CrawlSpec
|
|
132
|
+
|
|
133
|
+
def _write_release_info(self, release_info: GwasCatalogReleaseInfo) -> Self:
|
|
134
|
+
"""Write the release information to the specified GCP blob."""
|
|
135
|
+
with tempfile.NamedTemporaryFile() as source:
|
|
136
|
+
logger.info(f"Writing release information to {source.name}")
|
|
137
|
+
with open(source.name, "w") as source_file:
|
|
138
|
+
source_file.write(release_info.model_dump_json(indent=2, by_alias=False))
|
|
139
|
+
source_file.flush()
|
|
140
|
+
destinations = self.spec.substituted_destinations(release_info)
|
|
141
|
+
logger.info(f"Destinations for release information: {destinations}")
|
|
142
|
+
for destination in destinations:
|
|
143
|
+
storage = get_remote_storage(destination)
|
|
144
|
+
storage.upload(Path(source.name), destination)
|
|
145
|
+
logger.info(f"Release information written to {destination}")
|
|
146
|
+
return self
|
|
147
|
+
|
|
148
|
+
@report
|
|
149
|
+
def run(self) -> Self:
|
|
150
|
+
"""Crawl the release information."""
|
|
151
|
+
logger.info(f"Crawling release information from {self.spec.stats_uri}")
|
|
152
|
+
release_info = GwasCatalogReleaseInfo.from_uri(self.spec.stats_uri)
|
|
153
|
+
logger.info("Crawling completed successfully.")
|
|
154
|
+
self._write_release_info(release_info)
|
|
155
|
+
logger.info("Writing release information completed successfully.")
|
|
156
|
+
return self
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Module to handle the business logic for the GWAS Catalog curation task."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import date
|
|
6
|
+
from functools import cached_property
|
|
7
|
+
from typing import Annotated, Any, Self
|
|
8
|
+
|
|
9
|
+
from otter.task.model import Spec, Task, TaskContext
|
|
10
|
+
from otter.task.task_reporter import report
|
|
11
|
+
from pydantic import AfterValidator, computed_field
|
|
12
|
+
|
|
13
|
+
from gentroutils.io.transfer.polars_to_gcs import PolarsDataFrameToGCSTransferableObject
|
|
14
|
+
from gentroutils.parsers.curation import GWASCatalogCuration
|
|
15
|
+
from gentroutils.tasks import TemplateDestination, _requires_release_date_template
|
|
16
|
+
from gentroutils.transfer import TransferManager
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CurationSpec(Spec):
|
|
20
|
+
"""Configuration fields for the curation task.
|
|
21
|
+
|
|
22
|
+
The `CurationSpec` defines the parameters needed to curate GWAS Catalog data.
|
|
23
|
+
It includes the `previous_curation`, which is the path to the previous curation data,
|
|
24
|
+
and the `studies`, which is the path to the studies data.
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
---
|
|
28
|
+
>>> cs = CurationSpec(
|
|
29
|
+
... name="curate gwas catalog data",
|
|
30
|
+
... previous_curation="gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv",
|
|
31
|
+
... studies="gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv",
|
|
32
|
+
... destination_template="gs://gwas_catalog_inputs/{release_date}/pending/curation.tsv"
|
|
33
|
+
... )
|
|
34
|
+
>>> cs.name
|
|
35
|
+
'curate gwas catalog data'
|
|
36
|
+
>>> cs.previous_curation
|
|
37
|
+
'gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv'
|
|
38
|
+
>>> cs.studies
|
|
39
|
+
'gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv'
|
|
40
|
+
>>> cs.destination_template
|
|
41
|
+
'gs://gwas_catalog_inputs/{release_date}/pending/curation.tsv'
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
name: str = "curate gwas catalog data"
|
|
45
|
+
"""The name of the curation task."""
|
|
46
|
+
|
|
47
|
+
previous_curation: str
|
|
48
|
+
"""The path to the previous curation data."""
|
|
49
|
+
|
|
50
|
+
studies: str
|
|
51
|
+
"""The path to the studies data."""
|
|
52
|
+
|
|
53
|
+
destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
|
|
54
|
+
"""The destination path for the curation data."""
|
|
55
|
+
|
|
56
|
+
promote: bool = False
|
|
57
|
+
"""Whether to promote the curation data to the latest version."""
|
|
58
|
+
|
|
59
|
+
@computed_field # type: ignore[prop-decorator]
|
|
60
|
+
@cached_property
|
|
61
|
+
def destinations(self) -> list[TemplateDestination]:
|
|
62
|
+
"""Get the list of destinations templates where the release information will be saved.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
list[TemplateDestination]: A list of TemplateDestination objects with the formatted destination paths.
|
|
66
|
+
|
|
67
|
+
Depending on the `promote` flag this property will return:
|
|
68
|
+
* If `promote` is `False`, it will return a single destination template.
|
|
69
|
+
* If `promote` is `True`, it will return two destinations:
|
|
70
|
+
1. The destination template with the release date substituted.
|
|
71
|
+
2. The destination with the release date substituted to `latest`.
|
|
72
|
+
"""
|
|
73
|
+
d1 = TemplateDestination(self.destination_template, False)
|
|
74
|
+
if self.promote:
|
|
75
|
+
d2 = d1.format({"release_date": "latest"})
|
|
76
|
+
return [d1, d2]
|
|
77
|
+
return [d1]
|
|
78
|
+
|
|
79
|
+
def substituted_destinations(self, release_date: date) -> list[str]:
|
|
80
|
+
"""Safely parse the destination name to ensure it is valid."""
|
|
81
|
+
substitutions = {"release_date": release_date.strftime("%Y%m%d")}
|
|
82
|
+
return [
|
|
83
|
+
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
def model_post_init(self, __context: Any) -> None:
|
|
87
|
+
"""Method to ensure the scratchpad is set to ignore missing replacements."""
|
|
88
|
+
self.scratchpad_ignore_missing = True
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class Curation(Task):
|
|
92
|
+
"""Task for curating GWAS Catalog data."""
|
|
93
|
+
|
|
94
|
+
def __init__(self, spec: CurationSpec, context: TaskContext) -> None:
|
|
95
|
+
"""Initialize the Curation task with the given specification and context."""
|
|
96
|
+
super().__init__(spec, context)
|
|
97
|
+
self.spec: CurationSpec
|
|
98
|
+
|
|
99
|
+
@report
|
|
100
|
+
def run(self) -> Self:
|
|
101
|
+
"""Run the curation task."""
|
|
102
|
+
release_date = date.today()
|
|
103
|
+
destinations = self.spec.substituted_destinations(release_date)
|
|
104
|
+
curation = GWASCatalogCuration.from_prev_curation(self.spec.previous_curation, self.spec.studies)
|
|
105
|
+
transfer_objects = [
|
|
106
|
+
PolarsDataFrameToGCSTransferableObject(source=curation.result, destination=d) for d in destinations
|
|
107
|
+
]
|
|
108
|
+
TransferManager().transfer(transfer_objects)
|
|
109
|
+
|
|
110
|
+
return self
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Module to handle the fetching of GWAS Catalog release files."""
|
|
2
|
+
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
from typing import Annotated, Any, Self
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from otter.task.model import Spec, Task, TaskContext
|
|
8
|
+
from otter.task.task_reporter import report
|
|
9
|
+
from pydantic import AfterValidator, computed_field
|
|
10
|
+
|
|
11
|
+
from gentroutils.io.transfer import FTPtoGCPTransferableObject
|
|
12
|
+
from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, _requires_release_date_template
|
|
13
|
+
from gentroutils.transfer import TransferManager
|
|
14
|
+
|
|
15
|
+
MAX_CONCURRENT_CONNECTIONS = 10
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FetchSpec(Spec):
|
|
19
|
+
"""Configuration fields for the fetch task.
|
|
20
|
+
|
|
21
|
+
The task downloads single file based on the `source_template` and uploads it to the `destination_template`.
|
|
22
|
+
|
|
23
|
+
The `FetchSpec` defines the parameters needed to fetch the GWAS Catalog release files.
|
|
24
|
+
These should be files that reside in the `https://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/` directory.
|
|
25
|
+
|
|
26
|
+
To make sure that we download the latest release and persist the release date,
|
|
27
|
+
we need to make a single request to the `stats_uri` endpoint, which returns the latest release date.
|
|
28
|
+
(We are not using the https://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/` endpoint, rather
|
|
29
|
+
the endpoint with `https://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/` format to
|
|
30
|
+
download the files.
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
---
|
|
35
|
+
>>> fs = FetchSpec(
|
|
36
|
+
... name="fetch associations",
|
|
37
|
+
... stats_uri="https://www.ebi.ac.uk/gwas/api/search/stats",
|
|
38
|
+
... source_template="ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-associations_ontology-annotated.tsv",
|
|
39
|
+
... destination_template="gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv",
|
|
40
|
+
... promote=True
|
|
41
|
+
... )
|
|
42
|
+
>>> fs.name
|
|
43
|
+
'fetch associations'
|
|
44
|
+
>>> fs.stats_uri
|
|
45
|
+
'https://www.ebi.ac.uk/gwas/api/search/stats'
|
|
46
|
+
>>> fs.source_template
|
|
47
|
+
'ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-associations_ontology-annotated.tsv'
|
|
48
|
+
>>> fs.destination_template
|
|
49
|
+
'gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv'
|
|
50
|
+
>>> fs.promote
|
|
51
|
+
True
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
name: str = "fetch gwas catalog data"
|
|
55
|
+
"""The name of the task."""
|
|
56
|
+
|
|
57
|
+
stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
58
|
+
"""The URI to crawl the release statistics information from."""
|
|
59
|
+
|
|
60
|
+
source_template: Annotated[str, AfterValidator(_requires_release_date_template)]
|
|
61
|
+
"""The template URI of the file to download."""
|
|
62
|
+
|
|
63
|
+
destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
|
|
64
|
+
"""The template URI to upload the file to."""
|
|
65
|
+
|
|
66
|
+
promote: bool = False
|
|
67
|
+
"""Whether to promote the release information as the latest release.
|
|
68
|
+
|
|
69
|
+
Given the destination: `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`
|
|
70
|
+
|
|
71
|
+
* If set to `False` the task will upload the release information
|
|
72
|
+
only to the specified destination with `release_date` substituted by the value from the stats_uri endpoint.
|
|
73
|
+
Resulting in following destinations:
|
|
74
|
+
* `gs://gwas_catalog_inputs/gentroutils/20231001/stats.json`
|
|
75
|
+
|
|
76
|
+
* If set to `True`, the task will also upload the release information
|
|
77
|
+
to the destination with `release_date` substituted to `latest` literal, effectively
|
|
78
|
+
promoting the release as the latest release.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
@computed_field # type: ignore[prop-decorator]
|
|
82
|
+
@cached_property
|
|
83
|
+
def destinations(self) -> list[TemplateDestination]:
|
|
84
|
+
"""Get the list of destinations templates where the release information will be saved.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
list[TemplateDestination]: A list of TemplateDestination objects with the formatted destination paths.
|
|
88
|
+
|
|
89
|
+
Depending on the `promote` flag this property will return:
|
|
90
|
+
* If `promote` is `False`, it will return a single destination template.
|
|
91
|
+
* If `promote` is `True`, it will return two destinations:
|
|
92
|
+
1. The destination template with the release date substituted.
|
|
93
|
+
2. The destination with the release date substituted to `latest`.
|
|
94
|
+
"""
|
|
95
|
+
d1 = TemplateDestination(self.destination_template, False)
|
|
96
|
+
if self.promote:
|
|
97
|
+
d2 = d1.format({"release_date": "latest"})
|
|
98
|
+
return [d1, d2]
|
|
99
|
+
return [d1]
|
|
100
|
+
|
|
101
|
+
def substituted_destinations(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
|
|
102
|
+
"""Safely parse the destination name to ensure it is valid."""
|
|
103
|
+
substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
|
|
104
|
+
return [
|
|
105
|
+
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
def substituted_sources(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
|
|
109
|
+
"""Safely parse the source name to ensure it is valid."""
|
|
110
|
+
substitutions = {"release_date": release_info.strfmt("%Y/%m/%d")}
|
|
111
|
+
if self.promote:
|
|
112
|
+
return [self.source_template.format(**substitutions)] * 2
|
|
113
|
+
return [self.source_template.format(**substitutions)]
|
|
114
|
+
|
|
115
|
+
def model_post_init(self, __context: Any) -> None:
|
|
116
|
+
"""Method to ensure the scratchpad is set to ignore missing replacements."""
|
|
117
|
+
self.scratchpad_ignore_missing = True
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class Fetch(Task):
|
|
121
|
+
"""Task to fetch files from the GWAS Catalog release directory file."""
|
|
122
|
+
|
|
123
|
+
def __init__(self, spec: FetchSpec, context: TaskContext) -> None:
|
|
124
|
+
super().__init__(spec, context)
|
|
125
|
+
self.spec: FetchSpec
|
|
126
|
+
|
|
127
|
+
@report
|
|
128
|
+
def run(self) -> Self:
|
|
129
|
+
"""Fetch the file from the remote to local."""
|
|
130
|
+
logger.info(f"Fetching file from {self.spec.source_template}")
|
|
131
|
+
release_info = GwasCatalogReleaseInfo.from_uri(self.spec.stats_uri)
|
|
132
|
+
logger.info(f"Release information: {release_info}")
|
|
133
|
+
destinations = self.spec.substituted_destinations(release_info)
|
|
134
|
+
sources = self.spec.substituted_sources(release_info)
|
|
135
|
+
transferable_objects = [
|
|
136
|
+
FTPtoGCPTransferableObject(source=s, destination=d) for s, d in zip(sources, destinations, strict=True)
|
|
137
|
+
]
|
|
138
|
+
logger.info(f"Transferable objects: {transferable_objects}")
|
|
139
|
+
TransferManager().transfer(transferable_objects)
|
|
140
|
+
logger.success("File transferred successfully.")
|
|
141
|
+
return self
|
gentroutils/transfer.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Transfer module."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from typing import cast
|
|
6
|
+
|
|
7
|
+
import tqdm
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
|
|
11
|
+
from gentroutils.io.transfer import FTPtoGCPTransferableObject, PolarsDataFrameToGCSTransferableObject
|
|
12
|
+
from gentroutils.io.transfer.model import TransferableObject
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TransferManager:
|
|
16
|
+
"""Manager class for handling the transfer of various transferable objects.
|
|
17
|
+
|
|
18
|
+
This class provides static methods to `transfer` to move files or objects.
|
|
19
|
+
Currently it supports:
|
|
20
|
+
|
|
21
|
+
- FTP to Google Cloud Storage (GCP) transfers using `FTPtoGCPTransferableObject`.
|
|
22
|
+
- Polars DataFrame to GCS transfers using `PolarsDataFrameToGCSTransferableObject`.
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
async def transfer_ftp_to_gcp(transferable_objects: Sequence[FTPtoGCPTransferableObject]) -> None:
|
|
28
|
+
"""Update GWAS Catalog metadata directly to cloud bucket.
|
|
29
|
+
|
|
30
|
+
This method transfers files from FTP to Google Cloud Storage (GCS) using the provided
|
|
31
|
+
FTPtoGCPTransferableObject instances.
|
|
32
|
+
It fetches the data for the file provided in the local FTP path, collects the
|
|
33
|
+
data asynchronously to buffer, and uploads it to the provided GCP bucket blob.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
transferable_objects (Sequence[FTPtoGCPTransferableObject]): A sequence of FTPtoGCPTransferableObject instances.
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
# we always want to have the logs from this command uploaded to the target bucket
|
|
40
|
+
transfer_tasks = [asyncio.create_task(x.transfer()) for x in transferable_objects]
|
|
41
|
+
for f in tqdm.tqdm(asyncio.as_completed(transfer_tasks), total=len(transfer_tasks), desc="Downloading"):
|
|
42
|
+
await f
|
|
43
|
+
logger.info("gwas_curation_update step completed.")
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
async def transfer_polars_to_gcs(transferable_objects: Sequence[PolarsDataFrameToGCSTransferableObject]) -> None:
|
|
47
|
+
"""Transfer Polars DataFrames to Google Cloud Storage.
|
|
48
|
+
|
|
49
|
+
This method transfers Polars DataFrames to GCS using the provided
|
|
50
|
+
PolarsDataFrameToGCSTransferableObject instances.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
transferable_objects (Sequence[PolarsDataFrameToGCSTransferableObject]): A sequence of PolarsDataFrameToGCSTransferableObject instances.
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
transfer_tasks = [asyncio.create_task(x.transfer()) for x in transferable_objects]
|
|
57
|
+
for f in tqdm.tqdm(asyncio.as_completed(transfer_tasks), total=len(transfer_tasks), desc="Uploading"):
|
|
58
|
+
await f
|
|
59
|
+
logger.info("Polars DataFrame transfer to GCS completed.")
|
|
60
|
+
|
|
61
|
+
def transfer(self, transferable_objects: Sequence[TransferableObject]) -> None:
|
|
62
|
+
"""Transfer method that handles different types of transferable objects.
|
|
63
|
+
|
|
64
|
+
Main method to manage the transfer of various transferable objects.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
transferable_objects (Sequence[TransferableObject]): A sequence of TransferableObject instances.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
GentroutilsError: If the list of transferable objects is empty or if the objects are not instances of the expected types.
|
|
71
|
+
"""
|
|
72
|
+
if not transferable_objects:
|
|
73
|
+
raise GentroutilsError(GentroutilsErrorMessage.EMPTY_TRANSFERABLE_OBJECTS)
|
|
74
|
+
elif all(isinstance(c, FTPtoGCPTransferableObject) for c in transferable_objects):
|
|
75
|
+
ftp_objects = cast(Sequence[FTPtoGCPTransferableObject], transferable_objects)
|
|
76
|
+
asyncio.run(self.transfer_ftp_to_gcp(ftp_objects))
|
|
77
|
+
elif all(isinstance(c, PolarsDataFrameToGCSTransferableObject) for c in transferable_objects):
|
|
78
|
+
polars_objects = cast(Sequence[PolarsDataFrameToGCSTransferableObject], transferable_objects)
|
|
79
|
+
asyncio.run(self.transfer_polars_to_gcs(polars_objects))
|
|
80
|
+
else:
|
|
81
|
+
raise GentroutilsError(GentroutilsErrorMessage.INVALID_TRANSFERABLE_OBJECTS)
|