gentroutils 1.5.0__py3-none-any.whl → 1.6.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gentroutils/__init__.py +8 -43
- gentroutils/errors.py +39 -0
- gentroutils/io/path/__init__.py +6 -0
- gentroutils/io/path/ftp.py +48 -0
- gentroutils/io/path/gcs.py +45 -0
- gentroutils/io/transfer/__init__.py +6 -0
- gentroutils/io/transfer/ftp_to_gcs.py +49 -0
- gentroutils/io/transfer/model.py +36 -0
- gentroutils/io/transfer/polars_to_gcs.py +20 -0
- gentroutils/parsers/__init__.py +1 -0
- gentroutils/parsers/curation.py +168 -0
- gentroutils/tasks/__init__.py +90 -0
- gentroutils/tasks/crawl.py +156 -0
- gentroutils/tasks/curation.py +110 -0
- gentroutils/tasks/fetch.py +141 -0
- gentroutils/transfer.py +81 -0
- gentroutils-1.6.0.dev1.dist-info/METADATA +274 -0
- gentroutils-1.6.0.dev1.dist-info/RECORD +22 -0
- gentroutils-1.6.0.dev1.dist-info/entry_points.txt +2 -0
- {gentroutils-1.5.0.dist-info → gentroutils-1.6.0.dev1.dist-info}/licenses/LICENSE +1 -1
- gentroutils/commands/__init__.py +0 -11
- gentroutils/commands/update_gwas_curation_metadata.py +0 -287
- gentroutils/commands/utils.py +0 -152
- gentroutils/commands/validate_gwas_curation.py +0 -165
- gentroutils-1.5.0.dist-info/METADATA +0 -135
- gentroutils-1.5.0.dist-info/RECORD +0 -11
- gentroutils-1.5.0.dist-info/entry_points.txt +0 -2
- {gentroutils-1.5.0.dist-info → gentroutils-1.6.0.dev1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gentroutils
|
|
3
|
+
Version: 1.6.0.dev1
|
|
4
|
+
Summary: Open Targets python genetics utility CLI tools
|
|
5
|
+
Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Healthcare Industry
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Operating System :: Unix
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
15
|
+
Requires-Python: >=3.13
|
|
16
|
+
Requires-Dist: aioftp>=0.25.1
|
|
17
|
+
Requires-Dist: aiohttp>=3.11.18
|
|
18
|
+
Requires-Dist: google-cloud-storage>=3.1.1
|
|
19
|
+
Requires-Dist: loguru>=0.7.3
|
|
20
|
+
Requires-Dist: opentargets-otter>=25.0.2
|
|
21
|
+
Requires-Dist: polars>=1.31.0
|
|
22
|
+
Requires-Dist: pydantic>=2.10.6
|
|
23
|
+
Requires-Dist: tqdm>=4.67.1
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# gentroutils
|
|
27
|
+
|
|
28
|
+
[](https://github.com/opentargets/gentroutils/actions/workflows/pr.yaml)
|
|
29
|
+

|
|
30
|
+
[](https://github.com/opentargets/gentroutils/actions/workflows/release.yaml)
|
|
31
|
+
|
|
32
|
+
Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
pip install gentroutils
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Available commands
|
|
41
|
+
|
|
42
|
+
To see all available commands after installation run
|
|
43
|
+
|
|
44
|
+
```{bash}
|
|
45
|
+
gentroutils --help
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
To run a single step run
|
|
51
|
+
```{bash}
|
|
52
|
+
uv run gentroutils -s gwas_catalog_release # After cloning the repository
|
|
53
|
+
gentroutils -s gwas_catalog_release -c otter_config.yaml # When installed by pip
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
The `gentroutils` repository uses the [otter](https://github.com/opentargets/otter) framework to build the set of tasks to run. The current implementation of tasks can be found in the `config.yaml` file in the root of the repository. To run gentroutils installed via `pip` you need to define the otter config that looks like the `config.yaml` file.
|
|
57
|
+
|
|
58
|
+
<details>
|
|
59
|
+
<summary>Example config</summary>
|
|
60
|
+
|
|
61
|
+
For the top level fields refer to the [otter documentation](https://opentargets.github.io/otter/otter.config.html)
|
|
62
|
+
|
|
63
|
+
```yaml
|
|
64
|
+
---
|
|
65
|
+
work_path: ./work
|
|
66
|
+
log_level: DEBUG
|
|
67
|
+
scratchpad:
|
|
68
|
+
steps:
|
|
69
|
+
gwas_catalog_release:
|
|
70
|
+
- name: crawl release metadata
|
|
71
|
+
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
72
|
+
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json"
|
|
73
|
+
promote: "true"
|
|
74
|
+
- name: fetch associations
|
|
75
|
+
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
76
|
+
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-associations_ontology-annotated.tsv"
|
|
77
|
+
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv"
|
|
78
|
+
promote: true
|
|
79
|
+
- name: fetch studies
|
|
80
|
+
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
81
|
+
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt"
|
|
82
|
+
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_download_studies.tsv"
|
|
83
|
+
promote: true
|
|
84
|
+
- name: fetch ancestries
|
|
85
|
+
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
86
|
+
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt"
|
|
87
|
+
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv"
|
|
88
|
+
promote: true
|
|
89
|
+
- name: curation study
|
|
90
|
+
requires:
|
|
91
|
+
- fetch studies
|
|
92
|
+
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
93
|
+
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
94
|
+
destination_template: ./work/curation_{release_date}.tsv
|
|
95
|
+
promote: true
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
The config above defines the steps that are run in parallel by the `otter` framework.
|
|
99
|
+
|
|
100
|
+
</details>
|
|
101
|
+
|
|
102
|
+
### Available tasks
|
|
103
|
+
|
|
104
|
+
The list of tasks (defined in the `config.yaml` file) that can be run are:
|
|
105
|
+
|
|
106
|
+
#### Crawl release metadata
|
|
107
|
+
|
|
108
|
+
```yaml
|
|
109
|
+
- name: crawl release metadata
|
|
110
|
+
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
111
|
+
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json"
|
|
112
|
+
promote: "true"
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
|
|
116
|
+
|
|
117
|
+
> [!NOTE]
|
|
118
|
+
> **Task parameters**
|
|
119
|
+
>
|
|
120
|
+
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
121
|
+
> - The `destination_template` is where the metadata will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. By default it searches for the release directly in the stats_uri json output.
|
|
122
|
+
> - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/gentroutils/latest/stats.json` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
### Fetch associations
|
|
127
|
+
|
|
128
|
+
```yaml
|
|
129
|
+
- name: fetch associations
|
|
130
|
+
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
131
|
+
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-associations_ontology-annotated.tsv"
|
|
132
|
+
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv"
|
|
133
|
+
promote: true
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
|
|
137
|
+
|
|
138
|
+
> [!NOTE]
|
|
139
|
+
> **Task parameters**
|
|
140
|
+
>
|
|
141
|
+
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
142
|
+
> - The `source_template` is the URL of the GWAS Catalog associations file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
143
|
+
> - The `destination_template` is where the associations file will be saved, and it also uses the `{release_date}` placeholder. The release date is fetched from the `stats_uri` endpoint.
|
|
144
|
+
> - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_associations_ontology_annotated.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
### Fetch studies
|
|
149
|
+
|
|
150
|
+
```yaml
|
|
151
|
+
- name: fetch studies
|
|
152
|
+
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
153
|
+
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt"
|
|
154
|
+
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_download_studies.tsv"
|
|
155
|
+
promote: true
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
|
|
159
|
+
|
|
160
|
+
> [!NOTE]
|
|
161
|
+
> **Task parameters**
|
|
162
|
+
>
|
|
163
|
+
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
164
|
+
> - The `source_template` is the URL of the GWAS Catalog studies file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
165
|
+
> - The `destination_template` is where the studies file will be saved, and it also uses the `{release_date}` placeholder. The release date is fetched from the `stats_uri` endpoint.
|
|
166
|
+
> - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
### Fetch ancestries
|
|
171
|
+
|
|
172
|
+
```yaml
|
|
173
|
+
- name: fetch ancestries
|
|
174
|
+
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
175
|
+
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt"
|
|
176
|
+
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv"
|
|
177
|
+
promote: true
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
|
|
181
|
+
|
|
182
|
+
> [!NOTE]
|
|
183
|
+
> **Task parameters**
|
|
184
|
+
>
|
|
185
|
+
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
186
|
+
> - The `source_template` is the URL of the GWAS Catalog ancestries file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
187
|
+
> - The `destination_template` is where the ancestries file will be saved, and it also uses the `{release_date}` placeholder. The release date is fetched from the `stats_uri` endpoint.
|
|
188
|
+
> - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_ancestries.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
### Curation
|
|
193
|
+
|
|
194
|
+
```yaml
|
|
195
|
+
- name: curation study
|
|
196
|
+
requires:
|
|
197
|
+
- fetch studies
|
|
198
|
+
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
199
|
+
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
200
|
+
destination_template: gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv
|
|
201
|
+
promote: true
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
This task is used to build the GWAS Catalog curation file that is later used as a template for manual curation. It requires the `fetch studies` task to be completed before it can run. This is due to the fact that the curation file is build based on the list of studies fetched from `download studies` file.
|
|
205
|
+
|
|
206
|
+
> [!NOTE]
|
|
207
|
+
> **Task parameters**
|
|
208
|
+
>
|
|
209
|
+
> - The `requires` field specifies that this task depends on the `fetch studies` task, meaning it will only run after the studies have been fetched.
|
|
210
|
+
> - The `previous_curation` field is used to specify the path to the previous curation file. This is used to build the new curation file based on the previous one.
|
|
211
|
+
> - The `studies` field is the path to the studies file that was fetched in the `fetch studies` task. This file is used to build the curation file.
|
|
212
|
+
> - The `destination_template` is where the curation file will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
213
|
+
> - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Curation process
|
|
218
|
+
|
|
219
|
+
The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task autommates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
|
|
220
|
+
|
|
221
|
+
The automated process includes:
|
|
222
|
+
|
|
223
|
+
1. Reading `download studies` file with the list of studies that are currently comming from the latest GWAS Catalog release.
|
|
224
|
+
2. Reading `previous curation` file that contains the list of the curated studies from the previous release.
|
|
225
|
+
3. Comparing the two datasets with following logic:
|
|
226
|
+
- In case the study is present in the `previous curation` and `download studies`, the study is marked as `curated`
|
|
227
|
+
* In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `new`
|
|
228
|
+
* In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
|
|
229
|
+
4. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
|
|
230
|
+
5. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
|
|
231
|
+
6. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Contribute
|
|
236
|
+
|
|
237
|
+
To be able to contribute to the project you need to set it up. This project
|
|
238
|
+
runs on:
|
|
239
|
+
|
|
240
|
+
- [x] python 3.13
|
|
241
|
+
- [x] uv (dependency manager)
|
|
242
|
+
|
|
243
|
+
To set up the project run
|
|
244
|
+
|
|
245
|
+
```{bash}
|
|
246
|
+
make dev
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
The command will install above dependencies (initial requirements are curl and bash) if not present and
|
|
250
|
+
install all python dependencies listed in `pyproject.toml`. Finally the command will install `pre-commit` hooks
|
|
251
|
+
required to be run before the commit is created.
|
|
252
|
+
|
|
253
|
+
The project has additional `dev` dependencies that include the list of packages used for testing purposes.
|
|
254
|
+
All of the `dev` dependencies are automatically installed by `uv`.
|
|
255
|
+
|
|
256
|
+
To see all available dev commands
|
|
257
|
+
|
|
258
|
+
Run following command to see all available dev commands
|
|
259
|
+
|
|
260
|
+
```{bash}
|
|
261
|
+
make help
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
### Manual testing of CLI module
|
|
265
|
+
|
|
266
|
+
To check CLI execution manually you need to run
|
|
267
|
+
|
|
268
|
+
```{bash}
|
|
269
|
+
uv run gentroutils
|
|
270
|
+
```
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
This software was developed as part of the Open Targets project. For more
|
|
274
|
+
information please see: http://www.opentargets.org
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
gentroutils/__init__.py,sha256=jPZFw8osF4Ih87CAdLjaaMptfJUyqoSzLa6eNAwsuqA,281
|
|
2
|
+
gentroutils/errors.py,sha256=VhsQe0KMc7MMbuzZxTwi2gATs-zOnnGmpl_H_m4lwNg,1664
|
|
3
|
+
gentroutils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
gentroutils/transfer.py,sha256=qycMPWQi8RRvZZUAsRXmDdxE1crnpZwCuc3d5q4w89M,3889
|
|
5
|
+
gentroutils/io/path/__init__.py,sha256=oBNiw3xr84QAFejVgRIW_PttJ2ZQHZV2uYtIklXlecw,184
|
|
6
|
+
gentroutils/io/path/ftp.py,sha256=kus0GS9Bcm3IzooL8duu7fdtMDr0-Sm7LCtpiUd1Kxs,1585
|
|
7
|
+
gentroutils/io/path/gcs.py,sha256=itEEyUBQLqCBV-F7JF7ZJHUn5OBObmTCsrEfib_V7p8,1514
|
|
8
|
+
gentroutils/io/transfer/__init__.py,sha256=2KasKyCMTpnUvU2SVzZds7ggYBS5ReXn8q3K_TE0YYA,310
|
|
9
|
+
gentroutils/io/transfer/ftp_to_gcs.py,sha256=SGIpxTaQoiIeW2p9JHIMckNWhFpz1Ne4YqwoiYqdWdo,2107
|
|
10
|
+
gentroutils/io/transfer/model.py,sha256=-REhDC8nIT_RoOlRwHpvNfqN_khOm7OnlAi8-U7qX1k,1198
|
|
11
|
+
gentroutils/io/transfer/polars_to_gcs.py,sha256=l1vWDLVw5OuAjPRWEvA69w5V5cz36ARGqCOA9R9iecM,762
|
|
12
|
+
gentroutils/parsers/__init__.py,sha256=HtgvopQ3Xx_cjC2lA3Tp81Rd5-k4CUJGJu0GD7W9r0o,59
|
|
13
|
+
gentroutils/parsers/curation.py,sha256=34GvUTD9SmUpk_CEMoqJbpDyBVW3kKOyMLMiZ4nc9e0,7357
|
|
14
|
+
gentroutils/tasks/__init__.py,sha256=dN9Od2_I504AZjLwBixj04M0h6dmmxGRV7FMwHHEdzM,3537
|
|
15
|
+
gentroutils/tasks/crawl.py,sha256=njCveYWvJz6CHWQfjbciGp57yZcsKuFiZqVWR57_XeU,6838
|
|
16
|
+
gentroutils/tasks/curation.py,sha256=cJZQmaD-44rej-8K6dUW6IlzArZt-YcxtVVTZdKh-Fk,4539
|
|
17
|
+
gentroutils/tasks/fetch.py,sha256=jwqIQ49P--63X1EyKXIUf2iBtv3QRsuNgQWEBwlvgCk,6444
|
|
18
|
+
gentroutils-1.6.0.dev1.dist-info/METADATA,sha256=hD71G_Cz5eFNYbWUXBgaE6hFGwHEx4pHOb4WUXmJd6A,14941
|
|
19
|
+
gentroutils-1.6.0.dev1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
20
|
+
gentroutils-1.6.0.dev1.dist-info/entry_points.txt,sha256=KjODdAGrWKebI3ogqs9r6snAJ_DtLGpm1jREbZ6OXGs,49
|
|
21
|
+
gentroutils-1.6.0.dev1.dist-info/licenses/LICENSE,sha256=8rMKYP7K5vL-KA2WmdBkUBz4iFRveUOWUMAFP8uc3P0,10945
|
|
22
|
+
gentroutils-1.6.0.dev1.dist-info/RECORD,,
|
|
@@ -186,7 +186,7 @@ APPENDIX: How to apply the Apache License to your work.
|
|
|
186
186
|
same "printed page" as the copyright notice for easier
|
|
187
187
|
identification within third-party archives.
|
|
188
188
|
|
|
189
|
-
Copyright
|
|
189
|
+
Copyright 2025 [name of copyright owner]
|
|
190
190
|
|
|
191
191
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
192
192
|
you may not use this file except in compliance with the License.
|
gentroutils/commands/__init__.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
"""CLI submodules for gentroutils package."""
|
|
2
|
-
|
|
3
|
-
from gentroutils.commands.update_gwas_curation_metadata import (
|
|
4
|
-
update_gwas_curation_metadata_command,
|
|
5
|
-
)
|
|
6
|
-
from gentroutils.commands.validate_gwas_curation import validate_gwas_curation
|
|
7
|
-
|
|
8
|
-
__all__ = [
|
|
9
|
-
"update_gwas_curation_metadata_command",
|
|
10
|
-
"validate_gwas_curation",
|
|
11
|
-
]
|
|
@@ -1,287 +0,0 @@
|
|
|
1
|
-
"""Update gwas catalog metadata."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
import io
|
|
7
|
-
import logging
|
|
8
|
-
import re
|
|
9
|
-
import sys
|
|
10
|
-
from ftplib import FTP
|
|
11
|
-
from urllib.parse import ParseResult, urlparse
|
|
12
|
-
|
|
13
|
-
import click
|
|
14
|
-
import requests
|
|
15
|
-
from google.cloud import storage
|
|
16
|
-
|
|
17
|
-
from gentroutils.commands.utils import coro
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger("gentroutils")
|
|
20
|
-
MAX_CONCURRENT_CONNECTIONS = 10
|
|
21
|
-
CURATED_INPUTS = (
|
|
22
|
-
(
|
|
23
|
-
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv",
|
|
24
|
-
"gs://gwas_catalog_inputs/gwas_catalog_associations_ontology_annotated.tsv",
|
|
25
|
-
),
|
|
26
|
-
(
|
|
27
|
-
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt",
|
|
28
|
-
"gs://gwas_catalog_inputs/gwas_catalog_download_studies.tsv",
|
|
29
|
-
),
|
|
30
|
-
(
|
|
31
|
-
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt",
|
|
32
|
-
"gs://gwas_catalog_inputs/gwas_catalog_download_ancestries.tsv",
|
|
33
|
-
),
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@click.command(name="update-gwas-curation-metadata")
|
|
38
|
-
@click.option(
|
|
39
|
-
"--file-to-transfer",
|
|
40
|
-
"-f",
|
|
41
|
-
metavar="<ftp_file|http(s) file> <gcp_file>",
|
|
42
|
-
type=(str, str),
|
|
43
|
-
multiple=True,
|
|
44
|
-
default=CURATED_INPUTS,
|
|
45
|
-
)
|
|
46
|
-
@click.option(
|
|
47
|
-
"--gwas-catalog-release-info-url",
|
|
48
|
-
"-g",
|
|
49
|
-
metavar="<url>",
|
|
50
|
-
default="https://www.ebi.ac.uk/gwas/api/search/stats",
|
|
51
|
-
type=click.STRING,
|
|
52
|
-
)
|
|
53
|
-
@click.pass_context
|
|
54
|
-
@coro
|
|
55
|
-
async def update_gwas_curation_metadata_command(
|
|
56
|
-
ctx: click.Context,
|
|
57
|
-
file_to_transfer: list[tuple[str, str]],
|
|
58
|
-
gwas_catalog_release_info_url: str,
|
|
59
|
-
) -> None:
|
|
60
|
-
"""Update GWAS Catalog metadata directly to cloud bucket.
|
|
61
|
-
|
|
62
|
-
\b
|
|
63
|
-
This is the script to fetch the latest GWAS Catalog data files that include:
|
|
64
|
-
- [x] gwas-catalog-associations_ontology-annotated.tsv - list of associations with ontology annotations by GWAS Catalog
|
|
65
|
-
- [x] gwas-catalog-download-studies-v1.0.3.1.txt - list of published studies by GWAS Catalog
|
|
66
|
-
- [x] gwas-catalog-download-ancestries-v1.0.3.1.txt - list of published studies by GWAS Catalog
|
|
67
|
-
|
|
68
|
-
\b
|
|
69
|
-
By default all GWAS Catalog data files are uploaded from GWAS Catalog FTP server to Open Targets GCP bucket.
|
|
70
|
-
The script also captures the latest release metadata from GWAS Catalog release info url.
|
|
71
|
-
One can overwrite this script to sync data files from FTP or HTTP(s) to GCP bucket. The example usage is as follows:
|
|
72
|
-
|
|
73
|
-
\b
|
|
74
|
-
gentroutils --log-file gs://gwas_catalog_data/curated_inputs/log.txt update-gwas-curation-metadata \\
|
|
75
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv gs://gwas_catalog_data/gwas_catalog_associations_ontology_annotated.tsv \\
|
|
76
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt gs://gwas_catalog_inputs/gwas_catalog_download_studies.tsv \\
|
|
77
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt gs://gwas_catalog_inputs/gwas_catalog_download_ancestries.tsv \\
|
|
78
|
-
-g https://www.ebi.ac.uk/gwas/api/search/stats
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
To preserve the logs from this command, you can specify the log file path using `--log-file` option. The log file can point to local or GCP path.
|
|
82
|
-
Currently only FTP and HTTP(s) protocols are supported for input and GCP protocol is supported for output.
|
|
83
|
-
"""
|
|
84
|
-
# we always want to have the logs from this command uploaded to the target bucket
|
|
85
|
-
logger.debug("Running gwas_curation_update step.")
|
|
86
|
-
dry_run = ctx.obj["dry_run"]
|
|
87
|
-
if len(file_to_transfer) > MAX_CONCURRENT_CONNECTIONS:
|
|
88
|
-
logger.error(
|
|
89
|
-
"File transfer limit exceeded! Max %s connections allowed.",
|
|
90
|
-
MAX_CONCURRENT_CONNECTIONS,
|
|
91
|
-
)
|
|
92
|
-
sys.exit(1)
|
|
93
|
-
uri_map = [{"input": urlparse(ftp_file), "output": urlparse(gcp_file)} for ftp_file, gcp_file in file_to_transfer]
|
|
94
|
-
transfer_tasks = generate_transfer_tasks(uri_map, dry_run)
|
|
95
|
-
|
|
96
|
-
# capture latest release metadata
|
|
97
|
-
with requests.get(gwas_catalog_release_info_url) as response:
|
|
98
|
-
if not response.ok:
|
|
99
|
-
logger.error("Failed to fetch release info.")
|
|
100
|
-
sys.exit(1)
|
|
101
|
-
release_info = response.json()
|
|
102
|
-
for key, value in release_info.items():
|
|
103
|
-
logger.debug("%s: %s", key, value)
|
|
104
|
-
|
|
105
|
-
efo_version = release_info.get("efoversion")
|
|
106
|
-
logger.info("Diseases were mapped to %s EFO release.", efo_version)
|
|
107
|
-
logger.info("EFO version: %s", efo_version)
|
|
108
|
-
ensembl_build = release_info.get("ensemblbuild")
|
|
109
|
-
logger.info("Genes were mapped to v%s Ensembl release.", ensembl_build)
|
|
110
|
-
|
|
111
|
-
results = await asyncio.gather(*transfer_tasks)
|
|
112
|
-
if not dry_run:
|
|
113
|
-
logger.info("Transferred %s files.", len(results))
|
|
114
|
-
logger.info("gwas_curation_update step completed.")
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def generate_transfer_tasks(uri_map: list[dict[str, ParseResult]], dry_run: bool) -> list[asyncio.Task[None]]:
|
|
118
|
-
"""Generate transfer tasks.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
uri_map (list[dict[str, ParseResult]]): list of transferable tasks, each should have `input` and `output` keys.
|
|
122
|
-
dry_run (bool): dry run flag.
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
list[asyncio.Task[None]]: list of asyncio tasks.
|
|
126
|
-
"""
|
|
127
|
-
ftp_transfer_list = []
|
|
128
|
-
http_transfer_list = []
|
|
129
|
-
for uri in uri_map:
|
|
130
|
-
if uri["input"].scheme != "ftp" and not uri["input"].scheme.startswith("http"):
|
|
131
|
-
logger.error("Only FTP and HTTP(s) protocols is supported at input.")
|
|
132
|
-
sys.exit(1)
|
|
133
|
-
if uri["output"].scheme != "gs":
|
|
134
|
-
logger.error("Only GCP protocol is supported at output.")
|
|
135
|
-
sys.exit(1)
|
|
136
|
-
in_server = uri["input"].netloc
|
|
137
|
-
out_server = uri["output"].netloc
|
|
138
|
-
in_prefix = "/".join(uri["input"].path.strip("/").split("/")[:-1])
|
|
139
|
-
in_file = uri["input"].path.strip("/").split("/")[-1]
|
|
140
|
-
out_prefix = "/".join(uri["output"].path[1:-1].split("/")[:-1])
|
|
141
|
-
out_bucket = uri["output"].path.split("/")[-1]
|
|
142
|
-
if uri["input"].scheme == "ftp":
|
|
143
|
-
ftp_transfer_list.append(
|
|
144
|
-
{
|
|
145
|
-
"ftp_server": in_server,
|
|
146
|
-
"ftp_prefix": in_prefix,
|
|
147
|
-
"ftp_filename": in_file,
|
|
148
|
-
"gcp_bucket": out_server,
|
|
149
|
-
"gcp_prefix": out_prefix,
|
|
150
|
-
"gcp_filename": out_bucket,
|
|
151
|
-
}
|
|
152
|
-
)
|
|
153
|
-
if uri["input"].scheme.startswith("http"):
|
|
154
|
-
http_transfer_list.append(
|
|
155
|
-
{
|
|
156
|
-
"http_url": uri["input"].geturl(),
|
|
157
|
-
"gcp_bucket": out_server,
|
|
158
|
-
"gcp_prefix": out_prefix,
|
|
159
|
-
"gcp_filename": out_bucket,
|
|
160
|
-
}
|
|
161
|
-
)
|
|
162
|
-
transfer_tasks = []
|
|
163
|
-
for transfer_obj in ftp_transfer_list:
|
|
164
|
-
transfer_tasks.append(
|
|
165
|
-
asyncio.create_task(
|
|
166
|
-
sync_from_ftp_to_gcp(
|
|
167
|
-
transfer_obj["ftp_server"],
|
|
168
|
-
transfer_obj["ftp_prefix"],
|
|
169
|
-
transfer_obj["ftp_filename"],
|
|
170
|
-
transfer_obj["gcp_bucket"],
|
|
171
|
-
transfer_obj["gcp_prefix"],
|
|
172
|
-
transfer_obj["gcp_filename"],
|
|
173
|
-
dry_run=dry_run,
|
|
174
|
-
)
|
|
175
|
-
)
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
for transfer_obj in http_transfer_list:
|
|
179
|
-
transfer_tasks.append(
|
|
180
|
-
asyncio.create_task(
|
|
181
|
-
sync_from_http_to_gcp(
|
|
182
|
-
transfer_obj["http_url"],
|
|
183
|
-
transfer_obj["gcp_bucket"],
|
|
184
|
-
transfer_obj["gcp_prefix"],
|
|
185
|
-
transfer_obj["gcp_filename"],
|
|
186
|
-
dry_run=dry_run,
|
|
187
|
-
)
|
|
188
|
-
)
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
return transfer_tasks
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
async def sync_from_http_to_gcp(url: str, gcp_bucket: str, gcp_prefix: str, gcp_file: str, *, dry_run: bool = True) -> None:
|
|
195
|
-
"""Sync file from HTTP and upload to GCP.
|
|
196
|
-
|
|
197
|
-
This function fetches the data from the provided HTTP URL and uploads the content
|
|
198
|
-
directly to provided GCP bucket blob.
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
url (str): HTTP URL to fetch the data.
|
|
202
|
-
gcp_bucket (str): GCP bucket name.
|
|
203
|
-
gcp_prefix (str): GCP prefix.
|
|
204
|
-
gcp_file (str): GCP file name.
|
|
205
|
-
dry_run (bool, optional): Dry run flag. Defaults to True.
|
|
206
|
-
"""
|
|
207
|
-
if dry_run:
|
|
208
|
-
logger.info(
|
|
209
|
-
"Attempting to transfer data from %s to gs://%s/%s/%s.",
|
|
210
|
-
url,
|
|
211
|
-
gcp_bucket,
|
|
212
|
-
gcp_prefix,
|
|
213
|
-
gcp_file,
|
|
214
|
-
)
|
|
215
|
-
return
|
|
216
|
-
logger.info("Retriving data from: %s.", url)
|
|
217
|
-
response = requests.get(url)
|
|
218
|
-
if not response.ok:
|
|
219
|
-
logger.error("Failed to fetch data from %s.", url)
|
|
220
|
-
return
|
|
221
|
-
|
|
222
|
-
content = response.content
|
|
223
|
-
bucket = storage.Client().bucket(gcp_bucket)
|
|
224
|
-
gcp_path = f"{gcp_prefix}/{gcp_file}" if gcp_prefix else gcp_file
|
|
225
|
-
|
|
226
|
-
blob = bucket.blob(gcp_path)
|
|
227
|
-
logger.info("Uploading the data to: gs://%s/%s.", gcp_bucket, gcp_path)
|
|
228
|
-
blob.upload_from_string(content.decode("utf-8"))
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
async def sync_from_ftp_to_gcp(
|
|
232
|
-
ftp_server: str,
|
|
233
|
-
ftp_prefix: str,
|
|
234
|
-
ftp_file: str,
|
|
235
|
-
gcp_bucket: str,
|
|
236
|
-
gcp_prefix: str,
|
|
237
|
-
gcp_file: str,
|
|
238
|
-
*,
|
|
239
|
-
dry_run: bool = True,
|
|
240
|
-
) -> None:
|
|
241
|
-
"""Fetch files from FTP and upload to GCP.
|
|
242
|
-
|
|
243
|
-
This function fetches the data from the provided FTP server and uploads the content directly
|
|
244
|
-
to the provided GCP bucket blob.
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
ftp_server (str): FTP server.
|
|
248
|
-
ftp_prefix (str): FTP prefix.
|
|
249
|
-
ftp_file (str): FTP file name.
|
|
250
|
-
gcp_bucket (str): GCP bucket name.
|
|
251
|
-
gcp_prefix (str): GCP prefix.
|
|
252
|
-
gcp_file (str): GCP file name.
|
|
253
|
-
dry_run (bool, optional): Dry run flag. Defaults to True.
|
|
254
|
-
|
|
255
|
-
"""
|
|
256
|
-
if dry_run:
|
|
257
|
-
logger.info(
|
|
258
|
-
"Attempting to transfer data from ftp://%s/%s/%s to gs://%s/%s/%s.",
|
|
259
|
-
ftp_server,
|
|
260
|
-
ftp_prefix,
|
|
261
|
-
ftp_file,
|
|
262
|
-
gcp_bucket,
|
|
263
|
-
gcp_prefix,
|
|
264
|
-
gcp_file,
|
|
265
|
-
)
|
|
266
|
-
return
|
|
267
|
-
with FTP() as ftp:
|
|
268
|
-
ftp.connect(ftp_server)
|
|
269
|
-
ftp.login()
|
|
270
|
-
bucket = storage.Client().bucket(gcp_bucket)
|
|
271
|
-
gcp_path = f"{gcp_prefix}/{gcp_file}" if gcp_prefix else gcp_file
|
|
272
|
-
blob = bucket.blob(gcp_path)
|
|
273
|
-
logger.info("Changing directory to %s.", ftp_prefix)
|
|
274
|
-
ftp.cwd(ftp_prefix)
|
|
275
|
-
dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", ftp.pwd())
|
|
276
|
-
if dir_match:
|
|
277
|
-
logger.info("Found release date!: %s", dir_match.group("release_date"))
|
|
278
|
-
buffer = io.BytesIO()
|
|
279
|
-
logger.info("Retrieving data from: ftp://%s/%s/%s.", ftp_server, ftp_prefix, ftp_file)
|
|
280
|
-
ftp.retrbinary(f"RETR {ftp_file}", lambda x: buffer.write(x))
|
|
281
|
-
content = buffer.getvalue().decode("utf-8")
|
|
282
|
-
buffer.close()
|
|
283
|
-
logger.info("Uploading data to: gs://%s/%s.", gcp_bucket, gcp_path)
|
|
284
|
-
blob.upload_from_string("".join(content))
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
__all__ = ["update_gwas_curation_metadata_command"]
|