gentroutils 3.1.0__tar.gz → 4.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gentroutils-3.1.0 → gentroutils-4.0.0}/CHANGELOG.md +66 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/PKG-INFO +16 -12
- {gentroutils-3.1.0 → gentroutils-4.0.0}/README.md +13 -9
- gentroutils-4.0.0/config.yaml +41 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/pyproject.toml +3 -4
- gentroutils-4.0.0/src/gentroutils/io/transfer/ftp_to_gcs.py +143 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/polars_to_gcs.py +1 -1
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/parsers/curation.py +88 -8
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/tasks/curation.py +9 -1
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/transfer/test_ftp_to_gcs.py +52 -1
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/transfer/test_polars_to_gcs.py +8 -6
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/parsers/test_curation.py +128 -26
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/tasks/test_crawl_task.py +2 -2
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/tasks/test_curation_task.py +11 -4
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/tasks/test_fetch_task.py +17 -13
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/test_transfer.py +21 -15
- gentroutils-4.0.0/uv.lock +2344 -0
- gentroutils-3.1.0/config.yaml +0 -40
- gentroutils-3.1.0/src/gentroutils/io/transfer/ftp_to_gcs.py +0 -61
- gentroutils-3.1.0/uv.lock +0 -2132
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/build.yaml +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/labeler.yaml +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/pr.yaml +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/release.yaml +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/release_pr.yaml +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/tag.yaml +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.gitignore +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.pre-commit-config.yaml +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.vscode/extensions.json +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/.vscode/settings.json +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/Dockerfile +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/LICENSE +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/Makefile +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/commitlint.config.js +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/conftest.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/docs/00_prepare_tables_for_curation.R +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/docs/gwas_catalog_curation.md +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/setup.sh +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/__init__.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/errors.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/path/__init__.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/path/ftp.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/path/gcs.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/__init__.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/model.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/parsers/__init__.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/py.typed +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/tasks/__init__.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/tasks/crawl.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/tasks/fetch.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/transfer.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/ftp/test/databases/gwas/summary_statistics/harmonised_list.txt +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/gsutil_list.txt +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/correct_curation.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_analysisFlag_type.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_analysisFlag_value.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_columns_curation.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_publicationTitle_type.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_pubmedId_type.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyId_type.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyId_value.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyType_type.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyType_value.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_traitFromSource_type.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/non_unique_studyId.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/null_value_in_studyId.tsv +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/test.h.tsv.gz +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/conftest.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/path/conftest.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/path/test_ftp.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/path/test_gcs.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/transfer/conftest.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/transfer/test_model.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/parsers/conftest.py +0 -0
- {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/tasks/conftest.py +0 -0
|
@@ -1,6 +1,72 @@
|
|
|
1
1
|
# CHANGELOG
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
## v4.0.0 (2026-02-03)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
## v4.0.0-dev.1 (2026-02-03)
|
|
8
|
+
|
|
9
|
+
### Features
|
|
10
|
+
|
|
11
|
+
- Updete dependencies
|
|
12
|
+
([`b6af4d2`](https://github.com/opentargets/gentroutils/commit/b6af4d28605e7c687f5ec15cae7187c64e834cb0))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
## v3.2.0 (2026-02-03)
|
|
16
|
+
|
|
17
|
+
### Chores
|
|
18
|
+
|
|
19
|
+
- Update uv lock
|
|
20
|
+
([`6f13fc0`](https://github.com/opentargets/gentroutils/commit/6f13fc0055ee9a49a215166d3cccb31747602a4f))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
## v3.2.0-dev.2 (2026-02-03)
|
|
24
|
+
|
|
25
|
+
### Bug Fixes
|
|
26
|
+
|
|
27
|
+
- Output tsv file instead of csv
|
|
28
|
+
([`aff71b1`](https://github.com/opentargets/gentroutils/commit/aff71b16b6c4d273cc851050a793d3798bae27ac))
|
|
29
|
+
|
|
30
|
+
- Test
|
|
31
|
+
([`f9dd890`](https://github.com/opentargets/gentroutils/commit/f9dd890efc32ab969fbcd14eb0da14e40678e8fb))
|
|
32
|
+
|
|
33
|
+
- Test for curation
|
|
34
|
+
([`b853358`](https://github.com/opentargets/gentroutils/commit/b85335815d7a22745c61404f81b612a14cce06d5))
|
|
35
|
+
|
|
36
|
+
- Test for curation
|
|
37
|
+
([`22138ab`](https://github.com/opentargets/gentroutils/commit/22138ab31f7551a4b161f6f1885b7975d57a0ac7))
|
|
38
|
+
|
|
39
|
+
### Chores
|
|
40
|
+
|
|
41
|
+
- Cleanup
|
|
42
|
+
([`68a3f66`](https://github.com/opentargets/gentroutils/commit/68a3f6607a4a1b61441c1369f2a9d3b4babec30c))
|
|
43
|
+
|
|
44
|
+
- Fix glob pattern
|
|
45
|
+
([`404b8ca`](https://github.com/opentargets/gentroutils/commit/404b8ca71b95764529ebb3df7c39881a0a12ff5e))
|
|
46
|
+
|
|
47
|
+
- Handle mutliple sumstat files
|
|
48
|
+
([`1fc8902`](https://github.com/opentargets/gentroutils/commit/1fc8902171a8f6edac407b790c3bcbe691792f96))
|
|
49
|
+
|
|
50
|
+
- Update
|
|
51
|
+
([`e69575b`](https://github.com/opentargets/gentroutils/commit/e69575b5a6c802b78b959314b84348d7969eeaeb))
|
|
52
|
+
|
|
53
|
+
- Update readme
|
|
54
|
+
([`12f274c`](https://github.com/opentargets/gentroutils/commit/12f274c5158b3986ba2511791fc2289b24d9aa40))
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
## v3.2.0-dev.1 (2025-11-05)
|
|
58
|
+
|
|
59
|
+
### Chores
|
|
60
|
+
|
|
61
|
+
- Uncomment config
|
|
62
|
+
([`30c4d68`](https://github.com/opentargets/gentroutils/commit/30c4d68e79a35d2c5c83cd17a15f63906ef834d6))
|
|
63
|
+
|
|
64
|
+
### Features
|
|
65
|
+
|
|
66
|
+
- **associations**: Allow zip file transfer from ftp
|
|
67
|
+
([`662a635`](https://github.com/opentargets/gentroutils/commit/662a63593cd5f340a768974041461cc65e1566b9))
|
|
68
|
+
|
|
69
|
+
|
|
4
70
|
## v3.1.0 (2025-09-02)
|
|
5
71
|
|
|
6
72
|
### Chores
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gentroutils
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.0
|
|
4
4
|
Summary: Open Targets python genetics utility CLI tools
|
|
5
5
|
Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,13 +12,13 @@ Classifier: License :: OSI Approved :: Apache Software License
|
|
|
12
12
|
Classifier: Operating System :: Unix
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
15
|
-
Requires-Python:
|
|
15
|
+
Requires-Python: <=3.13,>3.11
|
|
16
16
|
Requires-Dist: aioftp>=0.25.1
|
|
17
17
|
Requires-Dist: aiohttp>=3.11.18
|
|
18
18
|
Requires-Dist: gcsfs>=2025.7.0
|
|
19
19
|
Requires-Dist: google-cloud-storage>=3.1.1
|
|
20
20
|
Requires-Dist: loguru>=0.7.3
|
|
21
|
-
Requires-Dist: opentargets-otter>=25.0.
|
|
21
|
+
Requires-Dist: opentargets-otter>=25.0.15
|
|
22
22
|
Requires-Dist: polars[fsspec]>=1.31.0
|
|
23
23
|
Requires-Dist: pydantic>=2.10.6
|
|
24
24
|
Requires-Dist: tqdm>=4.67.1
|
|
@@ -99,6 +99,7 @@ steps:
|
|
|
99
99
|
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
100
100
|
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
101
101
|
destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
|
|
102
|
+
summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
|
|
102
103
|
promote: true
|
|
103
104
|
```
|
|
104
105
|
|
|
@@ -164,7 +165,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
|
|
|
164
165
|
|
|
165
166
|
This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
|
|
166
167
|
|
|
167
|
-
> [!NOTE]
|
|
168
|
+
> [!NOTE]
|
|
168
169
|
> **Task parameters**
|
|
169
170
|
>
|
|
170
171
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
@@ -186,7 +187,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
|
|
|
186
187
|
|
|
187
188
|
This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
|
|
188
189
|
|
|
189
|
-
> [!NOTE]
|
|
190
|
+
> [!NOTE]
|
|
190
191
|
> **Task parameters**
|
|
191
192
|
>
|
|
192
193
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
@@ -205,6 +206,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
|
|
|
205
206
|
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
206
207
|
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
207
208
|
destination_template: gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv
|
|
209
|
+
summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
|
|
208
210
|
promote: true
|
|
209
211
|
```
|
|
210
212
|
|
|
@@ -218,24 +220,26 @@ This task is used to build the GWAS Catalog curation file that is later used as
|
|
|
218
220
|
> - The `studies` field is the path to the studies file that was fetched in the `fetch studies` task. This file is used to build the curation file.
|
|
219
221
|
> - The `destination_template` is where the curation file will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
220
222
|
> - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
|
|
223
|
+
> The `summary_statistics_glob` field is used to specify the glob pattern to list all synced summary statistics files from GCS. This is used to identify which studies have summary statistics available.
|
|
221
224
|
|
|
222
225
|
---
|
|
223
226
|
|
|
224
227
|
## Curation process
|
|
225
228
|
|
|
226
|
-
The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task
|
|
229
|
+
The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task automates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
|
|
227
230
|
|
|
228
231
|
The automated process includes:
|
|
229
232
|
|
|
230
233
|
1. Reading `download studies` file with the list of studies that are currently comming from the latest GWAS Catalog release.
|
|
231
234
|
2. Reading `previous curation` file that contains the list of the curated studies from the previous release.
|
|
232
|
-
3.
|
|
235
|
+
3. Listing all synced summary statistics files from the `summary_statistics_glob` parameter to identify which studies have summary statistics available. Note that this can be more then the list of studies in the `download studies` file as syncing also involves the unpublished studies.
|
|
236
|
+
4. Comparing the three datasets with following logic:
|
|
233
237
|
- In case the study is present in the `previous curation` and `download studies`, the study is marked as `curated`
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
238
|
+
- In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `to_curate` or `has_no_sumstats` depending on the presence of summary statistics files
|
|
239
|
+
- In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
|
|
240
|
+
5. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
|
|
241
|
+
6. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
|
|
242
|
+
7. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
|
|
239
243
|
|
|
240
244
|
---
|
|
241
245
|
|
|
@@ -73,6 +73,7 @@ steps:
|
|
|
73
73
|
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
74
74
|
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
75
75
|
destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
|
|
76
|
+
summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
|
|
76
77
|
promote: true
|
|
77
78
|
```
|
|
78
79
|
|
|
@@ -138,7 +139,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
|
|
|
138
139
|
|
|
139
140
|
This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
|
|
140
141
|
|
|
141
|
-
> [!NOTE]
|
|
142
|
+
> [!NOTE]
|
|
142
143
|
> **Task parameters**
|
|
143
144
|
>
|
|
144
145
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
@@ -160,7 +161,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
|
|
|
160
161
|
|
|
161
162
|
This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
|
|
162
163
|
|
|
163
|
-
> [!NOTE]
|
|
164
|
+
> [!NOTE]
|
|
164
165
|
> **Task parameters**
|
|
165
166
|
>
|
|
166
167
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
@@ -179,6 +180,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
|
|
|
179
180
|
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
180
181
|
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
181
182
|
destination_template: gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv
|
|
183
|
+
summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
|
|
182
184
|
promote: true
|
|
183
185
|
```
|
|
184
186
|
|
|
@@ -192,24 +194,26 @@ This task is used to build the GWAS Catalog curation file that is later used as
|
|
|
192
194
|
> - The `studies` field is the path to the studies file that was fetched in the `fetch studies` task. This file is used to build the curation file.
|
|
193
195
|
> - The `destination_template` is where the curation file will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
194
196
|
> - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
|
|
197
|
+
> The `summary_statistics_glob` field is used to specify the glob pattern to list all synced summary statistics files from GCS. This is used to identify which studies have summary statistics available.
|
|
195
198
|
|
|
196
199
|
---
|
|
197
200
|
|
|
198
201
|
## Curation process
|
|
199
202
|
|
|
200
|
-
The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task
|
|
203
|
+
The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task automates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
|
|
201
204
|
|
|
202
205
|
The automated process includes:
|
|
203
206
|
|
|
204
207
|
1. Reading `download studies` file with the list of studies that are currently comming from the latest GWAS Catalog release.
|
|
205
208
|
2. Reading `previous curation` file that contains the list of the curated studies from the previous release.
|
|
206
|
-
3.
|
|
209
|
+
3. Listing all synced summary statistics files from the `summary_statistics_glob` parameter to identify which studies have summary statistics available. Note that this can be more then the list of studies in the `download studies` file as syncing also involves the unpublished studies.
|
|
210
|
+
4. Comparing the three datasets with following logic:
|
|
207
211
|
- In case the study is present in the `previous curation` and `download studies`, the study is marked as `curated`
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
212
|
+
- In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `to_curate` or `has_no_sumstats` depending on the presence of summary statistics files
|
|
213
|
+
- In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
|
|
214
|
+
5. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
|
|
215
|
+
6. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
|
|
216
|
+
7. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
|
|
213
217
|
|
|
214
218
|
---
|
|
215
219
|
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
---
|
|
2
|
+
work_path: ./work
|
|
3
|
+
log_level: DEBUG
|
|
4
|
+
scratchpad:
|
|
5
|
+
gc_stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
6
|
+
gc_bucket: "gs://gwas_catalog_inputs"
|
|
7
|
+
gc_ftp: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases"
|
|
8
|
+
|
|
9
|
+
steps:
|
|
10
|
+
gwas_catalog_release:
|
|
11
|
+
- name: crawl release metadata
|
|
12
|
+
stats_uri: ${gc_stats_uri}
|
|
13
|
+
destination_template: '${gc_bucket}/gentroutils/{release_date}/stats.json'
|
|
14
|
+
promote: true
|
|
15
|
+
|
|
16
|
+
- name: fetch studies
|
|
17
|
+
stats_uri: ${gc_stats_uri}
|
|
18
|
+
source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt'
|
|
19
|
+
destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_studies.tsv'
|
|
20
|
+
promote: true
|
|
21
|
+
|
|
22
|
+
- name: fetch ancestries
|
|
23
|
+
stats_uri: ${gc_stats_uri}
|
|
24
|
+
source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt'
|
|
25
|
+
destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv'
|
|
26
|
+
promote: true
|
|
27
|
+
|
|
28
|
+
- name: fetch associations
|
|
29
|
+
stats_uri: ${gc_stats_uri}
|
|
30
|
+
source_template: '${gc_ftp}/{release_date}/gwas-catalog-associations_ontology-annotated-full.zip'
|
|
31
|
+
destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv'
|
|
32
|
+
promote: true
|
|
33
|
+
|
|
34
|
+
- name: curation study
|
|
35
|
+
requires:
|
|
36
|
+
- fetch studies
|
|
37
|
+
previous_curation: '${gc_bucket}/curation/latest/curated/GWAS_Catalog_study_curation.tsv'
|
|
38
|
+
studies: '${gc_bucket}/gentroutils/latest/gwas_catalog_download_studies.tsv'
|
|
39
|
+
summary_statistics_glob: '${gc_bucket}/raw_summary_statistics/**.h.tsv.gz'
|
|
40
|
+
destination_template: '${gc_bucket}/curation/{release_date}/raw/GWAS_Catalog_study_curation.tsv'
|
|
41
|
+
promote: true
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
authors = [{ name = "Szymon Szyszkowski", email = "ss60@sanger.ac.uk" }]
|
|
3
3
|
name = "gentroutils"
|
|
4
|
-
version = "
|
|
4
|
+
version = "4.0.0"
|
|
5
5
|
description = "Open Targets python genetics utility CLI tools"
|
|
6
6
|
dependencies = [
|
|
7
7
|
"aiohttp>=3.11.18",
|
|
@@ -10,12 +10,12 @@ dependencies = [
|
|
|
10
10
|
"pydantic>=2.10.6",
|
|
11
11
|
"loguru>=0.7.3",
|
|
12
12
|
"tqdm>=4.67.1",
|
|
13
|
-
"opentargets-otter>=25.0.
|
|
13
|
+
"opentargets-otter>=25.0.15",
|
|
14
14
|
"google-cloud-storage>=3.1.1",
|
|
15
15
|
"gcsfs>=2025.7.0",
|
|
16
16
|
]
|
|
17
17
|
readme = "README.md"
|
|
18
|
-
requires-python = "
|
|
18
|
+
requires-python = ">3.11,<=3.13"
|
|
19
19
|
license = "Apache-2.0"
|
|
20
20
|
classifiers = [
|
|
21
21
|
"Development Status :: 3 - Alpha",
|
|
@@ -50,7 +50,6 @@ dev = [
|
|
|
50
50
|
"gcloud-storage-emulator>=0.5.0",
|
|
51
51
|
"types-requests>=2.32.0.20240712",
|
|
52
52
|
"pyftpdlib>=2.0.1",
|
|
53
|
-
"python-semantic-release>=9.19.1",
|
|
54
53
|
"pandas-stubs>=2.2.3.250308",
|
|
55
54
|
"ipython>=8.36.0",
|
|
56
55
|
"pytest-asyncio>=1.1.0",
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Transfer files from FTP to Google Cloud Storage (GCS)."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import io
|
|
5
|
+
import re
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
import aioftp
|
|
9
|
+
from google.cloud import storage
|
|
10
|
+
from loguru import logger
|
|
11
|
+
from pydantic import AfterValidator
|
|
12
|
+
|
|
13
|
+
from gentroutils.io.path import FTPPath, GCSPath
|
|
14
|
+
from gentroutils.io.transfer.model import TransferableObject
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FTPtoGCPTransferableObject(TransferableObject):
|
|
18
|
+
"""A class to represent an object that can be transferred from FTP to GCP."""
|
|
19
|
+
|
|
20
|
+
source: Annotated[str, AfterValidator(lambda x: str(FTPPath(x)))]
|
|
21
|
+
destination: Annotated[str, AfterValidator(lambda x: str(GCSPath(x)))]
|
|
22
|
+
|
|
23
|
+
async def transfer(self) -> None:
|
|
24
|
+
"""Transfer files from FTP to GCP.
|
|
25
|
+
|
|
26
|
+
This function fetches the data for the file provided in the local FTP path, collects the
|
|
27
|
+
data asynchronously to buffer, and uploads it to the provided GCP bucket blob.
|
|
28
|
+
|
|
29
|
+
Implements retry logic with exponential backoff for handling transient network errors.
|
|
30
|
+
"""
|
|
31
|
+
max_retries = 3
|
|
32
|
+
retry_delay = 1 # Initial delay in seconds
|
|
33
|
+
|
|
34
|
+
for attempt in range(max_retries):
|
|
35
|
+
try:
|
|
36
|
+
await self._perform_transfer()
|
|
37
|
+
return # Success, exit the retry loop
|
|
38
|
+
except (ConnectionResetError, OSError, aioftp.errors.AIOFTPException) as e:
|
|
39
|
+
if attempt < max_retries - 1:
|
|
40
|
+
wait_time = retry_delay * (2**attempt) # Exponential backoff
|
|
41
|
+
logger.warning(
|
|
42
|
+
f"Transfer attempt {attempt + 1}/{max_retries} failed for {self.source}: {e}. "
|
|
43
|
+
f"Retrying in {wait_time}s..."
|
|
44
|
+
)
|
|
45
|
+
await asyncio.sleep(wait_time)
|
|
46
|
+
else:
|
|
47
|
+
logger.error(f"Transfer failed after {max_retries} attempts for {self.source}: {e}")
|
|
48
|
+
raise
|
|
49
|
+
except Exception as e:
|
|
50
|
+
# For non-retryable exceptions, log and raise immediately
|
|
51
|
+
logger.error(f"Non-retryable error during transfer from {self.source} to {self.destination}: {e}")
|
|
52
|
+
raise
|
|
53
|
+
|
|
54
|
+
async def _perform_transfer(self) -> None:
|
|
55
|
+
"""Perform the actual transfer operation.
|
|
56
|
+
|
|
57
|
+
This is separated from the transfer method to allow for retry logic.
|
|
58
|
+
"""
|
|
59
|
+
logger.info(f"Attempting to transfer data from {self.source} to {self.destination}.")
|
|
60
|
+
gcs_obj = GCSPath(self.destination)
|
|
61
|
+
ftp_obj = FTPPath(self.source)
|
|
62
|
+
|
|
63
|
+
async with aioftp.Client.context(ftp_obj.server, user="anonymous", password="anonymous") as ftp: # noqa: S106
|
|
64
|
+
bucket = storage.Client().bucket(gcs_obj.bucket)
|
|
65
|
+
blob = bucket.blob(gcs_obj.object)
|
|
66
|
+
logger.info(f"Searching for the release date in the provided ftp path: {ftp_obj.base_dir}.")
|
|
67
|
+
dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(ftp_obj.base_dir))
|
|
68
|
+
|
|
69
|
+
if dir_match:
|
|
70
|
+
logger.info(f"Found release date to search in the ftp {dir_match.group('release_date')}.")
|
|
71
|
+
release_date = dir_match.group("release_date")
|
|
72
|
+
try:
|
|
73
|
+
logger.debug(f"We are in the directory: {await ftp.get_current_directory()}")
|
|
74
|
+
logger.debug(f"Changing directory to: {ftp_obj.base_dir}")
|
|
75
|
+
await ftp.change_directory(ftp_obj.base_dir)
|
|
76
|
+
logger.success(f"Successfully changed directory to: {ftp_obj.base_dir}")
|
|
77
|
+
except aioftp.StatusCodeError as e:
|
|
78
|
+
logger.warning(f"Failed to change directory to {ftp_obj.base_dir}: {e}")
|
|
79
|
+
logger.warning(f"Probably the release date {release_date} is out of sync with the api endpoint.")
|
|
80
|
+
try:
|
|
81
|
+
logger.warning("Attempting to load the `latest` release.")
|
|
82
|
+
ftp_obj = FTPPath(self.source.replace(release_date, "latest"))
|
|
83
|
+
await ftp.change_directory(ftp_obj.base_dir)
|
|
84
|
+
logger.success(f"Successfully changed directory to: {ftp_obj.base_dir}")
|
|
85
|
+
|
|
86
|
+
except aioftp.StatusCodeError as e:
|
|
87
|
+
logger.error(f"Failed to find the latest release under {ftp_obj}")
|
|
88
|
+
raise
|
|
89
|
+
|
|
90
|
+
logger.debug("Creating in-memory buffer to store downloaded data.")
|
|
91
|
+
buffer = io.BytesIO()
|
|
92
|
+
logger.debug(f"Downloading data from FTP path: {ftp_obj.filename}")
|
|
93
|
+
stream = await ftp.download_stream(ftp_obj.filename)
|
|
94
|
+
logger.info("Successfully connected to the FTP stream, beginning data transfer to buffer.")
|
|
95
|
+
async with stream:
|
|
96
|
+
async for block in stream.iter_by_block():
|
|
97
|
+
buffer.write(block)
|
|
98
|
+
buffer.seek(0)
|
|
99
|
+
if ftp_obj.filename.endswith(".zip"):
|
|
100
|
+
logger.info("Uploading zipped content to GCS blob.")
|
|
101
|
+
logger.info("Unzipping content before upload.")
|
|
102
|
+
content = unzip_buffer(buffer)
|
|
103
|
+
blob.upload_from_string(content)
|
|
104
|
+
else:
|
|
105
|
+
content = buffer.getvalue()
|
|
106
|
+
buffer.close()
|
|
107
|
+
blob.upload_from_string(content)
|
|
108
|
+
|
|
109
|
+
else:
|
|
110
|
+
logger.error(f"Failed to extract release date from the provided ftp path: {ftp_obj.base_dir}.")
|
|
111
|
+
raise ValueError("Release date could not be extracted from the FTP path.")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def unzip_buffer(buffer: io.BytesIO) -> bytes:
|
|
115
|
+
"""Unzip a BytesIO buffer and return a dictionary of file names to their content.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
buffer (io.BytesIO): The in-memory buffer containing zipped data.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
bytes: The unzipped content of the single file.
|
|
122
|
+
|
|
123
|
+
Raises:
|
|
124
|
+
ValueError: If multiple files are found in the zipped buffer or if no files are found.
|
|
125
|
+
"""
|
|
126
|
+
import zipfile
|
|
127
|
+
|
|
128
|
+
unzipped_files: dict[str, bytes] = {}
|
|
129
|
+
with zipfile.ZipFile(buffer) as z:
|
|
130
|
+
for file_info in z.infolist():
|
|
131
|
+
with z.open(file_info) as unzipped_file:
|
|
132
|
+
unzipped_files[file_info.filename] = unzipped_file.read()
|
|
133
|
+
|
|
134
|
+
if len(unzipped_files) == 0:
|
|
135
|
+
logger.error("No files were found in the zipped buffer.")
|
|
136
|
+
raise ValueError("No files were found in the zipped buffer.")
|
|
137
|
+
if len(unzipped_files) != 1:
|
|
138
|
+
logger.error("Multiple files were found in the zipped buffer.")
|
|
139
|
+
raise ValueError("Multiple files were found in the zipped buffer.")
|
|
140
|
+
keys = list(unzipped_files.keys())
|
|
141
|
+
logger.info(f"Unzipped file: {keys[0]} with size {len(unzipped_files[keys[0]])} bytes.")
|
|
142
|
+
|
|
143
|
+
return unzipped_files[keys[0]]
|
|
@@ -16,5 +16,5 @@ class PolarsDataFrameToGCSTransferableObject(TransferableObject):
|
|
|
16
16
|
"""Transfer the Polars DataFrame to the specified GCS destination."""
|
|
17
17
|
# Convert Polars DataFrame to CSV and upload to GCS
|
|
18
18
|
logger.info(f"Transferring Polars DataFrame to {self.destination}.")
|
|
19
|
-
self.source.write_csv(self.destination)
|
|
19
|
+
self.source.write_csv(self.destination, separator="\t", include_header=True)
|
|
20
20
|
logger.info(f"Uploading DataFrame to {self.destination}")
|
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
from enum import StrEnum
|
|
6
6
|
|
|
7
7
|
import polars as pl
|
|
8
|
+
from google.cloud.storage import Client
|
|
8
9
|
from loguru import logger
|
|
9
10
|
|
|
10
11
|
from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
|
|
@@ -69,31 +70,102 @@ class DownloadStudiesSchema(StrEnum):
|
|
|
69
70
|
return [member.value for member in cls]
|
|
70
71
|
|
|
71
72
|
|
|
73
|
+
class SyncedSummaryStatisticsSchema(StrEnum):
|
|
74
|
+
"""Enum to define the columns for synced summary statistics."""
|
|
75
|
+
|
|
76
|
+
FILE_PATH = "filePath"
|
|
77
|
+
"""The GCS file path of the summary statistics file."""
|
|
78
|
+
SYNCED = "isSynced"
|
|
79
|
+
"""Flag indicating whether the file has been synced."""
|
|
80
|
+
STUDY_ID = "studyId"
|
|
81
|
+
"""The unique identifier for a study."""
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def columns(cls) -> list[str]:
|
|
85
|
+
"""Get the list of columns defined in the schema."""
|
|
86
|
+
return [member.value for member in cls]
|
|
87
|
+
|
|
88
|
+
|
|
72
89
|
class CuratedStudyStatus(StrEnum):
|
|
73
90
|
"""Enum to define the status of a curated study."""
|
|
74
91
|
|
|
75
92
|
REMOVED = "removed"
|
|
76
93
|
"""The study has been removed from the GWAS Catalog."""
|
|
77
|
-
|
|
78
|
-
"""The study is new
|
|
94
|
+
TO_CURATE = "to_curate"
|
|
95
|
+
"""The study is new and needs to be curated."""
|
|
79
96
|
CURATED = "curated"
|
|
80
97
|
"""The study has been curated and is still in the GWAS Catalog."""
|
|
98
|
+
NO_SUMSTATS = "no_summary_statistics"
|
|
99
|
+
"""The study has no associated summary statistics."""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class GCSSummaryStatisticsFileCrawler:
|
|
103
|
+
"""Class to crawl GCS for summary statistics files."""
|
|
104
|
+
|
|
105
|
+
def __init__(self, gcs_glob: str):
|
|
106
|
+
"""Initialize the GCSSummaryStatisticsFileCrawler with a GCS glob pattern."""
|
|
107
|
+
self.gcs_glob = gcs_glob
|
|
108
|
+
logger.debug("Initialized GCSSummaryStatisticsFileCrawler with glob: {}", gcs_glob)
|
|
109
|
+
|
|
110
|
+
def _fetch_paths(self) -> list[str]:
|
|
111
|
+
"""Fetch file paths from GCS based on the glob pattern."""
|
|
112
|
+
# Implementation to fetch file paths from GCS
|
|
113
|
+
c = Client()
|
|
114
|
+
bucket_name = self.gcs_glob.split("/")[2]
|
|
115
|
+
prefix = "/".join(self.gcs_glob.split("/")[3:-1])
|
|
116
|
+
suffix = self.gcs_glob.split("/")[-1].replace("*", "")
|
|
117
|
+
logger.debug("Crawling GCS bucket: {}, prefix: {}, suffix: {}", bucket_name, prefix, suffix)
|
|
118
|
+
bucket = c.bucket(bucket_name)
|
|
119
|
+
blobs = bucket.list_blobs(prefix=prefix)
|
|
120
|
+
return [f"gs://{bucket_name}/{blob.name}" for blob in blobs if blob.name.endswith(suffix)]
|
|
121
|
+
|
|
122
|
+
def crawl(self) -> pl.DataFrame:
|
|
123
|
+
"""Crawl GCS and return a DataFrame of summary statistics files."""
|
|
124
|
+
# Implementation to crawl GCS and return a DataFrame
|
|
125
|
+
file_paths = self._fetch_paths()
|
|
126
|
+
logger.debug("Found {} summary statistics files.", len(file_paths))
|
|
127
|
+
data = pl.DataFrame({
|
|
128
|
+
SyncedSummaryStatisticsSchema.FILE_PATH: file_paths,
|
|
129
|
+
SyncedSummaryStatisticsSchema.SYNCED: [True] * len(file_paths),
|
|
130
|
+
}).with_columns(
|
|
131
|
+
pl.col(SyncedSummaryStatisticsSchema.FILE_PATH)
|
|
132
|
+
.str.extract(r"\/(GCST\d+)\/", 1)
|
|
133
|
+
.alias(SyncedSummaryStatisticsSchema.STUDY_ID)
|
|
134
|
+
)
|
|
135
|
+
# Post check to find if there are any studies with multiple files.
|
|
136
|
+
multi_files = data.group_by(SyncedSummaryStatisticsSchema.STUDY_ID).len().filter(pl.col("len") > 1)
|
|
137
|
+
if not multi_files.is_empty():
|
|
138
|
+
logger.warning("Studies with multiple summary statistics files found: {}", multi_files)
|
|
139
|
+
logger.warning("DataFrame shape before deduplication: {}", data.shape)
|
|
140
|
+
logger.warning("Synced data preview:\n{}", data.head())
|
|
141
|
+
data = data.unique(subset=SyncedSummaryStatisticsSchema.STUDY_ID)
|
|
142
|
+
logger.warning("Synced data after deduplication:\n{}", data.shape)
|
|
143
|
+
return data
|
|
81
144
|
|
|
82
145
|
|
|
83
146
|
class GWASCatalogCuration:
|
|
84
147
|
"""Class to handle the curation of GWAS Catalog data."""
|
|
85
148
|
|
|
86
|
-
def __init__(self, previous_curation: pl.DataFrame, studies: pl.DataFrame):
|
|
149
|
+
def __init__(self, previous_curation: pl.DataFrame, studies: pl.DataFrame, synced: pl.DataFrame):
|
|
87
150
|
"""Initialize the GWASCatalogCuration with previous curation and studies data."""
|
|
88
151
|
logger.debug("Initializing GWASCatalogCuration with previous curation and studies data.")
|
|
89
152
|
self.previous_curation = previous_curation
|
|
90
153
|
logger.debug("Previous curation data loaded with shape: {}", previous_curation.shape)
|
|
91
154
|
self.studies = studies
|
|
92
155
|
logger.debug("Studies data loaded with shape: {}", studies.shape)
|
|
156
|
+
self.synced = synced
|
|
157
|
+
logger.debug("Synced summary statistics data loaded with shape: {}", synced.shape)
|
|
93
158
|
|
|
94
159
|
@classmethod
|
|
95
|
-
def from_prev_curation(
|
|
160
|
+
def from_prev_curation(
|
|
161
|
+
cls,
|
|
162
|
+
previous_curation_path: str,
|
|
163
|
+
download_studies_path: str,
|
|
164
|
+
summary_statistics_glob: str,
|
|
165
|
+
) -> GWASCatalogCuration:
|
|
96
166
|
"""Create a GWASCatalogCuration instance from previous curation and studies."""
|
|
167
|
+
crawled_summary_statistics = GCSSummaryStatisticsFileCrawler(summary_statistics_glob).crawl()
|
|
168
|
+
|
|
97
169
|
previous_curation_df = pl.read_csv(
|
|
98
170
|
previous_curation_path,
|
|
99
171
|
separator="\t",
|
|
@@ -112,7 +184,7 @@ class GWASCatalogCuration:
|
|
|
112
184
|
if studies_df.is_empty():
|
|
113
185
|
raise GentroutilsError(GentroutilsErrorMessage.DOWNLOAD_STUDIES_EMPTY, path=download_studies_path)
|
|
114
186
|
studies_df = studies_df.rename(mapping=DownloadStudiesSchema.mapping())
|
|
115
|
-
return cls(previous_curation_df, studies_df)
|
|
187
|
+
return cls(previous_curation_df, studies_df, crawled_summary_statistics)
|
|
116
188
|
|
|
117
189
|
@property
|
|
118
190
|
def result(self) -> pl.DataFrame:
|
|
@@ -144,7 +216,11 @@ class GWASCatalogCuration:
|
|
|
144
216
|
assert all(prev_studies.select(CurationSchema.STUDY_ID).is_unique()), "Study IDs must be unique after merging."
|
|
145
217
|
|
|
146
218
|
# Studies that are new in the GWAS Catalog
|
|
147
|
-
new_studies = self.studies.join(self.previous_curation, on=CurationSchema.STUDY_ID, how="anti")
|
|
219
|
+
new_studies = self.studies.join(self.previous_curation, on=CurationSchema.STUDY_ID, how="anti")
|
|
220
|
+
# Annotate new studies with info if they have summary statistics synced to the GCS bucket
|
|
221
|
+
new_studies_annotated = new_studies.join(self.synced, on=CurationSchema.STUDY_ID, how="left")
|
|
222
|
+
# Assign status NO_SUMSTATS to new studies without synced summary statistics (left join to drop info about already curated studies)
|
|
223
|
+
new_studies_annotated = new_studies_annotated.select(
|
|
148
224
|
CurationSchema.STUDY_ID,
|
|
149
225
|
pl.lit(None).alias(CurationSchema.STUDY_TYPE),
|
|
150
226
|
pl.lit(None).alias(CurationSchema.ANALYSIS_FLAG),
|
|
@@ -153,12 +229,16 @@ class GWASCatalogCuration:
|
|
|
153
229
|
CurationSchema.PUBMED_ID,
|
|
154
230
|
CurationSchema.PUBLICATION_TITLE,
|
|
155
231
|
CurationSchema.TRAIT_FROM_SOURCE,
|
|
156
|
-
pl.
|
|
232
|
+
pl.when(pl.col(SyncedSummaryStatisticsSchema.SYNCED).is_null())
|
|
233
|
+
.then(pl.lit(CuratedStudyStatus.NO_SUMSTATS))
|
|
234
|
+
.otherwise(pl.lit(CuratedStudyStatus.TO_CURATE))
|
|
235
|
+
.alias("status"),
|
|
157
236
|
)
|
|
158
237
|
logger.debug("New studies identified: {}", new_studies.shape[0])
|
|
159
238
|
|
|
160
239
|
# Union of new studies and previously curated studies
|
|
161
|
-
all_studies = pl.concat([prev_studies,
|
|
240
|
+
all_studies = pl.concat([prev_studies, new_studies_annotated], how="vertical")
|
|
241
|
+
|
|
162
242
|
logger.debug("All studies after combining new and previous: {}", all_studies.shape[0])
|
|
163
243
|
|
|
164
244
|
# Ensure the contract on the output dataframe
|