gentroutils 3.1.0__tar.gz → 4.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {gentroutils-3.1.0 → gentroutils-4.0.0}/CHANGELOG.md +66 -0
  2. {gentroutils-3.1.0 → gentroutils-4.0.0}/PKG-INFO +16 -12
  3. {gentroutils-3.1.0 → gentroutils-4.0.0}/README.md +13 -9
  4. gentroutils-4.0.0/config.yaml +41 -0
  5. {gentroutils-3.1.0 → gentroutils-4.0.0}/pyproject.toml +3 -4
  6. gentroutils-4.0.0/src/gentroutils/io/transfer/ftp_to_gcs.py +143 -0
  7. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/polars_to_gcs.py +1 -1
  8. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/parsers/curation.py +88 -8
  9. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/tasks/curation.py +9 -1
  10. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/transfer/test_ftp_to_gcs.py +52 -1
  11. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/transfer/test_polars_to_gcs.py +8 -6
  12. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/parsers/test_curation.py +128 -26
  13. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/tasks/test_crawl_task.py +2 -2
  14. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/tasks/test_curation_task.py +11 -4
  15. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/tasks/test_fetch_task.py +17 -13
  16. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/test_transfer.py +21 -15
  17. gentroutils-4.0.0/uv.lock +2344 -0
  18. gentroutils-3.1.0/config.yaml +0 -40
  19. gentroutils-3.1.0/src/gentroutils/io/transfer/ftp_to_gcs.py +0 -61
  20. gentroutils-3.1.0/uv.lock +0 -2132
  21. {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/build.yaml +0 -0
  22. {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/labeler.yaml +0 -0
  23. {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/pr.yaml +0 -0
  24. {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/release.yaml +0 -0
  25. {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/release_pr.yaml +0 -0
  26. {gentroutils-3.1.0 → gentroutils-4.0.0}/.github/workflows/tag.yaml +0 -0
  27. {gentroutils-3.1.0 → gentroutils-4.0.0}/.gitignore +0 -0
  28. {gentroutils-3.1.0 → gentroutils-4.0.0}/.pre-commit-config.yaml +0 -0
  29. {gentroutils-3.1.0 → gentroutils-4.0.0}/.vscode/extensions.json +0 -0
  30. {gentroutils-3.1.0 → gentroutils-4.0.0}/.vscode/settings.json +0 -0
  31. {gentroutils-3.1.0 → gentroutils-4.0.0}/Dockerfile +0 -0
  32. {gentroutils-3.1.0 → gentroutils-4.0.0}/LICENSE +0 -0
  33. {gentroutils-3.1.0 → gentroutils-4.0.0}/Makefile +0 -0
  34. {gentroutils-3.1.0 → gentroutils-4.0.0}/commitlint.config.js +0 -0
  35. {gentroutils-3.1.0 → gentroutils-4.0.0}/conftest.py +0 -0
  36. {gentroutils-3.1.0 → gentroutils-4.0.0}/docs/00_prepare_tables_for_curation.R +0 -0
  37. {gentroutils-3.1.0 → gentroutils-4.0.0}/docs/gwas_catalog_curation.md +0 -0
  38. {gentroutils-3.1.0 → gentroutils-4.0.0}/setup.sh +0 -0
  39. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/__init__.py +0 -0
  40. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/errors.py +0 -0
  41. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/path/__init__.py +0 -0
  42. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/path/ftp.py +0 -0
  43. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/path/gcs.py +0 -0
  44. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/__init__.py +0 -0
  45. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/model.py +0 -0
  46. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/parsers/__init__.py +0 -0
  47. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/py.typed +0 -0
  48. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/tasks/__init__.py +0 -0
  49. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/tasks/crawl.py +0 -0
  50. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/tasks/fetch.py +0 -0
  51. {gentroutils-3.1.0 → gentroutils-4.0.0}/src/gentroutils/transfer.py +0 -0
  52. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/ftp/test/databases/gwas/summary_statistics/harmonised_list.txt +0 -0
  53. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/gsutil_list.txt +0 -0
  54. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/correct_curation.tsv +0 -0
  55. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_analysisFlag_type.tsv +0 -0
  56. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_analysisFlag_value.tsv +0 -0
  57. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_columns_curation.tsv +0 -0
  58. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_publicationTitle_type.tsv +0 -0
  59. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_pubmedId_type.tsv +0 -0
  60. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyId_type.tsv +0 -0
  61. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyId_value.tsv +0 -0
  62. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyType_type.tsv +0 -0
  63. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyType_value.tsv +0 -0
  64. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_traitFromSource_type.tsv +0 -0
  65. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/non_unique_studyId.tsv +0 -0
  66. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/manual_curation/null_value_in_studyId.tsv +0 -0
  67. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/data/test.h.tsv.gz +0 -0
  68. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/conftest.py +0 -0
  69. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/path/conftest.py +0 -0
  70. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/path/test_ftp.py +0 -0
  71. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/path/test_gcs.py +0 -0
  72. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/transfer/conftest.py +0 -0
  73. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/io/transfer/test_model.py +0 -0
  74. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/parsers/conftest.py +0 -0
  75. {gentroutils-3.1.0 → gentroutils-4.0.0}/tests/tasks/conftest.py +0 -0
@@ -1,6 +1,72 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v4.0.0 (2026-02-03)
5
+
6
+
7
+ ## v4.0.0-dev.1 (2026-02-03)
8
+
9
+ ### Features
10
+
11
+ - Updete dependencies
12
+ ([`b6af4d2`](https://github.com/opentargets/gentroutils/commit/b6af4d28605e7c687f5ec15cae7187c64e834cb0))
13
+
14
+
15
+ ## v3.2.0 (2026-02-03)
16
+
17
+ ### Chores
18
+
19
+ - Update uv lock
20
+ ([`6f13fc0`](https://github.com/opentargets/gentroutils/commit/6f13fc0055ee9a49a215166d3cccb31747602a4f))
21
+
22
+
23
+ ## v3.2.0-dev.2 (2026-02-03)
24
+
25
+ ### Bug Fixes
26
+
27
+ - Output tsv file instead of csv
28
+ ([`aff71b1`](https://github.com/opentargets/gentroutils/commit/aff71b16b6c4d273cc851050a793d3798bae27ac))
29
+
30
+ - Test
31
+ ([`f9dd890`](https://github.com/opentargets/gentroutils/commit/f9dd890efc32ab969fbcd14eb0da14e40678e8fb))
32
+
33
+ - Test for curation
34
+ ([`b853358`](https://github.com/opentargets/gentroutils/commit/b85335815d7a22745c61404f81b612a14cce06d5))
35
+
36
+ - Test for curation
37
+ ([`22138ab`](https://github.com/opentargets/gentroutils/commit/22138ab31f7551a4b161f6f1885b7975d57a0ac7))
38
+
39
+ ### Chores
40
+
41
+ - Cleanup
42
+ ([`68a3f66`](https://github.com/opentargets/gentroutils/commit/68a3f6607a4a1b61441c1369f2a9d3b4babec30c))
43
+
44
+ - Fix glob pattern
45
+ ([`404b8ca`](https://github.com/opentargets/gentroutils/commit/404b8ca71b95764529ebb3df7c39881a0a12ff5e))
46
+
47
+ - Handle mutliple sumstat files
48
+ ([`1fc8902`](https://github.com/opentargets/gentroutils/commit/1fc8902171a8f6edac407b790c3bcbe691792f96))
49
+
50
+ - Update
51
+ ([`e69575b`](https://github.com/opentargets/gentroutils/commit/e69575b5a6c802b78b959314b84348d7969eeaeb))
52
+
53
+ - Update readme
54
+ ([`12f274c`](https://github.com/opentargets/gentroutils/commit/12f274c5158b3986ba2511791fc2289b24d9aa40))
55
+
56
+
57
+ ## v3.2.0-dev.1 (2025-11-05)
58
+
59
+ ### Chores
60
+
61
+ - Uncomment config
62
+ ([`30c4d68`](https://github.com/opentargets/gentroutils/commit/30c4d68e79a35d2c5c83cd17a15f63906ef834d6))
63
+
64
+ ### Features
65
+
66
+ - **associations**: Allow zip file transfer from ftp
67
+ ([`662a635`](https://github.com/opentargets/gentroutils/commit/662a63593cd5f340a768974041461cc65e1566b9))
68
+
69
+
4
70
  ## v3.1.0 (2025-09-02)
5
71
 
6
72
  ### Chores
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gentroutils
3
- Version: 3.1.0
3
+ Version: 4.0.0
4
4
  Summary: Open Targets python genetics utility CLI tools
5
5
  Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
6
6
  License-Expression: Apache-2.0
@@ -12,13 +12,13 @@ Classifier: License :: OSI Approved :: Apache Software License
12
12
  Classifier: Operating System :: Unix
13
13
  Classifier: Programming Language :: Python :: 3.13
14
14
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
- Requires-Python: >=3.13
15
+ Requires-Python: <=3.13,>3.11
16
16
  Requires-Dist: aioftp>=0.25.1
17
17
  Requires-Dist: aiohttp>=3.11.18
18
18
  Requires-Dist: gcsfs>=2025.7.0
19
19
  Requires-Dist: google-cloud-storage>=3.1.1
20
20
  Requires-Dist: loguru>=0.7.3
21
- Requires-Dist: opentargets-otter>=25.0.2
21
+ Requires-Dist: opentargets-otter>=25.0.15
22
22
  Requires-Dist: polars[fsspec]>=1.31.0
23
23
  Requires-Dist: pydantic>=2.10.6
24
24
  Requires-Dist: tqdm>=4.67.1
@@ -99,6 +99,7 @@ steps:
99
99
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
100
100
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
101
101
  destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
102
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
102
103
  promote: true
103
104
  ```
104
105
 
@@ -164,7 +165,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
164
165
 
165
166
  This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
166
167
 
167
- > [!NOTE]
168
+ > [!NOTE]
168
169
  > **Task parameters**
169
170
  >
170
171
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -186,7 +187,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
186
187
 
187
188
  This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
188
189
 
189
- > [!NOTE]
190
+ > [!NOTE]
190
191
  > **Task parameters**
191
192
  >
192
193
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -205,6 +206,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
205
206
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
206
207
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
207
208
  destination_template: gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv
209
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
208
210
  promote: true
209
211
  ```
210
212
 
@@ -218,24 +220,26 @@ This task is used to build the GWAS Catalog curation file that is later used as
218
220
  > - The `studies` field is the path to the studies file that was fetched in the `fetch studies` task. This file is used to build the curation file.
219
221
  > - The `destination_template` is where the curation file will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
220
222
  > - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
223
+ > The `summary_statistics_glob` field is used to specify the glob pattern to list all synced summary statistics files from GCS. This is used to identify which studies have summary statistics available.
221
224
 
222
225
  ---
223
226
 
224
227
  ## Curation process
225
228
 
226
- The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task autommates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
229
+ The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task automates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
227
230
 
228
231
  The automated process includes:
229
232
 
230
233
  1. Reading `download studies` file with the list of studies that are currently comming from the latest GWAS Catalog release.
231
234
  2. Reading `previous curation` file that contains the list of the curated studies from the previous release.
232
- 3. Comparing the two datasets with following logic:
235
+ 3. Listing all synced summary statistics files from the `summary_statistics_glob` parameter to identify which studies have summary statistics available. Note that this can be more then the list of studies in the `download studies` file as syncing also involves the unpublished studies.
236
+ 4. Comparing the three datasets with following logic:
233
237
  - In case the study is present in the `previous curation` and `download studies`, the study is marked as `curated`
234
- * In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `new`
235
- * In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
236
- 4. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
237
- 5. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
238
- 6. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
238
+ - In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `to_curate` or `has_no_sumstats` depending on the presence of summary statistics files
239
+ - In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
240
+ 5. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
241
+ 6. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
242
+ 7. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
239
243
 
240
244
  ---
241
245
 
@@ -73,6 +73,7 @@ steps:
73
73
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
74
74
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
75
75
  destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
76
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
76
77
  promote: true
77
78
  ```
78
79
 
@@ -138,7 +139,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
138
139
 
139
140
  This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
140
141
 
141
- > [!NOTE]
142
+ > [!NOTE]
142
143
  > **Task parameters**
143
144
  >
144
145
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -160,7 +161,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
160
161
 
161
162
  This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
162
163
 
163
- > [!NOTE]
164
+ > [!NOTE]
164
165
  > **Task parameters**
165
166
  >
166
167
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -179,6 +180,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
179
180
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
180
181
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
181
182
  destination_template: gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv
183
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
182
184
  promote: true
183
185
  ```
184
186
 
@@ -192,24 +194,26 @@ This task is used to build the GWAS Catalog curation file that is later used as
192
194
  > - The `studies` field is the path to the studies file that was fetched in the `fetch studies` task. This file is used to build the curation file.
193
195
  > - The `destination_template` is where the curation file will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
194
196
  > - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
197
+ > The `summary_statistics_glob` field is used to specify the glob pattern to list all synced summary statistics files from GCS. This is used to identify which studies have summary statistics available.
195
198
 
196
199
  ---
197
200
 
198
201
  ## Curation process
199
202
 
200
- The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task autommates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
203
+ The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task automates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
201
204
 
202
205
  The automated process includes:
203
206
 
204
207
  1. Reading `download studies` file with the list of studies that are currently comming from the latest GWAS Catalog release.
205
208
  2. Reading `previous curation` file that contains the list of the curated studies from the previous release.
206
- 3. Comparing the two datasets with following logic:
209
+ 3. Listing all synced summary statistics files from the `summary_statistics_glob` parameter to identify which studies have summary statistics available. Note that this can be more then the list of studies in the `download studies` file as syncing also involves the unpublished studies.
210
+ 4. Comparing the three datasets with following logic:
207
211
  - In case the study is present in the `previous curation` and `download studies`, the study is marked as `curated`
208
- * In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `new`
209
- * In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
210
- 4. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
211
- 5. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
212
- 6. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
212
+ - In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `to_curate` or `has_no_sumstats` depending on the presence of summary statistics files
213
+ - In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
214
+ 5. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
215
+ 6. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
216
+ 7. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
213
217
 
214
218
  ---
215
219
 
@@ -0,0 +1,41 @@
1
+ ---
2
+ work_path: ./work
3
+ log_level: DEBUG
4
+ scratchpad:
5
+ gc_stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
6
+ gc_bucket: "gs://gwas_catalog_inputs"
7
+ gc_ftp: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases"
8
+
9
+ steps:
10
+ gwas_catalog_release:
11
+ - name: crawl release metadata
12
+ stats_uri: ${gc_stats_uri}
13
+ destination_template: '${gc_bucket}/gentroutils/{release_date}/stats.json'
14
+ promote: true
15
+
16
+ - name: fetch studies
17
+ stats_uri: ${gc_stats_uri}
18
+ source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt'
19
+ destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_studies.tsv'
20
+ promote: true
21
+
22
+ - name: fetch ancestries
23
+ stats_uri: ${gc_stats_uri}
24
+ source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt'
25
+ destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv'
26
+ promote: true
27
+
28
+ - name: fetch associations
29
+ stats_uri: ${gc_stats_uri}
30
+ source_template: '${gc_ftp}/{release_date}/gwas-catalog-associations_ontology-annotated-full.zip'
31
+ destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv'
32
+ promote: true
33
+
34
+ - name: curation study
35
+ requires:
36
+ - fetch studies
37
+ previous_curation: '${gc_bucket}/curation/latest/curated/GWAS_Catalog_study_curation.tsv'
38
+ studies: '${gc_bucket}/gentroutils/latest/gwas_catalog_download_studies.tsv'
39
+ summary_statistics_glob: '${gc_bucket}/raw_summary_statistics/**.h.tsv.gz'
40
+ destination_template: '${gc_bucket}/curation/{release_date}/raw/GWAS_Catalog_study_curation.tsv'
41
+ promote: true
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  authors = [{ name = "Szymon Szyszkowski", email = "ss60@sanger.ac.uk" }]
3
3
  name = "gentroutils"
4
- version = "3.1.0"
4
+ version = "4.0.0"
5
5
  description = "Open Targets python genetics utility CLI tools"
6
6
  dependencies = [
7
7
  "aiohttp>=3.11.18",
@@ -10,12 +10,12 @@ dependencies = [
10
10
  "pydantic>=2.10.6",
11
11
  "loguru>=0.7.3",
12
12
  "tqdm>=4.67.1",
13
- "opentargets-otter>=25.0.2",
13
+ "opentargets-otter>=25.0.15",
14
14
  "google-cloud-storage>=3.1.1",
15
15
  "gcsfs>=2025.7.0",
16
16
  ]
17
17
  readme = "README.md"
18
- requires-python = ">=3.13"
18
+ requires-python = ">3.11,<=3.13"
19
19
  license = "Apache-2.0"
20
20
  classifiers = [
21
21
  "Development Status :: 3 - Alpha",
@@ -50,7 +50,6 @@ dev = [
50
50
  "gcloud-storage-emulator>=0.5.0",
51
51
  "types-requests>=2.32.0.20240712",
52
52
  "pyftpdlib>=2.0.1",
53
- "python-semantic-release>=9.19.1",
54
53
  "pandas-stubs>=2.2.3.250308",
55
54
  "ipython>=8.36.0",
56
55
  "pytest-asyncio>=1.1.0",
@@ -0,0 +1,143 @@
1
+ """Transfer files from FTP to Google Cloud Storage (GCS)."""
2
+
3
+ import asyncio
4
+ import io
5
+ import re
6
+ from typing import Annotated
7
+
8
+ import aioftp
9
+ from google.cloud import storage
10
+ from loguru import logger
11
+ from pydantic import AfterValidator
12
+
13
+ from gentroutils.io.path import FTPPath, GCSPath
14
+ from gentroutils.io.transfer.model import TransferableObject
15
+
16
+
17
+ class FTPtoGCPTransferableObject(TransferableObject):
18
+ """A class to represent an object that can be transferred from FTP to GCP."""
19
+
20
+ source: Annotated[str, AfterValidator(lambda x: str(FTPPath(x)))]
21
+ destination: Annotated[str, AfterValidator(lambda x: str(GCSPath(x)))]
22
+
23
+ async def transfer(self) -> None:
24
+ """Transfer files from FTP to GCP.
25
+
26
+ This function fetches the data for the file provided in the local FTP path, collects the
27
+ data asynchronously to buffer, and uploads it to the provided GCP bucket blob.
28
+
29
+ Implements retry logic with exponential backoff for handling transient network errors.
30
+ """
31
+ max_retries = 3
32
+ retry_delay = 1 # Initial delay in seconds
33
+
34
+ for attempt in range(max_retries):
35
+ try:
36
+ await self._perform_transfer()
37
+ return # Success, exit the retry loop
38
+ except (ConnectionResetError, OSError, aioftp.errors.AIOFTPException) as e:
39
+ if attempt < max_retries - 1:
40
+ wait_time = retry_delay * (2**attempt) # Exponential backoff
41
+ logger.warning(
42
+ f"Transfer attempt {attempt + 1}/{max_retries} failed for {self.source}: {e}. "
43
+ f"Retrying in {wait_time}s..."
44
+ )
45
+ await asyncio.sleep(wait_time)
46
+ else:
47
+ logger.error(f"Transfer failed after {max_retries} attempts for {self.source}: {e}")
48
+ raise
49
+ except Exception as e:
50
+ # For non-retryable exceptions, log and raise immediately
51
+ logger.error(f"Non-retryable error during transfer from {self.source} to {self.destination}: {e}")
52
+ raise
53
+
54
+ async def _perform_transfer(self) -> None:
55
+ """Perform the actual transfer operation.
56
+
57
+ This is separated from the transfer method to allow for retry logic.
58
+ """
59
+ logger.info(f"Attempting to transfer data from {self.source} to {self.destination}.")
60
+ gcs_obj = GCSPath(self.destination)
61
+ ftp_obj = FTPPath(self.source)
62
+
63
+ async with aioftp.Client.context(ftp_obj.server, user="anonymous", password="anonymous") as ftp: # noqa: S106
64
+ bucket = storage.Client().bucket(gcs_obj.bucket)
65
+ blob = bucket.blob(gcs_obj.object)
66
+ logger.info(f"Searching for the release date in the provided ftp path: {ftp_obj.base_dir}.")
67
+ dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(ftp_obj.base_dir))
68
+
69
+ if dir_match:
70
+ logger.info(f"Found release date to search in the ftp {dir_match.group('release_date')}.")
71
+ release_date = dir_match.group("release_date")
72
+ try:
73
+ logger.debug(f"We are in the directory: {await ftp.get_current_directory()}")
74
+ logger.debug(f"Changing directory to: {ftp_obj.base_dir}")
75
+ await ftp.change_directory(ftp_obj.base_dir)
76
+ logger.success(f"Successfully changed directory to: {ftp_obj.base_dir}")
77
+ except aioftp.StatusCodeError as e:
78
+ logger.warning(f"Failed to change directory to {ftp_obj.base_dir}: {e}")
79
+ logger.warning(f"Probably the release date {release_date} is out of sync with the api endpoint.")
80
+ try:
81
+ logger.warning("Attempting to load the `latest` release.")
82
+ ftp_obj = FTPPath(self.source.replace(release_date, "latest"))
83
+ await ftp.change_directory(ftp_obj.base_dir)
84
+ logger.success(f"Successfully changed directory to: {ftp_obj.base_dir}")
85
+
86
+ except aioftp.StatusCodeError as e:
87
+ logger.error(f"Failed to find the latest release under {ftp_obj}")
88
+ raise
89
+
90
+ logger.debug("Creating in-memory buffer to store downloaded data.")
91
+ buffer = io.BytesIO()
92
+ logger.debug(f"Downloading data from FTP path: {ftp_obj.filename}")
93
+ stream = await ftp.download_stream(ftp_obj.filename)
94
+ logger.info("Successfully connected to the FTP stream, beginning data transfer to buffer.")
95
+ async with stream:
96
+ async for block in stream.iter_by_block():
97
+ buffer.write(block)
98
+ buffer.seek(0)
99
+ if ftp_obj.filename.endswith(".zip"):
100
+ logger.info("Uploading zipped content to GCS blob.")
101
+ logger.info("Unzipping content before upload.")
102
+ content = unzip_buffer(buffer)
103
+ blob.upload_from_string(content)
104
+ else:
105
+ content = buffer.getvalue()
106
+ buffer.close()
107
+ blob.upload_from_string(content)
108
+
109
+ else:
110
+ logger.error(f"Failed to extract release date from the provided ftp path: {ftp_obj.base_dir}.")
111
+ raise ValueError("Release date could not be extracted from the FTP path.")
112
+
113
+
114
+ def unzip_buffer(buffer: io.BytesIO) -> bytes:
115
+ """Unzip a BytesIO buffer and return a dictionary of file names to their content.
116
+
117
+ Args:
118
+ buffer (io.BytesIO): The in-memory buffer containing zipped data.
119
+
120
+ Returns:
121
+ bytes: The unzipped content of the single file.
122
+
123
+ Raises:
124
+ ValueError: If multiple files are found in the zipped buffer or if no files are found.
125
+ """
126
+ import zipfile
127
+
128
+ unzipped_files: dict[str, bytes] = {}
129
+ with zipfile.ZipFile(buffer) as z:
130
+ for file_info in z.infolist():
131
+ with z.open(file_info) as unzipped_file:
132
+ unzipped_files[file_info.filename] = unzipped_file.read()
133
+
134
+ if len(unzipped_files) == 0:
135
+ logger.error("No files were found in the zipped buffer.")
136
+ raise ValueError("No files were found in the zipped buffer.")
137
+ if len(unzipped_files) != 1:
138
+ logger.error("Multiple files were found in the zipped buffer.")
139
+ raise ValueError("Multiple files were found in the zipped buffer.")
140
+ keys = list(unzipped_files.keys())
141
+ logger.info(f"Unzipped file: {keys[0]} with size {len(unzipped_files[keys[0]])} bytes.")
142
+
143
+ return unzipped_files[keys[0]]
@@ -16,5 +16,5 @@ class PolarsDataFrameToGCSTransferableObject(TransferableObject):
16
16
  """Transfer the Polars DataFrame to the specified GCS destination."""
17
17
  # Convert Polars DataFrame to CSV and upload to GCS
18
18
  logger.info(f"Transferring Polars DataFrame to {self.destination}.")
19
- self.source.write_csv(self.destination)
19
+ self.source.write_csv(self.destination, separator="\t", include_header=True)
20
20
  logger.info(f"Uploading DataFrame to {self.destination}")
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  from enum import StrEnum
6
6
 
7
7
  import polars as pl
8
+ from google.cloud.storage import Client
8
9
  from loguru import logger
9
10
 
10
11
  from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
@@ -69,31 +70,102 @@ class DownloadStudiesSchema(StrEnum):
69
70
  return [member.value for member in cls]
70
71
 
71
72
 
73
+ class SyncedSummaryStatisticsSchema(StrEnum):
74
+ """Enum to define the columns for synced summary statistics."""
75
+
76
+ FILE_PATH = "filePath"
77
+ """The GCS file path of the summary statistics file."""
78
+ SYNCED = "isSynced"
79
+ """Flag indicating whether the file has been synced."""
80
+ STUDY_ID = "studyId"
81
+ """The unique identifier for a study."""
82
+
83
+ @classmethod
84
+ def columns(cls) -> list[str]:
85
+ """Get the list of columns defined in the schema."""
86
+ return [member.value for member in cls]
87
+
88
+
72
89
  class CuratedStudyStatus(StrEnum):
73
90
  """Enum to define the status of a curated study."""
74
91
 
75
92
  REMOVED = "removed"
76
93
  """The study has been removed from the GWAS Catalog."""
77
- NEW = "new"
78
- """The study is new in the GWAS Catalog."""
94
+ TO_CURATE = "to_curate"
95
+ """The study is new and needs to be curated."""
79
96
  CURATED = "curated"
80
97
  """The study has been curated and is still in the GWAS Catalog."""
98
+ NO_SUMSTATS = "no_summary_statistics"
99
+ """The study has no associated summary statistics."""
100
+
101
+
102
+ class GCSSummaryStatisticsFileCrawler:
103
+ """Class to crawl GCS for summary statistics files."""
104
+
105
+ def __init__(self, gcs_glob: str):
106
+ """Initialize the GCSSummaryStatisticsFileCrawler with a GCS glob pattern."""
107
+ self.gcs_glob = gcs_glob
108
+ logger.debug("Initialized GCSSummaryStatisticsFileCrawler with glob: {}", gcs_glob)
109
+
110
+ def _fetch_paths(self) -> list[str]:
111
+ """Fetch file paths from GCS based on the glob pattern."""
112
+ # Implementation to fetch file paths from GCS
113
+ c = Client()
114
+ bucket_name = self.gcs_glob.split("/")[2]
115
+ prefix = "/".join(self.gcs_glob.split("/")[3:-1])
116
+ suffix = self.gcs_glob.split("/")[-1].replace("*", "")
117
+ logger.debug("Crawling GCS bucket: {}, prefix: {}, suffix: {}", bucket_name, prefix, suffix)
118
+ bucket = c.bucket(bucket_name)
119
+ blobs = bucket.list_blobs(prefix=prefix)
120
+ return [f"gs://{bucket_name}/{blob.name}" for blob in blobs if blob.name.endswith(suffix)]
121
+
122
+ def crawl(self) -> pl.DataFrame:
123
+ """Crawl GCS and return a DataFrame of summary statistics files."""
124
+ # Implementation to crawl GCS and return a DataFrame
125
+ file_paths = self._fetch_paths()
126
+ logger.debug("Found {} summary statistics files.", len(file_paths))
127
+ data = pl.DataFrame({
128
+ SyncedSummaryStatisticsSchema.FILE_PATH: file_paths,
129
+ SyncedSummaryStatisticsSchema.SYNCED: [True] * len(file_paths),
130
+ }).with_columns(
131
+ pl.col(SyncedSummaryStatisticsSchema.FILE_PATH)
132
+ .str.extract(r"\/(GCST\d+)\/", 1)
133
+ .alias(SyncedSummaryStatisticsSchema.STUDY_ID)
134
+ )
135
+ # Post check to find if there are any studies with multiple files.
136
+ multi_files = data.group_by(SyncedSummaryStatisticsSchema.STUDY_ID).len().filter(pl.col("len") > 1)
137
+ if not multi_files.is_empty():
138
+ logger.warning("Studies with multiple summary statistics files found: {}", multi_files)
139
+ logger.warning("DataFrame shape before deduplication: {}", data.shape)
140
+ logger.warning("Synced data preview:\n{}", data.head())
141
+ data = data.unique(subset=SyncedSummaryStatisticsSchema.STUDY_ID)
142
+ logger.warning("Synced data after deduplication:\n{}", data.shape)
143
+ return data
81
144
 
82
145
 
83
146
  class GWASCatalogCuration:
84
147
  """Class to handle the curation of GWAS Catalog data."""
85
148
 
86
- def __init__(self, previous_curation: pl.DataFrame, studies: pl.DataFrame):
149
+ def __init__(self, previous_curation: pl.DataFrame, studies: pl.DataFrame, synced: pl.DataFrame):
87
150
  """Initialize the GWASCatalogCuration with previous curation and studies data."""
88
151
  logger.debug("Initializing GWASCatalogCuration with previous curation and studies data.")
89
152
  self.previous_curation = previous_curation
90
153
  logger.debug("Previous curation data loaded with shape: {}", previous_curation.shape)
91
154
  self.studies = studies
92
155
  logger.debug("Studies data loaded with shape: {}", studies.shape)
156
+ self.synced = synced
157
+ logger.debug("Synced summary statistics data loaded with shape: {}", synced.shape)
93
158
 
94
159
  @classmethod
95
- def from_prev_curation(cls, previous_curation_path: str, download_studies_path: str) -> GWASCatalogCuration:
160
+ def from_prev_curation(
161
+ cls,
162
+ previous_curation_path: str,
163
+ download_studies_path: str,
164
+ summary_statistics_glob: str,
165
+ ) -> GWASCatalogCuration:
96
166
  """Create a GWASCatalogCuration instance from previous curation and studies."""
167
+ crawled_summary_statistics = GCSSummaryStatisticsFileCrawler(summary_statistics_glob).crawl()
168
+
97
169
  previous_curation_df = pl.read_csv(
98
170
  previous_curation_path,
99
171
  separator="\t",
@@ -112,7 +184,7 @@ class GWASCatalogCuration:
112
184
  if studies_df.is_empty():
113
185
  raise GentroutilsError(GentroutilsErrorMessage.DOWNLOAD_STUDIES_EMPTY, path=download_studies_path)
114
186
  studies_df = studies_df.rename(mapping=DownloadStudiesSchema.mapping())
115
- return cls(previous_curation_df, studies_df)
187
+ return cls(previous_curation_df, studies_df, crawled_summary_statistics)
116
188
 
117
189
  @property
118
190
  def result(self) -> pl.DataFrame:
@@ -144,7 +216,11 @@ class GWASCatalogCuration:
144
216
  assert all(prev_studies.select(CurationSchema.STUDY_ID).is_unique()), "Study IDs must be unique after merging."
145
217
 
146
218
  # Studies that are new in the GWAS Catalog
147
- new_studies = self.studies.join(self.previous_curation, on=CurationSchema.STUDY_ID, how="anti").select(
219
+ new_studies = self.studies.join(self.previous_curation, on=CurationSchema.STUDY_ID, how="anti")
220
+ # Annotate new studies with info if they have summary statistics synced to the GCS bucket
221
+ new_studies_annotated = new_studies.join(self.synced, on=CurationSchema.STUDY_ID, how="left")
222
+ # Assign status NO_SUMSTATS to new studies without synced summary statistics (left join to drop info about already curated studies)
223
+ new_studies_annotated = new_studies_annotated.select(
148
224
  CurationSchema.STUDY_ID,
149
225
  pl.lit(None).alias(CurationSchema.STUDY_TYPE),
150
226
  pl.lit(None).alias(CurationSchema.ANALYSIS_FLAG),
@@ -153,12 +229,16 @@ class GWASCatalogCuration:
153
229
  CurationSchema.PUBMED_ID,
154
230
  CurationSchema.PUBLICATION_TITLE,
155
231
  CurationSchema.TRAIT_FROM_SOURCE,
156
- pl.lit(CuratedStudyStatus.NEW).alias("status"),
232
+ pl.when(pl.col(SyncedSummaryStatisticsSchema.SYNCED).is_null())
233
+ .then(pl.lit(CuratedStudyStatus.NO_SUMSTATS))
234
+ .otherwise(pl.lit(CuratedStudyStatus.TO_CURATE))
235
+ .alias("status"),
157
236
  )
158
237
  logger.debug("New studies identified: {}", new_studies.shape[0])
159
238
 
160
239
  # Union of new studies and previously curated studies
161
- all_studies = pl.concat([prev_studies, new_studies], how="vertical")
240
+ all_studies = pl.concat([prev_studies, new_studies_annotated], how="vertical")
241
+
162
242
  logger.debug("All studies after combining new and previous: {}", all_studies.shape[0])
163
243
 
164
244
  # Ensure the contract on the output dataframe