gentroutils 3.0.0__tar.gz → 4.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {gentroutils-3.0.0 → gentroutils-4.0.0}/.github/workflows/build.yaml +7 -4
  2. {gentroutils-3.0.0 → gentroutils-4.0.0}/.github/workflows/tag.yaml +6 -2
  3. {gentroutils-3.0.0 → gentroutils-4.0.0}/CHANGELOG.md +240 -0
  4. gentroutils-4.0.0/Dockerfile +29 -0
  5. {gentroutils-3.0.0 → gentroutils-4.0.0}/Makefile +4 -1
  6. {gentroutils-3.0.0 → gentroutils-4.0.0}/PKG-INFO +24 -16
  7. {gentroutils-3.0.0 → gentroutils-4.0.0}/README.md +20 -12
  8. gentroutils-4.0.0/config.yaml +41 -0
  9. {gentroutils-3.0.0 → gentroutils-4.0.0}/pyproject.toml +4 -5
  10. gentroutils-4.0.0/src/gentroutils/io/transfer/ftp_to_gcs.py +143 -0
  11. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/polars_to_gcs.py +1 -1
  12. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/parsers/curation.py +88 -8
  13. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/tasks/curation.py +9 -1
  14. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/io/transfer/test_ftp_to_gcs.py +52 -1
  15. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/io/transfer/test_polars_to_gcs.py +8 -6
  16. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/parsers/test_curation.py +128 -26
  17. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/tasks/test_crawl_task.py +2 -2
  18. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/tasks/test_curation_task.py +11 -4
  19. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/tasks/test_fetch_task.py +17 -13
  20. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/test_transfer.py +21 -15
  21. gentroutils-4.0.0/uv.lock +2344 -0
  22. gentroutils-3.0.0/Dockerfile +0 -16
  23. gentroutils-3.0.0/config.yaml +0 -40
  24. gentroutils-3.0.0/src/gentroutils/io/transfer/ftp_to_gcs.py +0 -61
  25. gentroutils-3.0.0/uv.lock +0 -2132
  26. {gentroutils-3.0.0 → gentroutils-4.0.0}/.github/workflows/labeler.yaml +0 -0
  27. {gentroutils-3.0.0 → gentroutils-4.0.0}/.github/workflows/pr.yaml +0 -0
  28. {gentroutils-3.0.0 → gentroutils-4.0.0}/.github/workflows/release.yaml +0 -0
  29. {gentroutils-3.0.0 → gentroutils-4.0.0}/.github/workflows/release_pr.yaml +0 -0
  30. {gentroutils-3.0.0 → gentroutils-4.0.0}/.gitignore +0 -0
  31. {gentroutils-3.0.0 → gentroutils-4.0.0}/.pre-commit-config.yaml +0 -0
  32. {gentroutils-3.0.0 → gentroutils-4.0.0}/.vscode/extensions.json +0 -0
  33. {gentroutils-3.0.0 → gentroutils-4.0.0}/.vscode/settings.json +0 -0
  34. {gentroutils-3.0.0 → gentroutils-4.0.0}/LICENSE +0 -0
  35. {gentroutils-3.0.0 → gentroutils-4.0.0}/commitlint.config.js +0 -0
  36. {gentroutils-3.0.0 → gentroutils-4.0.0}/conftest.py +0 -0
  37. {gentroutils-3.0.0 → gentroutils-4.0.0}/docs/00_prepare_tables_for_curation.R +0 -0
  38. {gentroutils-3.0.0 → gentroutils-4.0.0}/docs/gwas_catalog_curation.md +0 -0
  39. {gentroutils-3.0.0 → gentroutils-4.0.0}/setup.sh +0 -0
  40. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/__init__.py +0 -0
  41. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/errors.py +0 -0
  42. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/io/path/__init__.py +0 -0
  43. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/io/path/ftp.py +0 -0
  44. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/io/path/gcs.py +0 -0
  45. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/__init__.py +0 -0
  46. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/io/transfer/model.py +0 -0
  47. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/parsers/__init__.py +0 -0
  48. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/py.typed +0 -0
  49. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/tasks/__init__.py +0 -0
  50. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/tasks/crawl.py +0 -0
  51. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/tasks/fetch.py +0 -0
  52. {gentroutils-3.0.0 → gentroutils-4.0.0}/src/gentroutils/transfer.py +0 -0
  53. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/ftp/test/databases/gwas/summary_statistics/harmonised_list.txt +0 -0
  54. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/gsutil_list.txt +0 -0
  55. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/correct_curation.tsv +0 -0
  56. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_analysisFlag_type.tsv +0 -0
  57. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_analysisFlag_value.tsv +0 -0
  58. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_columns_curation.tsv +0 -0
  59. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_publicationTitle_type.tsv +0 -0
  60. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_pubmedId_type.tsv +0 -0
  61. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyId_type.tsv +0 -0
  62. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyId_value.tsv +0 -0
  63. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyType_type.tsv +0 -0
  64. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_studyType_value.tsv +0 -0
  65. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/incorrect_traitFromSource_type.tsv +0 -0
  66. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/non_unique_studyId.tsv +0 -0
  67. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/manual_curation/null_value_in_studyId.tsv +0 -0
  68. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/data/test.h.tsv.gz +0 -0
  69. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/io/conftest.py +0 -0
  70. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/io/path/conftest.py +0 -0
  71. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/io/path/test_ftp.py +0 -0
  72. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/io/path/test_gcs.py +0 -0
  73. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/io/transfer/conftest.py +0 -0
  74. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/io/transfer/test_model.py +0 -0
  75. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/parsers/conftest.py +0 -0
  76. {gentroutils-3.0.0 → gentroutils-4.0.0}/tests/tasks/conftest.py +0 -0
@@ -8,11 +8,12 @@ env:
8
8
  GCP_PROJECT_ID: "open-targets-genetics-dev"
9
9
  GCP_REGION: "europe-west1"
10
10
  TAG: "${{ github.ref_name }}"
11
- REPO: "${{ github.event.repository_name }}"
11
+ REPO: "${{ github.event.repository.name }}"
12
12
 
13
13
 
14
14
  jobs:
15
15
  push-to-ghcr-and-gar:
16
+ if: startsWith(github.ref, 'refs/tags/')
16
17
  name: Build docker image and push to GHCR and GAR
17
18
  runs-on: ubuntu-22.04
18
19
 
@@ -54,7 +55,9 @@ jobs:
54
55
  name: Authenticate to Google Cloud
55
56
  uses: google-github-actions/auth@v2
56
57
  with:
58
+ token_format: access_token
57
59
  project_id: ${{ env.GCP_PROJECT_ID }}
60
+ service_account: github-actions@open-targets-genetics-dev.iam.gserviceaccount.com
58
61
  workload_identity_provider: projects/234703259993/locations/global/workloadIdentityPools/github/providers/opentargets
59
62
  access_token_lifetime: 300s
60
63
 
@@ -75,13 +78,13 @@ jobs:
75
78
  tags: |
76
79
  ghcr.io/${{ github.repository }}:latest
77
80
  ghcr.io/${{ github.repository }}:${{ env.TAG }}
78
- ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}/${{ env.REPO }}:latest
79
- ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}:${{ env.TAG }}
81
+ ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/opentargets/${{ env.REPO }}:latest
82
+ ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/opentargets/${{ env.REPO }}:${{ env.TAG }}
80
83
 
81
84
  - id: generate-attestations
82
85
  name: Generate artifact attestation
83
86
  uses: actions/attest-build-provenance@v1
84
87
  with:
85
- subject-name: ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}/${{ env.REPO }}
88
+ subject-name: ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/opentargets/${{ env.REPO }}
86
89
  subject-digest: ${{ steps.push.outputs.digest }}
87
90
  push-to-registry: true
@@ -4,12 +4,16 @@ on:
4
4
  branches:
5
5
  - dev
6
6
  - main
7
+ tags-ignore: # Prevent running the action on the tag
8
+ - '*'
7
9
 
8
10
 
9
11
  jobs:
10
12
  create-tag:
11
- # NOTE: only trigger the workflow when the commit is not from the GitHubActions bot (prevent self-triggering)
12
- if: github.event.commits[0].author.name != 'github-actions[botg]'
13
+ # NOTE: only trigger the workflow only when human user pushed to the branch (prevent self-triggering)
14
+ if: |
15
+ !contains(fromJSON('["github-actions[bot]", "semantic-release"]'), github.actor) &&
16
+ github.event.head_commit.author.name != 'semantic-release'
13
17
  runs-on: ubuntu-latest
14
18
  concurrency: release
15
19
  environment: DEV
@@ -1,6 +1,246 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v4.0.0 (2026-02-03)
5
+
6
+
7
+ ## v4.0.0-dev.1 (2026-02-03)
8
+
9
+ ### Features
10
+
11
+ - Updete dependencies
12
+ ([`b6af4d2`](https://github.com/opentargets/gentroutils/commit/b6af4d28605e7c687f5ec15cae7187c64e834cb0))
13
+
14
+
15
+ ## v3.2.0 (2026-02-03)
16
+
17
+ ### Chores
18
+
19
+ - Update uv lock
20
+ ([`6f13fc0`](https://github.com/opentargets/gentroutils/commit/6f13fc0055ee9a49a215166d3cccb31747602a4f))
21
+
22
+
23
+ ## v3.2.0-dev.2 (2026-02-03)
24
+
25
+ ### Bug Fixes
26
+
27
+ - Output tsv file instead of csv
28
+ ([`aff71b1`](https://github.com/opentargets/gentroutils/commit/aff71b16b6c4d273cc851050a793d3798bae27ac))
29
+
30
+ - Test
31
+ ([`f9dd890`](https://github.com/opentargets/gentroutils/commit/f9dd890efc32ab969fbcd14eb0da14e40678e8fb))
32
+
33
+ - Test for curation
34
+ ([`b853358`](https://github.com/opentargets/gentroutils/commit/b85335815d7a22745c61404f81b612a14cce06d5))
35
+
36
+ - Test for curation
37
+ ([`22138ab`](https://github.com/opentargets/gentroutils/commit/22138ab31f7551a4b161f6f1885b7975d57a0ac7))
38
+
39
+ ### Chores
40
+
41
+ - Cleanup
42
+ ([`68a3f66`](https://github.com/opentargets/gentroutils/commit/68a3f6607a4a1b61441c1369f2a9d3b4babec30c))
43
+
44
+ - Fix glob pattern
45
+ ([`404b8ca`](https://github.com/opentargets/gentroutils/commit/404b8ca71b95764529ebb3df7c39881a0a12ff5e))
46
+
47
+ - Handle mutliple sumstat files
48
+ ([`1fc8902`](https://github.com/opentargets/gentroutils/commit/1fc8902171a8f6edac407b790c3bcbe691792f96))
49
+
50
+ - Update
51
+ ([`e69575b`](https://github.com/opentargets/gentroutils/commit/e69575b5a6c802b78b959314b84348d7969eeaeb))
52
+
53
+ - Update readme
54
+ ([`12f274c`](https://github.com/opentargets/gentroutils/commit/12f274c5158b3986ba2511791fc2289b24d9aa40))
55
+
56
+
57
+ ## v3.2.0-dev.1 (2025-11-05)
58
+
59
+ ### Chores
60
+
61
+ - Uncomment config
62
+ ([`30c4d68`](https://github.com/opentargets/gentroutils/commit/30c4d68e79a35d2c5c83cd17a15f63906ef834d6))
63
+
64
+ ### Features
65
+
66
+ - **associations**: Allow zip file transfer from ftp
67
+ ([`662a635`](https://github.com/opentargets/gentroutils/commit/662a63593cd5f340a768974041461cc65e1566b9))
68
+
69
+
70
+ ## v3.1.0 (2025-09-02)
71
+
72
+ ### Chores
73
+
74
+ - Trigger release process ([#36](https://github.com/opentargets/gentroutils/pull/36),
75
+ [`a90fdc7`](https://github.com/opentargets/gentroutils/commit/a90fdc7cd26dcb1263590f93f79bff6ccc867868))
76
+
77
+ * fix: update auth
78
+
79
+ * 3.0.1-dev.1
80
+
81
+ Automatically generated by python-semantic-release
82
+
83
+ * ci: add service account to impersonate
84
+
85
+ * 3.0.1-dev.2
86
+
87
+ * ci: prevent running create-tag on tag
88
+
89
+ * 3.0.1-dev.3
90
+
91
+ * ci: prevent running create-tag by semantic-release
92
+
93
+ * fix: workflow file
94
+
95
+ * 3.0.1-dev.4
96
+
97
+ * chore: update readme
98
+
99
+ * ci: run artifact build only from tag
100
+
101
+ * 3.0.1-dev.5
102
+
103
+ * ci: prevent tag action to run after semvar
104
+
105
+ * 3.0.1-dev.6
106
+
107
+ * build: remove obscured gcs scope from polars
108
+
109
+ * feat: rebuild docker image
110
+
111
+ * feat: add docker build command
112
+
113
+ * 3.1.0-dev.1
114
+
115
+ * ci: fix container name
116
+
117
+ * 3.1.0-dev.2
118
+
119
+ * ci: change image name structure for gcs
120
+
121
+ * 3.1.0-dev.3
122
+
123
+ * ci: update path to attestations
124
+
125
+ * 3.1.0-dev.4
126
+
127
+ * 3.1.0-dev.5
128
+
129
+ ---------
130
+
131
+ Co-authored-by: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com>
132
+
133
+ Co-authored-by: semantic-release <semantic-release>
134
+
135
+ Co-authored-by: project-defiant <szymonszyszkowski@gmail.com>
136
+
137
+
138
+ ## v3.1.0-dev.5 (2025-08-29)
139
+
140
+ ### Continuous Integration
141
+
142
+ - Update path to attestations
143
+ ([`febea43`](https://github.com/opentargets/gentroutils/commit/febea4366fe94de09b82154d92b90baadfb08871))
144
+
145
+
146
+ ## v3.1.0-dev.4 (2025-08-29)
147
+
148
+ ### Continuous Integration
149
+
150
+ - Update path to attestations
151
+ ([`55a80f1`](https://github.com/opentargets/gentroutils/commit/55a80f11838d6569f4919b730e291be665f33dad))
152
+
153
+
154
+ ## v3.1.0-dev.3 (2025-08-29)
155
+
156
+ ### Continuous Integration
157
+
158
+ - Change image name structure for gcs
159
+ ([`88dbbbd`](https://github.com/opentargets/gentroutils/commit/88dbbbd7637bf7c9ec8c405a9811ad9e29416d48))
160
+
161
+
162
+ ## v3.1.0-dev.2 (2025-08-29)
163
+
164
+ ### Continuous Integration
165
+
166
+ - Fix container name
167
+ ([`9c64ae1`](https://github.com/opentargets/gentroutils/commit/9c64ae1e4d43b1861625ef673df8edd7b5127b48))
168
+
169
+
170
+ ## v3.1.0-dev.1 (2025-08-29)
171
+
172
+ ### Build System
173
+
174
+ - Remove obscured gcs scope from polars
175
+ ([`895bfed`](https://github.com/opentargets/gentroutils/commit/895bfed2486c6ca5123cc6908760020d832638ec))
176
+
177
+ ### Features
178
+
179
+ - Add docker build command
180
+ ([`8f42913`](https://github.com/opentargets/gentroutils/commit/8f42913c6c41b160a28ca230cb801dd2da0bebf1))
181
+
182
+ - Rebuild docker image
183
+ ([`94f2a49`](https://github.com/opentargets/gentroutils/commit/94f2a49ccb25a8c9eb9238cd56ea75024e26ff50))
184
+
185
+
186
+ ## v3.0.1-dev.6 (2025-08-29)
187
+
188
+ ### Continuous Integration
189
+
190
+ - Prevent tag action to run after semvar
191
+ ([`847e36b`](https://github.com/opentargets/gentroutils/commit/847e36ba4a5848116c2f6311849a1960dd55b34c))
192
+
193
+
194
+ ## v3.0.1-dev.5 (2025-08-29)
195
+
196
+ ### Chores
197
+
198
+ - Update readme
199
+ ([`9e75c35`](https://github.com/opentargets/gentroutils/commit/9e75c35a4ed6113dae76ed9b0a67762fbcd882e3))
200
+
201
+ ### Continuous Integration
202
+
203
+ - Run artifact build only from tag
204
+ ([`4906b38`](https://github.com/opentargets/gentroutils/commit/4906b38c49d51a7b8a6a28cb12638ecc9a6fdc5e))
205
+
206
+
207
+ ## v3.0.1-dev.4 (2025-08-29)
208
+
209
+ ### Bug Fixes
210
+
211
+ - Workflow file
212
+ ([`f840d55`](https://github.com/opentargets/gentroutils/commit/f840d555bbf4a8415c71b56b34d72393af8c8ebf))
213
+
214
+ ### Continuous Integration
215
+
216
+ - Prevent running create-tag by semantic-release
217
+ ([`57fb068`](https://github.com/opentargets/gentroutils/commit/57fb068a21bf86d048376ab0f4678694abeb2e71))
218
+
219
+
220
+ ## v3.0.1-dev.3 (2025-08-29)
221
+
222
+ ### Continuous Integration
223
+
224
+ - Prevent running create-tag on tag
225
+ ([`963f657`](https://github.com/opentargets/gentroutils/commit/963f657221f2f08f20f1b20a371ad884d584bf0a))
226
+
227
+
228
+ ## v3.0.1-dev.2 (2025-08-29)
229
+
230
+ ### Continuous Integration
231
+
232
+ - Add service account to impersonate
233
+ ([`05d5eb1`](https://github.com/opentargets/gentroutils/commit/05d5eb133efc9a5e3103397ff33233b543c1d2e2))
234
+
235
+
236
+ ## v3.0.1-dev.1 (2025-08-28)
237
+
238
+ ### Bug Fixes
239
+
240
+ - Update auth
241
+ ([`a95b566`](https://github.com/opentargets/gentroutils/commit/a95b566a164d31ad36383fb99d7c88a5aec27b70))
242
+
243
+
4
244
  ## v3.0.0 (2025-08-28)
5
245
 
6
246
  ### Bug Fixes
@@ -0,0 +1,29 @@
1
+ # Description: Dockerfile for the gentroutils package
2
+ #
3
+ # To run locally, you must have a credentials file for GCP. Assuming you do,
4
+ # you can run the following command:
5
+ #
6
+ # docker run -v /path/to/credentials.json:/app/credentials.json \
7
+ # -e GOOGLE_APPLICATION_CREDENTIALS=/app/credentials.json \
8
+ # gentroutuls -s gwas_catalog_release
9
+ # By default the image uses the `config.yaml` file provided in the repository.
10
+ FROM rust:slim-trixie AS rust-builder
11
+ FROM python:3.13.7-slim-trixie
12
+
13
+ # Copy Rustc and Cargo from the rust-builder stage
14
+ # These are needed to install polars without compiling rust from source
15
+ COPY --from=rust-builder /usr/local/cargo/bin/rustc /usr/local/bin/rustc
16
+ COPY --from=rust-builder /usr/local/cargo/bin/cargo /usr/local/bin/cargo
17
+
18
+ # Copy Python source
19
+ COPY src /app/src
20
+ COPY pyproject.toml /app/pyproject.toml
21
+ COPY uv.lock /app/uv.lock
22
+ COPY README.md /app/README.md
23
+ COPY config.yaml /app/config.yaml
24
+
25
+ # Build the executable
26
+ WORKDIR /app
27
+ RUN python -m pip install .
28
+
29
+ ENTRYPOINT [ "gentroutils", "-c", "/app/config.yaml" ]
@@ -1,6 +1,6 @@
1
1
  SHELL := /bin/bash
2
2
  .PHONY: $(shell sed -n -e '/^$$/ { n ; /^[^ .\#][^ ]*:/ { s/:.*$$// ; p ; } ; }' $(MAKEFILE_LIST))
3
- VERSION := $$(grep '^version' pyproject.toml | sed 's%version = "\(.*\)"%\1%')
3
+ VERSION := $$(grep '^version' pyproject.toml | head -1 | sed 's%version = "\(.*\)"%\1%')
4
4
  APP_NAME := $$(grep '^name' pyproject.toml | head -1 | sed 's%name = "\(.*\)"%\1%')
5
5
 
6
6
  .DEFAULT_GOAL := help
@@ -41,3 +41,6 @@ check: lint format test type-check dep-check ## run all checks
41
41
 
42
42
  help: ## This is help
43
43
  @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
44
+
45
+ build-docker: ## build docker image
46
+ docker build -t $(APP_NAME):$(VERSION) --no-cache -f Dockerfile .
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gentroutils
3
- Version: 3.0.0
3
+ Version: 4.0.0
4
4
  Summary: Open Targets python genetics utility CLI tools
5
5
  Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
6
6
  License-Expression: Apache-2.0
@@ -12,14 +12,14 @@ Classifier: License :: OSI Approved :: Apache Software License
12
12
  Classifier: Operating System :: Unix
13
13
  Classifier: Programming Language :: Python :: 3.13
14
14
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
- Requires-Python: >=3.13
15
+ Requires-Python: <=3.13,>3.11
16
16
  Requires-Dist: aioftp>=0.25.1
17
17
  Requires-Dist: aiohttp>=3.11.18
18
18
  Requires-Dist: gcsfs>=2025.7.0
19
19
  Requires-Dist: google-cloud-storage>=3.1.1
20
20
  Requires-Dist: loguru>=0.7.3
21
- Requires-Dist: opentargets-otter>=25.0.2
22
- Requires-Dist: polars[fsspec,gcs]>=1.31.0
21
+ Requires-Dist: opentargets-otter>=25.0.15
22
+ Requires-Dist: polars[fsspec]>=1.31.0
23
23
  Requires-Dist: pydantic>=2.10.6
24
24
  Requires-Dist: tqdm>=4.67.1
25
25
  Description-Content-Type: text/markdown
@@ -34,7 +34,7 @@ Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
34
34
 
35
35
  ## Installation
36
36
 
37
- ```
37
+ ```{bash}
38
38
  pip install gentroutils
39
39
  ```
40
40
 
@@ -99,6 +99,7 @@ steps:
99
99
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
100
100
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
101
101
  destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
102
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
102
103
  promote: true
103
104
  ```
104
105
 
@@ -121,7 +122,8 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
121
122
 
122
123
  This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
123
124
 
124
- > [!NOTE] > **Task parameters**
125
+ > [!NOTE]
126
+ > **Task parameters**
125
127
  >
126
128
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
127
129
  > - The `destination_template` is where the metadata will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. By default it searches for the release directly in the stats_uri json output.
@@ -141,7 +143,8 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
141
143
 
142
144
  This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
143
145
 
144
- > [!NOTE] > **Task parameters**
146
+ > [!NOTE]
147
+ > **Task parameters**
145
148
  >
146
149
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
147
150
  > - The `source_template` is the URL of the GWAS Catalog associations file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -162,7 +165,8 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
162
165
 
163
166
  This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
164
167
 
165
- > [!NOTE] > **Task parameters**
168
+ > [!NOTE]
169
+ > **Task parameters**
166
170
  >
167
171
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
168
172
  > - The `source_template` is the URL of the GWAS Catalog studies file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -183,7 +187,8 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
183
187
 
184
188
  This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
185
189
 
186
- > [!NOTE] > **Task parameters**
190
+ > [!NOTE]
191
+ > **Task parameters**
187
192
  >
188
193
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
189
194
  > - The `source_template` is the URL of the GWAS Catalog ancestries file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -201,6 +206,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
201
206
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
202
207
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
203
208
  destination_template: gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv
209
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
204
210
  promote: true
205
211
  ```
206
212
 
@@ -214,24 +220,26 @@ This task is used to build the GWAS Catalog curation file that is later used as
214
220
  > - The `studies` field is the path to the studies file that was fetched in the `fetch studies` task. This file is used to build the curation file.
215
221
  > - The `destination_template` is where the curation file will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
216
222
  > - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
223
+ > The `summary_statistics_glob` field is used to specify the glob pattern to list all synced summary statistics files from GCS. This is used to identify which studies have summary statistics available.
217
224
 
218
225
  ---
219
226
 
220
227
  ## Curation process
221
228
 
222
- The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task autommates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
229
+ The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task automates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
223
230
 
224
231
  The automated process includes:
225
232
 
226
233
  1. Reading `download studies` file with the list of studies that are currently comming from the latest GWAS Catalog release.
227
234
  2. Reading `previous curation` file that contains the list of the curated studies from the previous release.
228
- 3. Comparing the two datasets with following logic:
235
+ 3. Listing all synced summary statistics files from the `summary_statistics_glob` parameter to identify which studies have summary statistics available. Note that this can be more then the list of studies in the `download studies` file as syncing also involves the unpublished studies.
236
+ 4. Comparing the three datasets with following logic:
229
237
  - In case the study is present in the `previous curation` and `download studies`, the study is marked as `curated`
230
- * In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `new`
231
- * In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
232
- 4. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
233
- 5. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
234
- 6. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
238
+ - In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `to_curate` or `has_no_sumstats` depending on the presence of summary statistics files
239
+ - In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
240
+ 5. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
241
+ 6. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
242
+ 7. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
235
243
 
236
244
  ---
237
245
 
@@ -8,7 +8,7 @@ Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
8
8
 
9
9
  ## Installation
10
10
 
11
- ```
11
+ ```{bash}
12
12
  pip install gentroutils
13
13
  ```
14
14
 
@@ -73,6 +73,7 @@ steps:
73
73
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
74
74
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
75
75
  destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
76
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
76
77
  promote: true
77
78
  ```
78
79
 
@@ -95,7 +96,8 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
95
96
 
96
97
  This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
97
98
 
98
- > [!NOTE] > **Task parameters**
99
+ > [!NOTE]
100
+ > **Task parameters**
99
101
  >
100
102
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
101
103
  > - The `destination_template` is where the metadata will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. By default it searches for the release directly in the stats_uri json output.
@@ -115,7 +117,8 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
115
117
 
116
118
  This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
117
119
 
118
- > [!NOTE] > **Task parameters**
120
+ > [!NOTE]
121
+ > **Task parameters**
119
122
  >
120
123
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
121
124
  > - The `source_template` is the URL of the GWAS Catalog associations file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -136,7 +139,8 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
136
139
 
137
140
  This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
138
141
 
139
- > [!NOTE] > **Task parameters**
142
+ > [!NOTE]
143
+ > **Task parameters**
140
144
  >
141
145
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
142
146
  > - The `source_template` is the URL of the GWAS Catalog studies file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -157,7 +161,8 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
157
161
 
158
162
  This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
159
163
 
160
- > [!NOTE] > **Task parameters**
164
+ > [!NOTE]
165
+ > **Task parameters**
161
166
  >
162
167
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
163
168
  > - The `source_template` is the URL of the GWAS Catalog ancestries file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -175,6 +180,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
175
180
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
176
181
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
177
182
  destination_template: gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv
183
+ summary_statistics_glob: gs://gwas_catalog_inputs/raw_summary_statistics/*.h.tsv.gz
178
184
  promote: true
179
185
  ```
180
186
 
@@ -188,24 +194,26 @@ This task is used to build the GWAS Catalog curation file that is later used as
188
194
  > - The `studies` field is the path to the studies file that was fetched in the `fetch studies` task. This file is used to build the curation file.
189
195
  > - The `destination_template` is where the curation file will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
190
196
  > - The `promote` field is set to `true`, which means the output will be promoted to the latest release. Meaning that the file will be saved under `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` after the task is completed. If the `promote` field is set to `false`, the file will not be promoted and will be saved under the specified path with the release date.
197
+ > The `summary_statistics_glob` field is used to specify the glob pattern to list all synced summary statistics files from GCS. This is used to identify which studies have summary statistics available.
191
198
 
192
199
  ---
193
200
 
194
201
  ## Curation process
195
202
 
196
- The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task autommates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
203
+ The base of the curation process for GWAS Catalog data is defined in the [docs/gwas_catalog_curation.md](docs/gwas_catalog_curation.md). The original solution uses R script to prepare the data for curation and then manually curates the data. The solution proposed in the `curation` task automates the preparation of the data for curation and provides a template for manual curation. The manual curation process is still required, but the data preparation is automated.
197
204
 
198
205
  The automated process includes:
199
206
 
200
207
  1. Reading `download studies` file with the list of studies that are currently comming from the latest GWAS Catalog release.
201
208
  2. Reading `previous curation` file that contains the list of the curated studies from the previous release.
202
- 3. Comparing the two datasets with following logic:
209
+ 3. Listing all synced summary statistics files from the `summary_statistics_glob` parameter to identify which studies have summary statistics available. Note that this can be more then the list of studies in the `download studies` file as syncing also involves the unpublished studies.
210
+ 4. Comparing the three datasets with following logic:
203
211
  - In case the study is present in the `previous curation` and `download studies`, the study is marked as `curated`
204
- * In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `new`
205
- * In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
206
- 4. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
207
- 5. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
208
- 6. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
212
+ - In case the study is present in the `download studies` but not in the `previous curation`, the study is marked as `to_curate` or `has_no_sumstats` depending on the presence of summary statistics files
213
+ - In case the study is present in the `previous curation` but not in the `download studies`, the study is marked as `removed`
214
+ 5. The output of the curation process is a file that contains the list of studies with their status (curated, new, removed) and the fields that are required for manual curation. The output file is saved to the `destination_template` path specified in the task configuration. The file is saved under `gs://gwas_catalog_inputs/curation/{release_date}/raw/gwas_catalog_study_curation.tsv` path.
215
+ 6. The output file is then promoted to the latest release path `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` so that it can be used for manual curation.
216
+ 7. The manual curation process is then performed on the `gs://gwas_catalog_inputs/curation/latest/raw/gwas_catalog_study_curation.tsv` file. The manual curation process is not automated and requires manual intervention. The output from the manual curation process should be saved then to the `gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv` and `gs://gwas_catalog_inputs/curation/{release_date}/curated/GWAS_Catalog_study_curation.tsv` file. This file is then used for the [Open Targets Staging Dags](https://github.com/opentargets/orchestration).
209
217
 
210
218
  ---
211
219
 
@@ -0,0 +1,41 @@
1
+ ---
2
+ work_path: ./work
3
+ log_level: DEBUG
4
+ scratchpad:
5
+ gc_stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
6
+ gc_bucket: "gs://gwas_catalog_inputs"
7
+ gc_ftp: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases"
8
+
9
+ steps:
10
+ gwas_catalog_release:
11
+ - name: crawl release metadata
12
+ stats_uri: ${gc_stats_uri}
13
+ destination_template: '${gc_bucket}/gentroutils/{release_date}/stats.json'
14
+ promote: true
15
+
16
+ - name: fetch studies
17
+ stats_uri: ${gc_stats_uri}
18
+ source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt'
19
+ destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_studies.tsv'
20
+ promote: true
21
+
22
+ - name: fetch ancestries
23
+ stats_uri: ${gc_stats_uri}
24
+ source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt'
25
+ destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv'
26
+ promote: true
27
+
28
+ - name: fetch associations
29
+ stats_uri: ${gc_stats_uri}
30
+ source_template: '${gc_ftp}/{release_date}/gwas-catalog-associations_ontology-annotated-full.zip'
31
+ destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv'
32
+ promote: true
33
+
34
+ - name: curation study
35
+ requires:
36
+ - fetch studies
37
+ previous_curation: '${gc_bucket}/curation/latest/curated/GWAS_Catalog_study_curation.tsv'
38
+ studies: '${gc_bucket}/gentroutils/latest/gwas_catalog_download_studies.tsv'
39
+ summary_statistics_glob: '${gc_bucket}/raw_summary_statistics/**.h.tsv.gz'
40
+ destination_template: '${gc_bucket}/curation/{release_date}/raw/GWAS_Catalog_study_curation.tsv'
41
+ promote: true
@@ -1,21 +1,21 @@
1
1
  [project]
2
2
  authors = [{ name = "Szymon Szyszkowski", email = "ss60@sanger.ac.uk" }]
3
3
  name = "gentroutils"
4
- version = "3.0.0"
4
+ version = "4.0.0"
5
5
  description = "Open Targets python genetics utility CLI tools"
6
6
  dependencies = [
7
7
  "aiohttp>=3.11.18",
8
8
  "aioftp>=0.25.1",
9
- "polars[fsspec,gcs]>=1.31.0",
9
+ "polars[fsspec]>=1.31.0",
10
10
  "pydantic>=2.10.6",
11
11
  "loguru>=0.7.3",
12
12
  "tqdm>=4.67.1",
13
- "opentargets-otter>=25.0.2",
13
+ "opentargets-otter>=25.0.15",
14
14
  "google-cloud-storage>=3.1.1",
15
15
  "gcsfs>=2025.7.0",
16
16
  ]
17
17
  readme = "README.md"
18
- requires-python = ">=3.13"
18
+ requires-python = ">3.11,<=3.13"
19
19
  license = "Apache-2.0"
20
20
  classifiers = [
21
21
  "Development Status :: 3 - Alpha",
@@ -50,7 +50,6 @@ dev = [
50
50
  "gcloud-storage-emulator>=0.5.0",
51
51
  "types-requests>=2.32.0.20240712",
52
52
  "pyftpdlib>=2.0.1",
53
- "python-semantic-release>=9.19.1",
54
53
  "pandas-stubs>=2.2.3.250308",
55
54
  "ipython>=8.36.0",
56
55
  "pytest-asyncio>=1.1.0",