gentroutils 2.0.0__tar.gz → 3.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. gentroutils-3.1.0/.github/workflows/build.yaml +90 -0
  2. {gentroutils-2.0.0 → gentroutils-3.1.0}/.github/workflows/tag.yaml +6 -2
  3. {gentroutils-2.0.0 → gentroutils-3.1.0}/CHANGELOG.md +213 -0
  4. gentroutils-3.1.0/Dockerfile +29 -0
  5. {gentroutils-2.0.0 → gentroutils-3.1.0}/Makefile +4 -1
  6. {gentroutils-2.0.0 → gentroutils-3.1.0}/PKG-INFO +15 -7
  7. {gentroutils-2.0.0 → gentroutils-3.1.0}/README.md +12 -5
  8. gentroutils-3.1.0/config.yaml +40 -0
  9. {gentroutils-2.0.0 → gentroutils-3.1.0}/pyproject.toml +9 -2
  10. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/io/transfer/ftp_to_gcs.py +17 -5
  11. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/tasks/__init__.py +9 -2
  12. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/tasks/crawl.py +11 -12
  13. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/tasks/curation.py +9 -7
  14. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/tasks/fetch.py +5 -8
  15. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/io/transfer/test_ftp_to_gcs.py +2 -3
  16. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/tasks/test_crawl_task.py +2 -2
  17. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/tasks/test_curation_task.py +5 -4
  18. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/tasks/test_fetch_task.py +11 -8
  19. {gentroutils-2.0.0 → gentroutils-3.1.0}/uv.lock +72 -3
  20. gentroutils-2.0.0/.RData +0 -0
  21. gentroutils-2.0.0/.Rhistory +0 -0
  22. gentroutils-2.0.0/Dockerfile +0 -16
  23. gentroutils-2.0.0/config.yaml +0 -32
  24. {gentroutils-2.0.0 → gentroutils-3.1.0}/.github/workflows/labeler.yaml +0 -0
  25. {gentroutils-2.0.0 → gentroutils-3.1.0}/.github/workflows/pr.yaml +0 -0
  26. {gentroutils-2.0.0 → gentroutils-3.1.0}/.github/workflows/release.yaml +0 -0
  27. {gentroutils-2.0.0 → gentroutils-3.1.0}/.github/workflows/release_pr.yaml +0 -0
  28. {gentroutils-2.0.0 → gentroutils-3.1.0}/.gitignore +0 -0
  29. {gentroutils-2.0.0 → gentroutils-3.1.0}/.pre-commit-config.yaml +0 -0
  30. {gentroutils-2.0.0 → gentroutils-3.1.0}/.vscode/extensions.json +0 -0
  31. {gentroutils-2.0.0 → gentroutils-3.1.0}/.vscode/settings.json +0 -0
  32. {gentroutils-2.0.0 → gentroutils-3.1.0}/LICENSE +0 -0
  33. {gentroutils-2.0.0 → gentroutils-3.1.0}/commitlint.config.js +0 -0
  34. {gentroutils-2.0.0 → gentroutils-3.1.0}/conftest.py +0 -0
  35. {gentroutils-2.0.0 → gentroutils-3.1.0}/docs/00_prepare_tables_for_curation.R +0 -0
  36. {gentroutils-2.0.0 → gentroutils-3.1.0}/docs/gwas_catalog_curation.md +0 -0
  37. {gentroutils-2.0.0 → gentroutils-3.1.0}/setup.sh +0 -0
  38. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/__init__.py +0 -0
  39. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/errors.py +0 -0
  40. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/io/path/__init__.py +0 -0
  41. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/io/path/ftp.py +0 -0
  42. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/io/path/gcs.py +0 -0
  43. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/io/transfer/__init__.py +0 -0
  44. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/io/transfer/model.py +0 -0
  45. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/io/transfer/polars_to_gcs.py +0 -0
  46. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/parsers/__init__.py +0 -0
  47. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/parsers/curation.py +0 -0
  48. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/py.typed +0 -0
  49. {gentroutils-2.0.0 → gentroutils-3.1.0}/src/gentroutils/transfer.py +0 -0
  50. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/ftp/test/databases/gwas/summary_statistics/harmonised_list.txt +0 -0
  51. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/gsutil_list.txt +0 -0
  52. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/correct_curation.tsv +0 -0
  53. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_analysisFlag_type.tsv +0 -0
  54. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_analysisFlag_value.tsv +0 -0
  55. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_columns_curation.tsv +0 -0
  56. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_publicationTitle_type.tsv +0 -0
  57. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_pubmedId_type.tsv +0 -0
  58. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_studyId_type.tsv +0 -0
  59. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_studyId_value.tsv +0 -0
  60. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_studyType_type.tsv +0 -0
  61. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_studyType_value.tsv +0 -0
  62. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/incorrect_traitFromSource_type.tsv +0 -0
  63. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/non_unique_studyId.tsv +0 -0
  64. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/manual_curation/null_value_in_studyId.tsv +0 -0
  65. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/data/test.h.tsv.gz +0 -0
  66. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/io/conftest.py +0 -0
  67. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/io/path/conftest.py +0 -0
  68. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/io/path/test_ftp.py +0 -0
  69. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/io/path/test_gcs.py +0 -0
  70. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/io/transfer/conftest.py +0 -0
  71. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/io/transfer/test_model.py +0 -0
  72. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/io/transfer/test_polars_to_gcs.py +0 -0
  73. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/parsers/conftest.py +0 -0
  74. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/parsers/test_curation.py +0 -0
  75. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/tasks/conftest.py +0 -0
  76. {gentroutils-2.0.0 → gentroutils-3.1.0}/tests/test_transfer.py +0 -0
@@ -0,0 +1,90 @@
1
+
2
+ name: build-artifacts
3
+
4
+ on:
5
+ workflow_dispatch:
6
+
7
+ env:
8
+ GCP_PROJECT_ID: "open-targets-genetics-dev"
9
+ GCP_REGION: "europe-west1"
10
+ TAG: "${{ github.ref_name }}"
11
+ REPO: "${{ github.event.repository.name }}"
12
+
13
+
14
+ jobs:
15
+ push-to-ghcr-and-gar:
16
+ if: startsWith(github.ref, 'refs/tags/')
17
+ name: Build docker image and push to GHCR and GAR
18
+ runs-on: ubuntu-22.04
19
+
20
+ permissions:
21
+ packages: write
22
+ contents: read
23
+ attestations: write
24
+ id-token: write
25
+
26
+ steps:
27
+ - id: prepare
28
+ name: Prepare the action and log details
29
+ shell: bash
30
+ env:
31
+ GITHUB_CONTEXT: ${{ toJson(github) }}
32
+ run: |
33
+ TAG=$(echo $TAG | sed 's/^v//')
34
+ echo "TAG=$TAG" >> $GITHUB_ENV
35
+ echo "The tag for this build is $TAG"
36
+ echo "The repo name is: $REPO"
37
+ echo "Github context:\n$GITHUB_CONTEXT"
38
+
39
+ - id: checkout
40
+ name: Check out repo
41
+ uses: actions/checkout@v4
42
+
43
+ - name: Set up Docker Buildx
44
+ uses: docker/setup-buildx-action@v3
45
+
46
+ - id: auth-ghcr
47
+ name: Log in to GitHub Container Registry
48
+ uses: docker/login-action@v3
49
+ with:
50
+ registry: ghcr.io
51
+ username: ${{ github.actor }}
52
+ password: ${{ secrets.GITHUB_TOKEN }}
53
+
54
+ - id: auth-google
55
+ name: Authenticate to Google Cloud
56
+ uses: google-github-actions/auth@v2
57
+ with:
58
+ token_format: access_token
59
+ project_id: ${{ env.GCP_PROJECT_ID }}
60
+ service_account: github-actions@open-targets-genetics-dev.iam.gserviceaccount.com
61
+ workload_identity_provider: projects/234703259993/locations/global/workloadIdentityPools/github/providers/opentargets
62
+ access_token_lifetime: 300s
63
+
64
+ - id: auth-gar
65
+ name: Login to Google Artifact Registry
66
+ uses: docker/login-action@v3
67
+ with:
68
+ registry: ${{ env.GCP_REGION }}-docker.pkg.dev
69
+ username: oauth2accesstoken
70
+ password: ${{ steps.auth-google.outputs.access_token }}
71
+
72
+ - id: push
73
+ name: Build and push Docker image
74
+ uses: docker/build-push-action@v5
75
+ with:
76
+ context: .
77
+ push: true
78
+ tags: |
79
+ ghcr.io/${{ github.repository }}:latest
80
+ ghcr.io/${{ github.repository }}:${{ env.TAG }}
81
+ ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/opentargets/${{ env.REPO }}:latest
82
+ ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/opentargets/${{ env.REPO }}:${{ env.TAG }}
83
+
84
+ - id: generate-attestations
85
+ name: Generate artifact attestation
86
+ uses: actions/attest-build-provenance@v1
87
+ with:
88
+ subject-name: ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/opentargets/${{ env.REPO }}
89
+ subject-digest: ${{ steps.push.outputs.digest }}
90
+ push-to-registry: true
@@ -4,12 +4,16 @@ on:
4
4
  branches:
5
5
  - dev
6
6
  - main
7
+ tags-ignore: # Prevent running the action on the tag
8
+ - '*'
7
9
 
8
10
 
9
11
  jobs:
10
12
  create-tag:
11
- # NOTE: only trigger the workflow when the commit is not from the GitHubActions bot (prevent self-triggering)
12
- if: github.event.commits[0].author.name != 'github-actions[botg]'
13
+ # NOTE: only trigger the workflow only when human user pushed to the branch (prevent self-triggering)
14
+ if: |
15
+ !contains(fromJSON('["github-actions[bot]", "semantic-release"]'), github.actor) &&
16
+ github.event.head_commit.author.name != 'semantic-release'
13
17
  runs-on: ubuntu-latest
14
18
  concurrency: release
15
19
  environment: DEV
@@ -1,6 +1,209 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v3.1.0 (2025-09-02)
5
+
6
+ ### Chores
7
+
8
+ - Trigger release process ([#36](https://github.com/opentargets/gentroutils/pull/36),
9
+ [`a90fdc7`](https://github.com/opentargets/gentroutils/commit/a90fdc7cd26dcb1263590f93f79bff6ccc867868))
10
+
11
+ * fix: update auth
12
+
13
+ * 3.0.1-dev.1
14
+
15
+ Automatically generated by python-semantic-release
16
+
17
+ * ci: add service account to impersonate
18
+
19
+ * 3.0.1-dev.2
20
+
21
+ * ci: prevent running create-tag on tag
22
+
23
+ * 3.0.1-dev.3
24
+
25
+ * ci: prevent running create-tag by semantic-release
26
+
27
+ * fix: workflow file
28
+
29
+ * 3.0.1-dev.4
30
+
31
+ * chore: update readme
32
+
33
+ * ci: run artifact build only from tag
34
+
35
+ * 3.0.1-dev.5
36
+
37
+ * ci: prevent tag action to run after semvar
38
+
39
+ * 3.0.1-dev.6
40
+
41
+ * build: remove obscured gcs scope from polars
42
+
43
+ * feat: rebuild docker image
44
+
45
+ * feat: add docker build command
46
+
47
+ * 3.1.0-dev.1
48
+
49
+ * ci: fix container name
50
+
51
+ * 3.1.0-dev.2
52
+
53
+ * ci: change image name structure for gcs
54
+
55
+ * 3.1.0-dev.3
56
+
57
+ * ci: update path to attestations
58
+
59
+ * 3.1.0-dev.4
60
+
61
+ * 3.1.0-dev.5
62
+
63
+ ---------
64
+
65
+ Co-authored-by: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com>
66
+
67
+ Co-authored-by: semantic-release <semantic-release>
68
+
69
+ Co-authored-by: project-defiant <szymonszyszkowski@gmail.com>
70
+
71
+
72
+ ## v3.1.0-dev.5 (2025-08-29)
73
+
74
+ ### Continuous Integration
75
+
76
+ - Update path to attestations
77
+ ([`febea43`](https://github.com/opentargets/gentroutils/commit/febea4366fe94de09b82154d92b90baadfb08871))
78
+
79
+
80
+ ## v3.1.0-dev.4 (2025-08-29)
81
+
82
+ ### Continuous Integration
83
+
84
+ - Update path to attestations
85
+ ([`55a80f1`](https://github.com/opentargets/gentroutils/commit/55a80f11838d6569f4919b730e291be665f33dad))
86
+
87
+
88
+ ## v3.1.0-dev.3 (2025-08-29)
89
+
90
+ ### Continuous Integration
91
+
92
+ - Change image name structure for gcs
93
+ ([`88dbbbd`](https://github.com/opentargets/gentroutils/commit/88dbbbd7637bf7c9ec8c405a9811ad9e29416d48))
94
+
95
+
96
+ ## v3.1.0-dev.2 (2025-08-29)
97
+
98
+ ### Continuous Integration
99
+
100
+ - Fix container name
101
+ ([`9c64ae1`](https://github.com/opentargets/gentroutils/commit/9c64ae1e4d43b1861625ef673df8edd7b5127b48))
102
+
103
+
104
+ ## v3.1.0-dev.1 (2025-08-29)
105
+
106
+ ### Build System
107
+
108
+ - Remove obscured gcs scope from polars
109
+ ([`895bfed`](https://github.com/opentargets/gentroutils/commit/895bfed2486c6ca5123cc6908760020d832638ec))
110
+
111
+ ### Features
112
+
113
+ - Add docker build command
114
+ ([`8f42913`](https://github.com/opentargets/gentroutils/commit/8f42913c6c41b160a28ca230cb801dd2da0bebf1))
115
+
116
+ - Rebuild docker image
117
+ ([`94f2a49`](https://github.com/opentargets/gentroutils/commit/94f2a49ccb25a8c9eb9238cd56ea75024e26ff50))
118
+
119
+
120
+ ## v3.0.1-dev.6 (2025-08-29)
121
+
122
+ ### Continuous Integration
123
+
124
+ - Prevent tag action to run after semvar
125
+ ([`847e36b`](https://github.com/opentargets/gentroutils/commit/847e36ba4a5848116c2f6311849a1960dd55b34c))
126
+
127
+
128
+ ## v3.0.1-dev.5 (2025-08-29)
129
+
130
+ ### Chores
131
+
132
+ - Update readme
133
+ ([`9e75c35`](https://github.com/opentargets/gentroutils/commit/9e75c35a4ed6113dae76ed9b0a67762fbcd882e3))
134
+
135
+ ### Continuous Integration
136
+
137
+ - Run artifact build only from tag
138
+ ([`4906b38`](https://github.com/opentargets/gentroutils/commit/4906b38c49d51a7b8a6a28cb12638ecc9a6fdc5e))
139
+
140
+
141
+ ## v3.0.1-dev.4 (2025-08-29)
142
+
143
+ ### Bug Fixes
144
+
145
+ - Workflow file
146
+ ([`f840d55`](https://github.com/opentargets/gentroutils/commit/f840d555bbf4a8415c71b56b34d72393af8c8ebf))
147
+
148
+ ### Continuous Integration
149
+
150
+ - Prevent running create-tag by semantic-release
151
+ ([`57fb068`](https://github.com/opentargets/gentroutils/commit/57fb068a21bf86d048376ab0f4678694abeb2e71))
152
+
153
+
154
+ ## v3.0.1-dev.3 (2025-08-29)
155
+
156
+ ### Continuous Integration
157
+
158
+ - Prevent running create-tag on tag
159
+ ([`963f657`](https://github.com/opentargets/gentroutils/commit/963f657221f2f08f20f1b20a371ad884d584bf0a))
160
+
161
+
162
+ ## v3.0.1-dev.2 (2025-08-29)
163
+
164
+ ### Continuous Integration
165
+
166
+ - Add service account to impersonate
167
+ ([`05d5eb1`](https://github.com/opentargets/gentroutils/commit/05d5eb133efc9a5e3103397ff33233b543c1d2e2))
168
+
169
+
170
+ ## v3.0.1-dev.1 (2025-08-28)
171
+
172
+ ### Bug Fixes
173
+
174
+ - Update auth
175
+ ([`a95b566`](https://github.com/opentargets/gentroutils/commit/a95b566a164d31ad36383fb99d7c88a5aec27b70))
176
+
177
+
178
+ ## v3.0.0 (2025-08-28)
179
+
180
+ ### Bug Fixes
181
+
182
+ - Ensure otter scratchpad works
183
+ ([`b781b1f`](https://github.com/opentargets/gentroutils/commit/b781b1f587f94807daa3abde0dbb785121904e68))
184
+
185
+ ### Build System
186
+
187
+ - Update dependencies
188
+ ([`e9e05c3`](https://github.com/opentargets/gentroutils/commit/e9e05c31ec6ea443d1cb8af28f3583c6486ecb52))
189
+
190
+ ### Chores
191
+
192
+ - Format
193
+ ([`876400b`](https://github.com/opentargets/gentroutils/commit/876400b3d8b5a09c198a8b13db2ffc598a1f218a))
194
+
195
+ - Remove R session files
196
+ ([`ffe093d`](https://github.com/opentargets/gentroutils/commit/ffe093d494ec7c469b8b9f99978bfa002426c189))
197
+
198
+ - Update readme
199
+ ([`06a21db`](https://github.com/opentargets/gentroutils/commit/06a21dbdb0b706e06537f80d378f8837041b3906))
200
+
201
+ ### Continuous Integration
202
+
203
+ - Add build command
204
+ ([`2e67fcd`](https://github.com/opentargets/gentroutils/commit/2e67fcd1d7478b96dc698d5e54e2a077529fcb61))
205
+
206
+
4
207
  ## v2.0.0 (2025-08-28)
5
208
 
6
209
  ### Chores
@@ -16,6 +219,16 @@
16
219
 
17
220
  Co-authored-by: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com>
18
221
 
222
+ ### Continuous Integration
223
+
224
+ - Update ci
225
+ ([`81c39b2`](https://github.com/opentargets/gentroutils/commit/81c39b2d02f8fe318d8921b46c636a3361093263))
226
+
227
+ ### Features
228
+
229
+ - 2.0.0
230
+ ([`3268c38`](https://github.com/opentargets/gentroutils/commit/3268c383cf6dfac4be7748ad6b48e8ded5f6157c))
231
+
19
232
 
20
233
  ## v1.6.0-dev.2 (2025-08-12)
21
234
 
@@ -0,0 +1,29 @@
1
+ # Description: Dockerfile for the gentroutils package
2
+ #
3
+ # To run locally, you must have a credentials file for GCP. Assuming you do,
4
+ # you can run the following command:
5
+ #
6
+ # docker run -v /path/to/credentials.json:/app/credentials.json \
7
+ # -e GOOGLE_APPLICATION_CREDENTIALS=/app/credentials.json \
8
+ # gentroutuls -s gwas_catalog_release
9
+ # By default the image uses the `config.yaml` file provided in the repository.
10
+ FROM rust:slim-trixie AS rust-builder
11
+ FROM python:3.13.7-slim-trixie
12
+
13
+ # Copy Rustc and Cargo from the rust-builder stage
14
+ # These are needed to install polars without compiling rust from source
15
+ COPY --from=rust-builder /usr/local/cargo/bin/rustc /usr/local/bin/rustc
16
+ COPY --from=rust-builder /usr/local/cargo/bin/cargo /usr/local/bin/cargo
17
+
18
+ # Copy Python source
19
+ COPY src /app/src
20
+ COPY pyproject.toml /app/pyproject.toml
21
+ COPY uv.lock /app/uv.lock
22
+ COPY README.md /app/README.md
23
+ COPY config.yaml /app/config.yaml
24
+
25
+ # Build the executable
26
+ WORKDIR /app
27
+ RUN python -m pip install .
28
+
29
+ ENTRYPOINT [ "gentroutils", "-c", "/app/config.yaml" ]
@@ -1,6 +1,6 @@
1
1
  SHELL := /bin/bash
2
2
  .PHONY: $(shell sed -n -e '/^$$/ { n ; /^[^ .\#][^ ]*:/ { s/:.*$$// ; p ; } ; }' $(MAKEFILE_LIST))
3
- VERSION := $$(grep '^version' pyproject.toml | sed 's%version = "\(.*\)"%\1%')
3
+ VERSION := $$(grep '^version' pyproject.toml | head -1 | sed 's%version = "\(.*\)"%\1%')
4
4
  APP_NAME := $$(grep '^name' pyproject.toml | head -1 | sed 's%name = "\(.*\)"%\1%')
5
5
 
6
6
  .DEFAULT_GOAL := help
@@ -41,3 +41,6 @@ check: lint format test type-check dep-check ## run all checks
41
41
 
42
42
  help: ## This is help
43
43
  @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
44
+
45
+ build-docker: ## build docker image
46
+ docker build -t $(APP_NAME):$(VERSION) --no-cache -f Dockerfile .
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gentroutils
3
- Version: 2.0.0
3
+ Version: 3.1.0
4
4
  Summary: Open Targets python genetics utility CLI tools
5
5
  Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
6
6
  License-Expression: Apache-2.0
@@ -15,10 +15,11 @@ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
15
  Requires-Python: >=3.13
16
16
  Requires-Dist: aioftp>=0.25.1
17
17
  Requires-Dist: aiohttp>=3.11.18
18
+ Requires-Dist: gcsfs>=2025.7.0
18
19
  Requires-Dist: google-cloud-storage>=3.1.1
19
20
  Requires-Dist: loguru>=0.7.3
20
21
  Requires-Dist: opentargets-otter>=25.0.2
21
- Requires-Dist: polars>=1.31.0
22
+ Requires-Dist: polars[fsspec]>=1.31.0
22
23
  Requires-Dist: pydantic>=2.10.6
23
24
  Requires-Dist: tqdm>=4.67.1
24
25
  Description-Content-Type: text/markdown
@@ -33,7 +34,7 @@ Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
33
34
 
34
35
  ## Installation
35
36
 
36
- ```
37
+ ```{bash}
37
38
  pip install gentroutils
38
39
  ```
39
40
 
@@ -48,6 +49,7 @@ gentroutils --help
48
49
  ## Usage
49
50
 
50
51
  To run a single step run
52
+
51
53
  ```{bash}
52
54
  uv run gentroutils -s gwas_catalog_release # After cloning the repository
53
55
  gentroutils -s gwas_catalog_release -c otter_config.yaml # When installed by pip
@@ -60,6 +62,11 @@ The `gentroutils` repository uses the [otter](https://github.com/opentargets/ott
60
62
 
61
63
  For the top level fields refer to the [otter documentation](https://opentargets.github.io/otter/otter.config.html)
62
64
 
65
+ > [!NOTE]
66
+ > All `destination_template` must point to the Google Cloud Storage (GCS) bucket objects.
67
+ > All `source_template` must point to the FTP server paths.
68
+ > In case this is not enforced, the user may experience silent failures.
69
+
63
70
  ```yaml
64
71
  ---
65
72
  work_path: ./work
@@ -91,7 +98,7 @@ steps:
91
98
  - fetch studies
92
99
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
93
100
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
94
- destination_template: ./work/curation_{release_date}.tsv
101
+ destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
95
102
  promote: true
96
103
  ```
97
104
 
@@ -114,7 +121,7 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
114
121
 
115
122
  This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
116
123
 
117
- > [!NOTE]
124
+ > [!NOTE]
118
125
  > **Task parameters**
119
126
  >
120
127
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -135,7 +142,7 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
135
142
 
136
143
  This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
137
144
 
138
- > [!NOTE]
145
+ > [!NOTE]
139
146
  > **Task parameters**
140
147
  >
141
148
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -203,7 +210,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
203
210
 
204
211
  This task is used to build the GWAS Catalog curation file that is later used as a template for manual curation. It requires the `fetch studies` task to be completed before it can run. This is due to the fact that the curation file is build based on the list of studies fetched from `download studies` file.
205
212
 
206
- > [!NOTE]
213
+ > [!NOTE]
207
214
  > **Task parameters**
208
215
  >
209
216
  > - The `requires` field specifies that this task depends on the `fetch studies` task, meaning it will only run after the studies have been fetched.
@@ -268,6 +275,7 @@ To check CLI execution manually you need to run
268
275
  ```{bash}
269
276
  uv run gentroutils
270
277
  ```
278
+
271
279
  ---
272
280
 
273
281
  This software was developed as part of the Open Targets project. For more
@@ -8,7 +8,7 @@ Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
8
8
 
9
9
  ## Installation
10
10
 
11
- ```
11
+ ```{bash}
12
12
  pip install gentroutils
13
13
  ```
14
14
 
@@ -23,6 +23,7 @@ gentroutils --help
23
23
  ## Usage
24
24
 
25
25
  To run a single step run
26
+
26
27
  ```{bash}
27
28
  uv run gentroutils -s gwas_catalog_release # After cloning the repository
28
29
  gentroutils -s gwas_catalog_release -c otter_config.yaml # When installed by pip
@@ -35,6 +36,11 @@ The `gentroutils` repository uses the [otter](https://github.com/opentargets/ott
35
36
 
36
37
  For the top level fields refer to the [otter documentation](https://opentargets.github.io/otter/otter.config.html)
37
38
 
39
+ > [!NOTE]
40
+ > All `destination_template` must point to the Google Cloud Storage (GCS) bucket objects.
41
+ > All `source_template` must point to the FTP server paths.
42
+ > In case this is not enforced, the user may experience silent failures.
43
+
38
44
  ```yaml
39
45
  ---
40
46
  work_path: ./work
@@ -66,7 +72,7 @@ steps:
66
72
  - fetch studies
67
73
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
68
74
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
69
- destination_template: ./work/curation_{release_date}.tsv
75
+ destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
70
76
  promote: true
71
77
  ```
72
78
 
@@ -89,7 +95,7 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
89
95
 
90
96
  This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
91
97
 
92
- > [!NOTE]
98
+ > [!NOTE]
93
99
  > **Task parameters**
94
100
  >
95
101
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -110,7 +116,7 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
110
116
 
111
117
  This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
112
118
 
113
- > [!NOTE]
119
+ > [!NOTE]
114
120
  > **Task parameters**
115
121
  >
116
122
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
@@ -178,7 +184,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
178
184
 
179
185
  This task is used to build the GWAS Catalog curation file that is later used as a template for manual curation. It requires the `fetch studies` task to be completed before it can run. This is due to the fact that the curation file is build based on the list of studies fetched from `download studies` file.
180
186
 
181
- > [!NOTE]
187
+ > [!NOTE]
182
188
  > **Task parameters**
183
189
  >
184
190
  > - The `requires` field specifies that this task depends on the `fetch studies` task, meaning it will only run after the studies have been fetched.
@@ -243,6 +249,7 @@ To check CLI execution manually you need to run
243
249
  ```{bash}
244
250
  uv run gentroutils
245
251
  ```
252
+
246
253
  ---
247
254
 
248
255
  This software was developed as part of the Open Targets project. For more
@@ -0,0 +1,40 @@
1
+ ---
2
+ work_path: ./work
3
+ log_level: DEBUG
4
+ scratchpad:
5
+ gc_stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
6
+ gc_bucket: "gs://gwas_catalog_inputs"
7
+ gc_ftp: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases"
8
+
9
+ steps:
10
+ gwas_catalog_release:
11
+ # - name: crawl release metadata
12
+ # stats_uri: ${gc_stats_uri}
13
+ # destination_template: '${gc_bucket}/gentroutils/{release_date}/stats.json'
14
+ # promote: true
15
+
16
+ # - name: fetch associations
17
+ # stats_uri: ${gc_stats_uri}
18
+ # source_template: '${gc_ftp}/{release_date}/gwas-catalog-associations_ontology-annotated.tsv'
19
+ # destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv'
20
+ # promote: true
21
+
22
+ # - name: fetch studies
23
+ # stats_uri: ${gc_stats_uri}
24
+ # source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt'
25
+ # destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_studies.tsv'
26
+ # promote: true
27
+
28
+ # - name: fetch ancestries
29
+ # stats_uri: ${gc_stats_uri}
30
+ # source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt'
31
+ # destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv'
32
+ # promote: true
33
+
34
+ - name: curation study
35
+ # requires:
36
+ # - fetch studies
37
+ previous_curation: '${gc_bucket}/curation/latest/curated/GWAS_Catalog_study_curation.tsv'
38
+ studies: '${gc_bucket}/gentroutils/latest/gwas_catalog_download_studies.tsv'
39
+ destination_template: '${gc_bucket}/curation/{release_date}/raw/GWAS_Catalog_study_curation.tsv'
40
+ promote: true
@@ -1,17 +1,18 @@
1
1
  [project]
2
2
  authors = [{ name = "Szymon Szyszkowski", email = "ss60@sanger.ac.uk" }]
3
3
  name = "gentroutils"
4
- version = "2.0.0"
4
+ version = "3.1.0"
5
5
  description = "Open Targets python genetics utility CLI tools"
6
6
  dependencies = [
7
7
  "aiohttp>=3.11.18",
8
8
  "aioftp>=0.25.1",
9
- "polars>=1.31.0",
9
+ "polars[fsspec]>=1.31.0",
10
10
  "pydantic>=2.10.6",
11
11
  "loguru>=0.7.3",
12
12
  "tqdm>=4.67.1",
13
13
  "opentargets-otter>=25.0.2",
14
14
  "google-cloud-storage>=3.1.1",
15
+ "gcsfs>=2025.7.0",
15
16
  ]
16
17
  readme = "README.md"
17
18
  requires-python = ">=3.13"
@@ -75,6 +76,12 @@ allow-direct-references = true
75
76
  [tool.hatch.build.targets.wheel]
76
77
  packages = ["src/gentroutils"]
77
78
 
79
+
80
+ # Ignore polars x GCS depencency not imported in code
81
+ [tool.deptry.per_rule_ignores]
82
+ DEP002 = ["gcsfs"]
83
+
84
+
78
85
  # test configuration
79
86
  [tool.pytest.ini_options]
80
87
  markers = ["integration_test: Intergration tests", "unit_test: Unit tests"]
@@ -32,12 +32,24 @@ class FTPtoGCPTransferableObject(TransferableObject):
32
32
  async with aioftp.Client.context(ftp_obj.server, user="anonymous", password="anonymous") as ftp: # noqa: S106
33
33
  bucket = storage.Client().bucket(gcs_obj.bucket)
34
34
  blob = bucket.blob(gcs_obj.object)
35
- logger.info(f"Changing directory to {ftp_obj.base_dir}.")
36
- await ftp.change_directory(ftp_obj.base_dir)
37
- pwd = await ftp.get_current_directory()
38
- dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(pwd))
35
+ logger.info(f"Searching for the release date in the provided ftp path: {ftp_obj.base_dir}.")
36
+ dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(ftp_obj.base_dir))
37
+
39
38
  if dir_match:
40
- logger.info(f"Found release date!: {dir_match.group('release_date')}")
39
+ logger.info(f"Found release date to search in the ftp {dir_match.group('release_date')}.")
40
+ release_date = dir_match.group("release_date")
41
+ try:
42
+ await ftp.change_directory(ftp_obj.base_dir)
43
+ except aioftp.StatusCodeError as e:
44
+ logger.error(f"Failed to change directory to {ftp_obj.base_dir}: {e}")
45
+ logger.warning("Attempting to load the `latest` release.")
46
+ ftp_obj = FTPPath(self.source.replace(release_date, "latest"))
47
+ try:
48
+ await ftp.change_directory(ftp_obj.base_dir)
49
+ except aioftp.StatusCodeError as e:
50
+ logger.error(f"Failed to find the latest release under {ftp_obj}")
51
+ raise
52
+
41
53
  buffer = io.BytesIO()
42
54
  stream = await ftp.download_stream(ftp_obj.filename)
43
55
  async with stream:
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ from collections import defaultdict
6
7
  from dataclasses import dataclass
7
8
  from datetime import date
8
9
 
@@ -13,7 +14,12 @@ from pydantic import AliasPath, BaseModel, Field
13
14
  from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
14
15
 
15
16
 
16
- def _requires_release_date_template(path: str) -> str:
17
+ class KeepMissing(defaultdict[str, str]):
18
+ def __missing__(self, key):
19
+ return "{" + key + "}"
20
+
21
+
22
+ def destination_validator(path: str) -> str:
17
23
  """Ensure that the destination path contains a template for the release date."""
18
24
  if "{release_date}" not in path:
19
25
  raise GentroutilsError(GentroutilsErrorMessage.MISSING_RELEASE_DATE_TEMPLATE, release_date="{release_date}")
@@ -34,7 +40,7 @@ class TemplateDestination:
34
40
 
35
41
  This method returns a new TemplateDestination object (not a copy of the current one) with the formatted destination.
36
42
  """
37
- return TemplateDestination(self.destination.format(**substitutions), True)
43
+ return TemplateDestination(self.destination.format_map(KeepMissing(**substitutions)), True)
38
44
 
39
45
 
40
46
  class GwasCatalogReleaseInfo(BaseModel):
@@ -83,6 +89,7 @@ class GwasCatalogReleaseInfo(BaseModel):
83
89
  @classmethod
84
90
  def from_uri(cls, uri: str) -> GwasCatalogReleaseInfo:
85
91
  """Fetch the release information from the specified URI."""
92
+ logger.debug(f"Fetching release info from {uri}")
86
93
  try:
87
94
  return asyncio.run(cls._get_release_info(uri))
88
95
  except aiohttp.ClientError as e: