gentroutils 2.0.0__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. gentroutils-3.0.0/.github/workflows/build.yaml +87 -0
  2. {gentroutils-2.0.0 → gentroutils-3.0.0}/CHANGELOG.md +39 -0
  3. {gentroutils-2.0.0 → gentroutils-3.0.0}/PKG-INFO +16 -12
  4. {gentroutils-2.0.0 → gentroutils-3.0.0}/README.md +13 -10
  5. gentroutils-3.0.0/config.yaml +40 -0
  6. {gentroutils-2.0.0 → gentroutils-3.0.0}/pyproject.toml +9 -2
  7. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/transfer/ftp_to_gcs.py +17 -5
  8. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/tasks/__init__.py +9 -2
  9. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/tasks/crawl.py +11 -12
  10. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/tasks/curation.py +9 -7
  11. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/tasks/fetch.py +5 -8
  12. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/transfer/test_ftp_to_gcs.py +2 -3
  13. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/tasks/test_crawl_task.py +2 -2
  14. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/tasks/test_curation_task.py +5 -4
  15. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/tasks/test_fetch_task.py +11 -8
  16. {gentroutils-2.0.0 → gentroutils-3.0.0}/uv.lock +72 -3
  17. gentroutils-2.0.0/.RData +0 -0
  18. gentroutils-2.0.0/.Rhistory +0 -0
  19. gentroutils-2.0.0/config.yaml +0 -32
  20. {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/labeler.yaml +0 -0
  21. {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/pr.yaml +0 -0
  22. {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/release.yaml +0 -0
  23. {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/release_pr.yaml +0 -0
  24. {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/tag.yaml +0 -0
  25. {gentroutils-2.0.0 → gentroutils-3.0.0}/.gitignore +0 -0
  26. {gentroutils-2.0.0 → gentroutils-3.0.0}/.pre-commit-config.yaml +0 -0
  27. {gentroutils-2.0.0 → gentroutils-3.0.0}/.vscode/extensions.json +0 -0
  28. {gentroutils-2.0.0 → gentroutils-3.0.0}/.vscode/settings.json +0 -0
  29. {gentroutils-2.0.0 → gentroutils-3.0.0}/Dockerfile +0 -0
  30. {gentroutils-2.0.0 → gentroutils-3.0.0}/LICENSE +0 -0
  31. {gentroutils-2.0.0 → gentroutils-3.0.0}/Makefile +0 -0
  32. {gentroutils-2.0.0 → gentroutils-3.0.0}/commitlint.config.js +0 -0
  33. {gentroutils-2.0.0 → gentroutils-3.0.0}/conftest.py +0 -0
  34. {gentroutils-2.0.0 → gentroutils-3.0.0}/docs/00_prepare_tables_for_curation.R +0 -0
  35. {gentroutils-2.0.0 → gentroutils-3.0.0}/docs/gwas_catalog_curation.md +0 -0
  36. {gentroutils-2.0.0 → gentroutils-3.0.0}/setup.sh +0 -0
  37. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/__init__.py +0 -0
  38. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/errors.py +0 -0
  39. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/path/__init__.py +0 -0
  40. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/path/ftp.py +0 -0
  41. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/path/gcs.py +0 -0
  42. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/transfer/__init__.py +0 -0
  43. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/transfer/model.py +0 -0
  44. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/transfer/polars_to_gcs.py +0 -0
  45. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/parsers/__init__.py +0 -0
  46. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/parsers/curation.py +0 -0
  47. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/py.typed +0 -0
  48. {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/transfer.py +0 -0
  49. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/ftp/test/databases/gwas/summary_statistics/harmonised_list.txt +0 -0
  50. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/gsutil_list.txt +0 -0
  51. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/correct_curation.tsv +0 -0
  52. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_analysisFlag_type.tsv +0 -0
  53. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_analysisFlag_value.tsv +0 -0
  54. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_columns_curation.tsv +0 -0
  55. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_publicationTitle_type.tsv +0 -0
  56. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_pubmedId_type.tsv +0 -0
  57. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyId_type.tsv +0 -0
  58. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyId_value.tsv +0 -0
  59. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyType_type.tsv +0 -0
  60. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyType_value.tsv +0 -0
  61. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_traitFromSource_type.tsv +0 -0
  62. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/non_unique_studyId.tsv +0 -0
  63. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/null_value_in_studyId.tsv +0 -0
  64. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/test.h.tsv.gz +0 -0
  65. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/conftest.py +0 -0
  66. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/path/conftest.py +0 -0
  67. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/path/test_ftp.py +0 -0
  68. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/path/test_gcs.py +0 -0
  69. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/transfer/conftest.py +0 -0
  70. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/transfer/test_model.py +0 -0
  71. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/transfer/test_polars_to_gcs.py +0 -0
  72. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/parsers/conftest.py +0 -0
  73. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/parsers/test_curation.py +0 -0
  74. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/tasks/conftest.py +0 -0
  75. {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/test_transfer.py +0 -0
@@ -0,0 +1,87 @@
1
+
2
+ name: build-artifacts
3
+
4
+ on:
5
+ workflow_dispatch:
6
+
7
+ env:
8
+ GCP_PROJECT_ID: "open-targets-genetics-dev"
9
+ GCP_REGION: "europe-west1"
10
+ TAG: "${{ github.ref_name }}"
11
+ REPO: "${{ github.event.repository_name }}"
12
+
13
+
14
+ jobs:
15
+ push-to-ghcr-and-gar:
16
+ name: Build docker image and push to GHCR and GAR
17
+ runs-on: ubuntu-22.04
18
+
19
+ permissions:
20
+ packages: write
21
+ contents: read
22
+ attestations: write
23
+ id-token: write
24
+
25
+ steps:
26
+ - id: prepare
27
+ name: Prepare the action and log details
28
+ shell: bash
29
+ env:
30
+ GITHUB_CONTEXT: ${{ toJson(github) }}
31
+ run: |
32
+ TAG=$(echo $TAG | sed 's/^v//')
33
+ echo "TAG=$TAG" >> $GITHUB_ENV
34
+ echo "The tag for this build is $TAG"
35
+ echo "The repo name is: $REPO"
36
+ echo "Github context:\n$GITHUB_CONTEXT"
37
+
38
+ - id: checkout
39
+ name: Check out repo
40
+ uses: actions/checkout@v4
41
+
42
+ - name: Set up Docker Buildx
43
+ uses: docker/setup-buildx-action@v3
44
+
45
+ - id: auth-ghcr
46
+ name: Log in to GitHub Container Registry
47
+ uses: docker/login-action@v3
48
+ with:
49
+ registry: ghcr.io
50
+ username: ${{ github.actor }}
51
+ password: ${{ secrets.GITHUB_TOKEN }}
52
+
53
+ - id: auth-google
54
+ name: Authenticate to Google Cloud
55
+ uses: google-github-actions/auth@v2
56
+ with:
57
+ project_id: ${{ env.GCP_PROJECT_ID }}
58
+ workload_identity_provider: projects/234703259993/locations/global/workloadIdentityPools/github/providers/opentargets
59
+ access_token_lifetime: 300s
60
+
61
+ - id: auth-gar
62
+ name: Login to Google Artifact Registry
63
+ uses: docker/login-action@v3
64
+ with:
65
+ registry: ${{ env.GCP_REGION }}-docker.pkg.dev
66
+ username: oauth2accesstoken
67
+ password: ${{ steps.auth-google.outputs.access_token }}
68
+
69
+ - id: push
70
+ name: Build and push Docker image
71
+ uses: docker/build-push-action@v5
72
+ with:
73
+ context: .
74
+ push: true
75
+ tags: |
76
+ ghcr.io/${{ github.repository }}:latest
77
+ ghcr.io/${{ github.repository }}:${{ env.TAG }}
78
+ ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}/${{ env.REPO }}:latest
79
+ ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}:${{ env.TAG }}
80
+
81
+ - id: generate-attestations
82
+ name: Generate artifact attestation
83
+ uses: actions/attest-build-provenance@v1
84
+ with:
85
+ subject-name: ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}/${{ env.REPO }}
86
+ subject-digest: ${{ steps.push.outputs.digest }}
87
+ push-to-registry: true
@@ -1,6 +1,35 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v3.0.0 (2025-08-28)
5
+
6
+ ### Bug Fixes
7
+
8
+ - Ensure otter scratchpad works
9
+ ([`b781b1f`](https://github.com/opentargets/gentroutils/commit/b781b1f587f94807daa3abde0dbb785121904e68))
10
+
11
+ ### Build System
12
+
13
+ - Update dependencies
14
+ ([`e9e05c3`](https://github.com/opentargets/gentroutils/commit/e9e05c31ec6ea443d1cb8af28f3583c6486ecb52))
15
+
16
+ ### Chores
17
+
18
+ - Format
19
+ ([`876400b`](https://github.com/opentargets/gentroutils/commit/876400b3d8b5a09c198a8b13db2ffc598a1f218a))
20
+
21
+ - Remove R session files
22
+ ([`ffe093d`](https://github.com/opentargets/gentroutils/commit/ffe093d494ec7c469b8b9f99978bfa002426c189))
23
+
24
+ - Update readme
25
+ ([`06a21db`](https://github.com/opentargets/gentroutils/commit/06a21dbdb0b706e06537f80d378f8837041b3906))
26
+
27
+ ### Continuous Integration
28
+
29
+ - Add build command
30
+ ([`2e67fcd`](https://github.com/opentargets/gentroutils/commit/2e67fcd1d7478b96dc698d5e54e2a077529fcb61))
31
+
32
+
4
33
  ## v2.0.0 (2025-08-28)
5
34
 
6
35
  ### Chores
@@ -16,6 +45,16 @@
16
45
 
17
46
  Co-authored-by: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com>
18
47
 
48
+ ### Continuous Integration
49
+
50
+ - Update ci
51
+ ([`81c39b2`](https://github.com/opentargets/gentroutils/commit/81c39b2d02f8fe318d8921b46c636a3361093263))
52
+
53
+ ### Features
54
+
55
+ - 2.0.0
56
+ ([`3268c38`](https://github.com/opentargets/gentroutils/commit/3268c383cf6dfac4be7748ad6b48e8ded5f6157c))
57
+
19
58
 
20
59
  ## v1.6.0-dev.2 (2025-08-12)
21
60
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gentroutils
3
- Version: 2.0.0
3
+ Version: 3.0.0
4
4
  Summary: Open Targets python genetics utility CLI tools
5
5
  Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
6
6
  License-Expression: Apache-2.0
@@ -15,10 +15,11 @@ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
15
  Requires-Python: >=3.13
16
16
  Requires-Dist: aioftp>=0.25.1
17
17
  Requires-Dist: aiohttp>=3.11.18
18
+ Requires-Dist: gcsfs>=2025.7.0
18
19
  Requires-Dist: google-cloud-storage>=3.1.1
19
20
  Requires-Dist: loguru>=0.7.3
20
21
  Requires-Dist: opentargets-otter>=25.0.2
21
- Requires-Dist: polars>=1.31.0
22
+ Requires-Dist: polars[fsspec,gcs]>=1.31.0
22
23
  Requires-Dist: pydantic>=2.10.6
23
24
  Requires-Dist: tqdm>=4.67.1
24
25
  Description-Content-Type: text/markdown
@@ -48,6 +49,7 @@ gentroutils --help
48
49
  ## Usage
49
50
 
50
51
  To run a single step run
52
+
51
53
  ```{bash}
52
54
  uv run gentroutils -s gwas_catalog_release # After cloning the repository
53
55
  gentroutils -s gwas_catalog_release -c otter_config.yaml # When installed by pip
@@ -60,6 +62,11 @@ The `gentroutils` repository uses the [otter](https://github.com/opentargets/ott
60
62
 
61
63
  For the top level fields refer to the [otter documentation](https://opentargets.github.io/otter/otter.config.html)
62
64
 
65
+ > [!NOTE]
66
+ > All `destination_template` must point to the Google Cloud Storage (GCS) bucket objects.
67
+ > All `source_template` must point to the FTP server paths.
68
+ > In case this is not enforced, the user may experience silent failures.
69
+
63
70
  ```yaml
64
71
  ---
65
72
  work_path: ./work
@@ -91,7 +98,7 @@ steps:
91
98
  - fetch studies
92
99
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
93
100
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
94
- destination_template: ./work/curation_{release_date}.tsv
101
+ destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
95
102
  promote: true
96
103
  ```
97
104
 
@@ -114,8 +121,7 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
114
121
 
115
122
  This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
116
123
 
117
- > [!NOTE]
118
- > **Task parameters**
124
+ > [!NOTE] > **Task parameters**
119
125
  >
120
126
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
121
127
  > - The `destination_template` is where the metadata will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. By default it searches for the release directly in the stats_uri json output.
@@ -135,8 +141,7 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
135
141
 
136
142
  This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
137
143
 
138
- > [!NOTE]
139
- > **Task parameters**
144
+ > [!NOTE] > **Task parameters**
140
145
  >
141
146
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
142
147
  > - The `source_template` is the URL of the GWAS Catalog associations file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -157,8 +162,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
157
162
 
158
163
  This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
159
164
 
160
- > [!NOTE]
161
- > **Task parameters**
165
+ > [!NOTE] > **Task parameters**
162
166
  >
163
167
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
164
168
  > - The `source_template` is the URL of the GWAS Catalog studies file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -179,8 +183,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
179
183
 
180
184
  This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
181
185
 
182
- > [!NOTE]
183
- > **Task parameters**
186
+ > [!NOTE] > **Task parameters**
184
187
  >
185
188
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
186
189
  > - The `source_template` is the URL of the GWAS Catalog ancestries file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -203,7 +206,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
203
206
 
204
207
  This task is used to build the GWAS Catalog curation file that is later used as a template for manual curation. It requires the `fetch studies` task to be completed before it can run. This is due to the fact that the curation file is build based on the list of studies fetched from `download studies` file.
205
208
 
206
- > [!NOTE]
209
+ > [!NOTE]
207
210
  > **Task parameters**
208
211
  >
209
212
  > - The `requires` field specifies that this task depends on the `fetch studies` task, meaning it will only run after the studies have been fetched.
@@ -268,6 +271,7 @@ To check CLI execution manually you need to run
268
271
  ```{bash}
269
272
  uv run gentroutils
270
273
  ```
274
+
271
275
  ---
272
276
 
273
277
  This software was developed as part of the Open Targets project. For more
@@ -23,6 +23,7 @@ gentroutils --help
23
23
  ## Usage
24
24
 
25
25
  To run a single step run
26
+
26
27
  ```{bash}
27
28
  uv run gentroutils -s gwas_catalog_release # After cloning the repository
28
29
  gentroutils -s gwas_catalog_release -c otter_config.yaml # When installed by pip
@@ -35,6 +36,11 @@ The `gentroutils` repository uses the [otter](https://github.com/opentargets/ott
35
36
 
36
37
  For the top level fields refer to the [otter documentation](https://opentargets.github.io/otter/otter.config.html)
37
38
 
39
+ > [!NOTE]
40
+ > All `destination_template` must point to the Google Cloud Storage (GCS) bucket objects.
41
+ > All `source_template` must point to the FTP server paths.
42
+ > In case this is not enforced, the user may experience silent failures.
43
+
38
44
  ```yaml
39
45
  ---
40
46
  work_path: ./work
@@ -66,7 +72,7 @@ steps:
66
72
  - fetch studies
67
73
  previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
68
74
  studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
69
- destination_template: ./work/curation_{release_date}.tsv
75
+ destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
70
76
  promote: true
71
77
  ```
72
78
 
@@ -89,8 +95,7 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
89
95
 
90
96
  This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
91
97
 
92
- > [!NOTE]
93
- > **Task parameters**
98
+ > [!NOTE] > **Task parameters**
94
99
  >
95
100
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
96
101
  > - The `destination_template` is where the metadata will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. By default it searches for the release directly in the stats_uri json output.
@@ -110,8 +115,7 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
110
115
 
111
116
  This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
112
117
 
113
- > [!NOTE]
114
- > **Task parameters**
118
+ > [!NOTE] > **Task parameters**
115
119
  >
116
120
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
117
121
  > - The `source_template` is the URL of the GWAS Catalog associations file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -132,8 +136,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
132
136
 
133
137
  This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
134
138
 
135
- > [!NOTE]
136
- > **Task parameters**
139
+ > [!NOTE] > **Task parameters**
137
140
  >
138
141
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
139
142
  > - The `source_template` is the URL of the GWAS Catalog studies file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -154,8 +157,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
154
157
 
155
158
  This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
156
159
 
157
- > [!NOTE]
158
- > **Task parameters**
160
+ > [!NOTE] > **Task parameters**
159
161
  >
160
162
  > - The `stats_uri` is used to fetch the latest release date and other metadata.
161
163
  > - The `source_template` is the URL of the GWAS Catalog ancestries file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
@@ -178,7 +180,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
178
180
 
179
181
  This task is used to build the GWAS Catalog curation file that is later used as a template for manual curation. It requires the `fetch studies` task to be completed before it can run. This is due to the fact that the curation file is build based on the list of studies fetched from `download studies` file.
180
182
 
181
- > [!NOTE]
183
+ > [!NOTE]
182
184
  > **Task parameters**
183
185
  >
184
186
  > - The `requires` field specifies that this task depends on the `fetch studies` task, meaning it will only run after the studies have been fetched.
@@ -243,6 +245,7 @@ To check CLI execution manually you need to run
243
245
  ```{bash}
244
246
  uv run gentroutils
245
247
  ```
248
+
246
249
  ---
247
250
 
248
251
  This software was developed as part of the Open Targets project. For more
@@ -0,0 +1,40 @@
1
+ ---
2
+ work_path: ./work
3
+ log_level: DEBUG
4
+ scratchpad:
5
+ gc_stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
6
+ gc_bucket: "gs://gwas_catalog_inputs"
7
+ gc_ftp: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases"
8
+
9
+ steps:
10
+ gwas_catalog_release:
11
+ # - name: crawl release metadata
12
+ # stats_uri: ${gc_stats_uri}
13
+ # destination_template: '${gc_bucket}/gentroutils/{release_date}/stats.json'
14
+ # promote: true
15
+
16
+ # - name: fetch associations
17
+ # stats_uri: ${gc_stats_uri}
18
+ # source_template: '${gc_ftp}/{release_date}/gwas-catalog-associations_ontology-annotated.tsv'
19
+ # destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv'
20
+ # promote: true
21
+
22
+ # - name: fetch studies
23
+ # stats_uri: ${gc_stats_uri}
24
+ # source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt'
25
+ # destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_studies.tsv'
26
+ # promote: true
27
+
28
+ # - name: fetch ancestries
29
+ # stats_uri: ${gc_stats_uri}
30
+ # source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt'
31
+ # destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv'
32
+ # promote: true
33
+
34
+ - name: curation study
35
+ # requires:
36
+ # - fetch studies
37
+ previous_curation: '${gc_bucket}/curation/latest/curated/GWAS_Catalog_study_curation.tsv'
38
+ studies: '${gc_bucket}/gentroutils/latest/gwas_catalog_download_studies.tsv'
39
+ destination_template: '${gc_bucket}/curation/{release_date}/raw/GWAS_Catalog_study_curation.tsv'
40
+ promote: true
@@ -1,17 +1,18 @@
1
1
  [project]
2
2
  authors = [{ name = "Szymon Szyszkowski", email = "ss60@sanger.ac.uk" }]
3
3
  name = "gentroutils"
4
- version = "2.0.0"
4
+ version = "3.0.0"
5
5
  description = "Open Targets python genetics utility CLI tools"
6
6
  dependencies = [
7
7
  "aiohttp>=3.11.18",
8
8
  "aioftp>=0.25.1",
9
- "polars>=1.31.0",
9
+ "polars[fsspec,gcs]>=1.31.0",
10
10
  "pydantic>=2.10.6",
11
11
  "loguru>=0.7.3",
12
12
  "tqdm>=4.67.1",
13
13
  "opentargets-otter>=25.0.2",
14
14
  "google-cloud-storage>=3.1.1",
15
+ "gcsfs>=2025.7.0",
15
16
  ]
16
17
  readme = "README.md"
17
18
  requires-python = ">=3.13"
@@ -75,6 +76,12 @@ allow-direct-references = true
75
76
  [tool.hatch.build.targets.wheel]
76
77
  packages = ["src/gentroutils"]
77
78
 
79
+
80
+ # Ignore polars x GCS depencency not imported in code
81
+ [tool.deptry.per_rule_ignores]
82
+ DEP002 = ["gcsfs"]
83
+
84
+
78
85
  # test configuration
79
86
  [tool.pytest.ini_options]
80
87
  markers = ["integration_test: Intergration tests", "unit_test: Unit tests"]
@@ -32,12 +32,24 @@ class FTPtoGCPTransferableObject(TransferableObject):
32
32
  async with aioftp.Client.context(ftp_obj.server, user="anonymous", password="anonymous") as ftp: # noqa: S106
33
33
  bucket = storage.Client().bucket(gcs_obj.bucket)
34
34
  blob = bucket.blob(gcs_obj.object)
35
- logger.info(f"Changing directory to {ftp_obj.base_dir}.")
36
- await ftp.change_directory(ftp_obj.base_dir)
37
- pwd = await ftp.get_current_directory()
38
- dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(pwd))
35
+ logger.info(f"Searching for the release date in the provided ftp path: {ftp_obj.base_dir}.")
36
+ dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(ftp_obj.base_dir))
37
+
39
38
  if dir_match:
40
- logger.info(f"Found release date!: {dir_match.group('release_date')}")
39
+ logger.info(f"Found release date to search in the ftp {dir_match.group('release_date')}.")
40
+ release_date = dir_match.group("release_date")
41
+ try:
42
+ await ftp.change_directory(ftp_obj.base_dir)
43
+ except aioftp.StatusCodeError as e:
44
+ logger.error(f"Failed to change directory to {ftp_obj.base_dir}: {e}")
45
+ logger.warning("Attempting to load the `latest` release.")
46
+ ftp_obj = FTPPath(self.source.replace(release_date, "latest"))
47
+ try:
48
+ await ftp.change_directory(ftp_obj.base_dir)
49
+ except aioftp.StatusCodeError as e:
50
+ logger.error(f"Failed to find the latest release under {ftp_obj}")
51
+ raise
52
+
41
53
  buffer = io.BytesIO()
42
54
  stream = await ftp.download_stream(ftp_obj.filename)
43
55
  async with stream:
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ from collections import defaultdict
6
7
  from dataclasses import dataclass
7
8
  from datetime import date
8
9
 
@@ -13,7 +14,12 @@ from pydantic import AliasPath, BaseModel, Field
13
14
  from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
14
15
 
15
16
 
16
- def _requires_release_date_template(path: str) -> str:
17
+ class KeepMissing(defaultdict[str, str]):
18
+ def __missing__(self, key):
19
+ return "{" + key + "}"
20
+
21
+
22
+ def destination_validator(path: str) -> str:
17
23
  """Ensure that the destination path contains a template for the release date."""
18
24
  if "{release_date}" not in path:
19
25
  raise GentroutilsError(GentroutilsErrorMessage.MISSING_RELEASE_DATE_TEMPLATE, release_date="{release_date}")
@@ -34,7 +40,7 @@ class TemplateDestination:
34
40
 
35
41
  This method returns a new TemplateDestination object (not a copy of the current one) with the formatted destination.
36
42
  """
37
- return TemplateDestination(self.destination.format(**substitutions), True)
43
+ return TemplateDestination(self.destination.format_map(KeepMissing(**substitutions)), True)
38
44
 
39
45
 
40
46
  class GwasCatalogReleaseInfo(BaseModel):
@@ -83,6 +89,7 @@ class GwasCatalogReleaseInfo(BaseModel):
83
89
  @classmethod
84
90
  def from_uri(cls, uri: str) -> GwasCatalogReleaseInfo:
85
91
  """Fetch the release information from the specified URI."""
92
+ logger.debug(f"Fetching release info from {uri}")
86
93
  try:
87
94
  return asyncio.run(cls._get_release_info(uri))
88
95
  except aiohttp.ClientError as e:
@@ -1,7 +1,6 @@
1
1
  """Module to handle the crawling of GWAS Catalog release information."""
2
2
 
3
3
  import tempfile
4
- from functools import cached_property
5
4
  from pathlib import Path
6
5
  from typing import Annotated, Any, Self
7
6
 
@@ -9,9 +8,10 @@ from loguru import logger
9
8
  from otter.storage import get_remote_storage
10
9
  from otter.task.model import Spec, Task, TaskContext
11
10
  from otter.task.task_reporter import report
12
- from pydantic import AfterValidator, computed_field
11
+ from pydantic import AfterValidator
13
12
 
14
- from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, _requires_release_date_template
13
+ from gentroutils.io.path import GCSPath
14
+ from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, destination_validator
15
15
 
16
16
 
17
17
  class CrawlSpec(Spec):
@@ -68,7 +68,7 @@ class CrawlSpec(Spec):
68
68
  stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
69
69
  """The URI to crawl the release statistics information from."""
70
70
 
71
- destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
71
+ destination_template: Annotated[str, AfterValidator(destination_validator)]
72
72
  """The destination path to save the release information.
73
73
  This path should always be a template string that includes `{release_date}`.
74
74
  For example, `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`.
@@ -91,8 +91,6 @@ class CrawlSpec(Spec):
91
91
  promoting the release as the latest release.
92
92
  """
93
93
 
94
- @computed_field # type: ignore[prop-decorator]
95
- @cached_property
96
94
  def destinations(self) -> list[TemplateDestination]:
97
95
  """Get the list of destinations templates where the release information will be saved.
98
96
 
@@ -105,17 +103,17 @@ class CrawlSpec(Spec):
105
103
  1. The destination template with the release date substituted.
106
104
  2. The destination with the release date substituted to `latest`.
107
105
  """
108
- d1 = self.destination_template
106
+ d1 = TemplateDestination(self.destination_template, False)
109
107
  if self.promote:
110
- d2 = self.destination_template.format(release_date="latest")
111
- return [TemplateDestination(d1, False), TemplateDestination(d2, True)]
112
- return [TemplateDestination(d1, False)]
108
+ d2 = d1.format({"release_date": "latest"})
109
+ return [d1, d2]
110
+ return [d1]
113
111
 
114
112
  def substituted_destinations(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
115
113
  """Safely parse the destination name to ensure it is valid."""
116
114
  substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
117
115
  return [
118
- d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
116
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
119
117
  ]
120
118
 
121
119
  def model_post_init(self, __context: Any) -> None:
@@ -141,9 +139,10 @@ class Crawl(Task):
141
139
  logger.info(f"Destinations for release information: {destinations}")
142
140
  for destination in destinations:
143
141
  storage = get_remote_storage(destination)
142
+ assert "gs://" in destination, f"Invalid GCS path in destination template: {destination}"
144
143
  storage.upload(Path(source.name), destination)
145
144
  logger.info(f"Release information written to {destination}")
146
- return self
145
+ return self
147
146
 
148
147
  @report
149
148
  def run(self) -> Self:
@@ -3,16 +3,16 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from datetime import date
6
- from functools import cached_property
7
6
  from typing import Annotated, Any, Self
8
7
 
8
+ from loguru import logger
9
9
  from otter.task.model import Spec, Task, TaskContext
10
10
  from otter.task.task_reporter import report
11
- from pydantic import AfterValidator, computed_field
11
+ from pydantic import AfterValidator
12
12
 
13
13
  from gentroutils.io.transfer.polars_to_gcs import PolarsDataFrameToGCSTransferableObject
14
14
  from gentroutils.parsers.curation import GWASCatalogCuration
15
- from gentroutils.tasks import TemplateDestination, _requires_release_date_template
15
+ from gentroutils.tasks import TemplateDestination, destination_validator
16
16
  from gentroutils.transfer import TransferManager
17
17
 
18
18
 
@@ -50,14 +50,12 @@ class CurationSpec(Spec):
50
50
  studies: str
51
51
  """The path to the studies data."""
52
52
 
53
- destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
53
+ destination_template: Annotated[str, AfterValidator(destination_validator)]
54
54
  """The destination path for the curation data."""
55
55
 
56
56
  promote: bool = False
57
57
  """Whether to promote the curation data to the latest version."""
58
58
 
59
- @computed_field # type: ignore[prop-decorator]
60
- @cached_property
61
59
  def destinations(self) -> list[TemplateDestination]:
62
60
  """Get the list of destinations templates where the release information will be saved.
63
61
 
@@ -80,7 +78,7 @@ class CurationSpec(Spec):
80
78
  """Safely parse the destination name to ensure it is valid."""
81
79
  substitutions = {"release_date": release_date.strftime("%Y%m%d")}
82
80
  return [
83
- d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
81
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
84
82
  ]
85
83
 
86
84
  def model_post_init(self, __context: Any) -> None:
@@ -99,9 +97,13 @@ class Curation(Task):
99
97
  @report
100
98
  def run(self) -> Self:
101
99
  """Run the curation task."""
100
+ logger.info("Starting curation task.")
102
101
  release_date = date.today()
102
+ logger.debug(f"Using release date: {release_date}")
103
103
  destinations = self.spec.substituted_destinations(release_date)
104
+ logger.debug(f"Destinations for curation data: {destinations}")
104
105
  curation = GWASCatalogCuration.from_prev_curation(self.spec.previous_curation, self.spec.studies)
106
+ logger.debug(f"Curation result preview:\n{curation.result.head()}")
105
107
  transfer_objects = [
106
108
  PolarsDataFrameToGCSTransferableObject(source=curation.result, destination=d) for d in destinations
107
109
  ]
@@ -1,15 +1,14 @@
1
1
  """Module to handle the fetching of GWAS Catalog release files."""
2
2
 
3
- from functools import cached_property
4
3
  from typing import Annotated, Any, Self
5
4
 
6
5
  from loguru import logger
7
6
  from otter.task.model import Spec, Task, TaskContext
8
7
  from otter.task.task_reporter import report
9
- from pydantic import AfterValidator, computed_field
8
+ from pydantic import AfterValidator
10
9
 
11
10
  from gentroutils.io.transfer import FTPtoGCPTransferableObject
12
- from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, _requires_release_date_template
11
+ from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, destination_validator
13
12
  from gentroutils.transfer import TransferManager
14
13
 
15
14
  MAX_CONCURRENT_CONNECTIONS = 10
@@ -57,10 +56,10 @@ class FetchSpec(Spec):
57
56
  stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
58
57
  """The URI to crawl the release statistics information from."""
59
58
 
60
- source_template: Annotated[str, AfterValidator(_requires_release_date_template)]
59
+ source_template: Annotated[str, AfterValidator(destination_validator)]
61
60
  """The template URI of the file to download."""
62
61
 
63
- destination_template: Annotated[str, AfterValidator(_requires_release_date_template)]
62
+ destination_template: Annotated[str, AfterValidator(destination_validator)]
64
63
  """The template URI to upload the file to."""
65
64
 
66
65
  promote: bool = False
@@ -78,8 +77,6 @@ class FetchSpec(Spec):
78
77
  promoting the release as the latest release.
79
78
  """
80
79
 
81
- @computed_field # type: ignore[prop-decorator]
82
- @cached_property
83
80
  def destinations(self) -> list[TemplateDestination]:
84
81
  """Get the list of destinations templates where the release information will be saved.
85
82
 
@@ -102,7 +99,7 @@ class FetchSpec(Spec):
102
99
  """Safely parse the destination name to ensure it is valid."""
103
100
  substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
104
101
  return [
105
- d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
102
+ d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
106
103
  ]
107
104
 
108
105
  def substituted_sources(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
@@ -70,14 +70,13 @@ class TestFTPtoGCPTransferableObject:
70
70
 
71
71
  # Create and execute transfer
72
72
  obj = FTPtoGCPTransferableObject(
73
- source="ftp://example.com/data/file.txt", destination="gs://test-bucket/file.txt"
73
+ source="ftp://example.com/2025/12/12/file.txt", destination="gs://test-bucket/file.txt"
74
74
  )
75
75
  await obj.transfer()
76
76
 
77
77
  # Verify FTP operations
78
78
  mock_ftp_context.assert_called_once_with("example.com", user="anonymous", password="anonymous") # noqa: S106
79
- mock_ftp_client.change_directory.assert_called_once_with("/data")
80
- mock_ftp_client.get_current_directory.assert_called_once()
79
+ mock_ftp_client.change_directory.assert_called()
81
80
  mock_ftp_client.download_stream.assert_called_once_with("file.txt")
82
81
 
83
82
  # Verify GCS operations
@@ -88,7 +88,7 @@ class TestCrawlSpec:
88
88
 
89
89
  def test_crawl_spec_destinations_with_promote(self, crawl_spec):
90
90
  """Test destinations property when promote=True."""
91
- destinations = crawl_spec.destinations
91
+ destinations = crawl_spec.destinations()
92
92
  assert len(destinations) == 2
93
93
  assert destinations[0].destination == "gs://test-bucket/gwas/{release_date}/stats.json"
94
94
  assert destinations[0].is_substituted is False
@@ -97,7 +97,7 @@ class TestCrawlSpec:
97
97
 
98
98
  def test_crawl_spec_destinations_without_promote(self, crawl_spec_no_promote):
99
99
  """Test destinations property when promote=False."""
100
- destinations = crawl_spec_no_promote.destinations
100
+ destinations = crawl_spec_no_promote.destinations()
101
101
  assert len(destinations) == 1
102
102
  assert destinations[0].destination == "gs://test-bucket/gwas/{release_date}/stats.json"
103
103
  assert destinations[0].is_substituted is False
@@ -28,10 +28,11 @@ class TestCurationSpec:
28
28
  assert curation_spec.studies == "gs://test-bucket/studies.json"
29
29
  assert curation_spec.destination_template == "gs://test-bucket/{release_date}/curation.json"
30
30
  assert curation_spec.promote is True
31
- assert curation_spec.destinations[0].destination == "gs://test-bucket/{release_date}/curation.json"
32
- assert curation_spec.destinations[0].is_substituted is False
33
- assert curation_spec.destinations[1].destination == "gs://test-bucket/latest/curation.json"
34
- assert curation_spec.destinations[1].is_substituted is True
31
+ destinations = curation_spec.destinations()
32
+ assert destinations[0].destination == "gs://test-bucket/{release_date}/curation.json"
33
+ assert destinations[0].is_substituted is False
34
+ assert destinations[1].destination == "gs://test-bucket/latest/curation.json"
35
+ assert destinations[1].is_substituted is True
35
36
 
36
37
  def test_curation_spec_requires_release_date_template(self):
37
38
  """Test that CurationSpec validates release date template."""
@@ -46,11 +46,12 @@ class TestFetchSpec:
46
46
  assert fetch_spec.source_template == "https://example.com/{release_date}/data.json"
47
47
  assert fetch_spec.destination_template == "gs://test-bucket/{release_date}/data.json"
48
48
  assert fetch_spec.promote is True
49
- assert len(fetch_spec.destinations) == 2
50
- assert fetch_spec.destinations[0].destination == "gs://test-bucket/{release_date}/data.json"
51
- assert fetch_spec.destinations[0].is_substituted is False
52
- assert fetch_spec.destinations[1].destination == "gs://test-bucket/latest/data.json"
53
- assert fetch_spec.destinations[1].is_substituted is True
49
+ destinations = fetch_spec.destinations()
50
+ assert len(destinations) == 2
51
+ assert destinations[0].destination == "gs://test-bucket/{release_date}/data.json"
52
+ assert destinations[0].is_substituted is False
53
+ assert destinations[1].destination == "gs://test-bucket/latest/data.json"
54
+ assert destinations[1].is_substituted is True
54
55
 
55
56
  def test_initialization_no_promote(self):
56
57
  """Test FetchSpec initialization with promote = False."""
@@ -65,9 +66,11 @@ class TestFetchSpec:
65
66
  assert fetch_spec.source_template == "https://example.com/{release_date}/data.json"
66
67
  assert fetch_spec.destination_template == "gs://test-bucket/{release_date}/data.json"
67
68
  assert fetch_spec.promote is False
68
- assert len(fetch_spec.destinations) == 1
69
- assert fetch_spec.destinations[0].destination == "gs://test-bucket/{release_date}/data.json"
70
- assert fetch_spec.destinations[0].is_substituted is False
69
+
70
+ destinations = fetch_spec.destinations()
71
+ assert len(destinations) == 1
72
+ assert destinations[0].destination == "gs://test-bucket/{release_date}/data.json"
73
+ assert destinations[0].is_substituted is False
71
74
 
72
75
  def test_requires_release_date_template(self):
73
76
  """Test that FetchSpec validates release date template."""
@@ -457,6 +457,15 @@ wheels = [
457
457
  { url = "https://files.pythonhosted.org/packages/b9/5c/a3d95dc1ec6cdeb032d789b552ecc76effa3557ea9186e1566df6aac18df/fs-2.4.16-py2.py3-none-any.whl", hash = "sha256:660064febbccda264ae0b6bace80a8d1be9e089e0a5eb2427b7d517f9a91545c", size = 135261, upload-time = "2022-05-02T09:25:52.363Z" },
458
458
  ]
459
459
 
460
+ [[package]]
461
+ name = "fsspec"
462
+ version = "2025.7.0"
463
+ source = { registry = "https://pypi.org/simple" }
464
+ sdist = { url = "https://files.pythonhosted.org/packages/8b/02/0835e6ab9cfc03916fe3f78c0956cfcdb6ff2669ffa6651065d5ebf7fc98/fsspec-2025.7.0.tar.gz", hash = "sha256:786120687ffa54b8283d942929540d8bc5ccfa820deb555a2b5d0ed2b737bf58", size = 304432, upload-time = "2025-07-15T16:05:21.19Z" }
465
+ wheels = [
466
+ { url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" },
467
+ ]
468
+
460
469
  [[package]]
461
470
  name = "gcloud-storage-emulator"
462
471
  version = "0.5.0"
@@ -471,17 +480,36 @@ wheels = [
471
480
  { url = "https://files.pythonhosted.org/packages/f1/46/b70e4ee5df71b4253c146c3792703c1e63c66c1e63d9bd64dbc07a1c456e/gcloud_storage_emulator-0.5.0-py3-none-any.whl", hash = "sha256:69fe95ab57fc45acdeba7b8dfc44c12b9e7921e640776f7ec79d675d753b03cf", size = 19959, upload-time = "2021-09-29T09:03:26.099Z" },
472
481
  ]
473
482
 
483
+ [[package]]
484
+ name = "gcsfs"
485
+ version = "2025.7.0"
486
+ source = { registry = "https://pypi.org/simple" }
487
+ dependencies = [
488
+ { name = "aiohttp" },
489
+ { name = "decorator" },
490
+ { name = "fsspec" },
491
+ { name = "google-auth" },
492
+ { name = "google-auth-oauthlib" },
493
+ { name = "google-cloud-storage" },
494
+ { name = "requests" },
495
+ ]
496
+ sdist = { url = "https://files.pythonhosted.org/packages/5b/d7/5eafe9f09f1bb09433a473cef7984cd52c398592c8fd09974e0ad87cfea4/gcsfs-2025.7.0.tar.gz", hash = "sha256:ad3ff66cf189ae8fc375ac8a2af409003dbca02357621cb94a66e457e02ba420", size = 82659, upload-time = "2025-07-15T16:49:21.647Z" }
497
+ wheels = [
498
+ { url = "https://files.pythonhosted.org/packages/21/f5/54bccbee01efbc25581db6aafefb6f6c277d880930f7a083b10052382463/gcsfs-2025.7.0-py2.py3-none-any.whl", hash = "sha256:653503331d58cb02bb34e725d4595d166e93f7f2f3ff88e4c66ef535ae66eae5", size = 36815, upload-time = "2025-07-15T16:49:20.333Z" },
499
+ ]
500
+
474
501
  [[package]]
475
502
  name = "gentroutils"
476
- version = "2.0.0"
503
+ version = "3.0.0"
477
504
  source = { editable = "." }
478
505
  dependencies = [
479
506
  { name = "aioftp" },
480
507
  { name = "aiohttp" },
508
+ { name = "gcsfs" },
481
509
  { name = "google-cloud-storage" },
482
510
  { name = "loguru" },
483
511
  { name = "opentargets-otter" },
484
- { name = "polars" },
512
+ { name = "polars", extra = ["fsspec"] },
485
513
  { name = "pydantic" },
486
514
  { name = "tqdm" },
487
515
  ]
@@ -526,10 +554,11 @@ dev = [
526
554
  requires-dist = [
527
555
  { name = "aioftp", specifier = ">=0.25.1" },
528
556
  { name = "aiohttp", specifier = ">=3.11.18" },
557
+ { name = "gcsfs", specifier = ">=2025.7.0" },
529
558
  { name = "google-cloud-storage", specifier = ">=3.1.1" },
530
559
  { name = "loguru", specifier = ">=0.7.3" },
531
560
  { name = "opentargets-otter", specifier = ">=25.0.2" },
532
- { name = "polars", specifier = ">=1.31.0" },
561
+ { name = "polars", extras = ["fsspec", "gcs"], specifier = ">=1.31.0" },
533
562
  { name = "pydantic", specifier = ">=2.10.6" },
534
563
  { name = "tqdm", specifier = ">=4.67.1" },
535
564
  ]
@@ -624,6 +653,19 @@ wheels = [
624
653
  { url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" },
625
654
  ]
626
655
 
656
+ [[package]]
657
+ name = "google-auth-oauthlib"
658
+ version = "1.2.2"
659
+ source = { registry = "https://pypi.org/simple" }
660
+ dependencies = [
661
+ { name = "google-auth" },
662
+ { name = "requests-oauthlib" },
663
+ ]
664
+ sdist = { url = "https://files.pythonhosted.org/packages/fb/87/e10bf24f7bcffc1421b84d6f9c3377c30ec305d082cd737ddaa6d8f77f7c/google_auth_oauthlib-1.2.2.tar.gz", hash = "sha256:11046fb8d3348b296302dd939ace8af0a724042e8029c1b872d87fabc9f41684", size = 20955, upload-time = "2025-04-22T16:40:29.172Z" }
665
+ wheels = [
666
+ { url = "https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl", hash = "sha256:fd619506f4b3908b5df17b65f39ca8d66ea56986e5472eb5978fd8f3786f00a2", size = 19072, upload-time = "2025-04-22T16:40:28.174Z" },
667
+ ]
668
+
627
669
  [[package]]
628
670
  name = "google-cloud-core"
629
671
  version = "2.4.3"
@@ -1038,6 +1080,15 @@ wheels = [
1038
1080
  { url = "https://files.pythonhosted.org/packages/d2/47/ca494f6f2366f17bd2a79b6c1468f6c441a447b017fbdf600aa40b5d27ac/numpy_typing_compat-2.3.20250730-py3-none-any.whl", hash = "sha256:9ab0cd4bb1b4c31debf0bd745554c735a07a7bb3cc3801dc934b2b5856549612", size = 6057, upload-time = "2025-07-30T01:36:06.027Z" },
1039
1081
  ]
1040
1082
 
1083
+ [[package]]
1084
+ name = "oauthlib"
1085
+ version = "3.3.1"
1086
+ source = { registry = "https://pypi.org/simple" }
1087
+ sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" }
1088
+ wheels = [
1089
+ { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" },
1090
+ ]
1091
+
1041
1092
  [[package]]
1042
1093
  name = "opentargets-otter"
1043
1094
  version = "25.0.4"
@@ -1155,6 +1206,11 @@ wheels = [
1155
1206
  { url = "https://files.pythonhosted.org/packages/b8/53/4eaaa4f219add46594db21a05a9a5629ec6af20bd859a90668d5a1448abc/polars-1.32.2-cp39-abi3-win_arm64.whl", hash = "sha256:cd390364f6f3927474bd0aed255103195b9d2b3eef0f0c5bb429db5e6311615e", size = 34059100, upload-time = "2025-08-07T10:50:26.445Z" },
1156
1207
  ]
1157
1208
 
1209
+ [package.optional-dependencies]
1210
+ fsspec = [
1211
+ { name = "fsspec" },
1212
+ ]
1213
+
1158
1214
  [[package]]
1159
1215
  name = "pre-commit"
1160
1216
  version = "4.3.0"
@@ -1541,6 +1597,19 @@ wheels = [
1541
1597
  { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" },
1542
1598
  ]
1543
1599
 
1600
+ [[package]]
1601
+ name = "requests-oauthlib"
1602
+ version = "2.0.0"
1603
+ source = { registry = "https://pypi.org/simple" }
1604
+ dependencies = [
1605
+ { name = "oauthlib" },
1606
+ { name = "requests" },
1607
+ ]
1608
+ sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload-time = "2024-03-22T20:32:29.939Z" }
1609
+ wheels = [
1610
+ { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" },
1611
+ ]
1612
+
1544
1613
  [[package]]
1545
1614
  name = "requests-toolbelt"
1546
1615
  version = "1.0.0"
gentroutils-2.0.0/.RData DELETED
Binary file
File without changes
@@ -1,32 +0,0 @@
1
- ---
2
- work_path: ./work
3
- log_level: DEBUG
4
- scratchpad:
5
- steps:
6
- gwas_catalog_release:
7
- - name: crawl release metadata
8
- stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
9
- destination_template: 'gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json'
10
- promote: 'true'
11
- - name: fetch associations
12
- stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
13
- source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-associations_ontology-annotated.tsv"
14
- destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv"
15
- promote: true
16
- - name: fetch studies
17
- stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
18
- source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt"
19
- destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_download_studies.tsv"
20
- promote: true
21
- - name: fetch ancestries
22
- stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
23
- source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt"
24
- destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv"
25
- promote: true
26
- - name: curation study
27
- requires:
28
- - fetch studies
29
- previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
30
- studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
31
- destination_template: ./work/curation_{release_date}.tsv
32
- promote: true
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes