gentroutils 2.0.0__tar.gz → 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gentroutils-3.0.0/.github/workflows/build.yaml +87 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/CHANGELOG.md +39 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/PKG-INFO +16 -12
- {gentroutils-2.0.0 → gentroutils-3.0.0}/README.md +13 -10
- gentroutils-3.0.0/config.yaml +40 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/pyproject.toml +9 -2
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/transfer/ftp_to_gcs.py +17 -5
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/tasks/__init__.py +9 -2
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/tasks/crawl.py +11 -12
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/tasks/curation.py +9 -7
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/tasks/fetch.py +5 -8
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/transfer/test_ftp_to_gcs.py +2 -3
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/tasks/test_crawl_task.py +2 -2
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/tasks/test_curation_task.py +5 -4
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/tasks/test_fetch_task.py +11 -8
- {gentroutils-2.0.0 → gentroutils-3.0.0}/uv.lock +72 -3
- gentroutils-2.0.0/.RData +0 -0
- gentroutils-2.0.0/.Rhistory +0 -0
- gentroutils-2.0.0/config.yaml +0 -32
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/labeler.yaml +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/pr.yaml +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/release.yaml +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/release_pr.yaml +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.github/workflows/tag.yaml +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.gitignore +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.pre-commit-config.yaml +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.vscode/extensions.json +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/.vscode/settings.json +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/Dockerfile +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/LICENSE +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/Makefile +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/commitlint.config.js +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/conftest.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/docs/00_prepare_tables_for_curation.R +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/docs/gwas_catalog_curation.md +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/setup.sh +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/__init__.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/errors.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/path/__init__.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/path/ftp.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/path/gcs.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/transfer/__init__.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/transfer/model.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/io/transfer/polars_to_gcs.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/parsers/__init__.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/parsers/curation.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/py.typed +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/src/gentroutils/transfer.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/ftp/test/databases/gwas/summary_statistics/harmonised_list.txt +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/gsutil_list.txt +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/correct_curation.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_analysisFlag_type.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_analysisFlag_value.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_columns_curation.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_publicationTitle_type.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_pubmedId_type.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyId_type.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyId_value.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyType_type.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyType_value.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_traitFromSource_type.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/non_unique_studyId.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/null_value_in_studyId.tsv +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/test.h.tsv.gz +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/conftest.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/path/conftest.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/path/test_ftp.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/path/test_gcs.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/transfer/conftest.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/transfer/test_model.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/io/transfer/test_polars_to_gcs.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/parsers/conftest.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/parsers/test_curation.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/tasks/conftest.py +0 -0
- {gentroutils-2.0.0 → gentroutils-3.0.0}/tests/test_transfer.py +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
|
|
2
|
+
name: build-artifacts
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
workflow_dispatch:
|
|
6
|
+
|
|
7
|
+
env:
|
|
8
|
+
GCP_PROJECT_ID: "open-targets-genetics-dev"
|
|
9
|
+
GCP_REGION: "europe-west1"
|
|
10
|
+
TAG: "${{ github.ref_name }}"
|
|
11
|
+
REPO: "${{ github.event.repository_name }}"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
push-to-ghcr-and-gar:
|
|
16
|
+
name: Build docker image and push to GHCR and GAR
|
|
17
|
+
runs-on: ubuntu-22.04
|
|
18
|
+
|
|
19
|
+
permissions:
|
|
20
|
+
packages: write
|
|
21
|
+
contents: read
|
|
22
|
+
attestations: write
|
|
23
|
+
id-token: write
|
|
24
|
+
|
|
25
|
+
steps:
|
|
26
|
+
- id: prepare
|
|
27
|
+
name: Prepare the action and log details
|
|
28
|
+
shell: bash
|
|
29
|
+
env:
|
|
30
|
+
GITHUB_CONTEXT: ${{ toJson(github) }}
|
|
31
|
+
run: |
|
|
32
|
+
TAG=$(echo $TAG | sed 's/^v//')
|
|
33
|
+
echo "TAG=$TAG" >> $GITHUB_ENV
|
|
34
|
+
echo "The tag for this build is $TAG"
|
|
35
|
+
echo "The repo name is: $REPO"
|
|
36
|
+
echo "Github context:\n$GITHUB_CONTEXT"
|
|
37
|
+
|
|
38
|
+
- id: checkout
|
|
39
|
+
name: Check out repo
|
|
40
|
+
uses: actions/checkout@v4
|
|
41
|
+
|
|
42
|
+
- name: Set up Docker Buildx
|
|
43
|
+
uses: docker/setup-buildx-action@v3
|
|
44
|
+
|
|
45
|
+
- id: auth-ghcr
|
|
46
|
+
name: Log in to GitHub Container Registry
|
|
47
|
+
uses: docker/login-action@v3
|
|
48
|
+
with:
|
|
49
|
+
registry: ghcr.io
|
|
50
|
+
username: ${{ github.actor }}
|
|
51
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
52
|
+
|
|
53
|
+
- id: auth-google
|
|
54
|
+
name: Authenticate to Google Cloud
|
|
55
|
+
uses: google-github-actions/auth@v2
|
|
56
|
+
with:
|
|
57
|
+
project_id: ${{ env.GCP_PROJECT_ID }}
|
|
58
|
+
workload_identity_provider: projects/234703259993/locations/global/workloadIdentityPools/github/providers/opentargets
|
|
59
|
+
access_token_lifetime: 300s
|
|
60
|
+
|
|
61
|
+
- id: auth-gar
|
|
62
|
+
name: Login to Google Artifact Registry
|
|
63
|
+
uses: docker/login-action@v3
|
|
64
|
+
with:
|
|
65
|
+
registry: ${{ env.GCP_REGION }}-docker.pkg.dev
|
|
66
|
+
username: oauth2accesstoken
|
|
67
|
+
password: ${{ steps.auth-google.outputs.access_token }}
|
|
68
|
+
|
|
69
|
+
- id: push
|
|
70
|
+
name: Build and push Docker image
|
|
71
|
+
uses: docker/build-push-action@v5
|
|
72
|
+
with:
|
|
73
|
+
context: .
|
|
74
|
+
push: true
|
|
75
|
+
tags: |
|
|
76
|
+
ghcr.io/${{ github.repository }}:latest
|
|
77
|
+
ghcr.io/${{ github.repository }}:${{ env.TAG }}
|
|
78
|
+
${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}/${{ env.REPO }}:latest
|
|
79
|
+
${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}:${{ env.TAG }}
|
|
80
|
+
|
|
81
|
+
- id: generate-attestations
|
|
82
|
+
name: Generate artifact attestation
|
|
83
|
+
uses: actions/attest-build-provenance@v1
|
|
84
|
+
with:
|
|
85
|
+
subject-name: ${{ env.GCP_REGION }}-docker.pkg.dev/${{ env.GCP_PROJECT_ID }}/${{ env.REPO }}/${{ env.REPO }}
|
|
86
|
+
subject-digest: ${{ steps.push.outputs.digest }}
|
|
87
|
+
push-to-registry: true
|
|
@@ -1,6 +1,35 @@
|
|
|
1
1
|
# CHANGELOG
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
## v3.0.0 (2025-08-28)
|
|
5
|
+
|
|
6
|
+
### Bug Fixes
|
|
7
|
+
|
|
8
|
+
- Ensure otter scratchpad works
|
|
9
|
+
([`b781b1f`](https://github.com/opentargets/gentroutils/commit/b781b1f587f94807daa3abde0dbb785121904e68))
|
|
10
|
+
|
|
11
|
+
### Build System
|
|
12
|
+
|
|
13
|
+
- Update dependencies
|
|
14
|
+
([`e9e05c3`](https://github.com/opentargets/gentroutils/commit/e9e05c31ec6ea443d1cb8af28f3583c6486ecb52))
|
|
15
|
+
|
|
16
|
+
### Chores
|
|
17
|
+
|
|
18
|
+
- Format
|
|
19
|
+
([`876400b`](https://github.com/opentargets/gentroutils/commit/876400b3d8b5a09c198a8b13db2ffc598a1f218a))
|
|
20
|
+
|
|
21
|
+
- Remove R session files
|
|
22
|
+
([`ffe093d`](https://github.com/opentargets/gentroutils/commit/ffe093d494ec7c469b8b9f99978bfa002426c189))
|
|
23
|
+
|
|
24
|
+
- Update readme
|
|
25
|
+
([`06a21db`](https://github.com/opentargets/gentroutils/commit/06a21dbdb0b706e06537f80d378f8837041b3906))
|
|
26
|
+
|
|
27
|
+
### Continuous Integration
|
|
28
|
+
|
|
29
|
+
- Add build command
|
|
30
|
+
([`2e67fcd`](https://github.com/opentargets/gentroutils/commit/2e67fcd1d7478b96dc698d5e54e2a077529fcb61))
|
|
31
|
+
|
|
32
|
+
|
|
4
33
|
## v2.0.0 (2025-08-28)
|
|
5
34
|
|
|
6
35
|
### Chores
|
|
@@ -16,6 +45,16 @@
|
|
|
16
45
|
|
|
17
46
|
Co-authored-by: Szymon Szyszkowski <69353402+project-defiant@users.noreply.github.com>
|
|
18
47
|
|
|
48
|
+
### Continuous Integration
|
|
49
|
+
|
|
50
|
+
- Update ci
|
|
51
|
+
([`81c39b2`](https://github.com/opentargets/gentroutils/commit/81c39b2d02f8fe318d8921b46c636a3361093263))
|
|
52
|
+
|
|
53
|
+
### Features
|
|
54
|
+
|
|
55
|
+
- 2.0.0
|
|
56
|
+
([`3268c38`](https://github.com/opentargets/gentroutils/commit/3268c383cf6dfac4be7748ad6b48e8ded5f6157c))
|
|
57
|
+
|
|
19
58
|
|
|
20
59
|
## v1.6.0-dev.2 (2025-08-12)
|
|
21
60
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gentroutils
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.0
|
|
4
4
|
Summary: Open Targets python genetics utility CLI tools
|
|
5
5
|
Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -15,10 +15,11 @@ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
|
15
15
|
Requires-Python: >=3.13
|
|
16
16
|
Requires-Dist: aioftp>=0.25.1
|
|
17
17
|
Requires-Dist: aiohttp>=3.11.18
|
|
18
|
+
Requires-Dist: gcsfs>=2025.7.0
|
|
18
19
|
Requires-Dist: google-cloud-storage>=3.1.1
|
|
19
20
|
Requires-Dist: loguru>=0.7.3
|
|
20
21
|
Requires-Dist: opentargets-otter>=25.0.2
|
|
21
|
-
Requires-Dist: polars>=1.31.0
|
|
22
|
+
Requires-Dist: polars[fsspec,gcs]>=1.31.0
|
|
22
23
|
Requires-Dist: pydantic>=2.10.6
|
|
23
24
|
Requires-Dist: tqdm>=4.67.1
|
|
24
25
|
Description-Content-Type: text/markdown
|
|
@@ -48,6 +49,7 @@ gentroutils --help
|
|
|
48
49
|
## Usage
|
|
49
50
|
|
|
50
51
|
To run a single step run
|
|
52
|
+
|
|
51
53
|
```{bash}
|
|
52
54
|
uv run gentroutils -s gwas_catalog_release # After cloning the repository
|
|
53
55
|
gentroutils -s gwas_catalog_release -c otter_config.yaml # When installed by pip
|
|
@@ -60,6 +62,11 @@ The `gentroutils` repository uses the [otter](https://github.com/opentargets/ott
|
|
|
60
62
|
|
|
61
63
|
For the top level fields refer to the [otter documentation](https://opentargets.github.io/otter/otter.config.html)
|
|
62
64
|
|
|
65
|
+
> [!NOTE]
|
|
66
|
+
> All `destination_template` must point to the Google Cloud Storage (GCS) bucket objects.
|
|
67
|
+
> All `source_template` must point to the FTP server paths.
|
|
68
|
+
> In case this is not enforced, the user may experience silent failures.
|
|
69
|
+
|
|
63
70
|
```yaml
|
|
64
71
|
---
|
|
65
72
|
work_path: ./work
|
|
@@ -91,7 +98,7 @@ steps:
|
|
|
91
98
|
- fetch studies
|
|
92
99
|
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
93
100
|
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
94
|
-
destination_template:
|
|
101
|
+
destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
|
|
95
102
|
promote: true
|
|
96
103
|
```
|
|
97
104
|
|
|
@@ -114,8 +121,7 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
|
|
|
114
121
|
|
|
115
122
|
This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
|
|
116
123
|
|
|
117
|
-
> [!NOTE]
|
|
118
|
-
> **Task parameters**
|
|
124
|
+
> [!NOTE] > **Task parameters**
|
|
119
125
|
>
|
|
120
126
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
121
127
|
> - The `destination_template` is where the metadata will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. By default it searches for the release directly in the stats_uri json output.
|
|
@@ -135,8 +141,7 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
|
|
|
135
141
|
|
|
136
142
|
This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
|
|
137
143
|
|
|
138
|
-
> [!NOTE]
|
|
139
|
-
> **Task parameters**
|
|
144
|
+
> [!NOTE] > **Task parameters**
|
|
140
145
|
>
|
|
141
146
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
142
147
|
> - The `source_template` is the URL of the GWAS Catalog associations file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
@@ -157,8 +162,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
|
|
|
157
162
|
|
|
158
163
|
This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
|
|
159
164
|
|
|
160
|
-
> [!NOTE]
|
|
161
|
-
> **Task parameters**
|
|
165
|
+
> [!NOTE] > **Task parameters**
|
|
162
166
|
>
|
|
163
167
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
164
168
|
> - The `source_template` is the URL of the GWAS Catalog studies file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
@@ -179,8 +183,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
|
|
|
179
183
|
|
|
180
184
|
This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
|
|
181
185
|
|
|
182
|
-
> [!NOTE]
|
|
183
|
-
> **Task parameters**
|
|
186
|
+
> [!NOTE] > **Task parameters**
|
|
184
187
|
>
|
|
185
188
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
186
189
|
> - The `source_template` is the URL of the GWAS Catalog ancestries file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
@@ -203,7 +206,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
|
|
|
203
206
|
|
|
204
207
|
This task is used to build the GWAS Catalog curation file that is later used as a template for manual curation. It requires the `fetch studies` task to be completed before it can run. This is due to the fact that the curation file is build based on the list of studies fetched from `download studies` file.
|
|
205
208
|
|
|
206
|
-
> [!NOTE]
|
|
209
|
+
> [!NOTE]
|
|
207
210
|
> **Task parameters**
|
|
208
211
|
>
|
|
209
212
|
> - The `requires` field specifies that this task depends on the `fetch studies` task, meaning it will only run after the studies have been fetched.
|
|
@@ -268,6 +271,7 @@ To check CLI execution manually you need to run
|
|
|
268
271
|
```{bash}
|
|
269
272
|
uv run gentroutils
|
|
270
273
|
```
|
|
274
|
+
|
|
271
275
|
---
|
|
272
276
|
|
|
273
277
|
This software was developed as part of the Open Targets project. For more
|
|
@@ -23,6 +23,7 @@ gentroutils --help
|
|
|
23
23
|
## Usage
|
|
24
24
|
|
|
25
25
|
To run a single step run
|
|
26
|
+
|
|
26
27
|
```{bash}
|
|
27
28
|
uv run gentroutils -s gwas_catalog_release # After cloning the repository
|
|
28
29
|
gentroutils -s gwas_catalog_release -c otter_config.yaml # When installed by pip
|
|
@@ -35,6 +36,11 @@ The `gentroutils` repository uses the [otter](https://github.com/opentargets/ott
|
|
|
35
36
|
|
|
36
37
|
For the top level fields refer to the [otter documentation](https://opentargets.github.io/otter/otter.config.html)
|
|
37
38
|
|
|
39
|
+
> [!NOTE]
|
|
40
|
+
> All `destination_template` must point to the Google Cloud Storage (GCS) bucket objects.
|
|
41
|
+
> All `source_template` must point to the FTP server paths.
|
|
42
|
+
> In case this is not enforced, the user may experience silent failures.
|
|
43
|
+
|
|
38
44
|
```yaml
|
|
39
45
|
---
|
|
40
46
|
work_path: ./work
|
|
@@ -66,7 +72,7 @@ steps:
|
|
|
66
72
|
- fetch studies
|
|
67
73
|
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
68
74
|
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
69
|
-
destination_template:
|
|
75
|
+
destination_template: gs://gwas_catalog_inputs/gentroutils/curation/{release_date}/GWAS_Catalog_study_curation.tsv
|
|
70
76
|
promote: true
|
|
71
77
|
```
|
|
72
78
|
|
|
@@ -89,8 +95,7 @@ The list of tasks (defined in the `config.yaml` file) that can be run are:
|
|
|
89
95
|
|
|
90
96
|
This task fetches the latest GWAS Catalog release metadata from the `https://www.ebi.ac.uk/gwas/api/search/stats` endpoint and saves it to the specified destination.
|
|
91
97
|
|
|
92
|
-
> [!NOTE]
|
|
93
|
-
> **Task parameters**
|
|
98
|
+
> [!NOTE] > **Task parameters**
|
|
94
99
|
>
|
|
95
100
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
96
101
|
> - The `destination_template` is where the metadata will be saved, and it uses the `{release_date}` placeholder to specify the release date dynamically. By default it searches for the release directly in the stats_uri json output.
|
|
@@ -110,8 +115,7 @@ This task fetches the latest GWAS Catalog release metadata from the `https://www
|
|
|
110
115
|
|
|
111
116
|
This task fetches the GWAS Catalog associations file from the specified FTP server and saves it to the specified destination.
|
|
112
117
|
|
|
113
|
-
> [!NOTE]
|
|
114
|
-
> **Task parameters**
|
|
118
|
+
> [!NOTE] > **Task parameters**
|
|
115
119
|
>
|
|
116
120
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
117
121
|
> - The `source_template` is the URL of the GWAS Catalog associations file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
@@ -132,8 +136,7 @@ This task fetches the GWAS Catalog associations file from the specified FTP serv
|
|
|
132
136
|
|
|
133
137
|
This task fetches the GWAS Catalog studies file from the specified FTP server and saves it to the specified destination.
|
|
134
138
|
|
|
135
|
-
> [!NOTE]
|
|
136
|
-
> **Task parameters**
|
|
139
|
+
> [!NOTE] > **Task parameters**
|
|
137
140
|
>
|
|
138
141
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
139
142
|
> - The `source_template` is the URL of the GWAS Catalog studies file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
@@ -154,8 +157,7 @@ This task fetches the GWAS Catalog studies file from the specified FTP server an
|
|
|
154
157
|
|
|
155
158
|
This task fetches the GWAS Catalog ancestries file from the specified FTP server and saves it to the specified destination.
|
|
156
159
|
|
|
157
|
-
> [!NOTE]
|
|
158
|
-
> **Task parameters**
|
|
160
|
+
> [!NOTE] > **Task parameters**
|
|
159
161
|
>
|
|
160
162
|
> - The `stats_uri` is used to fetch the latest release date and other metadata.
|
|
161
163
|
> - The `source_template` is the URL of the GWAS Catalog ancestries file, which uses the `{release_date}` placeholder to specify the release date dynamically. The release date is fetched from the `stats_uri` endpoint.
|
|
@@ -178,7 +180,7 @@ This task fetches the GWAS Catalog ancestries file from the specified FTP server
|
|
|
178
180
|
|
|
179
181
|
This task is used to build the GWAS Catalog curation file that is later used as a template for manual curation. It requires the `fetch studies` task to be completed before it can run. This is due to the fact that the curation file is build based on the list of studies fetched from `download studies` file.
|
|
180
182
|
|
|
181
|
-
> [!NOTE]
|
|
183
|
+
> [!NOTE]
|
|
182
184
|
> **Task parameters**
|
|
183
185
|
>
|
|
184
186
|
> - The `requires` field specifies that this task depends on the `fetch studies` task, meaning it will only run after the studies have been fetched.
|
|
@@ -243,6 +245,7 @@ To check CLI execution manually you need to run
|
|
|
243
245
|
```{bash}
|
|
244
246
|
uv run gentroutils
|
|
245
247
|
```
|
|
248
|
+
|
|
246
249
|
---
|
|
247
250
|
|
|
248
251
|
This software was developed as part of the Open Targets project. For more
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
---
|
|
2
|
+
work_path: ./work
|
|
3
|
+
log_level: DEBUG
|
|
4
|
+
scratchpad:
|
|
5
|
+
gc_stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
6
|
+
gc_bucket: "gs://gwas_catalog_inputs"
|
|
7
|
+
gc_ftp: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases"
|
|
8
|
+
|
|
9
|
+
steps:
|
|
10
|
+
gwas_catalog_release:
|
|
11
|
+
# - name: crawl release metadata
|
|
12
|
+
# stats_uri: ${gc_stats_uri}
|
|
13
|
+
# destination_template: '${gc_bucket}/gentroutils/{release_date}/stats.json'
|
|
14
|
+
# promote: true
|
|
15
|
+
|
|
16
|
+
# - name: fetch associations
|
|
17
|
+
# stats_uri: ${gc_stats_uri}
|
|
18
|
+
# source_template: '${gc_ftp}/{release_date}/gwas-catalog-associations_ontology-annotated.tsv'
|
|
19
|
+
# destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv'
|
|
20
|
+
# promote: true
|
|
21
|
+
|
|
22
|
+
# - name: fetch studies
|
|
23
|
+
# stats_uri: ${gc_stats_uri}
|
|
24
|
+
# source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt'
|
|
25
|
+
# destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_studies.tsv'
|
|
26
|
+
# promote: true
|
|
27
|
+
|
|
28
|
+
# - name: fetch ancestries
|
|
29
|
+
# stats_uri: ${gc_stats_uri}
|
|
30
|
+
# source_template: '${gc_ftp}/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt'
|
|
31
|
+
# destination_template: '${gc_bucket}/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv'
|
|
32
|
+
# promote: true
|
|
33
|
+
|
|
34
|
+
- name: curation study
|
|
35
|
+
# requires:
|
|
36
|
+
# - fetch studies
|
|
37
|
+
previous_curation: '${gc_bucket}/curation/latest/curated/GWAS_Catalog_study_curation.tsv'
|
|
38
|
+
studies: '${gc_bucket}/gentroutils/latest/gwas_catalog_download_studies.tsv'
|
|
39
|
+
destination_template: '${gc_bucket}/curation/{release_date}/raw/GWAS_Catalog_study_curation.tsv'
|
|
40
|
+
promote: true
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
authors = [{ name = "Szymon Szyszkowski", email = "ss60@sanger.ac.uk" }]
|
|
3
3
|
name = "gentroutils"
|
|
4
|
-
version = "
|
|
4
|
+
version = "3.0.0"
|
|
5
5
|
description = "Open Targets python genetics utility CLI tools"
|
|
6
6
|
dependencies = [
|
|
7
7
|
"aiohttp>=3.11.18",
|
|
8
8
|
"aioftp>=0.25.1",
|
|
9
|
-
"polars>=1.31.0",
|
|
9
|
+
"polars[fsspec,gcs]>=1.31.0",
|
|
10
10
|
"pydantic>=2.10.6",
|
|
11
11
|
"loguru>=0.7.3",
|
|
12
12
|
"tqdm>=4.67.1",
|
|
13
13
|
"opentargets-otter>=25.0.2",
|
|
14
14
|
"google-cloud-storage>=3.1.1",
|
|
15
|
+
"gcsfs>=2025.7.0",
|
|
15
16
|
]
|
|
16
17
|
readme = "README.md"
|
|
17
18
|
requires-python = ">=3.13"
|
|
@@ -75,6 +76,12 @@ allow-direct-references = true
|
|
|
75
76
|
[tool.hatch.build.targets.wheel]
|
|
76
77
|
packages = ["src/gentroutils"]
|
|
77
78
|
|
|
79
|
+
|
|
80
|
+
# Ignore polars x GCS depencency not imported in code
|
|
81
|
+
[tool.deptry.per_rule_ignores]
|
|
82
|
+
DEP002 = ["gcsfs"]
|
|
83
|
+
|
|
84
|
+
|
|
78
85
|
# test configuration
|
|
79
86
|
[tool.pytest.ini_options]
|
|
80
87
|
markers = ["integration_test: Intergration tests", "unit_test: Unit tests"]
|
|
@@ -32,12 +32,24 @@ class FTPtoGCPTransferableObject(TransferableObject):
|
|
|
32
32
|
async with aioftp.Client.context(ftp_obj.server, user="anonymous", password="anonymous") as ftp: # noqa: S106
|
|
33
33
|
bucket = storage.Client().bucket(gcs_obj.bucket)
|
|
34
34
|
blob = bucket.blob(gcs_obj.object)
|
|
35
|
-
logger.info(f"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(pwd))
|
|
35
|
+
logger.info(f"Searching for the release date in the provided ftp path: {ftp_obj.base_dir}.")
|
|
36
|
+
dir_match = re.match(r"^.*(?P<release_date>\d{4}\/\d{2}\/\d{2}){1}$", str(ftp_obj.base_dir))
|
|
37
|
+
|
|
39
38
|
if dir_match:
|
|
40
|
-
logger.info(f"Found release date
|
|
39
|
+
logger.info(f"Found release date to search in the ftp {dir_match.group('release_date')}.")
|
|
40
|
+
release_date = dir_match.group("release_date")
|
|
41
|
+
try:
|
|
42
|
+
await ftp.change_directory(ftp_obj.base_dir)
|
|
43
|
+
except aioftp.StatusCodeError as e:
|
|
44
|
+
logger.error(f"Failed to change directory to {ftp_obj.base_dir}: {e}")
|
|
45
|
+
logger.warning("Attempting to load the `latest` release.")
|
|
46
|
+
ftp_obj = FTPPath(self.source.replace(release_date, "latest"))
|
|
47
|
+
try:
|
|
48
|
+
await ftp.change_directory(ftp_obj.base_dir)
|
|
49
|
+
except aioftp.StatusCodeError as e:
|
|
50
|
+
logger.error(f"Failed to find the latest release under {ftp_obj}")
|
|
51
|
+
raise
|
|
52
|
+
|
|
41
53
|
buffer = io.BytesIO()
|
|
42
54
|
stream = await ftp.download_stream(ftp_obj.filename)
|
|
43
55
|
async with stream:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
from collections import defaultdict
|
|
6
7
|
from dataclasses import dataclass
|
|
7
8
|
from datetime import date
|
|
8
9
|
|
|
@@ -13,7 +14,12 @@ from pydantic import AliasPath, BaseModel, Field
|
|
|
13
14
|
from gentroutils.errors import GentroutilsError, GentroutilsErrorMessage
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
|
|
17
|
+
class KeepMissing(defaultdict[str, str]):
|
|
18
|
+
def __missing__(self, key):
|
|
19
|
+
return "{" + key + "}"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def destination_validator(path: str) -> str:
|
|
17
23
|
"""Ensure that the destination path contains a template for the release date."""
|
|
18
24
|
if "{release_date}" not in path:
|
|
19
25
|
raise GentroutilsError(GentroutilsErrorMessage.MISSING_RELEASE_DATE_TEMPLATE, release_date="{release_date}")
|
|
@@ -34,7 +40,7 @@ class TemplateDestination:
|
|
|
34
40
|
|
|
35
41
|
This method returns a new TemplateDestination object (not a copy of the current one) with the formatted destination.
|
|
36
42
|
"""
|
|
37
|
-
return TemplateDestination(self.destination.
|
|
43
|
+
return TemplateDestination(self.destination.format_map(KeepMissing(**substitutions)), True)
|
|
38
44
|
|
|
39
45
|
|
|
40
46
|
class GwasCatalogReleaseInfo(BaseModel):
|
|
@@ -83,6 +89,7 @@ class GwasCatalogReleaseInfo(BaseModel):
|
|
|
83
89
|
@classmethod
|
|
84
90
|
def from_uri(cls, uri: str) -> GwasCatalogReleaseInfo:
|
|
85
91
|
"""Fetch the release information from the specified URI."""
|
|
92
|
+
logger.debug(f"Fetching release info from {uri}")
|
|
86
93
|
try:
|
|
87
94
|
return asyncio.run(cls._get_release_info(uri))
|
|
88
95
|
except aiohttp.ClientError as e:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Module to handle the crawling of GWAS Catalog release information."""
|
|
2
2
|
|
|
3
3
|
import tempfile
|
|
4
|
-
from functools import cached_property
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Annotated, Any, Self
|
|
7
6
|
|
|
@@ -9,9 +8,10 @@ from loguru import logger
|
|
|
9
8
|
from otter.storage import get_remote_storage
|
|
10
9
|
from otter.task.model import Spec, Task, TaskContext
|
|
11
10
|
from otter.task.task_reporter import report
|
|
12
|
-
from pydantic import AfterValidator
|
|
11
|
+
from pydantic import AfterValidator
|
|
13
12
|
|
|
14
|
-
from gentroutils.
|
|
13
|
+
from gentroutils.io.path import GCSPath
|
|
14
|
+
from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, destination_validator
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class CrawlSpec(Spec):
|
|
@@ -68,7 +68,7 @@ class CrawlSpec(Spec):
|
|
|
68
68
|
stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
69
69
|
"""The URI to crawl the release statistics information from."""
|
|
70
70
|
|
|
71
|
-
destination_template: Annotated[str, AfterValidator(
|
|
71
|
+
destination_template: Annotated[str, AfterValidator(destination_validator)]
|
|
72
72
|
"""The destination path to save the release information.
|
|
73
73
|
This path should always be a template string that includes `{release_date}`.
|
|
74
74
|
For example, `gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json`.
|
|
@@ -91,8 +91,6 @@ class CrawlSpec(Spec):
|
|
|
91
91
|
promoting the release as the latest release.
|
|
92
92
|
"""
|
|
93
93
|
|
|
94
|
-
@computed_field # type: ignore[prop-decorator]
|
|
95
|
-
@cached_property
|
|
96
94
|
def destinations(self) -> list[TemplateDestination]:
|
|
97
95
|
"""Get the list of destinations templates where the release information will be saved.
|
|
98
96
|
|
|
@@ -105,17 +103,17 @@ class CrawlSpec(Spec):
|
|
|
105
103
|
1. The destination template with the release date substituted.
|
|
106
104
|
2. The destination with the release date substituted to `latest`.
|
|
107
105
|
"""
|
|
108
|
-
d1 = self.destination_template
|
|
106
|
+
d1 = TemplateDestination(self.destination_template, False)
|
|
109
107
|
if self.promote:
|
|
110
|
-
d2 =
|
|
111
|
-
return [
|
|
112
|
-
return [
|
|
108
|
+
d2 = d1.format({"release_date": "latest"})
|
|
109
|
+
return [d1, d2]
|
|
110
|
+
return [d1]
|
|
113
111
|
|
|
114
112
|
def substituted_destinations(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
|
|
115
113
|
"""Safely parse the destination name to ensure it is valid."""
|
|
116
114
|
substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
|
|
117
115
|
return [
|
|
118
|
-
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
|
|
116
|
+
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
|
|
119
117
|
]
|
|
120
118
|
|
|
121
119
|
def model_post_init(self, __context: Any) -> None:
|
|
@@ -141,9 +139,10 @@ class Crawl(Task):
|
|
|
141
139
|
logger.info(f"Destinations for release information: {destinations}")
|
|
142
140
|
for destination in destinations:
|
|
143
141
|
storage = get_remote_storage(destination)
|
|
142
|
+
assert "gs://" in destination, f"Invalid GCS path in destination template: {destination}"
|
|
144
143
|
storage.upload(Path(source.name), destination)
|
|
145
144
|
logger.info(f"Release information written to {destination}")
|
|
146
|
-
|
|
145
|
+
return self
|
|
147
146
|
|
|
148
147
|
@report
|
|
149
148
|
def run(self) -> Self:
|
|
@@ -3,16 +3,16 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from datetime import date
|
|
6
|
-
from functools import cached_property
|
|
7
6
|
from typing import Annotated, Any, Self
|
|
8
7
|
|
|
8
|
+
from loguru import logger
|
|
9
9
|
from otter.task.model import Spec, Task, TaskContext
|
|
10
10
|
from otter.task.task_reporter import report
|
|
11
|
-
from pydantic import AfterValidator
|
|
11
|
+
from pydantic import AfterValidator
|
|
12
12
|
|
|
13
13
|
from gentroutils.io.transfer.polars_to_gcs import PolarsDataFrameToGCSTransferableObject
|
|
14
14
|
from gentroutils.parsers.curation import GWASCatalogCuration
|
|
15
|
-
from gentroutils.tasks import TemplateDestination,
|
|
15
|
+
from gentroutils.tasks import TemplateDestination, destination_validator
|
|
16
16
|
from gentroutils.transfer import TransferManager
|
|
17
17
|
|
|
18
18
|
|
|
@@ -50,14 +50,12 @@ class CurationSpec(Spec):
|
|
|
50
50
|
studies: str
|
|
51
51
|
"""The path to the studies data."""
|
|
52
52
|
|
|
53
|
-
destination_template: Annotated[str, AfterValidator(
|
|
53
|
+
destination_template: Annotated[str, AfterValidator(destination_validator)]
|
|
54
54
|
"""The destination path for the curation data."""
|
|
55
55
|
|
|
56
56
|
promote: bool = False
|
|
57
57
|
"""Whether to promote the curation data to the latest version."""
|
|
58
58
|
|
|
59
|
-
@computed_field # type: ignore[prop-decorator]
|
|
60
|
-
@cached_property
|
|
61
59
|
def destinations(self) -> list[TemplateDestination]:
|
|
62
60
|
"""Get the list of destinations templates where the release information will be saved.
|
|
63
61
|
|
|
@@ -80,7 +78,7 @@ class CurationSpec(Spec):
|
|
|
80
78
|
"""Safely parse the destination name to ensure it is valid."""
|
|
81
79
|
substitutions = {"release_date": release_date.strftime("%Y%m%d")}
|
|
82
80
|
return [
|
|
83
|
-
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
|
|
81
|
+
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
|
|
84
82
|
]
|
|
85
83
|
|
|
86
84
|
def model_post_init(self, __context: Any) -> None:
|
|
@@ -99,9 +97,13 @@ class Curation(Task):
|
|
|
99
97
|
@report
|
|
100
98
|
def run(self) -> Self:
|
|
101
99
|
"""Run the curation task."""
|
|
100
|
+
logger.info("Starting curation task.")
|
|
102
101
|
release_date = date.today()
|
|
102
|
+
logger.debug(f"Using release date: {release_date}")
|
|
103
103
|
destinations = self.spec.substituted_destinations(release_date)
|
|
104
|
+
logger.debug(f"Destinations for curation data: {destinations}")
|
|
104
105
|
curation = GWASCatalogCuration.from_prev_curation(self.spec.previous_curation, self.spec.studies)
|
|
106
|
+
logger.debug(f"Curation result preview:\n{curation.result.head()}")
|
|
105
107
|
transfer_objects = [
|
|
106
108
|
PolarsDataFrameToGCSTransferableObject(source=curation.result, destination=d) for d in destinations
|
|
107
109
|
]
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
"""Module to handle the fetching of GWAS Catalog release files."""
|
|
2
2
|
|
|
3
|
-
from functools import cached_property
|
|
4
3
|
from typing import Annotated, Any, Self
|
|
5
4
|
|
|
6
5
|
from loguru import logger
|
|
7
6
|
from otter.task.model import Spec, Task, TaskContext
|
|
8
7
|
from otter.task.task_reporter import report
|
|
9
|
-
from pydantic import AfterValidator
|
|
8
|
+
from pydantic import AfterValidator
|
|
10
9
|
|
|
11
10
|
from gentroutils.io.transfer import FTPtoGCPTransferableObject
|
|
12
|
-
from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination,
|
|
11
|
+
from gentroutils.tasks import GwasCatalogReleaseInfo, TemplateDestination, destination_validator
|
|
13
12
|
from gentroutils.transfer import TransferManager
|
|
14
13
|
|
|
15
14
|
MAX_CONCURRENT_CONNECTIONS = 10
|
|
@@ -57,10 +56,10 @@ class FetchSpec(Spec):
|
|
|
57
56
|
stats_uri: str = "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
58
57
|
"""The URI to crawl the release statistics information from."""
|
|
59
58
|
|
|
60
|
-
source_template: Annotated[str, AfterValidator(
|
|
59
|
+
source_template: Annotated[str, AfterValidator(destination_validator)]
|
|
61
60
|
"""The template URI of the file to download."""
|
|
62
61
|
|
|
63
|
-
destination_template: Annotated[str, AfterValidator(
|
|
62
|
+
destination_template: Annotated[str, AfterValidator(destination_validator)]
|
|
64
63
|
"""The template URI to upload the file to."""
|
|
65
64
|
|
|
66
65
|
promote: bool = False
|
|
@@ -78,8 +77,6 @@ class FetchSpec(Spec):
|
|
|
78
77
|
promoting the release as the latest release.
|
|
79
78
|
"""
|
|
80
79
|
|
|
81
|
-
@computed_field # type: ignore[prop-decorator]
|
|
82
|
-
@cached_property
|
|
83
80
|
def destinations(self) -> list[TemplateDestination]:
|
|
84
81
|
"""Get the list of destinations templates where the release information will be saved.
|
|
85
82
|
|
|
@@ -102,7 +99,7 @@ class FetchSpec(Spec):
|
|
|
102
99
|
"""Safely parse the destination name to ensure it is valid."""
|
|
103
100
|
substitutions = {"release_date": release_info.strfmt("%Y%m%d")}
|
|
104
101
|
return [
|
|
105
|
-
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations
|
|
102
|
+
d.format(substitutions).destination if not d.is_substituted else d.destination for d in self.destinations()
|
|
106
103
|
]
|
|
107
104
|
|
|
108
105
|
def substituted_sources(self, release_info: GwasCatalogReleaseInfo) -> list[str]:
|
|
@@ -70,14 +70,13 @@ class TestFTPtoGCPTransferableObject:
|
|
|
70
70
|
|
|
71
71
|
# Create and execute transfer
|
|
72
72
|
obj = FTPtoGCPTransferableObject(
|
|
73
|
-
source="ftp://example.com/
|
|
73
|
+
source="ftp://example.com/2025/12/12/file.txt", destination="gs://test-bucket/file.txt"
|
|
74
74
|
)
|
|
75
75
|
await obj.transfer()
|
|
76
76
|
|
|
77
77
|
# Verify FTP operations
|
|
78
78
|
mock_ftp_context.assert_called_once_with("example.com", user="anonymous", password="anonymous") # noqa: S106
|
|
79
|
-
mock_ftp_client.change_directory.
|
|
80
|
-
mock_ftp_client.get_current_directory.assert_called_once()
|
|
79
|
+
mock_ftp_client.change_directory.assert_called()
|
|
81
80
|
mock_ftp_client.download_stream.assert_called_once_with("file.txt")
|
|
82
81
|
|
|
83
82
|
# Verify GCS operations
|
|
@@ -88,7 +88,7 @@ class TestCrawlSpec:
|
|
|
88
88
|
|
|
89
89
|
def test_crawl_spec_destinations_with_promote(self, crawl_spec):
|
|
90
90
|
"""Test destinations property when promote=True."""
|
|
91
|
-
destinations = crawl_spec.destinations
|
|
91
|
+
destinations = crawl_spec.destinations()
|
|
92
92
|
assert len(destinations) == 2
|
|
93
93
|
assert destinations[0].destination == "gs://test-bucket/gwas/{release_date}/stats.json"
|
|
94
94
|
assert destinations[0].is_substituted is False
|
|
@@ -97,7 +97,7 @@ class TestCrawlSpec:
|
|
|
97
97
|
|
|
98
98
|
def test_crawl_spec_destinations_without_promote(self, crawl_spec_no_promote):
|
|
99
99
|
"""Test destinations property when promote=False."""
|
|
100
|
-
destinations = crawl_spec_no_promote.destinations
|
|
100
|
+
destinations = crawl_spec_no_promote.destinations()
|
|
101
101
|
assert len(destinations) == 1
|
|
102
102
|
assert destinations[0].destination == "gs://test-bucket/gwas/{release_date}/stats.json"
|
|
103
103
|
assert destinations[0].is_substituted is False
|
|
@@ -28,10 +28,11 @@ class TestCurationSpec:
|
|
|
28
28
|
assert curation_spec.studies == "gs://test-bucket/studies.json"
|
|
29
29
|
assert curation_spec.destination_template == "gs://test-bucket/{release_date}/curation.json"
|
|
30
30
|
assert curation_spec.promote is True
|
|
31
|
-
|
|
32
|
-
assert
|
|
33
|
-
assert
|
|
34
|
-
assert
|
|
31
|
+
destinations = curation_spec.destinations()
|
|
32
|
+
assert destinations[0].destination == "gs://test-bucket/{release_date}/curation.json"
|
|
33
|
+
assert destinations[0].is_substituted is False
|
|
34
|
+
assert destinations[1].destination == "gs://test-bucket/latest/curation.json"
|
|
35
|
+
assert destinations[1].is_substituted is True
|
|
35
36
|
|
|
36
37
|
def test_curation_spec_requires_release_date_template(self):
|
|
37
38
|
"""Test that CurationSpec validates release date template."""
|
|
@@ -46,11 +46,12 @@ class TestFetchSpec:
|
|
|
46
46
|
assert fetch_spec.source_template == "https://example.com/{release_date}/data.json"
|
|
47
47
|
assert fetch_spec.destination_template == "gs://test-bucket/{release_date}/data.json"
|
|
48
48
|
assert fetch_spec.promote is True
|
|
49
|
-
|
|
50
|
-
assert
|
|
51
|
-
assert
|
|
52
|
-
assert
|
|
53
|
-
assert
|
|
49
|
+
destinations = fetch_spec.destinations()
|
|
50
|
+
assert len(destinations) == 2
|
|
51
|
+
assert destinations[0].destination == "gs://test-bucket/{release_date}/data.json"
|
|
52
|
+
assert destinations[0].is_substituted is False
|
|
53
|
+
assert destinations[1].destination == "gs://test-bucket/latest/data.json"
|
|
54
|
+
assert destinations[1].is_substituted is True
|
|
54
55
|
|
|
55
56
|
def test_initialization_no_promote(self):
|
|
56
57
|
"""Test FetchSpec initialization with promote = False."""
|
|
@@ -65,9 +66,11 @@ class TestFetchSpec:
|
|
|
65
66
|
assert fetch_spec.source_template == "https://example.com/{release_date}/data.json"
|
|
66
67
|
assert fetch_spec.destination_template == "gs://test-bucket/{release_date}/data.json"
|
|
67
68
|
assert fetch_spec.promote is False
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
assert
|
|
69
|
+
|
|
70
|
+
destinations = fetch_spec.destinations()
|
|
71
|
+
assert len(destinations) == 1
|
|
72
|
+
assert destinations[0].destination == "gs://test-bucket/{release_date}/data.json"
|
|
73
|
+
assert destinations[0].is_substituted is False
|
|
71
74
|
|
|
72
75
|
def test_requires_release_date_template(self):
|
|
73
76
|
"""Test that FetchSpec validates release date template."""
|
|
@@ -457,6 +457,15 @@ wheels = [
|
|
|
457
457
|
{ url = "https://files.pythonhosted.org/packages/b9/5c/a3d95dc1ec6cdeb032d789b552ecc76effa3557ea9186e1566df6aac18df/fs-2.4.16-py2.py3-none-any.whl", hash = "sha256:660064febbccda264ae0b6bace80a8d1be9e089e0a5eb2427b7d517f9a91545c", size = 135261, upload-time = "2022-05-02T09:25:52.363Z" },
|
|
458
458
|
]
|
|
459
459
|
|
|
460
|
+
[[package]]
|
|
461
|
+
name = "fsspec"
|
|
462
|
+
version = "2025.7.0"
|
|
463
|
+
source = { registry = "https://pypi.org/simple" }
|
|
464
|
+
sdist = { url = "https://files.pythonhosted.org/packages/8b/02/0835e6ab9cfc03916fe3f78c0956cfcdb6ff2669ffa6651065d5ebf7fc98/fsspec-2025.7.0.tar.gz", hash = "sha256:786120687ffa54b8283d942929540d8bc5ccfa820deb555a2b5d0ed2b737bf58", size = 304432, upload-time = "2025-07-15T16:05:21.19Z" }
|
|
465
|
+
wheels = [
|
|
466
|
+
{ url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" },
|
|
467
|
+
]
|
|
468
|
+
|
|
460
469
|
[[package]]
|
|
461
470
|
name = "gcloud-storage-emulator"
|
|
462
471
|
version = "0.5.0"
|
|
@@ -471,17 +480,36 @@ wheels = [
|
|
|
471
480
|
{ url = "https://files.pythonhosted.org/packages/f1/46/b70e4ee5df71b4253c146c3792703c1e63c66c1e63d9bd64dbc07a1c456e/gcloud_storage_emulator-0.5.0-py3-none-any.whl", hash = "sha256:69fe95ab57fc45acdeba7b8dfc44c12b9e7921e640776f7ec79d675d753b03cf", size = 19959, upload-time = "2021-09-29T09:03:26.099Z" },
|
|
472
481
|
]
|
|
473
482
|
|
|
483
|
+
[[package]]
|
|
484
|
+
name = "gcsfs"
|
|
485
|
+
version = "2025.7.0"
|
|
486
|
+
source = { registry = "https://pypi.org/simple" }
|
|
487
|
+
dependencies = [
|
|
488
|
+
{ name = "aiohttp" },
|
|
489
|
+
{ name = "decorator" },
|
|
490
|
+
{ name = "fsspec" },
|
|
491
|
+
{ name = "google-auth" },
|
|
492
|
+
{ name = "google-auth-oauthlib" },
|
|
493
|
+
{ name = "google-cloud-storage" },
|
|
494
|
+
{ name = "requests" },
|
|
495
|
+
]
|
|
496
|
+
sdist = { url = "https://files.pythonhosted.org/packages/5b/d7/5eafe9f09f1bb09433a473cef7984cd52c398592c8fd09974e0ad87cfea4/gcsfs-2025.7.0.tar.gz", hash = "sha256:ad3ff66cf189ae8fc375ac8a2af409003dbca02357621cb94a66e457e02ba420", size = 82659, upload-time = "2025-07-15T16:49:21.647Z" }
|
|
497
|
+
wheels = [
|
|
498
|
+
{ url = "https://files.pythonhosted.org/packages/21/f5/54bccbee01efbc25581db6aafefb6f6c277d880930f7a083b10052382463/gcsfs-2025.7.0-py2.py3-none-any.whl", hash = "sha256:653503331d58cb02bb34e725d4595d166e93f7f2f3ff88e4c66ef535ae66eae5", size = 36815, upload-time = "2025-07-15T16:49:20.333Z" },
|
|
499
|
+
]
|
|
500
|
+
|
|
474
501
|
[[package]]
|
|
475
502
|
name = "gentroutils"
|
|
476
|
-
version = "
|
|
503
|
+
version = "3.0.0"
|
|
477
504
|
source = { editable = "." }
|
|
478
505
|
dependencies = [
|
|
479
506
|
{ name = "aioftp" },
|
|
480
507
|
{ name = "aiohttp" },
|
|
508
|
+
{ name = "gcsfs" },
|
|
481
509
|
{ name = "google-cloud-storage" },
|
|
482
510
|
{ name = "loguru" },
|
|
483
511
|
{ name = "opentargets-otter" },
|
|
484
|
-
{ name = "polars" },
|
|
512
|
+
{ name = "polars", extra = ["fsspec"] },
|
|
485
513
|
{ name = "pydantic" },
|
|
486
514
|
{ name = "tqdm" },
|
|
487
515
|
]
|
|
@@ -526,10 +554,11 @@ dev = [
|
|
|
526
554
|
requires-dist = [
|
|
527
555
|
{ name = "aioftp", specifier = ">=0.25.1" },
|
|
528
556
|
{ name = "aiohttp", specifier = ">=3.11.18" },
|
|
557
|
+
{ name = "gcsfs", specifier = ">=2025.7.0" },
|
|
529
558
|
{ name = "google-cloud-storage", specifier = ">=3.1.1" },
|
|
530
559
|
{ name = "loguru", specifier = ">=0.7.3" },
|
|
531
560
|
{ name = "opentargets-otter", specifier = ">=25.0.2" },
|
|
532
|
-
{ name = "polars", specifier = ">=1.31.0" },
|
|
561
|
+
{ name = "polars", extras = ["fsspec", "gcs"], specifier = ">=1.31.0" },
|
|
533
562
|
{ name = "pydantic", specifier = ">=2.10.6" },
|
|
534
563
|
{ name = "tqdm", specifier = ">=4.67.1" },
|
|
535
564
|
]
|
|
@@ -624,6 +653,19 @@ wheels = [
|
|
|
624
653
|
{ url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" },
|
|
625
654
|
]
|
|
626
655
|
|
|
656
|
+
[[package]]
|
|
657
|
+
name = "google-auth-oauthlib"
|
|
658
|
+
version = "1.2.2"
|
|
659
|
+
source = { registry = "https://pypi.org/simple" }
|
|
660
|
+
dependencies = [
|
|
661
|
+
{ name = "google-auth" },
|
|
662
|
+
{ name = "requests-oauthlib" },
|
|
663
|
+
]
|
|
664
|
+
sdist = { url = "https://files.pythonhosted.org/packages/fb/87/e10bf24f7bcffc1421b84d6f9c3377c30ec305d082cd737ddaa6d8f77f7c/google_auth_oauthlib-1.2.2.tar.gz", hash = "sha256:11046fb8d3348b296302dd939ace8af0a724042e8029c1b872d87fabc9f41684", size = 20955, upload-time = "2025-04-22T16:40:29.172Z" }
|
|
665
|
+
wheels = [
|
|
666
|
+
{ url = "https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl", hash = "sha256:fd619506f4b3908b5df17b65f39ca8d66ea56986e5472eb5978fd8f3786f00a2", size = 19072, upload-time = "2025-04-22T16:40:28.174Z" },
|
|
667
|
+
]
|
|
668
|
+
|
|
627
669
|
[[package]]
|
|
628
670
|
name = "google-cloud-core"
|
|
629
671
|
version = "2.4.3"
|
|
@@ -1038,6 +1080,15 @@ wheels = [
|
|
|
1038
1080
|
{ url = "https://files.pythonhosted.org/packages/d2/47/ca494f6f2366f17bd2a79b6c1468f6c441a447b017fbdf600aa40b5d27ac/numpy_typing_compat-2.3.20250730-py3-none-any.whl", hash = "sha256:9ab0cd4bb1b4c31debf0bd745554c735a07a7bb3cc3801dc934b2b5856549612", size = 6057, upload-time = "2025-07-30T01:36:06.027Z" },
|
|
1039
1081
|
]
|
|
1040
1082
|
|
|
1083
|
+
[[package]]
|
|
1084
|
+
name = "oauthlib"
|
|
1085
|
+
version = "3.3.1"
|
|
1086
|
+
source = { registry = "https://pypi.org/simple" }
|
|
1087
|
+
sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" }
|
|
1088
|
+
wheels = [
|
|
1089
|
+
{ url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" },
|
|
1090
|
+
]
|
|
1091
|
+
|
|
1041
1092
|
[[package]]
|
|
1042
1093
|
name = "opentargets-otter"
|
|
1043
1094
|
version = "25.0.4"
|
|
@@ -1155,6 +1206,11 @@ wheels = [
|
|
|
1155
1206
|
{ url = "https://files.pythonhosted.org/packages/b8/53/4eaaa4f219add46594db21a05a9a5629ec6af20bd859a90668d5a1448abc/polars-1.32.2-cp39-abi3-win_arm64.whl", hash = "sha256:cd390364f6f3927474bd0aed255103195b9d2b3eef0f0c5bb429db5e6311615e", size = 34059100, upload-time = "2025-08-07T10:50:26.445Z" },
|
|
1156
1207
|
]
|
|
1157
1208
|
|
|
1209
|
+
[package.optional-dependencies]
|
|
1210
|
+
fsspec = [
|
|
1211
|
+
{ name = "fsspec" },
|
|
1212
|
+
]
|
|
1213
|
+
|
|
1158
1214
|
[[package]]
|
|
1159
1215
|
name = "pre-commit"
|
|
1160
1216
|
version = "4.3.0"
|
|
@@ -1541,6 +1597,19 @@ wheels = [
|
|
|
1541
1597
|
{ url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" },
|
|
1542
1598
|
]
|
|
1543
1599
|
|
|
1600
|
+
[[package]]
|
|
1601
|
+
name = "requests-oauthlib"
|
|
1602
|
+
version = "2.0.0"
|
|
1603
|
+
source = { registry = "https://pypi.org/simple" }
|
|
1604
|
+
dependencies = [
|
|
1605
|
+
{ name = "oauthlib" },
|
|
1606
|
+
{ name = "requests" },
|
|
1607
|
+
]
|
|
1608
|
+
sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload-time = "2024-03-22T20:32:29.939Z" }
|
|
1609
|
+
wheels = [
|
|
1610
|
+
{ url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" },
|
|
1611
|
+
]
|
|
1612
|
+
|
|
1544
1613
|
[[package]]
|
|
1545
1614
|
name = "requests-toolbelt"
|
|
1546
1615
|
version = "1.0.0"
|
gentroutils-2.0.0/.RData
DELETED
|
Binary file
|
gentroutils-2.0.0/.Rhistory
DELETED
|
File without changes
|
gentroutils-2.0.0/config.yaml
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
work_path: ./work
|
|
3
|
-
log_level: DEBUG
|
|
4
|
-
scratchpad:
|
|
5
|
-
steps:
|
|
6
|
-
gwas_catalog_release:
|
|
7
|
-
- name: crawl release metadata
|
|
8
|
-
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
9
|
-
destination_template: 'gs://gwas_catalog_inputs/gentroutils/{release_date}/stats.json'
|
|
10
|
-
promote: 'true'
|
|
11
|
-
- name: fetch associations
|
|
12
|
-
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
13
|
-
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-associations_ontology-annotated.tsv"
|
|
14
|
-
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_associations_ontology_annotated.tsv"
|
|
15
|
-
promote: true
|
|
16
|
-
- name: fetch studies
|
|
17
|
-
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
18
|
-
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-download-studies-v1.0.3.1.txt"
|
|
19
|
-
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_download_studies.tsv"
|
|
20
|
-
promote: true
|
|
21
|
-
- name: fetch ancestries
|
|
22
|
-
stats_uri: "https://www.ebi.ac.uk/gwas/api/search/stats"
|
|
23
|
-
source_template: "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/{release_date}/gwas-catalog-download-ancestries-v1.0.3.1.txt"
|
|
24
|
-
destination_template: "gs://gwas_catalog_inputs/gentroutils/{release_date}/gwas_catalog_download_ancestries.tsv"
|
|
25
|
-
promote: true
|
|
26
|
-
- name: curation study
|
|
27
|
-
requires:
|
|
28
|
-
- fetch studies
|
|
29
|
-
previous_curation: gs://gwas_catalog_inputs/curation/latest/curated/GWAS_Catalog_study_curation.tsv
|
|
30
|
-
studies: gs://gwas_catalog_inputs/gentroutils/latest/gwas_catalog_download_studies.tsv
|
|
31
|
-
destination_template: ./work/curation_{release_date}.tsv
|
|
32
|
-
promote: true
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_analysisFlag_type.tsv
RENAMED
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_analysisFlag_value.tsv
RENAMED
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_columns_curation.tsv
RENAMED
|
File without changes
|
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_pubmedId_type.tsv
RENAMED
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyId_type.tsv
RENAMED
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyId_value.tsv
RENAMED
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyType_type.tsv
RENAMED
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/incorrect_studyType_value.tsv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gentroutils-2.0.0 → gentroutils-3.0.0}/tests/data/manual_curation/null_value_in_studyId.tsv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|