gentroutils 0.1.5__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gentroutils/__init__.py +3 -1
- gentroutils/commands/__init__.py +5 -1
- gentroutils/commands/update_gwas_curation_metadata.py +12 -38
- gentroutils/commands/utils.py +51 -34
- gentroutils/commands/validate_gwas_curation.py +165 -0
- gentroutils/py.typed +0 -0
- gentroutils-1.5.0.dist-info/METADATA +135 -0
- gentroutils-1.5.0.dist-info/RECORD +11 -0
- {gentroutils-0.1.5.dist-info → gentroutils-1.5.0.dist-info}/WHEEL +1 -1
- gentroutils-0.1.5.dist-info/METADATA +0 -71
- gentroutils-0.1.5.dist-info/RECORD +0 -9
- {gentroutils-0.1.5.dist-info → gentroutils-1.5.0.dist-info}/entry_points.txt +0 -0
- {gentroutils-0.1.5.dist-info → gentroutils-1.5.0.dist-info}/licenses/LICENSE +0 -0
gentroutils/__init__.py
CHANGED
|
@@ -8,8 +8,9 @@ import time
|
|
|
8
8
|
import click
|
|
9
9
|
import pyfiglet
|
|
10
10
|
|
|
11
|
-
from gentroutils.commands
|
|
11
|
+
from gentroutils.commands import (
|
|
12
12
|
update_gwas_curation_metadata_command,
|
|
13
|
+
validate_gwas_curation,
|
|
13
14
|
)
|
|
14
15
|
from gentroutils.commands.utils import set_log_file, set_log_lvl, teardown_cli
|
|
15
16
|
|
|
@@ -40,5 +41,6 @@ def cli(ctx: click.Context, **kwargs: dict[str, str]) -> None:
|
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
cli.add_command(update_gwas_curation_metadata_command)
|
|
44
|
+
cli.add_command(validate_gwas_curation)
|
|
43
45
|
|
|
44
46
|
__all__ = ["cli"]
|
gentroutils/commands/__init__.py
CHANGED
|
@@ -3,5 +3,9 @@
|
|
|
3
3
|
from gentroutils.commands.update_gwas_curation_metadata import (
|
|
4
4
|
update_gwas_curation_metadata_command,
|
|
5
5
|
)
|
|
6
|
+
from gentroutils.commands.validate_gwas_curation import validate_gwas_curation
|
|
6
7
|
|
|
7
|
-
__all__ = [
|
|
8
|
+
__all__ = [
|
|
9
|
+
"update_gwas_curation_metadata_command",
|
|
10
|
+
"validate_gwas_curation",
|
|
11
|
+
]
|
|
@@ -21,27 +21,15 @@ MAX_CONCURRENT_CONNECTIONS = 10
|
|
|
21
21
|
CURATED_INPUTS = (
|
|
22
22
|
(
|
|
23
23
|
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv",
|
|
24
|
-
"gs://
|
|
24
|
+
"gs://gwas_catalog_inputs/gwas_catalog_associations_ontology_annotated.tsv",
|
|
25
25
|
),
|
|
26
26
|
(
|
|
27
27
|
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt",
|
|
28
|
-
"gs://
|
|
29
|
-
),
|
|
30
|
-
(
|
|
31
|
-
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-unpublished-studies-v1.0.3.1.tsv",
|
|
32
|
-
"gs://gwas_catalog_data/curated_inputs/gwas_catalog_unpublished_studies.tsv",
|
|
28
|
+
"gs://gwas_catalog_inputs/gwas_catalog_download_studies.tsv",
|
|
33
29
|
),
|
|
34
30
|
(
|
|
35
31
|
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt",
|
|
36
|
-
"gs://
|
|
37
|
-
),
|
|
38
|
-
(
|
|
39
|
-
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-unpublished-ancestries-v1.0.3.1.tsv",
|
|
40
|
-
"gs://gwas_catalog_data/curated_inputs/gwas_catalog_unpublished_ancestries.tsv",
|
|
41
|
-
),
|
|
42
|
-
(
|
|
43
|
-
"ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/harmonised_list.txt",
|
|
44
|
-
"gs://gwas_catalog_data/curated_inputs/harmonised_list.txt",
|
|
32
|
+
"gs://gwas_catalog_inputs/gwas_catalog_download_ancestries.tsv",
|
|
45
33
|
),
|
|
46
34
|
)
|
|
47
35
|
|
|
@@ -75,9 +63,7 @@ async def update_gwas_curation_metadata_command(
|
|
|
75
63
|
This is the script to fetch the latest GWAS Catalog data files that include:
|
|
76
64
|
- [x] gwas-catalog-associations_ontology-annotated.tsv - list of associations with ontology annotations by GWAS Catalog
|
|
77
65
|
- [x] gwas-catalog-download-studies-v1.0.3.1.txt - list of published studies by GWAS Catalog
|
|
78
|
-
- [x] gwas-catalog-unpublished-studies-v1.0.3.1.tsv - list of unpublished studies by GWAS Catalog
|
|
79
66
|
- [x] gwas-catalog-download-ancestries-v1.0.3.1.txt - list of published studies by GWAS Catalog
|
|
80
|
-
- [x] gwas-catalog-unpublished-ancestries-v1.0.3.1.tsv - list of unpublished studies by GWAS Catalog
|
|
81
67
|
|
|
82
68
|
\b
|
|
83
69
|
By default all GWAS Catalog data files are uploaded from GWAS Catalog FTP server to Open Targets GCP bucket.
|
|
@@ -86,13 +72,9 @@ async def update_gwas_curation_metadata_command(
|
|
|
86
72
|
|
|
87
73
|
\b
|
|
88
74
|
gentroutils --log-file gs://gwas_catalog_data/curated_inputs/log.txt update-gwas-curation-metadata \\
|
|
89
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv gs://gwas_catalog_data/
|
|
90
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt gs://
|
|
91
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-
|
|
92
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt gs://gwas_catalog_data/curated_inputs/gwas_catalog_download_ancestries.tsv \\
|
|
93
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-unpublished-ancestries-v1.0.3.1.tsv gs://gwas_catalog_data/curated_inputs/gwas_catalog_unpublished_ancestries.tsv \\
|
|
94
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/harmonised_list.txt gs://gwas_catalog_data/curated_inputs/harmonised_list.txt \\
|
|
95
|
-
-f https://raw.githubusercontent.com/opentargets/curation/master/genetics/GWAS_Catalog_study_curation.tsv gs://gwas_catalog_data/manifests/gwas_catalog_study_curation.tsv \\
|
|
75
|
+
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv gs://gwas_catalog_data/gwas_catalog_associations_ontology_annotated.tsv \\
|
|
76
|
+
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt gs://gwas_catalog_inputs/gwas_catalog_download_studies.tsv \\
|
|
77
|
+
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt gs://gwas_catalog_inputs/gwas_catalog_download_ancestries.tsv \\
|
|
96
78
|
-g https://www.ebi.ac.uk/gwas/api/search/stats
|
|
97
79
|
|
|
98
80
|
|
|
@@ -108,10 +90,7 @@ async def update_gwas_curation_metadata_command(
|
|
|
108
90
|
MAX_CONCURRENT_CONNECTIONS,
|
|
109
91
|
)
|
|
110
92
|
sys.exit(1)
|
|
111
|
-
uri_map = [
|
|
112
|
-
{"input": urlparse(ftp_file), "output": urlparse(gcp_file)}
|
|
113
|
-
for ftp_file, gcp_file in file_to_transfer
|
|
114
|
-
]
|
|
93
|
+
uri_map = [{"input": urlparse(ftp_file), "output": urlparse(gcp_file)} for ftp_file, gcp_file in file_to_transfer]
|
|
115
94
|
transfer_tasks = generate_transfer_tasks(uri_map, dry_run)
|
|
116
95
|
|
|
117
96
|
# capture latest release metadata
|
|
@@ -135,9 +114,7 @@ async def update_gwas_curation_metadata_command(
|
|
|
135
114
|
logger.info("gwas_curation_update step completed.")
|
|
136
115
|
|
|
137
116
|
|
|
138
|
-
def generate_transfer_tasks(
|
|
139
|
-
uri_map: list[dict[str, ParseResult]], dry_run: bool
|
|
140
|
-
) -> list[asyncio.Task[None]]:
|
|
117
|
+
def generate_transfer_tasks(uri_map: list[dict[str, ParseResult]], dry_run: bool) -> list[asyncio.Task[None]]:
|
|
141
118
|
"""Generate transfer tasks.
|
|
142
119
|
|
|
143
120
|
Args:
|
|
@@ -214,9 +191,7 @@ def generate_transfer_tasks(
|
|
|
214
191
|
return transfer_tasks
|
|
215
192
|
|
|
216
193
|
|
|
217
|
-
async def sync_from_http_to_gcp(
|
|
218
|
-
url: str, gcp_bucket: str, gcp_prefix: str, gcp_file: str, *, dry_run: bool = True
|
|
219
|
-
) -> None:
|
|
194
|
+
async def sync_from_http_to_gcp(url: str, gcp_bucket: str, gcp_prefix: str, gcp_file: str, *, dry_run: bool = True) -> None:
|
|
220
195
|
"""Sync file from HTTP and upload to GCP.
|
|
221
196
|
|
|
222
197
|
This function fetches the data from the provided HTTP URL and uploads the content
|
|
@@ -289,7 +264,8 @@ async def sync_from_ftp_to_gcp(
|
|
|
289
264
|
gcp_file,
|
|
290
265
|
)
|
|
291
266
|
return
|
|
292
|
-
with FTP(
|
|
267
|
+
with FTP() as ftp:
|
|
268
|
+
ftp.connect(ftp_server)
|
|
293
269
|
ftp.login()
|
|
294
270
|
bucket = storage.Client().bucket(gcp_bucket)
|
|
295
271
|
gcp_path = f"{gcp_prefix}/{gcp_file}" if gcp_prefix else gcp_file
|
|
@@ -300,9 +276,7 @@ async def sync_from_ftp_to_gcp(
|
|
|
300
276
|
if dir_match:
|
|
301
277
|
logger.info("Found release date!: %s", dir_match.group("release_date"))
|
|
302
278
|
buffer = io.BytesIO()
|
|
303
|
-
logger.info(
|
|
304
|
-
"Retrieving data from: ftp://%s/%s/%s.", ftp_server, ftp_prefix, ftp_file
|
|
305
|
-
)
|
|
279
|
+
logger.info("Retrieving data from: ftp://%s/%s/%s.", ftp_server, ftp_prefix, ftp_file)
|
|
306
280
|
ftp.retrbinary(f"RETR {ftp_file}", lambda x: buffer.write(x))
|
|
307
281
|
content = buffer.getvalue().decode("utf-8")
|
|
308
282
|
buffer.close()
|
gentroutils/commands/utils.py
CHANGED
|
@@ -6,6 +6,7 @@ import sys
|
|
|
6
6
|
import time
|
|
7
7
|
from functools import wraps
|
|
8
8
|
from pathlib import Path
|
|
9
|
+
from tempfile import NamedTemporaryFile
|
|
9
10
|
from urllib.parse import urlparse
|
|
10
11
|
|
|
11
12
|
import click
|
|
@@ -39,33 +40,43 @@ def set_log_file(ctx: click.Context, param: click.Option, log_file: str) -> str:
|
|
|
39
40
|
return ""
|
|
40
41
|
logger.info("Extracting log file from the %s", param)
|
|
41
42
|
upload_to_gcp = False
|
|
43
|
+
|
|
42
44
|
if "://" in log_file:
|
|
43
45
|
upload_to_gcp = True
|
|
46
|
+
ctx.obj["upload_to_gcp"] = upload_to_gcp
|
|
47
|
+
|
|
44
48
|
if upload_to_gcp:
|
|
45
49
|
parsed_uri = urlparse(log_file)
|
|
46
|
-
ctx.obj["gcp_log_file"] = log_file
|
|
47
50
|
if parsed_uri.scheme != "gs":
|
|
48
51
|
raise click.BadParameter("Only GCS is supported for logging upload")
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
52
|
+
tmp_file = NamedTemporaryFile(delete=False)
|
|
53
|
+
logger.info("Logging to temporary file %s", tmp_file.name)
|
|
54
|
+
handler = logging.FileHandler(tmp_file.name)
|
|
55
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
56
|
+
handler.setFormatter(formatter)
|
|
57
|
+
handler.setLevel(logging.DEBUG)
|
|
58
|
+
logger.addHandler(handler)
|
|
59
|
+
ctx.obj["local_log_file"] = tmp_file.name
|
|
60
|
+
ctx.obj["local_log_file_obj"] = tmp_file
|
|
61
|
+
ctx.obj["gcp_log_file"] = log_file
|
|
62
|
+
return tmp_file.name
|
|
63
|
+
|
|
64
|
+
else:
|
|
65
|
+
local_file = Path(log_file)
|
|
66
|
+
if local_file.exists() and local_file.is_dir():
|
|
67
|
+
raise click.BadParameter("Log file is a directory")
|
|
68
|
+
if local_file.exists() and local_file.is_file():
|
|
69
|
+
local_file.unlink()
|
|
70
|
+
if not local_file.exists():
|
|
71
|
+
local_file.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
local_file.touch()
|
|
73
|
+
logger.info("Logging to %s", local_file)
|
|
74
|
+
handler = logging.FileHandler(local_file)
|
|
75
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
76
|
+
handler.setFormatter(formatter)
|
|
77
|
+
handler.setLevel(logging.DEBUG)
|
|
78
|
+
logger.addHandler(handler)
|
|
79
|
+
return str(local_file)
|
|
69
80
|
|
|
70
81
|
|
|
71
82
|
def teardown_cli(ctx: click.Context) -> None:
|
|
@@ -80,16 +91,24 @@ def teardown_cli(ctx: click.Context) -> None:
|
|
|
80
91
|
if "upload_to_gcp" in ctx.obj and ctx.obj["upload_to_gcp"]:
|
|
81
92
|
gcp_file = ctx.obj["gcp_log_file"]
|
|
82
93
|
local_file = ctx.obj["local_log_file"]
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
94
|
+
with open(local_file, "r") as f:
|
|
95
|
+
content = f.read()
|
|
96
|
+
try:
|
|
97
|
+
client = storage.Client()
|
|
98
|
+
bucket_name = urlparse(gcp_file).netloc
|
|
99
|
+
bucket = client.bucket(bucket_name=bucket_name)
|
|
100
|
+
file_name = urlparse(gcp_file).path.lstrip("/")
|
|
101
|
+
blob = bucket.blob(file_name)
|
|
102
|
+
logger.info("Uploading %s to %s", local_file, gcp_file)
|
|
103
|
+
if ctx.obj["dry_run"]:
|
|
104
|
+
logger.info("Dry run, skipping the upload of the log file")
|
|
105
|
+
else:
|
|
106
|
+
blob.upload_from_string(content)
|
|
107
|
+
ctx.obj["local_log_file_obj"].close()
|
|
108
|
+
except Exception as e:
|
|
109
|
+
msg = f"Failed to upload log file to GCP {e}"
|
|
110
|
+
logger.error(click.style(msg, fg="red"))
|
|
111
|
+
logger.info("Finished, elapsed time %s seconds", time.time() - ctx.obj["execution_start"])
|
|
93
112
|
|
|
94
113
|
|
|
95
114
|
def set_log_lvl(_: click.Context, param: click.Option, value: int) -> int:
|
|
@@ -112,9 +131,7 @@ def set_log_lvl(_: click.Context, param: click.Option, value: int) -> int:
|
|
|
112
131
|
log_lvls = {0: logging.ERROR, 1: logging.INFO, 2: logging.DEBUG}
|
|
113
132
|
log_lvl = log_lvls.get(value, logging.DEBUG)
|
|
114
133
|
handler = logging.StreamHandler(sys.stdout)
|
|
115
|
-
formatter = logging.Formatter(
|
|
116
|
-
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
117
|
-
)
|
|
134
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
118
135
|
handler.setFormatter(formatter)
|
|
119
136
|
handler.setLevel(log_lvl)
|
|
120
137
|
logger.addHandler(handler)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Validate gwas catalog manual curation file."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TypeVar
|
|
11
|
+
|
|
12
|
+
import click
|
|
13
|
+
import great_expectations as gx
|
|
14
|
+
from click import Argument, BadParameter
|
|
15
|
+
from great_expectations import expectations as gxe
|
|
16
|
+
|
|
17
|
+
T = TypeVar("T")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger("gentroutils")
|
|
21
|
+
DATASOURCE_NAME = "GWAS Catalog curation"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Lnum(Enum):
|
|
25
|
+
"""List convertable enum."""
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def as_list(cls) -> list[T]:
|
|
29
|
+
"""Convert enum to list of strings."""
|
|
30
|
+
return list(map(lambda c: c.value, cls))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ColumnSet(Lnum):
|
|
34
|
+
"""Expected column names for curation file."""
|
|
35
|
+
|
|
36
|
+
STUDY_ID = "studyId"
|
|
37
|
+
STUDY_TYPE = "studyType"
|
|
38
|
+
FLAG = "analysisFlag"
|
|
39
|
+
QUALITY_CONTROL = "qualityControl"
|
|
40
|
+
IS_CURATED = "isCurated"
|
|
41
|
+
PUBMED = "pubmedId"
|
|
42
|
+
PUBLICATION_TITLE = "publicationTitle"
|
|
43
|
+
TRAIT = "traitFromSource"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class StudyType(Lnum):
|
|
47
|
+
"""Expected studyType column values."""
|
|
48
|
+
|
|
49
|
+
NO_LICENCE = "no_licence"
|
|
50
|
+
PQTL = "pQTL"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AnalysisFlag(Lnum):
|
|
54
|
+
"""Expected analysisFlag column values."""
|
|
55
|
+
|
|
56
|
+
CC = "Case-case study"
|
|
57
|
+
EXWAS = "ExWAS"
|
|
58
|
+
GXE = "GxE"
|
|
59
|
+
GXG = "GxG"
|
|
60
|
+
METABOLITE = "Metabolite"
|
|
61
|
+
MULTIVARIATE = "Multivariate analysis"
|
|
62
|
+
NON_ADDITIVE = "Non-additive model"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class IsCurated(Lnum):
|
|
66
|
+
"""Expected isCurated column values."""
|
|
67
|
+
|
|
68
|
+
YES = True
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _validate_input_file_name(_: click.Context, param: Argument, value: str) -> str:
|
|
72
|
+
"""Assert file comes from local fs and exists."""
|
|
73
|
+
logger.debug("Validating %s variable with %s value", param, value)
|
|
74
|
+
import os
|
|
75
|
+
|
|
76
|
+
logger.info(os.getcwd())
|
|
77
|
+
pattern = re.compile(r"^[\w*/.-]*$")
|
|
78
|
+
_match = pattern.fullmatch(value)
|
|
79
|
+
if not _match:
|
|
80
|
+
logger.error("%s is not a local file.", value)
|
|
81
|
+
raise BadParameter("Provided path is not local.")
|
|
82
|
+
p = Path(value)
|
|
83
|
+
if p.is_dir():
|
|
84
|
+
logger.error("%s is a directory.", value)
|
|
85
|
+
raise BadParameter("Provided path is a directory.")
|
|
86
|
+
if not p.exists():
|
|
87
|
+
logger.error("%s does not exit.", value)
|
|
88
|
+
raise BadParameter("Provided path does not exist.")
|
|
89
|
+
return str(p)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def split_source_path(source_path: str) -> tuple[Path, str]:
|
|
93
|
+
"""Split the source path into directory name and filename"""
|
|
94
|
+
p = Path(source_path)
|
|
95
|
+
return p.parent, p.name
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@click.command(name="validate-gwas-curation")
|
|
99
|
+
@click.argument("source_path", type=click.UNPROCESSED, callback=_validate_input_file_name)
|
|
100
|
+
@click.pass_context
|
|
101
|
+
def validate_gwas_curation(ctx: click.Context, source_path: str) -> None: # noqa: DOC101, D0C103
|
|
102
|
+
"""Validate GWAS catalog manual curation file.
|
|
103
|
+
|
|
104
|
+
\b
|
|
105
|
+
gentroutils -vvv validate-gwas-curation GWAS_Catalog_study_curation.tsv
|
|
106
|
+
|
|
107
|
+
"""
|
|
108
|
+
logger.info("Using %s as curation input.", source_path)
|
|
109
|
+
|
|
110
|
+
dry_run = ctx.obj["dry_run"]
|
|
111
|
+
if dry_run:
|
|
112
|
+
logger.info("Running in --dry-run mode, exitting.")
|
|
113
|
+
sys.exit(0)
|
|
114
|
+
|
|
115
|
+
logger.info("Building great expectations context...")
|
|
116
|
+
context = gx.get_context(mode="ephemeral")
|
|
117
|
+
directory, file = split_source_path(source_path)
|
|
118
|
+
data_source = context.data_sources.add_pandas_filesystem(name=DATASOURCE_NAME, base_directory=directory)
|
|
119
|
+
|
|
120
|
+
logger.info("Using %s datasource.", DATASOURCE_NAME)
|
|
121
|
+
logger.debug("Adding csv asset from %s", file)
|
|
122
|
+
file_tsv_asset = data_source.add_csv_asset(name="manual_curation", sep="\t", header=0)
|
|
123
|
+
logger.debug("Adding batch definion path %s", file)
|
|
124
|
+
batch_definition = file_tsv_asset.add_batch_definition_path(name="manual_curation", path=file)
|
|
125
|
+
|
|
126
|
+
logger.info("Building expectation suite...")
|
|
127
|
+
|
|
128
|
+
suite = gx.ExpectationSuite(name="Curation Validation")
|
|
129
|
+
context.suites.add(suite)
|
|
130
|
+
suite.add_expectation(gxe.ExpectTableColumnsToMatchSet(column_set=ColumnSet.as_list(), exact_match=True))
|
|
131
|
+
suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.PUBMED.value, type_="int"))
|
|
132
|
+
suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.PUBLICATION_TITLE.value, type_="str"))
|
|
133
|
+
suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.TRAIT.value, type_="str"))
|
|
134
|
+
suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.STUDY_ID.value, type_="str"))
|
|
135
|
+
suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.STUDY_TYPE.value, type_="str"))
|
|
136
|
+
suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.FLAG.value, type_="str"))
|
|
137
|
+
suite.add_expectation(
|
|
138
|
+
gxe.ExpectColumnDistinctValuesToBeInSet(column=ColumnSet.STUDY_TYPE.value, value_set=StudyType.as_list())
|
|
139
|
+
)
|
|
140
|
+
suite.add_expectation(gxe.ExpectColumnDistinctValuesToBeInSet(column=ColumnSet.FLAG.value, value_set=AnalysisFlag.as_list()))
|
|
141
|
+
suite.add_expectation(gxe.ExpectColumnValueLengthsToEqual(column=ColumnSet.PUBMED.value, value=8))
|
|
142
|
+
suite.add_expectation(gxe.ExpectColumnValuesToMatchRegex(column=ColumnSet.STUDY_ID.value, regex=r"^GCST\d+$"))
|
|
143
|
+
suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column=ColumnSet.STUDY_ID.value))
|
|
144
|
+
suite.add_expectation(gxe.ExpectColumnValuesToBeUnique(column=ColumnSet.STUDY_ID.value))
|
|
145
|
+
suite.save()
|
|
146
|
+
logger.info("Building validation definition...")
|
|
147
|
+
validation_definition = gx.ValidationDefinition(data=batch_definition, suite=suite, name="Curation Validation")
|
|
148
|
+
result = validation_definition.run()
|
|
149
|
+
|
|
150
|
+
logger.info(
|
|
151
|
+
click.style("Validation succeded" if result["success"] else "Validation failed", "green" if result["success"] else "red")
|
|
152
|
+
)
|
|
153
|
+
if not result["success"]:
|
|
154
|
+
for res in result["results"]:
|
|
155
|
+
if not res["success"]:
|
|
156
|
+
logger.error(
|
|
157
|
+
"Expectation %s for column %s run with %s ",
|
|
158
|
+
res["expectation_config"]["type"],
|
|
159
|
+
res["expectation_config"]["kwargs"]["column"]
|
|
160
|
+
if "column" in res["expectation_config"]["kwargs"]
|
|
161
|
+
else res["expectation_config"]["kwargs"]["column_set"],
|
|
162
|
+
"succeded" if res["success"] else "failed",
|
|
163
|
+
)
|
|
164
|
+
logger.error(res)
|
|
165
|
+
sys.exit(1)
|
gentroutils/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gentroutils
|
|
3
|
+
Version: 1.5.0
|
|
4
|
+
Summary: Open Targets python genetics utility CLI tools
|
|
5
|
+
Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Healthcare Industry
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Operating System :: Unix
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: <3.13,>=3.10
|
|
19
|
+
Requires-Dist: click>=8.1.7
|
|
20
|
+
Requires-Dist: google-cloud-storage>=2.18.1
|
|
21
|
+
Requires-Dist: great-expectations>=1.3.4
|
|
22
|
+
Requires-Dist: pyfiglet>=1.0.2
|
|
23
|
+
Requires-Dist: requests>=2.32.3
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# gentroutils
|
|
27
|
+
[](https://github.com/opentargets/gentroutils/actions/workflows/pr.yaml)
|
|
28
|
+

|
|
29
|
+
[](https://github.com/opentargets/gentroutils/actions/workflows/release.yaml)
|
|
30
|
+
|
|
31
|
+
Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
pip install gentroutils
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Available commands
|
|
40
|
+
|
|
41
|
+
To see all available commands after installation run
|
|
42
|
+
|
|
43
|
+
```{bash}
|
|
44
|
+
gentroutils --help
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Updating gwas catalog metadata
|
|
48
|
+
|
|
49
|
+
To update gwas catalog metadata run folliwing command
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
gentroutils -vvv -q gs://ot_orchestration/tests/gentroutils/log.txt update-gwas-curation-metadata \
|
|
53
|
+
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv gs://ot_orchestration/tests/gentroutils/gwas-catalog-associations_ontology-annotated.tsv \
|
|
54
|
+
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt gs://ot_orchestration/tests/gentroutils/gwas-catalog-download-studies-v1.0.3.1.txt \
|
|
55
|
+
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt gs://ot_orchestration/tests/gentroutils/gwas-catalog-download-ancestries-v1.0.3.1.txt \
|
|
56
|
+
-g https://www.ebi.ac.uk/gwas/api/search/stats
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
The command `update-gwas-curation-metadata` fetches the data from the ftp server and transfers them to the gcp without intermediate temporary files. The download(s) and upload(s) are made asyncronously.
|
|
60
|
+
|
|
61
|
+
### Validate gwas catalog curation file
|
|
62
|
+
|
|
63
|
+
To validate gwas catalog curation file after manual curation to see if all expectation tests are passing.
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
gentroutils -vvv validate-gwas-curation GWAS_Catalog_study_curation.tsv
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
validation is only allowed on the local file, the curation should follow format requirements defined in [OT curation](https://github.com/opentargets/curation/blob/master/genetics/GWAS_Catalog_study_curation.tsv)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
### Validate manual curation
|
|
73
|
+
|
|
74
|
+
To validate the manually curated file, run the following command
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
gentroutils -vvv validate-gwas-curation tests/data/manual_curation/correct_curation.tsv
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The command will validate the file and return the results of the validation if the issues are found.
|
|
81
|
+
|
|
82
|
+
## Read before running
|
|
83
|
+
|
|
84
|
+
The logs from the command are saved under the `-q` log file, if specified `gcp` log file, then the file will be uploaded after the command has run.
|
|
85
|
+
|
|
86
|
+
To test the command run it with `-d` == `--dry-run`, this will just mark the input and output destinations.
|
|
87
|
+
To allow for full logs to be transmitted to the log file, use `-vvv` to increase the verbosity of the logs
|
|
88
|
+
|
|
89
|
+
> [!NOTE]
|
|
90
|
+
> Change the path to the output `gcp` files to make sure they are saved under requested path
|
|
91
|
+
|
|
92
|
+
> [!WARNING]
|
|
93
|
+
> Please read before running the command!:
|
|
94
|
+
>
|
|
95
|
+
> * The above command has some default values set for the input and output files, make sure you test them in `--dry-run` so the existing files will not get overwritten!
|
|
96
|
+
> * Make sure to run `gcloud auth application-default login` to allow to use Google Cloud Python SDK before running the command
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
## Contribute
|
|
101
|
+
|
|
102
|
+
To be able to contribute to the project you need to set it up. This project
|
|
103
|
+
runs on:
|
|
104
|
+
|
|
105
|
+
- [x] python 3.10.8
|
|
106
|
+
- [x] uv (dependency manager)
|
|
107
|
+
|
|
108
|
+
To set up the project run
|
|
109
|
+
|
|
110
|
+
```{bash}
|
|
111
|
+
make dev
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
The command will install above dependencies (initial requirements are curl and bash) if not present and
|
|
115
|
+
install all python dependencies listed in `pyproject.toml`. Finally the command will install `pre-commit` hooks
|
|
116
|
+
requred to be run before the commit is created.
|
|
117
|
+
|
|
118
|
+
The project has additional `dev` dependencies that include the list of packages used for testing purposes.
|
|
119
|
+
All of the `dev` depnendencies are automatically installed by `uv`.
|
|
120
|
+
|
|
121
|
+
To see all available dev commands
|
|
122
|
+
|
|
123
|
+
Run following command to see all available dev commands
|
|
124
|
+
|
|
125
|
+
```{bash}
|
|
126
|
+
make help
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Manual testing of CLI module
|
|
130
|
+
|
|
131
|
+
To check CLI execution manually you need to run
|
|
132
|
+
|
|
133
|
+
```{bash}
|
|
134
|
+
uv run gentroutils
|
|
135
|
+
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
gentroutils/__init__.py,sha256=lCl9VmPVU2hkZa_kh5NipCrNufbcTH00iKF-6WvE0gg,1319
|
|
2
|
+
gentroutils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
gentroutils/commands/__init__.py,sha256=940jwl8BvK0gTN2mRb_js8Kl74rgff-benDSEO5MuBc,326
|
|
4
|
+
gentroutils/commands/update_gwas_curation_metadata.py,sha256=zOIv5l_84kBlqyXAXJHnwH5OrPKVlObuHl6WhwlZC90,10903
|
|
5
|
+
gentroutils/commands/utils.py,sha256=XEZogKn6ZQKaSwKms7UtJU5h_eGrrHM-FofCoU2k884,5312
|
|
6
|
+
gentroutils/commands/validate_gwas_curation.py,sha256=CWlQ8uI8JAK9l4MyjFNmjDLeMOolrcA3neVaBdAGN80,6148
|
|
7
|
+
gentroutils-1.5.0.dist-info/METADATA,sha256=LYbG6KpN19EN1O3ZCGhXOv_yRAYVPj1odQ5742ELoDQ,5154
|
|
8
|
+
gentroutils-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
9
|
+
gentroutils-1.5.0.dist-info/entry_points.txt,sha256=IvxZyBBD71Ota0aPMtVaJzI9OSX5_f-iH4ZJx6sY53w,48
|
|
10
|
+
gentroutils-1.5.0.dist-info/licenses/LICENSE,sha256=RFhQPdSOiMTguUX7JSoIuTxA7HVzCbj_p8WU36HjUQQ,10947
|
|
11
|
+
gentroutils-1.5.0.dist-info/RECORD,,
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: gentroutils
|
|
3
|
-
Version: 0.1.5
|
|
4
|
-
Summary: Add your description here
|
|
5
|
-
Author-email: Szymon Szyszkowski <ss60@mib117351s.internal.sanger.ac.uk>
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Requires-Python: >=3.10
|
|
8
|
-
Requires-Dist: click>=8.1.7
|
|
9
|
-
Requires-Dist: google-cloud-storage>=2.18.1
|
|
10
|
-
Requires-Dist: pyfiglet>=1.0.2
|
|
11
|
-
Requires-Dist: requests>=2.32.3
|
|
12
|
-
Description-Content-Type: text/markdown
|
|
13
|
-
|
|
14
|
-
# gentroutils
|
|
15
|
-
|
|
16
|
-
[](https://github.com/opentargets/gentroutils/actions/workflows/test.yaml)
|
|
17
|
-

|
|
18
|
-
|
|
19
|
-
Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
|
|
20
|
-
|
|
21
|
-
## Installation
|
|
22
|
-
|
|
23
|
-
```
|
|
24
|
-
pip install gentroutils
|
|
25
|
-
```
|
|
26
|
-
|
|
27
|
-
## Available commands
|
|
28
|
-
|
|
29
|
-
To see all available commands after installation run
|
|
30
|
-
|
|
31
|
-
```{bash}
|
|
32
|
-
gentroutils --help
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
## Contribute
|
|
36
|
-
|
|
37
|
-
To be able to contribute to the project you need to set it up. This project
|
|
38
|
-
runs on:
|
|
39
|
-
|
|
40
|
-
- [x] python 3.10.8
|
|
41
|
-
- [x] rye (package manager)
|
|
42
|
-
- [x] uv (dependency manager)
|
|
43
|
-
|
|
44
|
-
To set up the project run
|
|
45
|
-
|
|
46
|
-
```{bash}
|
|
47
|
-
make dev
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
The command will install above dependencies (initial requirements are curl and bash) if not present and
|
|
51
|
-
install all python dependencies listed in `pyproject.toml`. Finally the command will install `pre-commit` hooks
|
|
52
|
-
requred to be run before the commit is created.
|
|
53
|
-
|
|
54
|
-
The project has additional `dev` dependencies that include the list of packages used for testing purposes.
|
|
55
|
-
All of the `dev` depnendencies are automatically installed by `rye`.
|
|
56
|
-
|
|
57
|
-
To see all available dev commands
|
|
58
|
-
|
|
59
|
-
Run following command to see all available dev commands
|
|
60
|
-
|
|
61
|
-
```{bash}
|
|
62
|
-
make help
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
### Manual testing of CLI module
|
|
66
|
-
|
|
67
|
-
To check CLI execution manually you need to run
|
|
68
|
-
|
|
69
|
-
```{bash}
|
|
70
|
-
rye run gentroutils
|
|
71
|
-
```
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
gentroutils/__init__.py,sha256=aHDzbBMrnsgdcO_FfsYCbbPXProynwB7_2nfyc4UGp8,1281
|
|
2
|
-
gentroutils/commands/__init__.py,sha256=avkqzwa1ck__rLVN0Wqfpr3eHtKS6TvyPeeaHcguJuw,210
|
|
3
|
-
gentroutils/commands/update_gwas_curation_metadata.py,sha256=7pBBkB6JF3VfT12xiP78MT_pmn0Wv4CF7Tm5TPgBXf8,12525
|
|
4
|
-
gentroutils/commands/utils.py,sha256=9Wyptjww9hiAufCFILdnjdDOE6X6TdtyTWJOTkoIRqg,4316
|
|
5
|
-
gentroutils-0.1.5.dist-info/METADATA,sha256=9PFlHuJakF2bnJfF9d6kPepH0jdRJM3g70GevV5Q7fM,1795
|
|
6
|
-
gentroutils-0.1.5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
7
|
-
gentroutils-0.1.5.dist-info/entry_points.txt,sha256=IvxZyBBD71Ota0aPMtVaJzI9OSX5_f-iH4ZJx6sY53w,48
|
|
8
|
-
gentroutils-0.1.5.dist-info/licenses/LICENSE,sha256=RFhQPdSOiMTguUX7JSoIuTxA7HVzCbj_p8WU36HjUQQ,10947
|
|
9
|
-
gentroutils-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|