gentroutils 0.1.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gentroutils/__init__.py CHANGED
@@ -8,8 +8,9 @@ import time
8
8
  import click
9
9
  import pyfiglet
10
10
 
11
- from gentroutils.commands.update_gwas_curation_metadata import (
11
+ from gentroutils.commands import (
12
12
  update_gwas_curation_metadata_command,
13
+ validate_gwas_curation,
13
14
  )
14
15
  from gentroutils.commands.utils import set_log_file, set_log_lvl, teardown_cli
15
16
 
@@ -40,5 +41,6 @@ def cli(ctx: click.Context, **kwargs: dict[str, str]) -> None:
40
41
 
41
42
 
42
43
  cli.add_command(update_gwas_curation_metadata_command)
44
+ cli.add_command(validate_gwas_curation)
43
45
 
44
46
  __all__ = ["cli"]
@@ -3,5 +3,9 @@
3
3
  from gentroutils.commands.update_gwas_curation_metadata import (
4
4
  update_gwas_curation_metadata_command,
5
5
  )
6
+ from gentroutils.commands.validate_gwas_curation import validate_gwas_curation
6
7
 
7
- __all__ = ["update_gwas_curation_metadata_command"]
8
+ __all__ = [
9
+ "update_gwas_curation_metadata_command",
10
+ "validate_gwas_curation",
11
+ ]
@@ -21,27 +21,15 @@ MAX_CONCURRENT_CONNECTIONS = 10
21
21
  CURATED_INPUTS = (
22
22
  (
23
23
  "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv",
24
- "gs://gwas_catalog_data/curated_inputs/gwas_catalog_associations_ontology_annotated.tsv",
24
+ "gs://gwas_catalog_inputs/gwas_catalog_associations_ontology_annotated.tsv",
25
25
  ),
26
26
  (
27
27
  "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt",
28
- "gs://gwas_catalog_data/curated_inputs/gwas_catalog_download_studies.tsv",
29
- ),
30
- (
31
- "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-unpublished-studies-v1.0.3.1.tsv",
32
- "gs://gwas_catalog_data/curated_inputs/gwas_catalog_unpublished_studies.tsv",
28
+ "gs://gwas_catalog_inputs/gwas_catalog_download_studies.tsv",
33
29
  ),
34
30
  (
35
31
  "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt",
36
- "gs://gwas_catalog_data/curated_inputs/gwas_catalog_download_ancestries.tsv",
37
- ),
38
- (
39
- "ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-unpublished-ancestries-v1.0.3.1.tsv",
40
- "gs://gwas_catalog_data/curated_inputs/gwas_catalog_unpublished_ancestries.tsv",
41
- ),
42
- (
43
- "ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/harmonised_list.txt",
44
- "gs://gwas_catalog_data/curated_inputs/harmonised_list.txt",
32
+ "gs://gwas_catalog_inputs/gwas_catalog_download_ancestries.tsv",
45
33
  ),
46
34
  )
47
35
 
@@ -75,9 +63,7 @@ async def update_gwas_curation_metadata_command(
75
63
  This is the script to fetch the latest GWAS Catalog data files that include:
76
64
  - [x] gwas-catalog-associations_ontology-annotated.tsv - list of associations with ontology annotations by GWAS Catalog
77
65
  - [x] gwas-catalog-download-studies-v1.0.3.1.txt - list of published studies by GWAS Catalog
78
- - [x] gwas-catalog-unpublished-studies-v1.0.3.1.tsv - list of unpublished studies by GWAS Catalog
79
66
  - [x] gwas-catalog-download-ancestries-v1.0.3.1.txt - list of published studies by GWAS Catalog
80
- - [x] gwas-catalog-unpublished-ancestries-v1.0.3.1.tsv - list of unpublished studies by GWAS Catalog
81
67
 
82
68
  \b
83
69
  By default all GWAS Catalog data files are uploaded from GWAS Catalog FTP server to Open Targets GCP bucket.
@@ -86,13 +72,9 @@ async def update_gwas_curation_metadata_command(
86
72
 
87
73
  \b
88
74
  gentroutils --log-file gs://gwas_catalog_data/curated_inputs/log.txt update-gwas-curation-metadata \\
89
- -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv gs://gwas_catalog_data/curated_inputs/gwas_catalog_associations_ontology_annotated.tsv \\
90
- -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt gs://gwas_catalog_data/curated_inputs/gwas_catalog_download_studies.tsv \\
91
- -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-unpublished-studies-v1.0.3.1.tsv gs://gwas_catalog_data/curated_inputs/gwas_catalog_unpublished_studies.tsv \\
92
- -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt gs://gwas_catalog_data/curated_inputs/gwas_catalog_download_ancestries.tsv \\
93
- -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-unpublished-ancestries-v1.0.3.1.tsv gs://gwas_catalog_data/curated_inputs/gwas_catalog_unpublished_ancestries.tsv \\
94
- -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/harmonised_list.txt gs://gwas_catalog_data/curated_inputs/harmonised_list.txt \\
95
- -f https://raw.githubusercontent.com/opentargets/curation/master/genetics/GWAS_Catalog_study_curation.tsv gs://gwas_catalog_data/manifests/gwas_catalog_study_curation.tsv \\
75
+ -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv gs://gwas_catalog_data/gwas_catalog_associations_ontology_annotated.tsv \\
76
+ -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt gs://gwas_catalog_inputs/gwas_catalog_download_studies.tsv \\
77
+ -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt gs://gwas_catalog_inputs/gwas_catalog_download_ancestries.tsv \\
96
78
  -g https://www.ebi.ac.uk/gwas/api/search/stats
97
79
 
98
80
 
@@ -108,10 +90,7 @@ async def update_gwas_curation_metadata_command(
108
90
  MAX_CONCURRENT_CONNECTIONS,
109
91
  )
110
92
  sys.exit(1)
111
- uri_map = [
112
- {"input": urlparse(ftp_file), "output": urlparse(gcp_file)}
113
- for ftp_file, gcp_file in file_to_transfer
114
- ]
93
+ uri_map = [{"input": urlparse(ftp_file), "output": urlparse(gcp_file)} for ftp_file, gcp_file in file_to_transfer]
115
94
  transfer_tasks = generate_transfer_tasks(uri_map, dry_run)
116
95
 
117
96
  # capture latest release metadata
@@ -135,9 +114,7 @@ async def update_gwas_curation_metadata_command(
135
114
  logger.info("gwas_curation_update step completed.")
136
115
 
137
116
 
138
- def generate_transfer_tasks(
139
- uri_map: list[dict[str, ParseResult]], dry_run: bool
140
- ) -> list[asyncio.Task[None]]:
117
+ def generate_transfer_tasks(uri_map: list[dict[str, ParseResult]], dry_run: bool) -> list[asyncio.Task[None]]:
141
118
  """Generate transfer tasks.
142
119
 
143
120
  Args:
@@ -214,9 +191,7 @@ def generate_transfer_tasks(
214
191
  return transfer_tasks
215
192
 
216
193
 
217
- async def sync_from_http_to_gcp(
218
- url: str, gcp_bucket: str, gcp_prefix: str, gcp_file: str, *, dry_run: bool = True
219
- ) -> None:
194
+ async def sync_from_http_to_gcp(url: str, gcp_bucket: str, gcp_prefix: str, gcp_file: str, *, dry_run: bool = True) -> None:
220
195
  """Sync file from HTTP and upload to GCP.
221
196
 
222
197
  This function fetches the data from the provided HTTP URL and uploads the content
@@ -289,7 +264,8 @@ async def sync_from_ftp_to_gcp(
289
264
  gcp_file,
290
265
  )
291
266
  return
292
- with FTP(ftp_server) as ftp:
267
+ with FTP() as ftp:
268
+ ftp.connect(ftp_server)
293
269
  ftp.login()
294
270
  bucket = storage.Client().bucket(gcp_bucket)
295
271
  gcp_path = f"{gcp_prefix}/{gcp_file}" if gcp_prefix else gcp_file
@@ -300,9 +276,7 @@ async def sync_from_ftp_to_gcp(
300
276
  if dir_match:
301
277
  logger.info("Found release date!: %s", dir_match.group("release_date"))
302
278
  buffer = io.BytesIO()
303
- logger.info(
304
- "Retrieving data from: ftp://%s/%s/%s.", ftp_server, ftp_prefix, ftp_file
305
- )
279
+ logger.info("Retrieving data from: ftp://%s/%s/%s.", ftp_server, ftp_prefix, ftp_file)
306
280
  ftp.retrbinary(f"RETR {ftp_file}", lambda x: buffer.write(x))
307
281
  content = buffer.getvalue().decode("utf-8")
308
282
  buffer.close()
@@ -6,6 +6,7 @@ import sys
6
6
  import time
7
7
  from functools import wraps
8
8
  from pathlib import Path
9
+ from tempfile import NamedTemporaryFile
9
10
  from urllib.parse import urlparse
10
11
 
11
12
  import click
@@ -39,33 +40,43 @@ def set_log_file(ctx: click.Context, param: click.Option, log_file: str) -> str:
39
40
  return ""
40
41
  logger.info("Extracting log file from the %s", param)
41
42
  upload_to_gcp = False
43
+
42
44
  if "://" in log_file:
43
45
  upload_to_gcp = True
46
+ ctx.obj["upload_to_gcp"] = upload_to_gcp
47
+
44
48
  if upload_to_gcp:
45
49
  parsed_uri = urlparse(log_file)
46
- ctx.obj["gcp_log_file"] = log_file
47
50
  if parsed_uri.scheme != "gs":
48
51
  raise click.BadParameter("Only GCS is supported for logging upload")
49
- log_file = parsed_uri.path.strip("/")
50
- ctx.obj["local_log_file"] = log_file
51
- ctx.obj["upload_to_gcp"] = upload_to_gcp
52
-
53
- local_file = Path(log_file)
54
- if local_file.exists() and local_file.is_dir():
55
- raise click.BadParameter("Log file is a directory")
56
- if local_file.exists() and local_file.is_file():
57
- local_file.unlink()
58
- if not local_file.exists():
59
- local_file.touch()
60
- logger.info("Logging to %s", local_file)
61
- handler = logging.FileHandler(local_file)
62
- formatter = logging.Formatter(
63
- "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
64
- )
65
- handler.setFormatter(formatter)
66
- handler.setLevel(logging.DEBUG)
67
- logger.addHandler(handler)
68
- return str(local_file)
52
+ tmp_file = NamedTemporaryFile(delete=False)
53
+ logger.info("Logging to temporary file %s", tmp_file.name)
54
+ handler = logging.FileHandler(tmp_file.name)
55
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
56
+ handler.setFormatter(formatter)
57
+ handler.setLevel(logging.DEBUG)
58
+ logger.addHandler(handler)
59
+ ctx.obj["local_log_file"] = tmp_file.name
60
+ ctx.obj["local_log_file_obj"] = tmp_file
61
+ ctx.obj["gcp_log_file"] = log_file
62
+ return tmp_file.name
63
+
64
+ else:
65
+ local_file = Path(log_file)
66
+ if local_file.exists() and local_file.is_dir():
67
+ raise click.BadParameter("Log file is a directory")
68
+ if local_file.exists() and local_file.is_file():
69
+ local_file.unlink()
70
+ if not local_file.exists():
71
+ local_file.parent.mkdir(parents=True, exist_ok=True)
72
+ local_file.touch()
73
+ logger.info("Logging to %s", local_file)
74
+ handler = logging.FileHandler(local_file)
75
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
76
+ handler.setFormatter(formatter)
77
+ handler.setLevel(logging.DEBUG)
78
+ logger.addHandler(handler)
79
+ return str(local_file)
69
80
 
70
81
 
71
82
  def teardown_cli(ctx: click.Context) -> None:
@@ -80,16 +91,24 @@ def teardown_cli(ctx: click.Context) -> None:
80
91
  if "upload_to_gcp" in ctx.obj and ctx.obj["upload_to_gcp"]:
81
92
  gcp_file = ctx.obj["gcp_log_file"]
82
93
  local_file = ctx.obj["local_log_file"]
83
- client = storage.Client()
84
- bucket_name = urlparse(gcp_file).netloc
85
- bucket = client.bucket(bucket_name=bucket_name)
86
- blob = bucket.blob(Path(local_file).name)
87
- logger.info("Uploading %s to %s", local_file, gcp_file)
88
- blob.upload_from_filename(local_file)
89
- Path(local_file).unlink()
90
- logger.info(
91
- "Finished, elapsed time %s seconds", time.time() - ctx.obj["execution_start"]
92
- )
94
+ with open(local_file, "r") as f:
95
+ content = f.read()
96
+ try:
97
+ client = storage.Client()
98
+ bucket_name = urlparse(gcp_file).netloc
99
+ bucket = client.bucket(bucket_name=bucket_name)
100
+ file_name = urlparse(gcp_file).path.lstrip("/")
101
+ blob = bucket.blob(file_name)
102
+ logger.info("Uploading %s to %s", local_file, gcp_file)
103
+ if ctx.obj["dry_run"]:
104
+ logger.info("Dry run, skipping the upload of the log file")
105
+ else:
106
+ blob.upload_from_string(content)
107
+ ctx.obj["local_log_file_obj"].close()
108
+ except Exception as e:
109
+ msg = f"Failed to upload log file to GCP {e}"
110
+ logger.error(click.style(msg, fg="red"))
111
+ logger.info("Finished, elapsed time %s seconds", time.time() - ctx.obj["execution_start"])
93
112
 
94
113
 
95
114
  def set_log_lvl(_: click.Context, param: click.Option, value: int) -> int:
@@ -112,9 +131,7 @@ def set_log_lvl(_: click.Context, param: click.Option, value: int) -> int:
112
131
  log_lvls = {0: logging.ERROR, 1: logging.INFO, 2: logging.DEBUG}
113
132
  log_lvl = log_lvls.get(value, logging.DEBUG)
114
133
  handler = logging.StreamHandler(sys.stdout)
115
- formatter = logging.Formatter(
116
- "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
117
- )
134
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
118
135
  handler.setFormatter(formatter)
119
136
  handler.setLevel(log_lvl)
120
137
  logger.addHandler(handler)
@@ -0,0 +1,165 @@
1
+ """Validate gwas catalog manual curation file."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ import sys
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from typing import TypeVar
11
+
12
+ import click
13
+ import great_expectations as gx
14
+ from click import Argument, BadParameter
15
+ from great_expectations import expectations as gxe
16
+
17
+ T = TypeVar("T")
18
+
19
+
20
+ logger = logging.getLogger("gentroutils")
21
+ DATASOURCE_NAME = "GWAS Catalog curation"
22
+
23
+
24
+ class Lnum(Enum):
25
+ """List convertable enum."""
26
+
27
+ @classmethod
28
+ def as_list(cls) -> list[T]:
29
+ """Convert enum to list of strings."""
30
+ return list(map(lambda c: c.value, cls))
31
+
32
+
33
+ class ColumnSet(Lnum):
34
+ """Expected column names for curation file."""
35
+
36
+ STUDY_ID = "studyId"
37
+ STUDY_TYPE = "studyType"
38
+ FLAG = "analysisFlag"
39
+ QUALITY_CONTROL = "qualityControl"
40
+ IS_CURATED = "isCurated"
41
+ PUBMED = "pubmedId"
42
+ PUBLICATION_TITLE = "publicationTitle"
43
+ TRAIT = "traitFromSource"
44
+
45
+
46
+ class StudyType(Lnum):
47
+ """Expected studyType column values."""
48
+
49
+ NO_LICENCE = "no_licence"
50
+ PQTL = "pQTL"
51
+
52
+
53
+ class AnalysisFlag(Lnum):
54
+ """Expected analysisFlag column values."""
55
+
56
+ CC = "Case-case study"
57
+ EXWAS = "ExWAS"
58
+ GXE = "GxE"
59
+ GXG = "GxG"
60
+ METABOLITE = "Metabolite"
61
+ MULTIVARIATE = "Multivariate analysis"
62
+ NON_ADDITIVE = "Non-additive model"
63
+
64
+
65
+ class IsCurated(Lnum):
66
+ """Expected isCurated column values."""
67
+
68
+ YES = True
69
+
70
+
71
+ def _validate_input_file_name(_: click.Context, param: Argument, value: str) -> str:
72
+ """Assert file comes from local fs and exists."""
73
+ logger.debug("Validating %s variable with %s value", param, value)
74
+ import os
75
+
76
+ logger.info(os.getcwd())
77
+ pattern = re.compile(r"^[\w*/.-]*$")
78
+ _match = pattern.fullmatch(value)
79
+ if not _match:
80
+ logger.error("%s is not a local file.", value)
81
+ raise BadParameter("Provided path is not local.")
82
+ p = Path(value)
83
+ if p.is_dir():
84
+ logger.error("%s is a directory.", value)
85
+ raise BadParameter("Provided path is a directory.")
86
+ if not p.exists():
87
+ logger.error("%s does not exit.", value)
88
+ raise BadParameter("Provided path does not exist.")
89
+ return str(p)
90
+
91
+
92
+ def split_source_path(source_path: str) -> tuple[Path, str]:
93
+ """Split the source path into directory name and filename"""
94
+ p = Path(source_path)
95
+ return p.parent, p.name
96
+
97
+
98
+ @click.command(name="validate-gwas-curation")
99
+ @click.argument("source_path", type=click.UNPROCESSED, callback=_validate_input_file_name)
100
+ @click.pass_context
101
+ def validate_gwas_curation(ctx: click.Context, source_path: str) -> None: # noqa: DOC101, D0C103
102
+ """Validate GWAS catalog manual curation file.
103
+
104
+ \b
105
+ gentroutils -vvv validate-gwas-curation GWAS_Catalog_study_curation.tsv
106
+
107
+ """
108
+ logger.info("Using %s as curation input.", source_path)
109
+
110
+ dry_run = ctx.obj["dry_run"]
111
+ if dry_run:
112
+ logger.info("Running in --dry-run mode, exitting.")
113
+ sys.exit(0)
114
+
115
+ logger.info("Building great expectations context...")
116
+ context = gx.get_context(mode="ephemeral")
117
+ directory, file = split_source_path(source_path)
118
+ data_source = context.data_sources.add_pandas_filesystem(name=DATASOURCE_NAME, base_directory=directory)
119
+
120
+ logger.info("Using %s datasource.", DATASOURCE_NAME)
121
+ logger.debug("Adding csv asset from %s", file)
122
+ file_tsv_asset = data_source.add_csv_asset(name="manual_curation", sep="\t", header=0)
123
+ logger.debug("Adding batch definion path %s", file)
124
+ batch_definition = file_tsv_asset.add_batch_definition_path(name="manual_curation", path=file)
125
+
126
+ logger.info("Building expectation suite...")
127
+
128
+ suite = gx.ExpectationSuite(name="Curation Validation")
129
+ context.suites.add(suite)
130
+ suite.add_expectation(gxe.ExpectTableColumnsToMatchSet(column_set=ColumnSet.as_list(), exact_match=True))
131
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.PUBMED.value, type_="int"))
132
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.PUBLICATION_TITLE.value, type_="str"))
133
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.TRAIT.value, type_="str"))
134
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.STUDY_ID.value, type_="str"))
135
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.STUDY_TYPE.value, type_="str"))
136
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.FLAG.value, type_="str"))
137
+ suite.add_expectation(
138
+ gxe.ExpectColumnDistinctValuesToBeInSet(column=ColumnSet.STUDY_TYPE.value, value_set=StudyType.as_list())
139
+ )
140
+ suite.add_expectation(gxe.ExpectColumnDistinctValuesToBeInSet(column=ColumnSet.FLAG.value, value_set=AnalysisFlag.as_list()))
141
+ suite.add_expectation(gxe.ExpectColumnValueLengthsToEqual(column=ColumnSet.PUBMED.value, value=8))
142
+ suite.add_expectation(gxe.ExpectColumnValuesToMatchRegex(column=ColumnSet.STUDY_ID.value, regex=r"^GCST\d+$"))
143
+ suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column=ColumnSet.STUDY_ID.value))
144
+ suite.add_expectation(gxe.ExpectColumnValuesToBeUnique(column=ColumnSet.STUDY_ID.value))
145
+ suite.save()
146
+ logger.info("Building validation definition...")
147
+ validation_definition = gx.ValidationDefinition(data=batch_definition, suite=suite, name="Curation Validation")
148
+ result = validation_definition.run()
149
+
150
+ logger.info(
151
+ click.style("Validation succeded" if result["success"] else "Validation failed", "green" if result["success"] else "red")
152
+ )
153
+ if not result["success"]:
154
+ for res in result["results"]:
155
+ if not res["success"]:
156
+ logger.error(
157
+ "Expectation %s for column %s run with %s ",
158
+ res["expectation_config"]["type"],
159
+ res["expectation_config"]["kwargs"]["column"]
160
+ if "column" in res["expectation_config"]["kwargs"]
161
+ else res["expectation_config"]["kwargs"]["column_set"],
162
+ "succeded" if res["success"] else "failed",
163
+ )
164
+ logger.error(res)
165
+ sys.exit(1)
gentroutils/py.typed ADDED
File without changes
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.4
2
+ Name: gentroutils
3
+ Version: 1.5.0
4
+ Summary: Open Targets python genetics utility CLI tools
5
+ Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
6
+ License-Expression: Apache-2.0
7
+ License-File: LICENSE
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Healthcare Industry
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Operating System :: Unix
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
+ Requires-Python: <3.13,>=3.10
19
+ Requires-Dist: click>=8.1.7
20
+ Requires-Dist: google-cloud-storage>=2.18.1
21
+ Requires-Dist: great-expectations>=1.3.4
22
+ Requires-Dist: pyfiglet>=1.0.2
23
+ Requires-Dist: requests>=2.32.3
24
+ Description-Content-Type: text/markdown
25
+
26
+ # gentroutils
27
+ [![checks](https://github.com/opentargets/gentroutils/actions/workflows/pr.yaml/badge.svg?branch=dev)](https://github.com/opentargets/gentroutils/actions/workflows/pr.yaml)
28
+ ![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)
29
+ [![release](https://github.com/opentargets/gentroutils/actions/workflows/release.yaml/badge.svg)](https://github.com/opentargets/gentroutils/actions/workflows/release.yaml)
30
+
31
+ Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
32
+
33
+ ## Installation
34
+
35
+ ```
36
+ pip install gentroutils
37
+ ```
38
+
39
+ ## Available commands
40
+
41
+ To see all available commands after installation run
42
+
43
+ ```{bash}
44
+ gentroutils --help
45
+ ```
46
+
47
+ ### Updating gwas catalog metadata
48
+
49
+ To update gwas catalog metadata run folliwing command
50
+
51
+ ```bash
52
+ gentroutils -vvv -q gs://ot_orchestration/tests/gentroutils/log.txt update-gwas-curation-metadata \
53
+ -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv gs://ot_orchestration/tests/gentroutils/gwas-catalog-associations_ontology-annotated.tsv \
54
+ -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt gs://ot_orchestration/tests/gentroutils/gwas-catalog-download-studies-v1.0.3.1.txt \
55
+ -f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt gs://ot_orchestration/tests/gentroutils/gwas-catalog-download-ancestries-v1.0.3.1.txt \
56
+ -g https://www.ebi.ac.uk/gwas/api/search/stats
57
+ ```
58
+
59
+ The command `update-gwas-curation-metadata` fetches the data from the ftp server and transfers them to the gcp without intermediate temporary files. The download(s) and upload(s) are made asyncronously.
60
+
61
+ ### Validate gwas catalog curation file
62
+
63
+ To validate gwas catalog curation file after manual curation to see if all expectation tests are passing.
64
+
65
+ ```bash
66
+ gentroutils -vvv validate-gwas-curation GWAS_Catalog_study_curation.tsv
67
+ ```
68
+
69
+ validation is only allowed on the local file, the curation should follow format requirements defined in [OT curation](https://github.com/opentargets/curation/blob/master/genetics/GWAS_Catalog_study_curation.tsv)
70
+
71
+
72
+ ### Validate manual curation
73
+
74
+ To validate the manually curated file, run the following command
75
+
76
+ ```bash
77
+ gentroutils -vvv validate-gwas-curation tests/data/manual_curation/correct_curation.tsv
78
+ ```
79
+
80
+ The command will validate the file and return the results of the validation if the issues are found.
81
+
82
+ ## Read before running
83
+
84
+ The logs from the command are saved under the `-q` log file, if specified `gcp` log file, then the file will be uploaded after the command has run.
85
+
86
+ To test the command run it with `-d` == `--dry-run`, this will just mark the input and output destinations.
87
+ To allow for full logs to be transmitted to the log file, use `-vvv` to increase the verbosity of the logs
88
+
89
+ > [!NOTE]
90
+ > Change the path to the output `gcp` files to make sure they are saved under requested path
91
+
92
+ > [!WARNING]
93
+ > Please read before running the command!:
94
+ >
95
+ > * The above command has some default values set for the input and output files, make sure you test them in `--dry-run` so the existing files will not get overwritten!
96
+ > * Make sure to run `gcloud auth application-default login` to allow to use Google Cloud Python SDK before running the command
97
+
98
+
99
+
100
+ ## Contribute
101
+
102
+ To be able to contribute to the project you need to set it up. This project
103
+ runs on:
104
+
105
+ - [x] python 3.10.8
106
+ - [x] uv (dependency manager)
107
+
108
+ To set up the project run
109
+
110
+ ```{bash}
111
+ make dev
112
+ ```
113
+
114
+ The command will install above dependencies (initial requirements are curl and bash) if not present and
115
+ install all python dependencies listed in `pyproject.toml`. Finally the command will install `pre-commit` hooks
116
+ requred to be run before the commit is created.
117
+
118
+ The project has additional `dev` dependencies that include the list of packages used for testing purposes.
119
+ All of the `dev` depnendencies are automatically installed by `uv`.
120
+
121
+ To see all available dev commands
122
+
123
+ Run following command to see all available dev commands
124
+
125
+ ```{bash}
126
+ make help
127
+ ```
128
+
129
+ ### Manual testing of CLI module
130
+
131
+ To check CLI execution manually you need to run
132
+
133
+ ```{bash}
134
+ uv run gentroutils
135
+ ```
@@ -0,0 +1,11 @@
1
+ gentroutils/__init__.py,sha256=lCl9VmPVU2hkZa_kh5NipCrNufbcTH00iKF-6WvE0gg,1319
2
+ gentroutils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ gentroutils/commands/__init__.py,sha256=940jwl8BvK0gTN2mRb_js8Kl74rgff-benDSEO5MuBc,326
4
+ gentroutils/commands/update_gwas_curation_metadata.py,sha256=zOIv5l_84kBlqyXAXJHnwH5OrPKVlObuHl6WhwlZC90,10903
5
+ gentroutils/commands/utils.py,sha256=XEZogKn6ZQKaSwKms7UtJU5h_eGrrHM-FofCoU2k884,5312
6
+ gentroutils/commands/validate_gwas_curation.py,sha256=CWlQ8uI8JAK9l4MyjFNmjDLeMOolrcA3neVaBdAGN80,6148
7
+ gentroutils-1.5.0.dist-info/METADATA,sha256=LYbG6KpN19EN1O3ZCGhXOv_yRAYVPj1odQ5742ELoDQ,5154
8
+ gentroutils-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ gentroutils-1.5.0.dist-info/entry_points.txt,sha256=IvxZyBBD71Ota0aPMtVaJzI9OSX5_f-iH4ZJx6sY53w,48
10
+ gentroutils-1.5.0.dist-info/licenses/LICENSE,sha256=RFhQPdSOiMTguUX7JSoIuTxA7HVzCbj_p8WU36HjUQQ,10947
11
+ gentroutils-1.5.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,71 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: gentroutils
3
- Version: 0.1.5
4
- Summary: Add your description here
5
- Author-email: Szymon Szyszkowski <ss60@mib117351s.internal.sanger.ac.uk>
6
- License-File: LICENSE
7
- Requires-Python: >=3.10
8
- Requires-Dist: click>=8.1.7
9
- Requires-Dist: google-cloud-storage>=2.18.1
10
- Requires-Dist: pyfiglet>=1.0.2
11
- Requires-Dist: requests>=2.32.3
12
- Description-Content-Type: text/markdown
13
-
14
- # gentroutils
15
-
16
- [![Tests](https://github.com/opentargets/gentroutils/actions/workflows/test.yaml/badge.svg?event=push)](https://github.com/opentargets/gentroutils/actions/workflows/test.yaml)
17
- ![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)
18
-
19
- Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
20
-
21
- ## Installation
22
-
23
- ```
24
- pip install gentroutils
25
- ```
26
-
27
- ## Available commands
28
-
29
- To see all available commands after installation run
30
-
31
- ```{bash}
32
- gentroutils --help
33
- ```
34
-
35
- ## Contribute
36
-
37
- To be able to contribute to the project you need to set it up. This project
38
- runs on:
39
-
40
- - [x] python 3.10.8
41
- - [x] rye (package manager)
42
- - [x] uv (dependency manager)
43
-
44
- To set up the project run
45
-
46
- ```{bash}
47
- make dev
48
- ```
49
-
50
- The command will install above dependencies (initial requirements are curl and bash) if not present and
51
- install all python dependencies listed in `pyproject.toml`. Finally the command will install `pre-commit` hooks
52
- requred to be run before the commit is created.
53
-
54
- The project has additional `dev` dependencies that include the list of packages used for testing purposes.
55
- All of the `dev` depnendencies are automatically installed by `rye`.
56
-
57
- To see all available dev commands
58
-
59
- Run following command to see all available dev commands
60
-
61
- ```{bash}
62
- make help
63
- ```
64
-
65
- ### Manual testing of CLI module
66
-
67
- To check CLI execution manually you need to run
68
-
69
- ```{bash}
70
- rye run gentroutils
71
- ```
@@ -1,9 +0,0 @@
1
- gentroutils/__init__.py,sha256=aHDzbBMrnsgdcO_FfsYCbbPXProynwB7_2nfyc4UGp8,1281
2
- gentroutils/commands/__init__.py,sha256=avkqzwa1ck__rLVN0Wqfpr3eHtKS6TvyPeeaHcguJuw,210
3
- gentroutils/commands/update_gwas_curation_metadata.py,sha256=7pBBkB6JF3VfT12xiP78MT_pmn0Wv4CF7Tm5TPgBXf8,12525
4
- gentroutils/commands/utils.py,sha256=9Wyptjww9hiAufCFILdnjdDOE6X6TdtyTWJOTkoIRqg,4316
5
- gentroutils-0.1.5.dist-info/METADATA,sha256=9PFlHuJakF2bnJfF9d6kPepH0jdRJM3g70GevV5Q7fM,1795
6
- gentroutils-0.1.5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
7
- gentroutils-0.1.5.dist-info/entry_points.txt,sha256=IvxZyBBD71Ota0aPMtVaJzI9OSX5_f-iH4ZJx6sY53w,48
8
- gentroutils-0.1.5.dist-info/licenses/LICENSE,sha256=RFhQPdSOiMTguUX7JSoIuTxA7HVzCbj_p8WU36HjUQQ,10947
9
- gentroutils-0.1.5.dist-info/RECORD,,