gentroutils 0.2.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gentroutils/__init__.py CHANGED
@@ -8,8 +8,9 @@ import time
8
8
  import click
9
9
  import pyfiglet
10
10
 
11
- from gentroutils.commands.update_gwas_curation_metadata import (
11
+ from gentroutils.commands import (
12
12
  update_gwas_curation_metadata_command,
13
+ validate_gwas_curation,
13
14
  )
14
15
  from gentroutils.commands.utils import set_log_file, set_log_lvl, teardown_cli
15
16
 
@@ -40,5 +41,6 @@ def cli(ctx: click.Context, **kwargs: dict[str, str]) -> None:
40
41
 
41
42
 
42
43
  cli.add_command(update_gwas_curation_metadata_command)
44
+ cli.add_command(validate_gwas_curation)
43
45
 
44
46
  __all__ = ["cli"]
@@ -3,5 +3,9 @@
3
3
  from gentroutils.commands.update_gwas_curation_metadata import (
4
4
  update_gwas_curation_metadata_command,
5
5
  )
6
+ from gentroutils.commands.validate_gwas_curation import validate_gwas_curation
6
7
 
7
- __all__ = ["update_gwas_curation_metadata_command"]
8
+ __all__ = [
9
+ "update_gwas_curation_metadata_command",
10
+ "validate_gwas_curation",
11
+ ]
@@ -90,10 +90,7 @@ async def update_gwas_curation_metadata_command(
90
90
  MAX_CONCURRENT_CONNECTIONS,
91
91
  )
92
92
  sys.exit(1)
93
- uri_map = [
94
- {"input": urlparse(ftp_file), "output": urlparse(gcp_file)}
95
- for ftp_file, gcp_file in file_to_transfer
96
- ]
93
+ uri_map = [{"input": urlparse(ftp_file), "output": urlparse(gcp_file)} for ftp_file, gcp_file in file_to_transfer]
97
94
  transfer_tasks = generate_transfer_tasks(uri_map, dry_run)
98
95
 
99
96
  # capture latest release metadata
@@ -117,9 +114,7 @@ async def update_gwas_curation_metadata_command(
117
114
  logger.info("gwas_curation_update step completed.")
118
115
 
119
116
 
120
- def generate_transfer_tasks(
121
- uri_map: list[dict[str, ParseResult]], dry_run: bool
122
- ) -> list[asyncio.Task[None]]:
117
+ def generate_transfer_tasks(uri_map: list[dict[str, ParseResult]], dry_run: bool) -> list[asyncio.Task[None]]:
123
118
  """Generate transfer tasks.
124
119
 
125
120
  Args:
@@ -196,9 +191,7 @@ def generate_transfer_tasks(
196
191
  return transfer_tasks
197
192
 
198
193
 
199
- async def sync_from_http_to_gcp(
200
- url: str, gcp_bucket: str, gcp_prefix: str, gcp_file: str, *, dry_run: bool = True
201
- ) -> None:
194
+ async def sync_from_http_to_gcp(url: str, gcp_bucket: str, gcp_prefix: str, gcp_file: str, *, dry_run: bool = True) -> None:
202
195
  """Sync file from HTTP and upload to GCP.
203
196
 
204
197
  This function fetches the data from the provided HTTP URL and uploads the content
@@ -271,7 +264,8 @@ async def sync_from_ftp_to_gcp(
271
264
  gcp_file,
272
265
  )
273
266
  return
274
- with FTP(ftp_server) as ftp:
267
+ with FTP() as ftp:
268
+ ftp.connect(ftp_server)
275
269
  ftp.login()
276
270
  bucket = storage.Client().bucket(gcp_bucket)
277
271
  gcp_path = f"{gcp_prefix}/{gcp_file}" if gcp_prefix else gcp_file
@@ -282,9 +276,7 @@ async def sync_from_ftp_to_gcp(
282
276
  if dir_match:
283
277
  logger.info("Found release date!: %s", dir_match.group("release_date"))
284
278
  buffer = io.BytesIO()
285
- logger.info(
286
- "Retrieving data from: ftp://%s/%s/%s.", ftp_server, ftp_prefix, ftp_file
287
- )
279
+ logger.info("Retrieving data from: ftp://%s/%s/%s.", ftp_server, ftp_prefix, ftp_file)
288
280
  ftp.retrbinary(f"RETR {ftp_file}", lambda x: buffer.write(x))
289
281
  content = buffer.getvalue().decode("utf-8")
290
282
  buffer.close()
@@ -52,9 +52,7 @@ def set_log_file(ctx: click.Context, param: click.Option, log_file: str) -> str:
52
52
  tmp_file = NamedTemporaryFile(delete=False)
53
53
  logger.info("Logging to temporary file %s", tmp_file.name)
54
54
  handler = logging.FileHandler(tmp_file.name)
55
- formatter = logging.Formatter(
56
- "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
57
- )
55
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
58
56
  handler.setFormatter(formatter)
59
57
  handler.setLevel(logging.DEBUG)
60
58
  logger.addHandler(handler)
@@ -74,9 +72,7 @@ def set_log_file(ctx: click.Context, param: click.Option, log_file: str) -> str:
74
72
  local_file.touch()
75
73
  logger.info("Logging to %s", local_file)
76
74
  handler = logging.FileHandler(local_file)
77
- formatter = logging.Formatter(
78
- "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
79
- )
75
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
80
76
  handler.setFormatter(formatter)
81
77
  handler.setLevel(logging.DEBUG)
82
78
  logger.addHandler(handler)
@@ -112,9 +108,7 @@ def teardown_cli(ctx: click.Context) -> None:
112
108
  except Exception as e:
113
109
  msg = f"Failed to upload log file to GCP {e}"
114
110
  logger.error(click.style(msg, fg="red"))
115
- logger.info(
116
- "Finished, elapsed time %s seconds", time.time() - ctx.obj["execution_start"]
117
- )
111
+ logger.info("Finished, elapsed time %s seconds", time.time() - ctx.obj["execution_start"])
118
112
 
119
113
 
120
114
  def set_log_lvl(_: click.Context, param: click.Option, value: int) -> int:
@@ -137,9 +131,7 @@ def set_log_lvl(_: click.Context, param: click.Option, value: int) -> int:
137
131
  log_lvls = {0: logging.ERROR, 1: logging.INFO, 2: logging.DEBUG}
138
132
  log_lvl = log_lvls.get(value, logging.DEBUG)
139
133
  handler = logging.StreamHandler(sys.stdout)
140
- formatter = logging.Formatter(
141
- "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
142
- )
134
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
143
135
  handler.setFormatter(formatter)
144
136
  handler.setLevel(log_lvl)
145
137
  logger.addHandler(handler)
@@ -0,0 +1,165 @@
1
+ """Validate gwas catalog manual curation file."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ import sys
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from typing import TypeVar
11
+
12
+ import click
13
+ import great_expectations as gx
14
+ from click import Argument, BadParameter
15
+ from great_expectations import expectations as gxe
16
+
17
+ T = TypeVar("T")
18
+
19
+
20
+ logger = logging.getLogger("gentroutils")
21
+ DATASOURCE_NAME = "GWAS Catalog curation"
22
+
23
+
24
+ class Lnum(Enum):
25
+ """List convertable enum."""
26
+
27
+ @classmethod
28
+ def as_list(cls) -> list[T]:
29
+ """Convert enum to list of strings."""
30
+ return list(map(lambda c: c.value, cls))
31
+
32
+
33
+ class ColumnSet(Lnum):
34
+ """Expected column names for curation file."""
35
+
36
+ STUDY_ID = "studyId"
37
+ STUDY_TYPE = "studyType"
38
+ FLAG = "analysisFlag"
39
+ QUALITY_CONTROL = "qualityControl"
40
+ IS_CURATED = "isCurated"
41
+ PUBMED = "pubmedId"
42
+ PUBLICATION_TITLE = "publicationTitle"
43
+ TRAIT = "traitFromSource"
44
+
45
+
46
+ class StudyType(Lnum):
47
+ """Expected studyType column values."""
48
+
49
+ NO_LICENCE = "no_licence"
50
+ PQTL = "pQTL"
51
+
52
+
53
+ class AnalysisFlag(Lnum):
54
+ """Expected analysisFlag column values."""
55
+
56
+ CC = "Case-case study"
57
+ EXWAS = "ExWAS"
58
+ GXE = "GxE"
59
+ GXG = "GxG"
60
+ METABOLITE = "Metabolite"
61
+ MULTIVARIATE = "Multivariate analysis"
62
+ NON_ADDITIVE = "Non-additive model"
63
+
64
+
65
+ class IsCurated(Lnum):
66
+ """Expected isCurated column values."""
67
+
68
+ YES = True
69
+
70
+
71
+ def _validate_input_file_name(_: click.Context, param: Argument, value: str) -> str:
72
+ """Assert file comes from local fs and exists."""
73
+ logger.debug("Validating %s variable with %s value", param, value)
74
+ import os
75
+
76
+ logger.info(os.getcwd())
77
+ pattern = re.compile(r"^[\w*/.-]*$")
78
+ _match = pattern.fullmatch(value)
79
+ if not _match:
80
+ logger.error("%s is not a local file.", value)
81
+ raise BadParameter("Provided path is not local.")
82
+ p = Path(value)
83
+ if p.is_dir():
84
+ logger.error("%s is a directory.", value)
85
+ raise BadParameter("Provided path is a directory.")
86
+ if not p.exists():
87
+ logger.error("%s does not exit.", value)
88
+ raise BadParameter("Provided path does not exist.")
89
+ return str(p)
90
+
91
+
92
+ def split_source_path(source_path: str) -> tuple[Path, str]:
93
+ """Split the source path into directory name and filename"""
94
+ p = Path(source_path)
95
+ return p.parent, p.name
96
+
97
+
98
+ @click.command(name="validate-gwas-curation")
99
+ @click.argument("source_path", type=click.UNPROCESSED, callback=_validate_input_file_name)
100
+ @click.pass_context
101
+ def validate_gwas_curation(ctx: click.Context, source_path: str) -> None: # noqa: DOC101, D0C103
102
+ """Validate GWAS catalog manual curation file.
103
+
104
+ \b
105
+ gentroutils -vvv validate-gwas-curation GWAS_Catalog_study_curation.tsv
106
+
107
+ """
108
+ logger.info("Using %s as curation input.", source_path)
109
+
110
+ dry_run = ctx.obj["dry_run"]
111
+ if dry_run:
112
+ logger.info("Running in --dry-run mode, exitting.")
113
+ sys.exit(0)
114
+
115
+ logger.info("Building great expectations context...")
116
+ context = gx.get_context(mode="ephemeral")
117
+ directory, file = split_source_path(source_path)
118
+ data_source = context.data_sources.add_pandas_filesystem(name=DATASOURCE_NAME, base_directory=directory)
119
+
120
+ logger.info("Using %s datasource.", DATASOURCE_NAME)
121
+ logger.debug("Adding csv asset from %s", file)
122
+ file_tsv_asset = data_source.add_csv_asset(name="manual_curation", sep="\t", header=0)
123
+ logger.debug("Adding batch definion path %s", file)
124
+ batch_definition = file_tsv_asset.add_batch_definition_path(name="manual_curation", path=file)
125
+
126
+ logger.info("Building expectation suite...")
127
+
128
+ suite = gx.ExpectationSuite(name="Curation Validation")
129
+ context.suites.add(suite)
130
+ suite.add_expectation(gxe.ExpectTableColumnsToMatchSet(column_set=ColumnSet.as_list(), exact_match=True))
131
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.PUBMED.value, type_="int"))
132
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.PUBLICATION_TITLE.value, type_="str"))
133
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.TRAIT.value, type_="str"))
134
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.STUDY_ID.value, type_="str"))
135
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.STUDY_TYPE.value, type_="str"))
136
+ suite.add_expectation(gxe.ExpectColumnValuesToBeOfType(column=ColumnSet.FLAG.value, type_="str"))
137
+ suite.add_expectation(
138
+ gxe.ExpectColumnDistinctValuesToBeInSet(column=ColumnSet.STUDY_TYPE.value, value_set=StudyType.as_list())
139
+ )
140
+ suite.add_expectation(gxe.ExpectColumnDistinctValuesToBeInSet(column=ColumnSet.FLAG.value, value_set=AnalysisFlag.as_list()))
141
+ suite.add_expectation(gxe.ExpectColumnValueLengthsToEqual(column=ColumnSet.PUBMED.value, value=8))
142
+ suite.add_expectation(gxe.ExpectColumnValuesToMatchRegex(column=ColumnSet.STUDY_ID.value, regex=r"^GCST\d+$"))
143
+ suite.add_expectation(gxe.ExpectColumnValuesToNotBeNull(column=ColumnSet.STUDY_ID.value))
144
+ suite.add_expectation(gxe.ExpectColumnValuesToBeUnique(column=ColumnSet.STUDY_ID.value))
145
+ suite.save()
146
+ logger.info("Building validation definition...")
147
+ validation_definition = gx.ValidationDefinition(data=batch_definition, suite=suite, name="Curation Validation")
148
+ result = validation_definition.run()
149
+
150
+ logger.info(
151
+ click.style("Validation succeded" if result["success"] else "Validation failed", "green" if result["success"] else "red")
152
+ )
153
+ if not result["success"]:
154
+ for res in result["results"]:
155
+ if not res["success"]:
156
+ logger.error(
157
+ "Expectation %s for column %s run with %s ",
158
+ res["expectation_config"]["type"],
159
+ res["expectation_config"]["kwargs"]["column"]
160
+ if "column" in res["expectation_config"]["kwargs"]
161
+ else res["expectation_config"]["kwargs"]["column_set"],
162
+ "succeded" if res["success"] else "failed",
163
+ )
164
+ logger.error(res)
165
+ sys.exit(1)
gentroutils/py.typed ADDED
File without changes
@@ -1,9 +1,10 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: gentroutils
3
- Version: 0.2.0
3
+ Version: 1.5.0
4
4
  Summary: Open Targets python genetics utility CLI tools
5
5
  Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
6
- License: Apache-2.0
6
+ License-Expression: Apache-2.0
7
+ License-File: LICENSE
7
8
  Classifier: Development Status :: 3 - Alpha
8
9
  Classifier: Intended Audience :: Healthcare Industry
9
10
  Classifier: Intended Audience :: Science/Research
@@ -11,18 +12,21 @@ Classifier: License :: OSI Approved :: Apache Software License
11
12
  Classifier: Operating System :: Unix
12
13
  Classifier: Programming Language :: Python :: 3
13
14
  Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
14
17
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
- Requires-Python: >=3.10
18
+ Requires-Python: <3.13,>=3.10
16
19
  Requires-Dist: click>=8.1.7
17
20
  Requires-Dist: google-cloud-storage>=2.18.1
21
+ Requires-Dist: great-expectations>=1.3.4
18
22
  Requires-Dist: pyfiglet>=1.0.2
19
23
  Requires-Dist: requests>=2.32.3
20
24
  Description-Content-Type: text/markdown
21
25
 
22
26
  # gentroutils
23
-
24
- [![Tests](https://github.com/opentargets/gentroutils/actions/workflows/test.yaml/badge.svg?event=push)](https://github.com/opentargets/gentroutils/actions/workflows/test.yaml)
27
+ [![checks](https://github.com/opentargets/gentroutils/actions/workflows/pr.yaml/badge.svg?branch=dev)](https://github.com/opentargets/gentroutils/actions/workflows/pr.yaml)
25
28
  ![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)
29
+ [![release](https://github.com/opentargets/gentroutils/actions/workflows/release.yaml/badge.svg)](https://github.com/opentargets/gentroutils/actions/workflows/release.yaml)
26
30
 
27
31
  Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
28
32
 
@@ -54,6 +58,29 @@ gentroutils -vvv -q gs://ot_orchestration/tests/gentroutils/log.txt update-gwa
54
58
 
55
59
  The command `update-gwas-curation-metadata` fetches the data from the ftp server and transfers them to the gcp without intermediate temporary files. The download(s) and upload(s) are made asyncronously.
56
60
 
61
+ ### Validate gwas catalog curation file
62
+
63
+ To validate gwas catalog curation file after manual curation to see if all expectation tests are passing.
64
+
65
+ ```bash
66
+ gentroutils -vvv validate-gwas-curation GWAS_Catalog_study_curation.tsv
67
+ ```
68
+
69
+ validation is only allowed on the local file, the curation should follow format requirements defined in [OT curation](https://github.com/opentargets/curation/blob/master/genetics/GWAS_Catalog_study_curation.tsv)
70
+
71
+
72
+ ### Validate manual curation
73
+
74
+ To validate the manually curated file, run the following command
75
+
76
+ ```bash
77
+ gentroutils -vvv validate-gwas-curation tests/data/manual_curation/correct_curation.tsv
78
+ ```
79
+
80
+ The command will validate the file and return the results of the validation if the issues are found.
81
+
82
+ ## Read before running
83
+
57
84
  The logs from the command are saved under the `-q` log file, if specified `gcp` log file, then the file will be uploaded after the command has run.
58
85
 
59
86
  To test the command run it with `-d` == `--dry-run`, this will just mark the input and output destinations.
@@ -68,13 +95,14 @@ To allow for full logs to be transmitted to the log file, use `-vvv` to increase
68
95
  > * The above command has some default values set for the input and output files, make sure you test them in `--dry-run` so the existing files will not get overwritten!
69
96
  > * Make sure to run `gcloud auth application-default login` to allow to use Google Cloud Python SDK before running the command
70
97
 
98
+
99
+
71
100
  ## Contribute
72
101
 
73
102
  To be able to contribute to the project you need to set it up. This project
74
103
  runs on:
75
104
 
76
105
  - [x] python 3.10.8
77
- - [x] rye (package manager)
78
106
  - [x] uv (dependency manager)
79
107
 
80
108
  To set up the project run
@@ -88,7 +116,7 @@ install all python dependencies listed in `pyproject.toml`. Finally the command
88
116
  requred to be run before the commit is created.
89
117
 
90
118
  The project has additional `dev` dependencies that include the list of packages used for testing purposes.
91
- All of the `dev` depnendencies are automatically installed by `rye`.
119
+ All of the `dev` depnendencies are automatically installed by `uv`.
92
120
 
93
121
  To see all available dev commands
94
122
 
@@ -103,5 +131,5 @@ make help
103
131
  To check CLI execution manually you need to run
104
132
 
105
133
  ```{bash}
106
- rye run gentroutils
134
+ uv run gentroutils
107
135
  ```
@@ -0,0 +1,11 @@
1
+ gentroutils/__init__.py,sha256=lCl9VmPVU2hkZa_kh5NipCrNufbcTH00iKF-6WvE0gg,1319
2
+ gentroutils/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ gentroutils/commands/__init__.py,sha256=940jwl8BvK0gTN2mRb_js8Kl74rgff-benDSEO5MuBc,326
4
+ gentroutils/commands/update_gwas_curation_metadata.py,sha256=zOIv5l_84kBlqyXAXJHnwH5OrPKVlObuHl6WhwlZC90,10903
5
+ gentroutils/commands/utils.py,sha256=XEZogKn6ZQKaSwKms7UtJU5h_eGrrHM-FofCoU2k884,5312
6
+ gentroutils/commands/validate_gwas_curation.py,sha256=CWlQ8uI8JAK9l4MyjFNmjDLeMOolrcA3neVaBdAGN80,6148
7
+ gentroutils-1.5.0.dist-info/METADATA,sha256=LYbG6KpN19EN1O3ZCGhXOv_yRAYVPj1odQ5742ELoDQ,5154
8
+ gentroutils-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ gentroutils-1.5.0.dist-info/entry_points.txt,sha256=IvxZyBBD71Ota0aPMtVaJzI9OSX5_f-iH4ZJx6sY53w,48
10
+ gentroutils-1.5.0.dist-info/licenses/LICENSE,sha256=RFhQPdSOiMTguUX7JSoIuTxA7HVzCbj_p8WU36HjUQQ,10947
11
+ gentroutils-1.5.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.26.3
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,9 +0,0 @@
1
- gentroutils/__init__.py,sha256=aHDzbBMrnsgdcO_FfsYCbbPXProynwB7_2nfyc4UGp8,1281
2
- gentroutils/commands/__init__.py,sha256=avkqzwa1ck__rLVN0Wqfpr3eHtKS6TvyPeeaHcguJuw,210
3
- gentroutils/commands/update_gwas_curation_metadata.py,sha256=4Pb2YdEnfulQklFh0KBvAOBnylCsDIAye7Keq2dC0mY,10937
4
- gentroutils/commands/utils.py,sha256=zYIzu47f-_a3nBeVXRR5xg5QiklrwES8uYNNhjed7gA,5384
5
- gentroutils-0.2.0.dist-info/METADATA,sha256=lMJ2JdqokHojQaY-hWhs9IvCJ4ei4vBpOfsOAfgBw4E,4061
6
- gentroutils-0.2.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
7
- gentroutils-0.2.0.dist-info/entry_points.txt,sha256=IvxZyBBD71Ota0aPMtVaJzI9OSX5_f-iH4ZJx6sY53w,48
8
- gentroutils-0.2.0.dist-info/licenses/LICENSE,sha256=RFhQPdSOiMTguUX7JSoIuTxA7HVzCbj_p8WU36HjUQQ,10947
9
- gentroutils-0.2.0.dist-info/RECORD,,