gentroutils 0.2.0__py3-none-any.whl → 1.6.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gentroutils/__init__.py +8 -41
- gentroutils/errors.py +39 -0
- gentroutils/io/path/__init__.py +6 -0
- gentroutils/io/path/ftp.py +48 -0
- gentroutils/io/path/gcs.py +45 -0
- gentroutils/io/transfer/__init__.py +6 -0
- gentroutils/io/transfer/ftp_to_gcs.py +49 -0
- gentroutils/io/transfer/model.py +36 -0
- gentroutils/io/transfer/polars_to_gcs.py +20 -0
- gentroutils/parsers/__init__.py +1 -0
- gentroutils/parsers/curation.py +168 -0
- gentroutils/py.typed +0 -0
- gentroutils/tasks/__init__.py +90 -0
- gentroutils/tasks/crawl.py +156 -0
- gentroutils/tasks/curation.py +110 -0
- gentroutils/tasks/fetch.py +141 -0
- gentroutils/transfer.py +81 -0
- gentroutils-1.6.0.dev1.dist-info/METADATA +274 -0
- gentroutils-1.6.0.dev1.dist-info/RECORD +22 -0
- {gentroutils-0.2.0.dist-info → gentroutils-1.6.0.dev1.dist-info}/WHEEL +1 -1
- gentroutils-1.6.0.dev1.dist-info/entry_points.txt +2 -0
- {gentroutils-0.2.0.dist-info → gentroutils-1.6.0.dev1.dist-info}/licenses/LICENSE +1 -1
- gentroutils/commands/__init__.py +0 -7
- gentroutils/commands/update_gwas_curation_metadata.py +0 -295
- gentroutils/commands/utils.py +0 -160
- gentroutils-0.2.0.dist-info/METADATA +0 -107
- gentroutils-0.2.0.dist-info/RECORD +0 -9
- gentroutils-0.2.0.dist-info/entry_points.txt +0 -2
gentroutils/commands/utils.py
DELETED
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
"""Ütility functions for the CLI."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import logging
|
|
5
|
-
import sys
|
|
6
|
-
import time
|
|
7
|
-
from functools import wraps
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from tempfile import NamedTemporaryFile
|
|
10
|
-
from urllib.parse import urlparse
|
|
11
|
-
|
|
12
|
-
import click
|
|
13
|
-
from google.cloud import storage
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger("gentroutils")
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def set_log_file(ctx: click.Context, param: click.Option, log_file: str) -> str:
|
|
19
|
-
"""Set logging file based on provided `log-file` flag.
|
|
20
|
-
|
|
21
|
-
This is a callback function called by the click.Option [--log-file] flag.
|
|
22
|
-
In case of the `log_file` being path to the GCP bucket the returned value
|
|
23
|
-
will be the local temporary file path. both log file paths (remote and local)
|
|
24
|
-
will be stored in the click context object for further reference at the end of the CLI run.
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
ctx (click.Context): click context
|
|
29
|
-
param (click.Option): click option
|
|
30
|
-
log_file (str): log file path
|
|
31
|
-
|
|
32
|
-
Raises:
|
|
33
|
-
click.BadParameter: If the log file is a directory or the URI scheme is not GCS.
|
|
34
|
-
|
|
35
|
-
Returns:
|
|
36
|
-
str: log file path
|
|
37
|
-
"""
|
|
38
|
-
ctx.ensure_object(dict)
|
|
39
|
-
if not log_file:
|
|
40
|
-
return ""
|
|
41
|
-
logger.info("Extracting log file from the %s", param)
|
|
42
|
-
upload_to_gcp = False
|
|
43
|
-
|
|
44
|
-
if "://" in log_file:
|
|
45
|
-
upload_to_gcp = True
|
|
46
|
-
ctx.obj["upload_to_gcp"] = upload_to_gcp
|
|
47
|
-
|
|
48
|
-
if upload_to_gcp:
|
|
49
|
-
parsed_uri = urlparse(log_file)
|
|
50
|
-
if parsed_uri.scheme != "gs":
|
|
51
|
-
raise click.BadParameter("Only GCS is supported for logging upload")
|
|
52
|
-
tmp_file = NamedTemporaryFile(delete=False)
|
|
53
|
-
logger.info("Logging to temporary file %s", tmp_file.name)
|
|
54
|
-
handler = logging.FileHandler(tmp_file.name)
|
|
55
|
-
formatter = logging.Formatter(
|
|
56
|
-
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
57
|
-
)
|
|
58
|
-
handler.setFormatter(formatter)
|
|
59
|
-
handler.setLevel(logging.DEBUG)
|
|
60
|
-
logger.addHandler(handler)
|
|
61
|
-
ctx.obj["local_log_file"] = tmp_file.name
|
|
62
|
-
ctx.obj["local_log_file_obj"] = tmp_file
|
|
63
|
-
ctx.obj["gcp_log_file"] = log_file
|
|
64
|
-
return tmp_file.name
|
|
65
|
-
|
|
66
|
-
else:
|
|
67
|
-
local_file = Path(log_file)
|
|
68
|
-
if local_file.exists() and local_file.is_dir():
|
|
69
|
-
raise click.BadParameter("Log file is a directory")
|
|
70
|
-
if local_file.exists() and local_file.is_file():
|
|
71
|
-
local_file.unlink()
|
|
72
|
-
if not local_file.exists():
|
|
73
|
-
local_file.parent.mkdir(parents=True, exist_ok=True)
|
|
74
|
-
local_file.touch()
|
|
75
|
-
logger.info("Logging to %s", local_file)
|
|
76
|
-
handler = logging.FileHandler(local_file)
|
|
77
|
-
formatter = logging.Formatter(
|
|
78
|
-
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
79
|
-
)
|
|
80
|
-
handler.setFormatter(formatter)
|
|
81
|
-
handler.setLevel(logging.DEBUG)
|
|
82
|
-
logger.addHandler(handler)
|
|
83
|
-
return str(local_file)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def teardown_cli(ctx: click.Context) -> None:
|
|
87
|
-
"""Teardown the gentroutils cli.
|
|
88
|
-
|
|
89
|
-
This function is used to as a teardown function for the CLI.
|
|
90
|
-
This will upload the log file to the GCP bucket if the `upload_to_gcp` flag is set in the context object.
|
|
91
|
-
|
|
92
|
-
Args:
|
|
93
|
-
ctx (click.Context): click context
|
|
94
|
-
"""
|
|
95
|
-
if "upload_to_gcp" in ctx.obj and ctx.obj["upload_to_gcp"]:
|
|
96
|
-
gcp_file = ctx.obj["gcp_log_file"]
|
|
97
|
-
local_file = ctx.obj["local_log_file"]
|
|
98
|
-
with open(local_file, "r") as f:
|
|
99
|
-
content = f.read()
|
|
100
|
-
try:
|
|
101
|
-
client = storage.Client()
|
|
102
|
-
bucket_name = urlparse(gcp_file).netloc
|
|
103
|
-
bucket = client.bucket(bucket_name=bucket_name)
|
|
104
|
-
file_name = urlparse(gcp_file).path.lstrip("/")
|
|
105
|
-
blob = bucket.blob(file_name)
|
|
106
|
-
logger.info("Uploading %s to %s", local_file, gcp_file)
|
|
107
|
-
if ctx.obj["dry_run"]:
|
|
108
|
-
logger.info("Dry run, skipping the upload of the log file")
|
|
109
|
-
else:
|
|
110
|
-
blob.upload_from_string(content)
|
|
111
|
-
ctx.obj["local_log_file_obj"].close()
|
|
112
|
-
except Exception as e:
|
|
113
|
-
msg = f"Failed to upload log file to GCP {e}"
|
|
114
|
-
logger.error(click.style(msg, fg="red"))
|
|
115
|
-
logger.info(
|
|
116
|
-
"Finished, elapsed time %s seconds", time.time() - ctx.obj["execution_start"]
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def set_log_lvl(_: click.Context, param: click.Option, value: int) -> int:
|
|
121
|
-
"""Set logging level based on the number of provided `v` flags.
|
|
122
|
-
|
|
123
|
-
This is a callback function called by the click.Option [-v] flag.
|
|
124
|
-
For example
|
|
125
|
-
`-vv` - DEBUG
|
|
126
|
-
`-v` - INFO
|
|
127
|
-
`no flag - ERROR
|
|
128
|
-
|
|
129
|
-
Args:
|
|
130
|
-
param (click.Option): click option
|
|
131
|
-
value (int): logging level
|
|
132
|
-
|
|
133
|
-
Returns:
|
|
134
|
-
int: logging level
|
|
135
|
-
"""
|
|
136
|
-
logger.info("Extracting log level from the %s", param)
|
|
137
|
-
log_lvls = {0: logging.ERROR, 1: logging.INFO, 2: logging.DEBUG}
|
|
138
|
-
log_lvl = log_lvls.get(value, logging.DEBUG)
|
|
139
|
-
handler = logging.StreamHandler(sys.stdout)
|
|
140
|
-
formatter = logging.Formatter(
|
|
141
|
-
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
142
|
-
)
|
|
143
|
-
handler.setFormatter(formatter)
|
|
144
|
-
handler.setLevel(log_lvl)
|
|
145
|
-
logger.addHandler(handler)
|
|
146
|
-
return log_lvl
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def coro(f):
|
|
150
|
-
"""Corutine wrapper for synchronous functions."""
|
|
151
|
-
|
|
152
|
-
@wraps(f)
|
|
153
|
-
def wrapper(*args, **kwargs):
|
|
154
|
-
"""Wrapper around the synchronous function."""
|
|
155
|
-
return asyncio.run(f(*args, **kwargs))
|
|
156
|
-
|
|
157
|
-
return wrapper
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
__all__ = ["set_log_file", "set_log_lvl", "coro", "logger", "teardown_cli"]
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: gentroutils
|
|
3
|
-
Version: 0.2.0
|
|
4
|
-
Summary: Open Targets python genetics utility CLI tools
|
|
5
|
-
Author-email: Szymon Szyszkowski <ss60@sanger.ac.uk>
|
|
6
|
-
License: Apache-2.0
|
|
7
|
-
Classifier: Development Status :: 3 - Alpha
|
|
8
|
-
Classifier: Intended Audience :: Healthcare Industry
|
|
9
|
-
Classifier: Intended Audience :: Science/Research
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
-
Classifier: Operating System :: Unix
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
15
|
-
Requires-Python: >=3.10
|
|
16
|
-
Requires-Dist: click>=8.1.7
|
|
17
|
-
Requires-Dist: google-cloud-storage>=2.18.1
|
|
18
|
-
Requires-Dist: pyfiglet>=1.0.2
|
|
19
|
-
Requires-Dist: requests>=2.32.3
|
|
20
|
-
Description-Content-Type: text/markdown
|
|
21
|
-
|
|
22
|
-
# gentroutils
|
|
23
|
-
|
|
24
|
-
[](https://github.com/opentargets/gentroutils/actions/workflows/test.yaml)
|
|
25
|
-

|
|
26
|
-
|
|
27
|
-
Set of Command Line Interface tools to process Open Targets Genetics GWAS data.
|
|
28
|
-
|
|
29
|
-
## Installation
|
|
30
|
-
|
|
31
|
-
```
|
|
32
|
-
pip install gentroutils
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
## Available commands
|
|
36
|
-
|
|
37
|
-
To see all available commands after installation run
|
|
38
|
-
|
|
39
|
-
```{bash}
|
|
40
|
-
gentroutils --help
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
### Updating gwas catalog metadata
|
|
44
|
-
|
|
45
|
-
To update gwas catalog metadata run folliwing command
|
|
46
|
-
|
|
47
|
-
```bash
|
|
48
|
-
gentroutils -vvv -q gs://ot_orchestration/tests/gentroutils/log.txt update-gwas-curation-metadata \
|
|
49
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-associations_ontology-annotated.tsv gs://ot_orchestration/tests/gentroutils/gwas-catalog-associations_ontology-annotated.tsv \
|
|
50
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-studies-v1.0.3.1.txt gs://ot_orchestration/tests/gentroutils/gwas-catalog-download-studies-v1.0.3.1.txt \
|
|
51
|
-
-f ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/latest/gwas-catalog-download-ancestries-v1.0.3.1.txt gs://ot_orchestration/tests/gentroutils/gwas-catalog-download-ancestries-v1.0.3.1.txt \
|
|
52
|
-
-g https://www.ebi.ac.uk/gwas/api/search/stats
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
The command `update-gwas-curation-metadata` fetches the data from the ftp server and transfers them to the gcp without intermediate temporary files. The download(s) and upload(s) are made asyncronously.
|
|
56
|
-
|
|
57
|
-
The logs from the command are saved under the `-q` log file, if specified `gcp` log file, then the file will be uploaded after the command has run.
|
|
58
|
-
|
|
59
|
-
To test the command run it with `-d` == `--dry-run`, this will just mark the input and output destinations.
|
|
60
|
-
To allow for full logs to be transmitted to the log file, use `-vvv` to increase the verbosity of the logs
|
|
61
|
-
|
|
62
|
-
> [!NOTE]
|
|
63
|
-
> Change the path to the output `gcp` files to make sure they are saved under requested path
|
|
64
|
-
|
|
65
|
-
> [!WARNING]
|
|
66
|
-
> Please read before running the command!:
|
|
67
|
-
>
|
|
68
|
-
> * The above command has some default values set for the input and output files, make sure you test them in `--dry-run` so the existing files will not get overwritten!
|
|
69
|
-
> * Make sure to run `gcloud auth application-default login` to allow to use Google Cloud Python SDK before running the command
|
|
70
|
-
|
|
71
|
-
## Contribute
|
|
72
|
-
|
|
73
|
-
To be able to contribute to the project you need to set it up. This project
|
|
74
|
-
runs on:
|
|
75
|
-
|
|
76
|
-
- [x] python 3.10.8
|
|
77
|
-
- [x] rye (package manager)
|
|
78
|
-
- [x] uv (dependency manager)
|
|
79
|
-
|
|
80
|
-
To set up the project run
|
|
81
|
-
|
|
82
|
-
```{bash}
|
|
83
|
-
make dev
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
The command will install above dependencies (initial requirements are curl and bash) if not present and
|
|
87
|
-
install all python dependencies listed in `pyproject.toml`. Finally the command will install `pre-commit` hooks
|
|
88
|
-
requred to be run before the commit is created.
|
|
89
|
-
|
|
90
|
-
The project has additional `dev` dependencies that include the list of packages used for testing purposes.
|
|
91
|
-
All of the `dev` depnendencies are automatically installed by `rye`.
|
|
92
|
-
|
|
93
|
-
To see all available dev commands
|
|
94
|
-
|
|
95
|
-
Run following command to see all available dev commands
|
|
96
|
-
|
|
97
|
-
```{bash}
|
|
98
|
-
make help
|
|
99
|
-
```
|
|
100
|
-
|
|
101
|
-
### Manual testing of CLI module
|
|
102
|
-
|
|
103
|
-
To check CLI execution manually you need to run
|
|
104
|
-
|
|
105
|
-
```{bash}
|
|
106
|
-
rye run gentroutils
|
|
107
|
-
```
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
gentroutils/__init__.py,sha256=aHDzbBMrnsgdcO_FfsYCbbPXProynwB7_2nfyc4UGp8,1281
|
|
2
|
-
gentroutils/commands/__init__.py,sha256=avkqzwa1ck__rLVN0Wqfpr3eHtKS6TvyPeeaHcguJuw,210
|
|
3
|
-
gentroutils/commands/update_gwas_curation_metadata.py,sha256=4Pb2YdEnfulQklFh0KBvAOBnylCsDIAye7Keq2dC0mY,10937
|
|
4
|
-
gentroutils/commands/utils.py,sha256=zYIzu47f-_a3nBeVXRR5xg5QiklrwES8uYNNhjed7gA,5384
|
|
5
|
-
gentroutils-0.2.0.dist-info/METADATA,sha256=lMJ2JdqokHojQaY-hWhs9IvCJ4ei4vBpOfsOAfgBw4E,4061
|
|
6
|
-
gentroutils-0.2.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
|
7
|
-
gentroutils-0.2.0.dist-info/entry_points.txt,sha256=IvxZyBBD71Ota0aPMtVaJzI9OSX5_f-iH4ZJx6sY53w,48
|
|
8
|
-
gentroutils-0.2.0.dist-info/licenses/LICENSE,sha256=RFhQPdSOiMTguUX7JSoIuTxA7HVzCbj_p8WU36HjUQQ,10947
|
|
9
|
-
gentroutils-0.2.0.dist-info/RECORD,,
|