kegg-pull 3.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kegg_pull/__init__.py +16 -0
- kegg_pull/__main__.py +53 -0
- kegg_pull/_utils.py +129 -0
- kegg_pull/_version.py +34 -0
- kegg_pull/entry_ids.py +89 -0
- kegg_pull/entry_ids_cli.py +38 -0
- kegg_pull/kegg_url.py +632 -0
- kegg_pull/map.py +370 -0
- kegg_pull/map_cli.py +60 -0
- kegg_pull/pathway_organizer.py +200 -0
- kegg_pull/pathway_organizer_cli.py +36 -0
- kegg_pull/pull.py +525 -0
- kegg_pull/pull_cli.py +97 -0
- kegg_pull/rest.py +248 -0
- kegg_pull/rest_cli.py +140 -0
- kegg_pull-3.2.1.dist-info/METADATA +125 -0
- kegg_pull-3.2.1.dist-info/RECORD +21 -0
- kegg_pull-3.2.1.dist-info/WHEEL +5 -0
- kegg_pull-3.2.1.dist-info/entry_points.txt +2 -0
- kegg_pull-3.2.1.dist-info/licenses/LICENSE +24 -0
- kegg_pull-3.2.1.dist-info/top_level.txt +1 -0
kegg_pull/__init__.py
ADDED
kegg_pull/__main__.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Usage:
|
|
3
|
+
kegg_pull -h | --help Show this help message.
|
|
4
|
+
kegg_pull -v | --version Displays the package version.
|
|
5
|
+
kegg_pull --full-help Show the help message of all sub commands.
|
|
6
|
+
kegg_pull pull ... Pull, separate, and store an arbitrary number of KEGG entries to the local file system.
|
|
7
|
+
kegg_pull entry-ids ... Obtain a list of KEGG entry IDs.
|
|
8
|
+
kegg_pull map ... Obtain a mapping of entry IDs (KEGG or outside databases) to the IDs of related entries.
|
|
9
|
+
kegg_pull pathway-organizer ... Creates a flattened version of a pathways Brite hierarchy.
|
|
10
|
+
kegg_pull rest ... Executes one of the KEGG REST API operations.
|
|
11
|
+
"""
|
|
12
|
+
import sys
|
|
13
|
+
from . import __version__
|
|
14
|
+
from . import pull_cli as p_cli
|
|
15
|
+
from . import entry_ids_cli as ei_cli
|
|
16
|
+
from . import map_cli as map_cli
|
|
17
|
+
from . import pathway_organizer_cli as po_cli
|
|
18
|
+
from . import rest_cli as r_cli
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def main() -> None:
|
|
22
|
+
first_arg: str = sys.argv[1] if len(sys.argv) > 1 else None
|
|
23
|
+
if first_arg == 'pull':
|
|
24
|
+
p_cli.main()
|
|
25
|
+
elif first_arg == 'entry-ids':
|
|
26
|
+
ei_cli.main()
|
|
27
|
+
elif first_arg == 'map':
|
|
28
|
+
map_cli.main()
|
|
29
|
+
elif first_arg == 'pathway-organizer':
|
|
30
|
+
po_cli.main()
|
|
31
|
+
elif first_arg == 'rest':
|
|
32
|
+
r_cli.main()
|
|
33
|
+
elif first_arg == '--full-help':
|
|
34
|
+
separator = '-'*80
|
|
35
|
+
print(__doc__)
|
|
36
|
+
print(separator)
|
|
37
|
+
print(p_cli.__doc__)
|
|
38
|
+
print(separator)
|
|
39
|
+
print(ei_cli.__doc__)
|
|
40
|
+
print(separator)
|
|
41
|
+
print(map_cli.__doc__)
|
|
42
|
+
print(separator)
|
|
43
|
+
print(po_cli.__doc__)
|
|
44
|
+
print(separator)
|
|
45
|
+
print(r_cli.__doc__)
|
|
46
|
+
elif first_arg == '--version' or first_arg == '-v':
|
|
47
|
+
print(__version__)
|
|
48
|
+
else:
|
|
49
|
+
print(__doc__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == '__main__': # pragma: no cover
|
|
53
|
+
main() # pragma: no cover
|
kegg_pull/_utils.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import logging as log
|
|
2
|
+
import typing as t
|
|
3
|
+
import zipfile as zf
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import json
|
|
7
|
+
import jsonschema as js
|
|
8
|
+
import inspect as ins
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_molecular_attribute_args(args: dict) -> tuple[str | None, float | tuple[float, float] | None, int | tuple[int, int] | None]:
|
|
12
|
+
formula: str | None = args['--formula']
|
|
13
|
+
exact_mass: list[str] | None = args['--em']
|
|
14
|
+
molecular_weight: list[str] | None = args['--mw']
|
|
15
|
+
# exact_mass and molecular_weight will be [] (empty list) if not specified in the commandline args
|
|
16
|
+
if exact_mass:
|
|
17
|
+
exact_mass: float | tuple[float, float] = _get_range_values(range_values=exact_mass, value_type=float)
|
|
18
|
+
else:
|
|
19
|
+
exact_mass = None
|
|
20
|
+
if molecular_weight:
|
|
21
|
+
molecular_weight: int | tuple[int, int] = _get_range_values(range_values=molecular_weight, value_type=int)
|
|
22
|
+
else:
|
|
23
|
+
molecular_weight = None
|
|
24
|
+
return formula, exact_mass, molecular_weight
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_range_values(
|
|
28
|
+
range_values: list[str], value_type: type[int | float]) -> int | float | tuple[int, int] | tuple[float, float]:
|
|
29
|
+
if len(range_values) == 1:
|
|
30
|
+
[val] = range_values
|
|
31
|
+
return value_type(val)
|
|
32
|
+
elif len(range_values) == 2:
|
|
33
|
+
[min_val, max_val] = range_values
|
|
34
|
+
return value_type(min_val), value_type(max_val)
|
|
35
|
+
else:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f'Range can only be specified by two values but {len(range_values)} values were provided: '
|
|
38
|
+
f'{", ".join(range_value for range_value in range_values)}')
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_json_file(file_path: str, json_schema: dict, validation_error_message: str) -> dict:
|
|
42
|
+
if '.zip:' in file_path:
|
|
43
|
+
[file_location, file_name] = file_path.split('.zip:')
|
|
44
|
+
file_location = file_location + '.zip'
|
|
45
|
+
with zf.ZipFile(file_location, 'r') as zip_file:
|
|
46
|
+
json_object: bytes = zip_file.read(file_name)
|
|
47
|
+
json_object: dict = json.loads(s=json_object)
|
|
48
|
+
else:
|
|
49
|
+
with open(file_path, 'r') as file:
|
|
50
|
+
json_object: dict = json.load(file)
|
|
51
|
+
validate_json_object(json_object=json_object, json_schema=json_schema, validation_error_message=validation_error_message)
|
|
52
|
+
return json_object
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def validate_json_object(json_object: dict, json_schema: dict, validation_error_message: str) -> None:
|
|
56
|
+
try:
|
|
57
|
+
js.validate(json_object, json_schema)
|
|
58
|
+
except js.exceptions.ValidationError as e:
|
|
59
|
+
log.error(validation_error_message)
|
|
60
|
+
raise e
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def parse_input_sequence(input_source: str) -> list[str]:
|
|
64
|
+
if input_source == '-':
|
|
65
|
+
# Read from standard input
|
|
66
|
+
inputs: str = sys.stdin.read()
|
|
67
|
+
inputs: list = inputs.strip().split('\n')
|
|
68
|
+
else:
|
|
69
|
+
# Split a comma separated list
|
|
70
|
+
inputs: list = input_source.split(',')
|
|
71
|
+
inputs: list = [input_string.strip() for input_string in inputs if input_string.strip() != '']
|
|
72
|
+
# If the inputs end up being an empty list
|
|
73
|
+
if not inputs:
|
|
74
|
+
input_source = 'standard input' if input_source == '-' else f'comma separated list: "{input_source}"'
|
|
75
|
+
raise ValueError(f'Empty list provided from {input_source}')
|
|
76
|
+
return inputs
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def print_or_save(output_target: str, output_content: str | bytes) -> None:
|
|
80
|
+
if output_target is None:
|
|
81
|
+
if type(output_content) is bytes:
|
|
82
|
+
log.warning('Printing binary output...')
|
|
83
|
+
print(output_content)
|
|
84
|
+
else:
|
|
85
|
+
save_output(output_target=output_target, output_content=output_content)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def save_output(output_target: str, output_content: str | bytes) -> None:
|
|
89
|
+
if '.zip:' in output_target:
|
|
90
|
+
[file_location, file_name] = output_target.split('.zip:')
|
|
91
|
+
file_location: str = file_location + '.zip'
|
|
92
|
+
else:
|
|
93
|
+
file_location, file_name = os.path.split(output_target)
|
|
94
|
+
file_location = '.' if file_location == '' else file_location
|
|
95
|
+
save_file(file_location=file_location, file_content=output_content, file_name=file_name)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def save_file(file_location: str, file_content: str | bytes, file_name: str) -> None:
|
|
99
|
+
if os.name == 'nt': # pragma: no cover
|
|
100
|
+
# If the OS is Windows, replace colons with underscores (Windows does not support colons in file names).
|
|
101
|
+
file_name = file_name.replace(':', '_') # pragma: no cover
|
|
102
|
+
if file_location.endswith('.zip'):
|
|
103
|
+
with zf.ZipFile(file_location, 'a') as zip_file:
|
|
104
|
+
zip_file.writestr(file_name, file_content)
|
|
105
|
+
else:
|
|
106
|
+
if not os.path.isdir(file_location):
|
|
107
|
+
os.makedirs(file_location)
|
|
108
|
+
file_path = os.path.join(file_location, file_name)
|
|
109
|
+
save_type = 'wb' if type(file_content) is bytes else 'w'
|
|
110
|
+
encoding: str | None = None if type(file_content) is bytes else 'utf-8'
|
|
111
|
+
with open(file_path, save_type, encoding=encoding) as file:
|
|
112
|
+
file.write(file_content)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class NonInstantiable:
|
|
116
|
+
"""Base classes of this class are only instantiable in the same module that they are defined in."""
|
|
117
|
+
@classmethod
|
|
118
|
+
def __init__(cls) -> None:
|
|
119
|
+
caller_module_path = ins.stack()[2].filename
|
|
120
|
+
class_module_path = ins.getfile(cls)
|
|
121
|
+
# Ensure the python module of the caller matches that of the class
|
|
122
|
+
# This ensures the class is only instantiated in the same module that it's defined in
|
|
123
|
+
if caller_module_path != class_module_path:
|
|
124
|
+
raise RuntimeError(f'The class "{cls.__name__}" cannot be instantiated outside of its module.')
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class staticproperty(staticmethod):
|
|
128
|
+
def __get__(self, *_) -> t.Any:
|
|
129
|
+
return self.__func__()
|
kegg_pull/_version.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '3.2.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (3, 2, 1)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
kegg_pull/entry_ids.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pulling Lists of KEGG Entry IDs
|
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
4
|
+
|Functionality| for pulling lists of KEGG entry IDs from the KEGG REST API.
|
|
5
|
+
"""
|
|
6
|
+
from . import rest as r
|
|
7
|
+
from . import kegg_url as ku
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def from_database(database: str, kegg_rest: r.KEGGrest | None = None) -> list[str]:
|
|
11
|
+
""" Pulls the KEGG entry IDs of a given database.
|
|
12
|
+
|
|
13
|
+
:param database: The KEGG database to pull the entry IDs from. If equal to "brite", the "br:" prefix is prepended to each entry ID such that they succeed if used in downstream use of the KEGG "get" operation (e.g. for the "pull" API module or CLI subcommand).
|
|
14
|
+
:param kegg_rest: The KEGGrest object to request the entry IDs. If None, one is created with the default parameters.
|
|
15
|
+
:return: The list of resulting entry IDs.
|
|
16
|
+
:raises RuntimeError: Raised if the request to the KEGG REST API fails or times out.
|
|
17
|
+
"""
|
|
18
|
+
entry_ids = _process_response(KEGGurl=ku.ListKEGGurl, kegg_rest=kegg_rest, database=database)
|
|
19
|
+
if database == 'brite':
|
|
20
|
+
entry_ids = [f'br:{entry_id}' for entry_id in entry_ids if not entry_id.startswith('br:')]
|
|
21
|
+
return entry_ids
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _process_response(KEGGurl: type[ku.AbstractKEGGurl], kegg_rest: r.KEGGrest | None, **kwargs) -> list[str]:
|
|
25
|
+
""" Extracts the entry IDs from a KEGG response if successful, else raises an exception. The KEGG response arrives from making
|
|
26
|
+
an entry IDs related request with a KEGGrest object.
|
|
27
|
+
|
|
28
|
+
:param KEGGurl: The URL class for the request.
|
|
29
|
+
:param kegg_rest: The KEGGrest object to make the request with. If None, one is created with the default parameters.
|
|
30
|
+
:param kwargs: The arguments to pass into the KEGGrest method.
|
|
31
|
+
:return: The list of KEGG entry IDs.
|
|
32
|
+
:raises RuntimeError: Raised if the KEGG response indicates a failure or time out.
|
|
33
|
+
"""
|
|
34
|
+
kegg_response: r.KEGGresponse = r.request_and_check_error(kegg_rest=kegg_rest, KEGGurl=KEGGurl, **kwargs)
|
|
35
|
+
return _parse_entry_ids_string(entry_ids_string=kegg_response.text_body)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _parse_entry_ids_string(entry_ids_string: str) -> list[str]:
|
|
39
|
+
""" Parses the entry IDs contained in a string.
|
|
40
|
+
|
|
41
|
+
:param entry_ids_string: The string containing the entry IDs.
|
|
42
|
+
:return: The list of parsed entry IDs.
|
|
43
|
+
"""
|
|
44
|
+
entry_ids = entry_ids_string.strip().split('\n')
|
|
45
|
+
return [entry_id.split('\t')[0].strip() for entry_id in entry_ids if entry_id.strip() != '']
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def from_file(file_path: str) -> list[str]:
|
|
49
|
+
""" Loads KEGG entry IDs that are listed in a file with one entry ID on each line.
|
|
50
|
+
|
|
51
|
+
:param file_path: The path to the file containing the entry IDs.
|
|
52
|
+
:return: The list of entry IDs.
|
|
53
|
+
:raises ValueError: Raised if the file is empty.
|
|
54
|
+
"""
|
|
55
|
+
with open(file_path, 'r') as file:
|
|
56
|
+
entry_ids = file.read()
|
|
57
|
+
if entry_ids == '':
|
|
58
|
+
raise ValueError(f'Attempted to load entry IDs from {file_path}. But the file is empty')
|
|
59
|
+
return _parse_entry_ids_string(entry_ids_string=entry_ids)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def from_keywords(database: str, keywords: list[str], kegg_rest: r.KEGGrest | None = None) -> list[str]:
|
|
63
|
+
""" Pulls entry IDs from a KEGG database based on keywords searched in the entries.
|
|
64
|
+
|
|
65
|
+
:param database: The name of the database to pull entry IDs from.
|
|
66
|
+
:param keywords: The keywords to search entries in the database with.
|
|
67
|
+
:param kegg_rest: The KEGGrest object to request the entry IDs. If None, one is created with the default parameters.
|
|
68
|
+
:return: The list of entry IDs.
|
|
69
|
+
:raises RuntimeError: Raised if the request to the KEGG REST API fails or times out.
|
|
70
|
+
"""
|
|
71
|
+
return _process_response(KEGGurl=ku.KeywordsFindKEGGurl, kegg_rest=kegg_rest, database=database, keywords=keywords)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def from_molecular_attribute(
|
|
75
|
+
database: str, formula: str | None = None, exact_mass: float | tuple[float, float] | None = None,
|
|
76
|
+
molecular_weight: int | tuple[int, int] | None = None, kegg_rest: r.KEGGrest | None = None) -> list[str]:
|
|
77
|
+
""" Pulls entry IDs from a KEGG database containing chemical entries based on one (and only one) of three molecular attributes of the entries.
|
|
78
|
+
|
|
79
|
+
:param database: The name of the database containing chemical entries.
|
|
80
|
+
:param formula: The chemical formula to search for.
|
|
81
|
+
:param exact_mass: The exact mass of the compound to search for (a single value or a range).
|
|
82
|
+
:param molecular_weight: The molecular weight of the compound to search for (a single value or a range).
|
|
83
|
+
:param kegg_rest: The KEGGrest object to request the entry IDs. If None, one is created with the default parameters.
|
|
84
|
+
:return: The list of entry IDs.
|
|
85
|
+
:raises RuntimeError: Raised if the request to the KEGG REST API fails or times out.
|
|
86
|
+
"""
|
|
87
|
+
return _process_response(
|
|
88
|
+
KEGGurl=ku.MolecularFindKEGGurl, kegg_rest=kegg_rest, database=database, formula=formula, exact_mass=exact_mass,
|
|
89
|
+
molecular_weight=molecular_weight)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Usage:
|
|
3
|
+
kegg_pull entry-ids -h | --help
|
|
4
|
+
kegg_pull entry-ids database <database> [--output=<output>]
|
|
5
|
+
kegg_pull entry-ids keywords <database> <keywords> [--output=<output>]
|
|
6
|
+
kegg_pull entry-ids molec-attr <database> (--formula=<formula>|--em=<exact-mass>...|--mw=<molecular-weight>...) [--output=<output>]
|
|
7
|
+
|
|
8
|
+
Options:
|
|
9
|
+
-h --help Show this help message.
|
|
10
|
+
database Pulls all the entry IDs within a given database.
|
|
11
|
+
<database> The KEGG database from which to pull a list of entry IDs.
|
|
12
|
+
--output=<output> Path to the file (either in a directory or ZIP archive) to store the output (1 entry ID per line). Prints to the console if not specified. If a ZIP archive, the file path must be in the form of /path/to/zip-archive.zip:/path/to/file (e.g. ./archive.zip:file.txt).
|
|
13
|
+
keywords Searches for entries within a database based on provided keywords.
|
|
14
|
+
<keywords> Comma separated list of keywords to search entries with (e.g. kw1,kw2,kw3 etc.). Or if equal to "-", keywords are read from standard input, one keyword per line; Press CTRL+D to finalize input or pipe (e.g. cat file.txt | kegg_pull rest find brite - ...).
|
|
15
|
+
molec-attr Searches a database of molecule-type KEGG entries by molecular attributes.
|
|
16
|
+
--formula=<formula> Sequence of atoms in a chemical formula format to search for (e.g. "O5C7" searches for molecule entries containing 5 oxygen atoms and/or 7 carbon atoms).
|
|
17
|
+
--em=<exact-mass> Either a single number (e.g. "--em=155.5") or two numbers (e.g. "--em=155.5 --em=244.4"). If a single number, searches for molecule entries with an exact mass equal to that value rounded by the last decimal point. If two numbers, searches for molecule entries with an exact mass within the two values (a range).
|
|
18
|
+
--mw=<molecular-weight> Same as "--em=<exact-mass>" but searches based on the molecular weight.
|
|
19
|
+
"""
|
|
20
|
+
import docopt as d
|
|
21
|
+
from . import entry_ids as ei
|
|
22
|
+
from . import _utils as u
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def main() -> None:
|
|
26
|
+
args = d.docopt(__doc__)
|
|
27
|
+
database: str = args['<database>']
|
|
28
|
+
if args['database']:
|
|
29
|
+
entry_ids = ei.from_database(database=database)
|
|
30
|
+
elif args['keywords']:
|
|
31
|
+
keywords: list = u.parse_input_sequence(input_source=args['<keywords>'])
|
|
32
|
+
entry_ids = ei.from_keywords(database=database, keywords=keywords)
|
|
33
|
+
else:
|
|
34
|
+
formula, exact_mass, molecular_weight = u.get_molecular_attribute_args(args=args)
|
|
35
|
+
entry_ids = ei.from_molecular_attribute(
|
|
36
|
+
database=database, formula=formula, exact_mass=exact_mass, molecular_weight=molecular_weight)
|
|
37
|
+
entry_ids_str = '\n'.join(entry_ids)
|
|
38
|
+
u.print_or_save(output_target=args['--output'], output_content=entry_ids_str)
|