gsrap 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsrap/.ipynb_checkpoints/__init__-checkpoint.py +2 -0
- gsrap/__init__.py +2 -0
- gsrap/assets/kegg_compound_to_others.pickle +0 -0
- gsrap/assets/kegg_reaction_to_others.pickle +0 -0
- gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +96 -4
- gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +72 -1
- gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +2 -2
- gsrap/commons/downloads.py +96 -4
- gsrap/commons/escherutils.py +72 -1
- gsrap/commons/excelhub.py +2 -2
- gsrap/getmaps/.ipynb_checkpoints/getmaps-checkpoint.py +14 -5
- gsrap/getmaps/.ipynb_checkpoints/kdown-checkpoint.py +75 -4
- gsrap/getmaps/getmaps.py +14 -5
- gsrap/getmaps/kdown.py +75 -4
- gsrap/parsedb/.ipynb_checkpoints/annotation-checkpoint.py +9 -0
- gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +45 -11
- gsrap/parsedb/.ipynb_checkpoints/manual-checkpoint.py +10 -0
- gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +40 -19
- gsrap/parsedb/.ipynb_checkpoints/repeating-checkpoint.py +2 -2
- gsrap/parsedb/annotation.py +9 -0
- gsrap/parsedb/completeness.py +45 -11
- gsrap/parsedb/manual.py +10 -0
- gsrap/parsedb/parsedb.py +40 -19
- gsrap/parsedb/repeating.py +2 -2
- {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/METADATA +1 -1
- {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/RECORD +29 -29
- {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/LICENSE.txt +0 -0
- {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/WHEEL +0 -0
- {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/entry_points.txt +0 -0
gsrap/getmaps/getmaps.py
CHANGED
|
@@ -4,6 +4,7 @@ import pickle
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
from .kdown import download_raw_txtfiles
|
|
7
|
+
from .kdown import create_dict_keggorg
|
|
7
8
|
from .kdown import create_dict_ko
|
|
8
9
|
from .kdown import create_dict_c
|
|
9
10
|
from .kdown import create_dict_r
|
|
@@ -20,13 +21,19 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
20
21
|
logger.info(f"Respectfully retrieving metabolic information from KEGG. Raw data are being saved into '{outdir}/kdown/'. Be patient, could take a couple of days...")
|
|
21
22
|
os.makedirs(f'{outdir}/kdown/', exist_ok=True)
|
|
22
23
|
|
|
24
|
+
|
|
23
25
|
response = download_raw_txtfiles(logger, outdir, usecache)
|
|
24
26
|
if type(response) == int: return 1
|
|
25
27
|
else: RELEASE_kegg = response
|
|
26
28
|
|
|
29
|
+
|
|
27
30
|
|
|
28
31
|
logger.info("Parsing downloaded KEGG information...")
|
|
29
|
-
|
|
32
|
+
|
|
33
|
+
response = create_dict_keggorg(logger, outdir)
|
|
34
|
+
if type(response) == int: return 1
|
|
35
|
+
else: dict_keggorg = response
|
|
36
|
+
|
|
30
37
|
response = create_dict_ko(logger, outdir)
|
|
31
38
|
if type(response) == int: return 1
|
|
32
39
|
else: dict_ko = response
|
|
@@ -49,7 +56,7 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
49
56
|
|
|
50
57
|
|
|
51
58
|
# create 'idcollection_dict' and 'summary_dict' dictionaries
|
|
52
|
-
idcollection_dict = create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md)
|
|
59
|
+
idcollection_dict = create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md)
|
|
53
60
|
summary_dict = create_summary_dict(dict_c, dict_r, dict_map, dict_md)
|
|
54
61
|
|
|
55
62
|
|
|
@@ -57,7 +64,6 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
57
64
|
|
|
58
65
|
|
|
59
66
|
|
|
60
|
-
|
|
61
67
|
def main(args, logger):
|
|
62
68
|
|
|
63
69
|
|
|
@@ -67,7 +73,7 @@ def main(args, logger):
|
|
|
67
73
|
os.makedirs(f'{args.outdir}/', exist_ok=True)
|
|
68
74
|
|
|
69
75
|
|
|
70
|
-
# KEGG
|
|
76
|
+
# KEGG download
|
|
71
77
|
response = do_kdown(logger, args.outdir, args.usecache, args.keeptmp)
|
|
72
78
|
if type(response) == int: return 1
|
|
73
79
|
else: RELEASE_kegg, idcollection_dict, summary_dict = response[0], response[1], response[2]
|
|
@@ -76,7 +82,9 @@ def main(args, logger):
|
|
|
76
82
|
# create 'gsrap.maps':
|
|
77
83
|
with open(f'{args.outdir}/gsrap.maps', 'wb') as wb_handler:
|
|
78
84
|
pickle.dump({
|
|
79
|
-
'RELEASE_kegg': RELEASE_kegg,
|
|
85
|
+
'RELEASE_kegg': RELEASE_kegg,
|
|
86
|
+
'idcollection_dict': idcollection_dict,
|
|
87
|
+
'summary_dict': summary_dict,
|
|
80
88
|
}, wb_handler)
|
|
81
89
|
logger.info(f"'{args.outdir}/gsrap.maps' created!")
|
|
82
90
|
|
|
@@ -87,4 +95,5 @@ def main(args, logger):
|
|
|
87
95
|
logger.info(f"Temporary raw files deleted!")
|
|
88
96
|
|
|
89
97
|
|
|
98
|
+
|
|
90
99
|
return 0
|
gsrap/getmaps/kdown.py
CHANGED
|
@@ -34,6 +34,7 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
34
34
|
'orthology',
|
|
35
35
|
'module',
|
|
36
36
|
'pathway',
|
|
37
|
+
'organism',
|
|
37
38
|
]
|
|
38
39
|
for db in databases:
|
|
39
40
|
time.sleep(0.5)
|
|
@@ -45,8 +46,9 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
45
46
|
|
|
46
47
|
# mix the items to download to be respectful/compliant
|
|
47
48
|
items_to_download = []
|
|
48
|
-
|
|
49
49
|
for db in databases:
|
|
50
|
+
if db == 'organism':
|
|
51
|
+
continue # here we just need the list
|
|
50
52
|
with open(f"{outdir}/kdown/{db}.txt", 'r') as file:
|
|
51
53
|
res_string = file.read()
|
|
52
54
|
rows = res_string.split('\n')
|
|
@@ -54,7 +56,6 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
54
56
|
item_id = row.split('\t', 1)[0]
|
|
55
57
|
if item_id == '': continue
|
|
56
58
|
items_to_download.append({'db': db, 'id': item_id})
|
|
57
|
-
|
|
58
59
|
random.shuffle(items_to_download)
|
|
59
60
|
|
|
60
61
|
|
|
@@ -79,6 +80,51 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
79
80
|
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
def create_dict_keggorg(logger, outdir):
|
|
84
|
+
|
|
85
|
+
organisms_raw = open(f'{outdir}/kdown/organism.txt', 'r').read()
|
|
86
|
+
|
|
87
|
+
# create a dataframe listing all organisms in KEGG;
|
|
88
|
+
# columns are [tnumber, name, domain, kingdom, phylum, classification]
|
|
89
|
+
df = [] # list fo dicts
|
|
90
|
+
for line in organisms_raw.strip().split("\n"):
|
|
91
|
+
fields = line.split("\t")
|
|
92
|
+
if len(fields) == 4:
|
|
93
|
+
tnumber, keggorg, name, classification = fields
|
|
94
|
+
levels = classification.split(";")
|
|
95
|
+
domain = levels[0]
|
|
96
|
+
kingdom = levels[1]
|
|
97
|
+
phylum = levels[2]
|
|
98
|
+
df.append({
|
|
99
|
+
'tnumber':tnumber,
|
|
100
|
+
'keggorg': keggorg,
|
|
101
|
+
'name': name,
|
|
102
|
+
'domain': domain,
|
|
103
|
+
'kingdom': kingdom,
|
|
104
|
+
'phylum': phylum,
|
|
105
|
+
'classification': classification
|
|
106
|
+
})
|
|
107
|
+
else:
|
|
108
|
+
# never verified during tests!
|
|
109
|
+
logger.warning(f'Strange number of fields found in this line of "organism.txt": """{line}""".')
|
|
110
|
+
df = pnd.DataFrame.from_records(df)
|
|
111
|
+
df = df.set_index('keggorg', drop=True, verify_integrity=True)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# convert dataframe to dict
|
|
115
|
+
dict_keggorg = {}
|
|
116
|
+
for keggorg, row in df.iterrows():
|
|
117
|
+
dict_keggorg[keggorg] = {
|
|
118
|
+
'kingdom': row['kingdom'],
|
|
119
|
+
'phylum': row['phylum'],
|
|
120
|
+
#'name': row['name'], # not strictly needed. Commented to save disk space.
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if logger != None: logger.info(f'Number of unique items (org): {len(dict_keggorg.keys())}.')
|
|
124
|
+
return dict_keggorg
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
82
128
|
def create_dict_ko(logger, outdir):
|
|
83
129
|
|
|
84
130
|
dict_ko = {} # main output
|
|
@@ -98,6 +144,7 @@ def create_dict_ko(logger, outdir):
|
|
|
98
144
|
'ecs': set(),
|
|
99
145
|
'cogs': set(),
|
|
100
146
|
'gos': set(),
|
|
147
|
+
'keggorgs': set(),
|
|
101
148
|
}
|
|
102
149
|
else:
|
|
103
150
|
logger.error(f"{ko_id} already included!")
|
|
@@ -175,7 +222,13 @@ def create_dict_ko(logger, outdir):
|
|
|
175
222
|
gos = content[len('GO: '):].strip().split(' ')
|
|
176
223
|
for go in gos:
|
|
177
224
|
dict_ko[ko_id]['gos'].add(go)
|
|
178
|
-
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# parse the organism-specific genes
|
|
228
|
+
if curr_header == 'GENES ':
|
|
229
|
+
keggorg = content.split(': ',1)[0]
|
|
230
|
+
dict_ko[ko_id]['keggorgs'].add(keggorg.lower()) # organism.txt has IDs in lowercase
|
|
231
|
+
|
|
179
232
|
|
|
180
233
|
# parse the reactions
|
|
181
234
|
if curr_header == 'REACTION ':
|
|
@@ -547,7 +600,7 @@ def create_dict_md(logger, outdir):
|
|
|
547
600
|
|
|
548
601
|
|
|
549
602
|
|
|
550
|
-
def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
603
|
+
def create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
551
604
|
|
|
552
605
|
idcollection_dict = {}
|
|
553
606
|
|
|
@@ -620,6 +673,24 @@ def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
|
620
673
|
for go in dict_ko[ko_id]['gos']:
|
|
621
674
|
idcollection_dict['ko_to_gos'][ko_id].add(go)
|
|
622
675
|
|
|
676
|
+
|
|
677
|
+
# creation of 'ko_to_keggorgs' skipped as it takes too much disk space. Replaced with 'ko_to_taxa'.
|
|
678
|
+
idcollection_dict['ko_to_taxa'] = {}
|
|
679
|
+
missing_keggorgs = set()
|
|
680
|
+
for ko_id in dict_ko.keys():
|
|
681
|
+
idcollection_dict['ko_to_taxa'][ko_id] = {'kingdom': set(), 'phylum': set()}
|
|
682
|
+
for keggorg in dict_ko[ko_id]['keggorgs']:
|
|
683
|
+
try:
|
|
684
|
+
kingdom = dict_keggorg[keggorg]['kingdom']
|
|
685
|
+
phylum = dict_keggorg[keggorg]['phylum']
|
|
686
|
+
except:
|
|
687
|
+
if keggorg not in missing_keggorgs:
|
|
688
|
+
missing_keggorgs.add(keggorg)
|
|
689
|
+
#print(f"Organism '{keggorg}' appears in 'orthology/' but not in 'organism.txt'.")
|
|
690
|
+
continue
|
|
691
|
+
idcollection_dict['ko_to_taxa'][ko_id]['kingdom'].add(kingdom)
|
|
692
|
+
idcollection_dict['ko_to_taxa'][ko_id]['phylum'].add(phylum)
|
|
693
|
+
|
|
623
694
|
|
|
624
695
|
idcollection_dict['map_to_name'] = {}
|
|
625
696
|
for map_id in dict_map.keys():
|
|
@@ -138,6 +138,15 @@ def set_up_groups(logger, model, idcollection_dict):
|
|
|
138
138
|
|
|
139
139
|
# insert custom groups:
|
|
140
140
|
custom_groups = get_custom_groups()
|
|
141
|
+
#
|
|
142
|
+
# create a group for transporters on-the-fly
|
|
143
|
+
custom_groups['transport'] = []
|
|
144
|
+
for r in model.reactions:
|
|
145
|
+
if len(r.metabolites) == 1: # exchanges / sinks/ demands
|
|
146
|
+
custom_groups['transport'].append(r.id)
|
|
147
|
+
if len(set([m.id.rsplit('_', 1)[-1] for m in r.metabolites])) > 1: # transport reactions
|
|
148
|
+
custom_groups['transport'].append(r.id)
|
|
149
|
+
#
|
|
141
150
|
for group_id in custom_groups.keys():
|
|
142
151
|
actual_group = cobra.core.Group(
|
|
143
152
|
group_id,
|
|
@@ -6,6 +6,9 @@ import os
|
|
|
6
6
|
import pandas as pnd
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
from .manual import get_krs_to_exclude
|
|
10
|
+
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
def parse_eggnog(model, eggnog, idcollection_dict):
|
|
11
14
|
|
|
@@ -27,9 +30,8 @@ def parse_eggnog(model, eggnog, idcollection_dict):
|
|
|
27
30
|
|
|
28
31
|
|
|
29
32
|
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
30
|
-
kr_to_kos = idcollection_dict['kr_to_kos']
|
|
31
33
|
krs_org = set()
|
|
32
|
-
for kr, kos in kr_to_kos.items():
|
|
34
|
+
for kr, kos in idcollection_dict['kr_to_kos'].items():
|
|
33
35
|
if any([ko in kos_org for ko in kos]):
|
|
34
36
|
krs_org.add(kr)
|
|
35
37
|
|
|
@@ -49,9 +51,34 @@ def parse_keggorg(keggorg, outdir, idcollection_dict):
|
|
|
49
51
|
|
|
50
52
|
|
|
51
53
|
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
52
|
-
kr_to_kos = idcollection_dict['kr_to_kos']
|
|
53
54
|
krs_org = set()
|
|
54
|
-
for kr, kos in kr_to_kos.items():
|
|
55
|
+
for kr, kos in idcollection_dict['kr_to_kos'].items():
|
|
56
|
+
if any([ko in kos_org for ko in kos]):
|
|
57
|
+
krs_org.add(kr)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
return krs_org
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def parse_taxon(taxon, idcollection_dict):
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# formatting of --taxon was already verified at startup.
|
|
68
|
+
# also the presence of 'ko_to_taxa' in idcollection_dict was veryfied at startup.
|
|
69
|
+
level, name = taxon.split(':')
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# PART 1. get KO codes available
|
|
73
|
+
kos_org = set()
|
|
74
|
+
for ko in idcollection_dict['ko_to_taxa'].keys():
|
|
75
|
+
if name in idcollection_dict['ko_to_taxa'][ko][level]:
|
|
76
|
+
kos_org.add(ko)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
80
|
+
krs_org = set()
|
|
81
|
+
for kr, kos in idcollection_dict['kr_to_kos'].items():
|
|
55
82
|
if any([ko in kos_org for ko in kos]):
|
|
56
83
|
krs_org.add(kr)
|
|
57
84
|
|
|
@@ -60,7 +87,7 @@ def parse_keggorg(keggorg, outdir, idcollection_dict):
|
|
|
60
87
|
|
|
61
88
|
|
|
62
89
|
|
|
63
|
-
def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
|
|
90
|
+
def check_completeness(logger, model, progress, module, focus, taxon, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
|
|
64
91
|
# check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
|
|
65
92
|
|
|
66
93
|
|
|
@@ -69,6 +96,9 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
69
96
|
if keggorg != '-': # keggorg has precedence
|
|
70
97
|
kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
|
|
71
98
|
kr_uni_label = f"organism code '{keggorg}'"
|
|
99
|
+
elif taxon != '-':
|
|
100
|
+
kr_uni = parse_taxon(taxon, idcollection_dict)
|
|
101
|
+
kr_uni_label = f"taxon '{taxon}'"
|
|
72
102
|
elif eggnog != '-':
|
|
73
103
|
for eggfile in eggnog:
|
|
74
104
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
@@ -85,7 +115,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
85
115
|
if 'kegg.reaction' in r.annotation.keys():
|
|
86
116
|
for kr_id in r.annotation['kegg.reaction']:
|
|
87
117
|
kr_ids_modeled.add(kr_id)
|
|
88
|
-
kr_uni_missing = kr_uni - kr_ids_modeled
|
|
118
|
+
kr_uni_missing = (kr_uni - kr_ids_modeled) - get_krs_to_exclude()
|
|
89
119
|
kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
|
|
90
120
|
logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
|
|
91
121
|
|
|
@@ -114,8 +144,12 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
114
144
|
|
|
115
145
|
# check if 'focus' exist
|
|
116
146
|
if focus != '-' and focus not in map_ids and focus not in md_ids:
|
|
117
|
-
|
|
118
|
-
|
|
147
|
+
if focus == 'transport':
|
|
148
|
+
df_coverage = None
|
|
149
|
+
return df_coverage # just the jeneration of 'transport.json' for Escher drawing is needed here
|
|
150
|
+
else:
|
|
151
|
+
logger.error(f"The ID provided with --focus does not exist: {focus}.")
|
|
152
|
+
return 1
|
|
119
153
|
if focus.startswith('map'):
|
|
120
154
|
logger.debug(f"With --focus {focus}, --module will switch to False.")
|
|
121
155
|
module = False
|
|
@@ -148,7 +182,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
148
182
|
|
|
149
183
|
# check if this map was (at least partially) covered:
|
|
150
184
|
map_krs = set([kr for kr in i['kr_ids'] if kr in kr_uni])
|
|
151
|
-
missing = map_krs - kr_ids_modeled
|
|
185
|
+
missing = (map_krs - kr_ids_modeled) - get_krs_to_exclude()
|
|
152
186
|
present = kr_ids_modeled.intersection(map_krs)
|
|
153
187
|
if focus == map_id:
|
|
154
188
|
missing_logger = (map_id, missing)
|
|
@@ -260,7 +294,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
260
294
|
|
|
261
295
|
# check if this module was (at least partially) covered:
|
|
262
296
|
md_krs = set([kr for kr in z['kr_ids_md'] if kr in kr_uni])
|
|
263
|
-
missing = md_krs - kr_ids_modeled
|
|
297
|
+
missing = (md_krs - kr_ids_modeled) - get_krs_to_exclude()
|
|
264
298
|
present = kr_ids_modeled.intersection(md_krs)
|
|
265
299
|
if focus == md_id:
|
|
266
300
|
missing_logger = (md_id, missing)
|
|
@@ -309,7 +343,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
309
343
|
if module and focus=='-':
|
|
310
344
|
logger.info(f"{spacer}Modules of {right_item['map_id']}: completed {len(mds_completed)} - partial {len(mds_partial)} - missing {len(mds_missing)} - noreac {len(mds_noreac)}")
|
|
311
345
|
if focus != '-':
|
|
312
|
-
logger.info(f"Missing reactions focusing on {missing_logger[0]}: {' '.join(list(missing_logger[1]))}.")
|
|
346
|
+
logger.info(f"Missing reactions focusing on '{missing_logger[0]}': {' '.join(list(missing_logger[1]))}.")
|
|
313
347
|
if progress:
|
|
314
348
|
logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
|
|
315
349
|
|
|
@@ -5,11 +5,21 @@ def get_deprecated_kos():
|
|
|
5
5
|
deprecated_kos = [
|
|
6
6
|
'K11189', # should be K02784
|
|
7
7
|
'K07011', # linked to lp_1215(cps3A) and lp_1216(cps3B) during 2018 and not replaced
|
|
8
|
+
#'K24301', # to be introduced in GPRs
|
|
8
9
|
]
|
|
9
10
|
return deprecated_kos
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
def get_krs_to_exclude():
|
|
15
|
+
return set([
|
|
16
|
+
'R12328', 'R05190', # general forms of fatty acid biosynthesis
|
|
17
|
+
'R01347', 'R04121', # general forms of fatty acid degradation
|
|
18
|
+
])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
13
23
|
def get_rids_with_mancheck_gpr():
|
|
14
24
|
rids_mancheck_gpr = [ # reactions with manually checked GPRs
|
|
15
25
|
'SUCD1', 'ALKP', 'PFK_3', 'TCMPTS', 'PPA', 'APSR',
|
|
@@ -16,7 +16,10 @@ from ..commons import write_excel_model
|
|
|
16
16
|
from ..commons import show_contributions
|
|
17
17
|
from ..commons import adjust_biomass_precursors
|
|
18
18
|
from ..commons import count_undrawn_rids
|
|
19
|
+
from ..commons import count_undrawn_rids_focus
|
|
20
|
+
|
|
19
21
|
from ..commons import format_expansion
|
|
22
|
+
from ..commons import check_taxon
|
|
20
23
|
from ..commons import download_keggorg
|
|
21
24
|
from ..commons import initialize_model
|
|
22
25
|
from ..commons import get_memote_results_dict
|
|
@@ -46,6 +49,7 @@ from .cycles import verify_egc_all
|
|
|
46
49
|
def main(args, logger):
|
|
47
50
|
|
|
48
51
|
|
|
52
|
+
###### FORMAT ARGS NOT REQUIRING RESOURCES
|
|
49
53
|
# adjust out folder path
|
|
50
54
|
while args.outdir.endswith('/'):
|
|
51
55
|
args.outdir = args.outdir[:-1]
|
|
@@ -77,17 +81,8 @@ def main(args, logger):
|
|
|
77
81
|
if args.onlyauthor == '-': args.onlyauthor = None
|
|
78
82
|
|
|
79
83
|
|
|
80
|
-
# format the --eggnog param
|
|
81
|
-
args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
|
|
82
|
-
|
|
83
|
-
# get the kegg organism if requested
|
|
84
|
-
if args.keggorg != '-':
|
|
85
|
-
response = download_keggorg(logger, args.keggorg, args.outdir)
|
|
86
|
-
if response == 1: return 1
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
84
|
|
|
85
|
+
###### LOAD LOCAL RESOURCES
|
|
91
86
|
# check and extract the required 'gsrap.maps' file
|
|
92
87
|
if os.path.exists(f'{args.inmaps}') == False:
|
|
93
88
|
logger.error(f"File 'gsrap.maps' not found at {args.inmaps}.")
|
|
@@ -108,9 +103,27 @@ def main(args, logger):
|
|
|
108
103
|
kegg_compound_to_others = pickle.load(handle)
|
|
109
104
|
with resources.path("gsrap.assets", f"kegg_reaction_to_others.pickle") as asset_path:
|
|
110
105
|
with open(asset_path, 'rb') as handle:
|
|
111
|
-
kegg_reaction_to_others = pickle.load(handle)
|
|
106
|
+
kegg_reaction_to_others = pickle.load(handle)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
###### FORMAT/CHECK FOCUSING ARGS
|
|
111
|
+
# format the --eggnog param
|
|
112
|
+
args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
|
|
112
113
|
|
|
114
|
+
# check the --taxon param
|
|
115
|
+
if args.taxon != '-':
|
|
116
|
+
response = check_taxon(logger, args.taxon, idcollection_dict)
|
|
117
|
+
if response == 1: return 1
|
|
113
118
|
|
|
119
|
+
# get the kegg organism if requested
|
|
120
|
+
if args.keggorg != '-':
|
|
121
|
+
response = download_keggorg(logger, args.keggorg, args.outdir)
|
|
122
|
+
if response == 1: return 1
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# DOWNLOAD ONLINE RESOURCES
|
|
114
127
|
# get dbuni and dbexp:
|
|
115
128
|
logger.info("Downloading gsrap database...")
|
|
116
129
|
response = get_databases(logger)
|
|
@@ -166,14 +179,15 @@ def main(args, logger):
|
|
|
166
179
|
|
|
167
180
|
###### CHECKS 1
|
|
168
181
|
# check universe completness
|
|
169
|
-
df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, args.keggorg, idcollection_dict, summary_dict, args.outdir)
|
|
182
|
+
df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.taxon, args.eggnog, args.keggorg, idcollection_dict, summary_dict, args.outdir)
|
|
170
183
|
if type(df_C)==int: return 1
|
|
171
184
|
|
|
172
185
|
|
|
173
186
|
|
|
174
187
|
###### POLISHING 1
|
|
175
188
|
# remove disconnected metabolites
|
|
176
|
-
|
|
189
|
+
if args.keepdisconn == False:
|
|
190
|
+
universe = remove_disconnected(logger, universe) # can be commented when using booster.py
|
|
177
191
|
|
|
178
192
|
|
|
179
193
|
|
|
@@ -182,9 +196,9 @@ def main(args, logger):
|
|
|
182
196
|
verify_egc_all(logger, universe, args.outdir)
|
|
183
197
|
|
|
184
198
|
|
|
199
|
+
|
|
185
200
|
if not args.justparse:
|
|
186
201
|
|
|
187
|
-
|
|
188
202
|
###### CHECKS 3
|
|
189
203
|
# check growth on minmal media
|
|
190
204
|
df_G = grow_on_media(logger, universe, dbexp, args.media, '-', True)
|
|
@@ -217,10 +231,15 @@ def main(args, logger):
|
|
|
217
231
|
|
|
218
232
|
|
|
219
233
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
234
|
+
# output the universe (even when --justparse)
|
|
235
|
+
logger.info("Writing universal model...")
|
|
236
|
+
cobra.io.save_json_model(universe, f'{args.outdir}/universe.json')
|
|
237
|
+
logger.info(f"'{args.outdir}/universe.json' created!")
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
if not args.justparse:
|
|
241
|
+
|
|
242
|
+
# outptu in the remaining formats:
|
|
224
243
|
cobra.io.write_sbml_model(universe, f'{args.outdir}/universe.xml') # groups are saved only to SBML
|
|
225
244
|
logger.info(f"'{args.outdir}/universe.xml' created!")
|
|
226
245
|
force_id_on_sbml(f'{args.outdir}/universe.xml', 'universe') # force introduction of the 'id=""' field
|
|
@@ -231,7 +250,9 @@ def main(args, logger):
|
|
|
231
250
|
|
|
232
251
|
###### CHECKS 4
|
|
233
252
|
# check if universal escher map is updated:
|
|
234
|
-
count_undrawn_rids(logger, universe, lastmap)
|
|
253
|
+
count_undrawn_rids(logger, universe, lastmap, args.focus)
|
|
254
|
+
if args.focus != '-':
|
|
255
|
+
count_undrawn_rids_focus(logger, universe, lastmap, args.focus, args.outdir)
|
|
235
256
|
|
|
236
257
|
|
|
237
258
|
return 0
|
|
@@ -45,7 +45,7 @@ def check_gpr(logger, rid, row, kr_ids, idcollection_dict, addtype='R'):
|
|
|
45
45
|
pass
|
|
46
46
|
elif ko_id not in idcollection_dict['ko'] and ko_id != 'spontaneous' and ko_id != 'orphan':
|
|
47
47
|
logger.error(f"{itemtype} '{rid}' has an invalid KEGG Ortholog: '{ko_id}'.")
|
|
48
|
-
return 1
|
|
48
|
+
return 1 # can be commented when migrating to new kegg release
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
# check if these ko_ids are really assigned to this reaction:
|
|
@@ -61,7 +61,7 @@ def check_gpr(logger, rid, row, kr_ids, idcollection_dict, addtype='R'):
|
|
|
61
61
|
missing_ko_ids = ko_for_rid - (set(ko_ids_parsed) - set(['spontaneous', 'orphan']))
|
|
62
62
|
if len(missing_ko_ids) > 0:
|
|
63
63
|
logger.error(f"Orthologs {missing_ko_ids} are missing from reaction '{rid}' ({kr_ids}).")
|
|
64
|
-
return 1
|
|
64
|
+
return 1 # can be commented when migrating to new kegg release
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
return 0
|
gsrap/parsedb/annotation.py
CHANGED
|
@@ -138,6 +138,15 @@ def set_up_groups(logger, model, idcollection_dict):
|
|
|
138
138
|
|
|
139
139
|
# insert custom groups:
|
|
140
140
|
custom_groups = get_custom_groups()
|
|
141
|
+
#
|
|
142
|
+
# create a group for transporters on-the-fly
|
|
143
|
+
custom_groups['transport'] = []
|
|
144
|
+
for r in model.reactions:
|
|
145
|
+
if len(r.metabolites) == 1: # exchanges / sinks/ demands
|
|
146
|
+
custom_groups['transport'].append(r.id)
|
|
147
|
+
if len(set([m.id.rsplit('_', 1)[-1] for m in r.metabolites])) > 1: # transport reactions
|
|
148
|
+
custom_groups['transport'].append(r.id)
|
|
149
|
+
#
|
|
141
150
|
for group_id in custom_groups.keys():
|
|
142
151
|
actual_group = cobra.core.Group(
|
|
143
152
|
group_id,
|
gsrap/parsedb/completeness.py
CHANGED
|
@@ -6,6 +6,9 @@ import os
|
|
|
6
6
|
import pandas as pnd
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
from .manual import get_krs_to_exclude
|
|
10
|
+
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
def parse_eggnog(model, eggnog, idcollection_dict):
|
|
11
14
|
|
|
@@ -27,9 +30,8 @@ def parse_eggnog(model, eggnog, idcollection_dict):
|
|
|
27
30
|
|
|
28
31
|
|
|
29
32
|
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
30
|
-
kr_to_kos = idcollection_dict['kr_to_kos']
|
|
31
33
|
krs_org = set()
|
|
32
|
-
for kr, kos in kr_to_kos.items():
|
|
34
|
+
for kr, kos in idcollection_dict['kr_to_kos'].items():
|
|
33
35
|
if any([ko in kos_org for ko in kos]):
|
|
34
36
|
krs_org.add(kr)
|
|
35
37
|
|
|
@@ -49,9 +51,34 @@ def parse_keggorg(keggorg, outdir, idcollection_dict):
|
|
|
49
51
|
|
|
50
52
|
|
|
51
53
|
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
52
|
-
kr_to_kos = idcollection_dict['kr_to_kos']
|
|
53
54
|
krs_org = set()
|
|
54
|
-
for kr, kos in kr_to_kos.items():
|
|
55
|
+
for kr, kos in idcollection_dict['kr_to_kos'].items():
|
|
56
|
+
if any([ko in kos_org for ko in kos]):
|
|
57
|
+
krs_org.add(kr)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
return krs_org
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def parse_taxon(taxon, idcollection_dict):
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# formatting of --taxon was already verified at startup.
|
|
68
|
+
# also the presence of 'ko_to_taxa' in idcollection_dict was veryfied at startup.
|
|
69
|
+
level, name = taxon.split(':')
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# PART 1. get KO codes available
|
|
73
|
+
kos_org = set()
|
|
74
|
+
for ko in idcollection_dict['ko_to_taxa'].keys():
|
|
75
|
+
if name in idcollection_dict['ko_to_taxa'][ko][level]:
|
|
76
|
+
kos_org.add(ko)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
80
|
+
krs_org = set()
|
|
81
|
+
for kr, kos in idcollection_dict['kr_to_kos'].items():
|
|
55
82
|
if any([ko in kos_org for ko in kos]):
|
|
56
83
|
krs_org.add(kr)
|
|
57
84
|
|
|
@@ -60,7 +87,7 @@ def parse_keggorg(keggorg, outdir, idcollection_dict):
|
|
|
60
87
|
|
|
61
88
|
|
|
62
89
|
|
|
63
|
-
def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
|
|
90
|
+
def check_completeness(logger, model, progress, module, focus, taxon, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
|
|
64
91
|
# check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
|
|
65
92
|
|
|
66
93
|
|
|
@@ -69,6 +96,9 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
69
96
|
if keggorg != '-': # keggorg has precedence
|
|
70
97
|
kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
|
|
71
98
|
kr_uni_label = f"organism code '{keggorg}'"
|
|
99
|
+
elif taxon != '-':
|
|
100
|
+
kr_uni = parse_taxon(taxon, idcollection_dict)
|
|
101
|
+
kr_uni_label = f"taxon '{taxon}'"
|
|
72
102
|
elif eggnog != '-':
|
|
73
103
|
for eggfile in eggnog:
|
|
74
104
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
@@ -85,7 +115,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
85
115
|
if 'kegg.reaction' in r.annotation.keys():
|
|
86
116
|
for kr_id in r.annotation['kegg.reaction']:
|
|
87
117
|
kr_ids_modeled.add(kr_id)
|
|
88
|
-
kr_uni_missing = kr_uni - kr_ids_modeled
|
|
118
|
+
kr_uni_missing = (kr_uni - kr_ids_modeled) - get_krs_to_exclude()
|
|
89
119
|
kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
|
|
90
120
|
logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
|
|
91
121
|
|
|
@@ -114,8 +144,12 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
114
144
|
|
|
115
145
|
# check if 'focus' exist
|
|
116
146
|
if focus != '-' and focus not in map_ids and focus not in md_ids:
|
|
117
|
-
|
|
118
|
-
|
|
147
|
+
if focus == 'transport':
|
|
148
|
+
df_coverage = None
|
|
149
|
+
return df_coverage # just the jeneration of 'transport.json' for Escher drawing is needed here
|
|
150
|
+
else:
|
|
151
|
+
logger.error(f"The ID provided with --focus does not exist: {focus}.")
|
|
152
|
+
return 1
|
|
119
153
|
if focus.startswith('map'):
|
|
120
154
|
logger.debug(f"With --focus {focus}, --module will switch to False.")
|
|
121
155
|
module = False
|
|
@@ -148,7 +182,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
148
182
|
|
|
149
183
|
# check if this map was (at least partially) covered:
|
|
150
184
|
map_krs = set([kr for kr in i['kr_ids'] if kr in kr_uni])
|
|
151
|
-
missing = map_krs - kr_ids_modeled
|
|
185
|
+
missing = (map_krs - kr_ids_modeled) - get_krs_to_exclude()
|
|
152
186
|
present = kr_ids_modeled.intersection(map_krs)
|
|
153
187
|
if focus == map_id:
|
|
154
188
|
missing_logger = (map_id, missing)
|
|
@@ -260,7 +294,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
260
294
|
|
|
261
295
|
# check if this module was (at least partially) covered:
|
|
262
296
|
md_krs = set([kr for kr in z['kr_ids_md'] if kr in kr_uni])
|
|
263
|
-
missing = md_krs - kr_ids_modeled
|
|
297
|
+
missing = (md_krs - kr_ids_modeled) - get_krs_to_exclude()
|
|
264
298
|
present = kr_ids_modeled.intersection(md_krs)
|
|
265
299
|
if focus == md_id:
|
|
266
300
|
missing_logger = (md_id, missing)
|
|
@@ -309,7 +343,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
|
|
|
309
343
|
if module and focus=='-':
|
|
310
344
|
logger.info(f"{spacer}Modules of {right_item['map_id']}: completed {len(mds_completed)} - partial {len(mds_partial)} - missing {len(mds_missing)} - noreac {len(mds_noreac)}")
|
|
311
345
|
if focus != '-':
|
|
312
|
-
logger.info(f"Missing reactions focusing on {missing_logger[0]}: {' '.join(list(missing_logger[1]))}.")
|
|
346
|
+
logger.info(f"Missing reactions focusing on '{missing_logger[0]}': {' '.join(list(missing_logger[1]))}.")
|
|
313
347
|
if progress:
|
|
314
348
|
logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
|
|
315
349
|
|
gsrap/parsedb/manual.py
CHANGED
|
@@ -5,11 +5,21 @@ def get_deprecated_kos():
|
|
|
5
5
|
deprecated_kos = [
|
|
6
6
|
'K11189', # should be K02784
|
|
7
7
|
'K07011', # linked to lp_1215(cps3A) and lp_1216(cps3B) during 2018 and not replaced
|
|
8
|
+
#'K24301', # to be introduced in GPRs
|
|
8
9
|
]
|
|
9
10
|
return deprecated_kos
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
def get_krs_to_exclude():
|
|
15
|
+
return set([
|
|
16
|
+
'R12328', 'R05190', # general forms of fatty acid biosynthesis
|
|
17
|
+
'R01347', 'R01348', 'R04121', # general forms of fatty acid degradation
|
|
18
|
+
])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
13
23
|
def get_rids_with_mancheck_gpr():
|
|
14
24
|
rids_mancheck_gpr = [ # reactions with manually checked GPRs
|
|
15
25
|
'SUCD1', 'ALKP', 'PFK_3', 'TCMPTS', 'PPA', 'APSR',
|