gsrap 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsrap/.ipynb_checkpoints/__init__-checkpoint.py +2 -0
- gsrap/__init__.py +2 -0
- gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
- gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +70 -37
- gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +15 -1
- gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py +145 -0
- gsrap/commons/.ipynb_checkpoints/medium-checkpoint.py +3 -4
- gsrap/commons/__init__.py +1 -0
- gsrap/commons/downloads.py +1 -1
- gsrap/commons/escherutils.py +1 -1
- gsrap/commons/excelhub.py +70 -37
- gsrap/commons/figures.py +15 -1
- gsrap/commons/keggutils.py +145 -0
- gsrap/commons/medium.py +3 -4
- gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +69 -19
- gsrap/mkmodel/.ipynb_checkpoints/pruner-checkpoint.py +72 -7
- gsrap/mkmodel/mkmodel.py +69 -19
- gsrap/mkmodel/pruner.py +72 -7
- gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +33 -6
- gsrap/parsedb/.ipynb_checkpoints/cycles-checkpoint.py +128 -0
- gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +9 -9
- gsrap/parsedb/.ipynb_checkpoints/manual-checkpoint.py +27 -0
- gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +15 -2
- gsrap/parsedb/.ipynb_checkpoints/repeating-checkpoint.py +9 -0
- gsrap/parsedb/completeness.py +33 -6
- gsrap/parsedb/cycles.py +128 -0
- gsrap/parsedb/introduce.py +9 -9
- gsrap/parsedb/manual.py +27 -0
- gsrap/parsedb/parsedb.py +15 -2
- gsrap/parsedb/repeating.py +9 -0
- {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/METADATA +1 -1
- {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/RECORD +37 -33
- {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/LICENSE.txt +0 -0
- {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/WHEEL +0 -0
- {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/entry_points.txt +0 -0
gsrap/mkmodel/pruner.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import warnings
|
|
3
3
|
import logging
|
|
4
|
+
import pickle
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
import pandas as pnd
|
|
@@ -43,22 +44,57 @@ def load_input_eggnog(logger, eggnog):
|
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
# load eggnog annotations
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
df_eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
|
|
48
|
+
df_eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
|
|
49
|
+
df_eggnog = df_eggnog.set_index('query', drop=True, verify_integrity=True)
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
return
|
|
52
|
+
return df_eggnog
|
|
52
53
|
|
|
53
54
|
|
|
54
55
|
|
|
55
|
-
def
|
|
56
|
+
def load_keggorg_like_eggnog(logger, keggorg, outdir):
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# load raw data, downloaded form kegg:
|
|
60
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
61
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# create an eggnog-like dataframe:
|
|
65
|
+
df_eggnog_like = [] # list of dict future df
|
|
66
|
+
for gid in df_keggorg.index:
|
|
67
|
+
row_dict = {}
|
|
68
|
+
|
|
69
|
+
row_dict['query'] = gid
|
|
70
|
+
row_dict['PFAMs'] = ','.join(df_keggorg.loc[gid, 'Pfam']) if type(df_keggorg.loc[gid, 'Pfam'])==list else '-'
|
|
71
|
+
row_dict['KEGG_ko'] = df_keggorg.loc[gid, 'ko'] if type(df_keggorg.loc[gid, 'ko'])==str else '-'
|
|
72
|
+
|
|
73
|
+
df_eggnog_like.append(row_dict)
|
|
74
|
+
df_eggnog_like = pnd.DataFrame.from_records(df_eggnog_like)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# appen missing coluns and sort
|
|
78
|
+
eggnog_columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
|
|
79
|
+
for c in eggnog_columns:
|
|
80
|
+
if c not in df_eggnog_like.columns:
|
|
81
|
+
df_eggnog_like[c] = '-'
|
|
82
|
+
df_eggnog_like = df_eggnog_like[eggnog_columns]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# set the index like in eggnog
|
|
86
|
+
df_eggnog_like = df_eggnog_like.set_index('query', drop=True, verify_integrity=True)
|
|
87
|
+
return df_eggnog_like
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def parse_eggnog(df_eggnog):
|
|
56
92
|
|
|
57
93
|
|
|
58
94
|
# PART 1. get KO codes available
|
|
59
95
|
gid_to_kos = {}
|
|
60
96
|
ko_to_gids = {}
|
|
61
|
-
for gid, kos in
|
|
97
|
+
for gid, kos in df_eggnog['KEGG_ko'].items():
|
|
62
98
|
if kos == '-':
|
|
63
99
|
continue
|
|
64
100
|
|
|
@@ -229,8 +265,37 @@ def restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos):
|
|
|
229
265
|
# collect names
|
|
230
266
|
names.append(uni_g.name)
|
|
231
267
|
g.name = '; '.join(names)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def append_keggorg_gene_annots(logger, model, keggorg, outdir):
|
|
272
|
+
|
|
232
273
|
|
|
233
|
-
|
|
274
|
+
# load raw data, downloaded form kegg:
|
|
275
|
+
logger.info("Adding gene annotations retrieved from KEGG...")
|
|
276
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
277
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# KEGG can provide some useful (ie, used in Memote) gene annotations:
|
|
281
|
+
for g in model.genes:
|
|
282
|
+
if g.id in df_keggorg.index:
|
|
283
|
+
|
|
284
|
+
g.annotation['kegg.genes'] = [keggorg + ':' + g.id]
|
|
285
|
+
|
|
286
|
+
if 'NCBI-GeneID' in df_keggorg.columns:
|
|
287
|
+
g.annotation['ncbigene'] = df_keggorg.loc[g.id, 'NCBI-GeneID'] if type(df_keggorg.loc[g.id, 'NCBI-GeneID'])==list else []
|
|
288
|
+
if 'NCBI-ProteinID' in df_keggorg.columns:
|
|
289
|
+
g.annotation['ncbiprotein'] = df_keggorg.loc[g.id, 'NCBI-ProteinID'] if type(df_keggorg.loc[g.id, 'NCBI-ProteinID'])==list else []
|
|
290
|
+
if 'ASAP' in df_keggorg.columns:
|
|
291
|
+
g.annotation['asap'] = df_keggorg.loc[g.id, 'ASAP'] if type(df_keggorg.loc[g.id, 'ASAP'])==list else []
|
|
292
|
+
if 'UniProt' in df_keggorg.columns:
|
|
293
|
+
g.annotation['uniprot'] = df_keggorg.loc[g.id, 'UniProt'] if type(df_keggorg.loc[g.id, 'UniProt'])==list else []
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
|
|
234
299
|
|
|
235
300
|
|
|
236
301
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
+
import pickle
|
|
3
|
+
import os
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
import pandas as pnd
|
|
@@ -35,14 +37,39 @@ def parse_eggnog(model, eggnog, idcollection_dict):
|
|
|
35
37
|
return krs_org
|
|
36
38
|
|
|
37
39
|
|
|
40
|
+
|
|
41
|
+
def parse_keggorg(keggorg, outdir, idcollection_dict):
|
|
42
|
+
|
|
43
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
44
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# PART 1. get KO codes available
|
|
48
|
+
kos_org = set([i for i in df_keggorg['ko'] if pnd.isna(i)==False])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
52
|
+
kr_to_kos = idcollection_dict['kr_to_kos']
|
|
53
|
+
krs_org = set()
|
|
54
|
+
for kr, kos in kr_to_kos.items():
|
|
55
|
+
if any([ko in kos_org for ko in kos]):
|
|
56
|
+
krs_org.add(kr)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
return krs_org
|
|
60
|
+
|
|
61
|
+
|
|
38
62
|
|
|
39
|
-
def check_completeness(logger, model, progress, module, focus, eggnog, idcollection_dict, summary_dict):
|
|
63
|
+
def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
|
|
40
64
|
# check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
|
|
41
65
|
|
|
42
66
|
|
|
43
67
|
# get the reference set of kr codes (all kegg or organism specific):
|
|
44
68
|
kr_uni = set()
|
|
45
|
-
if
|
|
69
|
+
if keggorg != '-': # keggorg has precedence
|
|
70
|
+
kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
|
|
71
|
+
kr_uni_label = f"organism code '{keggorg}'"
|
|
72
|
+
elif eggnog != '-':
|
|
46
73
|
for eggfile in eggnog:
|
|
47
74
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
48
75
|
kr_uni = kr_uni.union(eggset)
|
|
@@ -60,7 +87,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, idcollect
|
|
|
60
87
|
kr_ids_modeled.add(kr_id)
|
|
61
88
|
kr_uni_missing = kr_uni - kr_ids_modeled
|
|
62
89
|
kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
|
|
63
|
-
logger.info(f"Coverage for
|
|
90
|
+
logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
|
|
64
91
|
|
|
65
92
|
|
|
66
93
|
# define the map?????, containing krs not included in maps
|
|
@@ -177,15 +204,15 @@ def check_completeness(logger, model, progress, module, focus, eggnog, idcollect
|
|
|
177
204
|
for eggfile in eggnog:
|
|
178
205
|
strain = Path(eggfile).stem
|
|
179
206
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
180
|
-
col = df_coverage.index.to_series().isin(eggset).astype(int)
|
|
207
|
+
col = df_coverage.index.to_series().isin(eggset).astype(int) # integer: 0 or 1
|
|
181
208
|
df_strains.append(col.rename(strain))
|
|
182
209
|
df_strains = pnd.concat(df_strains, axis=1)
|
|
183
210
|
# sort rows: upper rows are present in more strains
|
|
184
|
-
df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index]
|
|
211
|
+
#df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index] # commented: now in charge of figures.py
|
|
185
212
|
df_coverage = df_coverage.loc[df_strains.index]
|
|
186
213
|
df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
|
|
187
214
|
# split in 2: modeled above, non-modeled below:
|
|
188
|
-
df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]])
|
|
215
|
+
#df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]]) # commented: now in charge of figures.py
|
|
189
216
|
else: # not interesting in a super-long table without strains in column
|
|
190
217
|
df_coverage = None
|
|
191
218
|
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import cobra
|
|
7
|
+
import gempipe
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from ..commons import fba_no_warnings
|
|
11
|
+
from ..commons import get_optthr
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def verify_egc(logger, model, mid, outdir):
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# changes as not permament:
|
|
19
|
+
found_egc = False
|
|
20
|
+
with model:
|
|
21
|
+
|
|
22
|
+
# close (0; 0) all the exchange reactions:
|
|
23
|
+
gempipe.close_boundaries(model)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# create a dissipation reaction:
|
|
27
|
+
dissip = cobra.Reaction(f'__dissip__{mid}')
|
|
28
|
+
model.add_reactions([dissip])
|
|
29
|
+
dissip = model.reactions.get_by_id(f'__dissip__{mid}')
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# define the dissipation reaction:
|
|
33
|
+
modeled_mids = [m.id for m in model.metabolites]
|
|
34
|
+
if mid == 'atp':
|
|
35
|
+
dissip_string = 'atp_c + h2o_c --> adp_c + pi_c + h_c'
|
|
36
|
+
elif mid == 'ctp':
|
|
37
|
+
dissip_string = 'ctp_c + h2o_c --> cdp_c + pi_c + h_c'
|
|
38
|
+
elif mid == 'gtp':
|
|
39
|
+
dissip_string = 'gtp_c + h2o_c --> gdp_c + pi_c + h_c'
|
|
40
|
+
elif mid == 'utp':
|
|
41
|
+
dissip_string = 'utp_c + h2o_c --> udp_c + pi_c + h_c'
|
|
42
|
+
elif mid == 'itp':
|
|
43
|
+
dissip_string = 'itp_c + h2o_c --> idp_c + pi_c + h_c'
|
|
44
|
+
elif mid == 'nadh':
|
|
45
|
+
dissip_string = 'nadh_c --> nad_c + h_c'
|
|
46
|
+
elif mid == 'nadph':
|
|
47
|
+
dissip_string = 'nadph_c --> nadp_c + h_c'
|
|
48
|
+
elif mid == 'fadh2':
|
|
49
|
+
dissip_string = 'fadh2_c --> fad_c + 2.0 h_c'
|
|
50
|
+
elif mid == 'accoa':
|
|
51
|
+
dissip_string = 'accoa_c + h2o_c --> ac_c + coa_c + h_c'
|
|
52
|
+
elif mid == 'glu__L':
|
|
53
|
+
dissip_string = 'glu__L_c + h2o_c --> akg_c + nh4_c + 2.0 h_c'
|
|
54
|
+
elif mid == 'q8h2':
|
|
55
|
+
dissip_string = 'q8h2_c --> q8_c + 2.0 h_c'
|
|
56
|
+
dissip.build_reaction_from_string(dissip_string)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# set the objective and optimize:
|
|
60
|
+
model.objective = f'__dissip__{mid}'
|
|
61
|
+
res, obj_value, status = fba_no_warnings(model)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# apply the threshold:
|
|
65
|
+
obj_value = res.objective_value
|
|
66
|
+
status = res.status
|
|
67
|
+
if status == 'optimal' and obj_value >= get_optthr():
|
|
68
|
+
found_egc = True
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# get suspect !=0 fluxes
|
|
72
|
+
fluxes = res.fluxes
|
|
73
|
+
# get interesting fluxes (get_optthr() tries to take into account the approximation in glpk and cplex solvers)
|
|
74
|
+
fluxes_interesting = fluxes[(fluxes > get_optthr()) | (fluxes < -get_optthr())]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# create a model for escher, remove Rs not beloning to the cycle
|
|
78
|
+
model_copy = model.copy()
|
|
79
|
+
all_rids = [r.id for r in model_copy.reactions]
|
|
80
|
+
to_delete = set(all_rids) - set(fluxes_interesting.index)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# trick to avoid the WARNING "cobra/core/group.py:147: UserWarning: need to pass in a list"
|
|
84
|
+
# triggered when trying to remove reactions that are included in groups.
|
|
85
|
+
with warnings.catch_warnings(): # temporarily suppress warnings for this block
|
|
86
|
+
warnings.simplefilter("ignore") # ignore all warnings
|
|
87
|
+
cobra_logger = logging.getLogger("cobra.util.solver")
|
|
88
|
+
old_level = cobra_logger.level
|
|
89
|
+
cobra_logger.setLevel(logging.ERROR)
|
|
90
|
+
|
|
91
|
+
# triggering code
|
|
92
|
+
model_copy.remove_reactions(to_delete) # should work also with IDs
|
|
93
|
+
|
|
94
|
+
# restore original behaviour:
|
|
95
|
+
cobra_logger.setLevel(old_level)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# save JSON to direct import in Escher:
|
|
99
|
+
outfile = os.path.join(outdir, f'EGC_{mid}.json')
|
|
100
|
+
cobra.io.save_json_model(model_copy, outfile)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# log some messages
|
|
104
|
+
rid_labels = []
|
|
105
|
+
for rid, flux in fluxes_interesting.to_dict().items():
|
|
106
|
+
rid_label = "'" + rid + "'"
|
|
107
|
+
# mark reversible reactions composing the cycle:
|
|
108
|
+
r = model.reactions.get_by_id(rid)
|
|
109
|
+
if r.lower_bound < 0 and r.upper_bound > 0:
|
|
110
|
+
rid_label = rid_label + '(<=>)'
|
|
111
|
+
rid_labels.append(rid_label)
|
|
112
|
+
logger.warning(f"Found erroneous EGC (N={len(model_copy.reactions)}) for '{mid}' (f={obj_value}): [{', '.join(rid_labels)}]. EGC saved to '{outfile}' to be inspected with Escher-FBA.")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
return found_egc
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def verify_egc_all(logger, model, outdir='./', mids_to_check=['atp','ctp','gtp','utp','itp','nadh','nadph','fadh2','accoa','glu__L','q8h2']):
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
all_results = []
|
|
123
|
+
for mid in mids_to_check:
|
|
124
|
+
all_results.append(verify_egc(logger, model, mid, outdir))
|
|
125
|
+
if any(all_results)==False:
|
|
126
|
+
logger.info("Found 0 erroneous energy-generating cycles (EGCs).")
|
|
127
|
+
|
|
128
|
+
|
|
@@ -143,6 +143,14 @@ def introduce_metabolites(logger, db, model, idcollection_dict, kegg_compound_to
|
|
|
143
143
|
m.annotation[ankey] = list(m.annotation[ankey])
|
|
144
144
|
|
|
145
145
|
|
|
146
|
+
# replace inchikey with manually-curated
|
|
147
|
+
if m.annotation['inchikey'] != [] and m.annotation['inchikey'] != [row['inchikey']]:
|
|
148
|
+
logger.debug(f"Metabolite '{pure_mid}': manual-curated inchikey ({[row['inchikey']]}) is diferent from the one derived from MNX ({m.annotation['inchikey']}).")
|
|
149
|
+
m.annotation['inchikey'] = [row['inchikey']] # force the manual-curated version
|
|
150
|
+
if m.annotation['inchikey'] == ['XXXXXXXXXXXXXX-XXXXXXXXXX-X']:
|
|
151
|
+
m.annotation['inchikey'] = []
|
|
152
|
+
|
|
153
|
+
|
|
146
154
|
# add SBO annotation
|
|
147
155
|
m.annotation['sbo'] = ['SBO:0000247'] # generic metabolite
|
|
148
156
|
|
|
@@ -279,15 +287,7 @@ def introduce_transporters(logger, db, model, idcollection_dict, kegg_reaction_t
|
|
|
279
287
|
r = model.reactions.get_by_id(f'EX_{mid_e}')
|
|
280
288
|
r.name = f"Exchange for {model.metabolites.get_by_id(mid_e).name}"
|
|
281
289
|
r.build_reaction_from_string(f'{mid_e} --> ')
|
|
282
|
-
|
|
283
|
-
# basics:
|
|
284
|
-
'glc__D_e', 'nh4_e', 'pi_e', 'so4_e', 'h2o_e', 'h_e', 'o2_e', 'co2_e',
|
|
285
|
-
# metals:
|
|
286
|
-
'cu2_e', 'mobd_e', 'fe2_e', 'cobalt2_e',
|
|
287
|
-
]:
|
|
288
|
-
r.bounds = (-1000, 1000)
|
|
289
|
-
else:
|
|
290
|
-
r.bounds = (0, 1000)
|
|
290
|
+
r.bounds = (0, 1000)
|
|
291
291
|
|
|
292
292
|
# add SBO annotation
|
|
293
293
|
r.annotation['sbo'] = ['SBO:0000627'] # exchange reaction
|
|
@@ -19,6 +19,33 @@ def get_rids_with_mancheck_gpr():
|
|
|
19
19
|
return rids_mancheck_gpr
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
def get_rids_with_mancheck_balancing():
|
|
23
|
+
rids_mancheck_bal = [ # same reactions involving ATP can be reversible
|
|
24
|
+
|
|
25
|
+
# SECTION "reversible both in KEGG and MetaCyc"
|
|
26
|
+
'PGK', 'SUCOAS', 'ADK1', 'GK1', 'NNATr', 'CYTK1', 'ACKr',
|
|
27
|
+
'DGK1', 'PPAKr', 'ATPSr', 'NDPK10',
|
|
28
|
+
|
|
29
|
+
### SECTION "reversible in KEGG but not in MetaCyc" ###
|
|
30
|
+
'CYTK2', # clearly reversible in KEGG but not in MetaCyc (RXN-7913)
|
|
31
|
+
'DADK', # clearly reversible in KEGG but not in MetaCyc (DEOXYADENYLATE-KINASE-RXN)
|
|
32
|
+
'UMPK', # clearly reversible in KEGG but not in MetaCyc (RXN-12002)
|
|
33
|
+
'NDPK1', # clearly reversible in KEGG but not in MetaCyc (GDPKIN-RXN)
|
|
34
|
+
'NDPK2', # clearly reversible in KEGG but not in MetaCyc (UDPKIN-RXN)
|
|
35
|
+
'NDPK3', # clearly reversible in KEGG but not in MetaCyc (CDPKIN-RXN)
|
|
36
|
+
'NDPK4', # clearly reversible in KEGG but not in MetaCyc (DTDPKIN-RXN)
|
|
37
|
+
'NDPK5', # clearly reversible in KEGG but not in MetaCyc (DGDPKIN-RXN)
|
|
38
|
+
'NDPK6', # clearly reversible in KEGG but not in MetaCyc (DUDPKIN-RXN)
|
|
39
|
+
'NDPK7', # clearly reversible in KEGG but not in MetaCyc (DCDPKIN-RXN)
|
|
40
|
+
'NDPK8', # clearly reversible in KEGG but not in MetaCyc (DADPKIN-RXN)
|
|
41
|
+
'NDPK9', # clearly reversible in KEGG but not in MetaCyc (RXN-14120)
|
|
42
|
+
|
|
43
|
+
### SECTION "missing reversibility info" ###
|
|
44
|
+
'LPHERA',
|
|
45
|
+
]
|
|
46
|
+
return rids_mancheck_bal
|
|
47
|
+
|
|
48
|
+
|
|
22
49
|
|
|
23
50
|
def get_manual_sinks():
|
|
24
51
|
|
|
@@ -17,6 +17,7 @@ from ..commons import show_contributions
|
|
|
17
17
|
from ..commons import adjust_biomass_precursors
|
|
18
18
|
from ..commons import count_undrawn_rids
|
|
19
19
|
from ..commons import format_expansion
|
|
20
|
+
from ..commons import download_keggorg
|
|
20
21
|
|
|
21
22
|
from .introduce import introduce_metabolites
|
|
22
23
|
from .introduce import introduce_reactions
|
|
@@ -34,6 +35,8 @@ from ..runsims.biosynth import biosynthesis_on_media
|
|
|
34
35
|
|
|
35
36
|
from ..mkmodel.polishing import remove_disconnected
|
|
36
37
|
|
|
38
|
+
from .cycles import verify_egc_all
|
|
39
|
+
|
|
37
40
|
|
|
38
41
|
|
|
39
42
|
|
|
@@ -72,7 +75,14 @@ def main(args, logger):
|
|
|
72
75
|
|
|
73
76
|
|
|
74
77
|
# format the --eggnog param
|
|
75
|
-
args.eggnog = format_expansion(logger, args.eggnog)
|
|
78
|
+
args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
|
|
79
|
+
|
|
80
|
+
# get the kegg organism if requested
|
|
81
|
+
if args.keggorg != '-':
|
|
82
|
+
response = download_keggorg(logger, args.keggorg, args.outdir)
|
|
83
|
+
if response == 1: return 1
|
|
84
|
+
|
|
85
|
+
|
|
76
86
|
|
|
77
87
|
|
|
78
88
|
# check and extract the required 'gsrap.maps' file
|
|
@@ -153,7 +163,7 @@ def main(args, logger):
|
|
|
153
163
|
|
|
154
164
|
###### CHECKS 1
|
|
155
165
|
# check universe completness
|
|
156
|
-
df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, idcollection_dict, summary_dict)
|
|
166
|
+
df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, args.keggorg, idcollection_dict, summary_dict, args.outdir)
|
|
157
167
|
if type(df_C)==int: return 1
|
|
158
168
|
|
|
159
169
|
|
|
@@ -165,6 +175,9 @@ def main(args, logger):
|
|
|
165
175
|
|
|
166
176
|
|
|
167
177
|
###### CHECKS 2
|
|
178
|
+
# check erroneous EGCs
|
|
179
|
+
verify_egc_all(logger, universe, args.outdir)
|
|
180
|
+
|
|
168
181
|
# check growth on minmal media
|
|
169
182
|
df_G = grow_on_media(logger, universe, dbexp, args.media, '-', True)
|
|
170
183
|
if type(df_G)==int: return 1
|
|
@@ -4,6 +4,7 @@ import cobra
|
|
|
4
4
|
|
|
5
5
|
from .manual import get_deprecated_kos
|
|
6
6
|
from .manual import get_rids_with_mancheck_gpr
|
|
7
|
+
from .manual import get_rids_with_mancheck_balancing
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
|
|
@@ -138,6 +139,14 @@ def add_reaction(logger, model, rid, row, kr_ids, kegg_reaction_to_others, addty
|
|
|
138
139
|
return 1
|
|
139
140
|
|
|
140
141
|
|
|
142
|
+
# check if reversible and using ATP
|
|
143
|
+
if r.lower_bound < 0 and r.upper_bound > 0:
|
|
144
|
+
for m in r.metabolites:
|
|
145
|
+
if m.id.rsplit('_', 1)[0] == 'atp':
|
|
146
|
+
if rid not in get_rids_with_mancheck_balancing():
|
|
147
|
+
logger.warning(f"Reaction '{rid}' involves ATP and is reversible: are you sure?")
|
|
148
|
+
|
|
149
|
+
|
|
141
150
|
return 0
|
|
142
151
|
|
|
143
152
|
|
gsrap/parsedb/completeness.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
+
import pickle
|
|
3
|
+
import os
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
import pandas as pnd
|
|
@@ -35,14 +37,39 @@ def parse_eggnog(model, eggnog, idcollection_dict):
|
|
|
35
37
|
return krs_org
|
|
36
38
|
|
|
37
39
|
|
|
40
|
+
|
|
41
|
+
def parse_keggorg(keggorg, outdir, idcollection_dict):
|
|
42
|
+
|
|
43
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
44
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# PART 1. get KO codes available
|
|
48
|
+
kos_org = set([i for i in df_keggorg['ko'] if pnd.isna(i)==False])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
52
|
+
kr_to_kos = idcollection_dict['kr_to_kos']
|
|
53
|
+
krs_org = set()
|
|
54
|
+
for kr, kos in kr_to_kos.items():
|
|
55
|
+
if any([ko in kos_org for ko in kos]):
|
|
56
|
+
krs_org.add(kr)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
return krs_org
|
|
60
|
+
|
|
61
|
+
|
|
38
62
|
|
|
39
|
-
def check_completeness(logger, model, progress, module, focus, eggnog, idcollection_dict, summary_dict):
|
|
63
|
+
def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
|
|
40
64
|
# check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
|
|
41
65
|
|
|
42
66
|
|
|
43
67
|
# get the reference set of kr codes (all kegg or organism specific):
|
|
44
68
|
kr_uni = set()
|
|
45
|
-
if
|
|
69
|
+
if keggorg != '-': # keggorg has precedence
|
|
70
|
+
kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
|
|
71
|
+
kr_uni_label = f"organism code '{keggorg}'"
|
|
72
|
+
elif eggnog != '-':
|
|
46
73
|
for eggfile in eggnog:
|
|
47
74
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
48
75
|
kr_uni = kr_uni.union(eggset)
|
|
@@ -60,7 +87,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, idcollect
|
|
|
60
87
|
kr_ids_modeled.add(kr_id)
|
|
61
88
|
kr_uni_missing = kr_uni - kr_ids_modeled
|
|
62
89
|
kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
|
|
63
|
-
logger.info(f"Coverage for
|
|
90
|
+
logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
|
|
64
91
|
|
|
65
92
|
|
|
66
93
|
# define the map?????, containing krs not included in maps
|
|
@@ -177,15 +204,15 @@ def check_completeness(logger, model, progress, module, focus, eggnog, idcollect
|
|
|
177
204
|
for eggfile in eggnog:
|
|
178
205
|
strain = Path(eggfile).stem
|
|
179
206
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
180
|
-
col = df_coverage.index.to_series().isin(eggset).astype(int)
|
|
207
|
+
col = df_coverage.index.to_series().isin(eggset).astype(int) # integer: 0 or 1
|
|
181
208
|
df_strains.append(col.rename(strain))
|
|
182
209
|
df_strains = pnd.concat(df_strains, axis=1)
|
|
183
210
|
# sort rows: upper rows are present in more strains
|
|
184
|
-
df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index]
|
|
211
|
+
#df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index] # commented: now in charge of figures.py
|
|
185
212
|
df_coverage = df_coverage.loc[df_strains.index]
|
|
186
213
|
df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
|
|
187
214
|
# split in 2: modeled above, non-modeled below:
|
|
188
|
-
df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]])
|
|
215
|
+
#df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]]) # commented: now in charge of figures.py
|
|
189
216
|
else: # not interesting in a super-long table without strains in column
|
|
190
217
|
df_coverage = None
|
|
191
218
|
|
gsrap/parsedb/cycles.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import cobra
|
|
7
|
+
import gempipe
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from ..commons import fba_no_warnings
|
|
11
|
+
from ..commons import get_optthr
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def verify_egc(logger, model, mid, outdir):
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# changes as not permament:
|
|
19
|
+
found_egc = False
|
|
20
|
+
with model:
|
|
21
|
+
|
|
22
|
+
# close (0; 0) all the exchange reactions:
|
|
23
|
+
gempipe.close_boundaries(model)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# create a dissipation reaction:
|
|
27
|
+
dissip = cobra.Reaction(f'__dissip__{mid}')
|
|
28
|
+
model.add_reactions([dissip])
|
|
29
|
+
dissip = model.reactions.get_by_id(f'__dissip__{mid}')
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# define the dissipation reaction:
|
|
33
|
+
modeled_mids = [m.id for m in model.metabolites]
|
|
34
|
+
if mid == 'atp':
|
|
35
|
+
dissip_string = 'atp_c + h2o_c --> adp_c + pi_c + h_c'
|
|
36
|
+
elif mid == 'ctp':
|
|
37
|
+
dissip_string = 'ctp_c + h2o_c --> cdp_c + pi_c + h_c'
|
|
38
|
+
elif mid == 'gtp':
|
|
39
|
+
dissip_string = 'gtp_c + h2o_c --> gdp_c + pi_c + h_c'
|
|
40
|
+
elif mid == 'utp':
|
|
41
|
+
dissip_string = 'utp_c + h2o_c --> udp_c + pi_c + h_c'
|
|
42
|
+
elif mid == 'itp':
|
|
43
|
+
dissip_string = 'itp_c + h2o_c --> idp_c + pi_c + h_c'
|
|
44
|
+
elif mid == 'nadh':
|
|
45
|
+
dissip_string = 'nadh_c --> nad_c + h_c'
|
|
46
|
+
elif mid == 'nadph':
|
|
47
|
+
dissip_string = 'nadph_c --> nadp_c + h_c'
|
|
48
|
+
elif mid == 'fadh2':
|
|
49
|
+
dissip_string = 'fadh2_c --> fad_c + 2.0 h_c'
|
|
50
|
+
elif mid == 'accoa':
|
|
51
|
+
dissip_string = 'accoa_c + h2o_c --> ac_c + coa_c + h_c'
|
|
52
|
+
elif mid == 'glu__L':
|
|
53
|
+
dissip_string = 'glu__L_c + h2o_c --> akg_c + nh4_c + 2.0 h_c'
|
|
54
|
+
elif mid == 'q8h2':
|
|
55
|
+
dissip_string = 'q8h2_c --> q8_c + 2.0 h_c'
|
|
56
|
+
dissip.build_reaction_from_string(dissip_string)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# set the objective and optimize:
|
|
60
|
+
model.objective = f'__dissip__{mid}'
|
|
61
|
+
res, obj_value, status = fba_no_warnings(model)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# apply the threshold:
|
|
65
|
+
obj_value = res.objective_value
|
|
66
|
+
status = res.status
|
|
67
|
+
if status == 'optimal' and obj_value >= get_optthr():
|
|
68
|
+
found_egc = True
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# get suspect !=0 fluxes
|
|
72
|
+
fluxes = res.fluxes
|
|
73
|
+
# get interesting fluxes (get_optthr() tries to take into account the approximation in glpk and cplex solvers)
|
|
74
|
+
fluxes_interesting = fluxes[(fluxes > get_optthr()) | (fluxes < -get_optthr())]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# create a model for escher, remove Rs not beloning to the cycle
|
|
78
|
+
model_copy = model.copy()
|
|
79
|
+
all_rids = [r.id for r in model_copy.reactions]
|
|
80
|
+
to_delete = set(all_rids) - set(fluxes_interesting.index)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# trick to avoid the WARNING "cobra/core/group.py:147: UserWarning: need to pass in a list"
|
|
84
|
+
# triggered when trying to remove reactions that are included in groups.
|
|
85
|
+
with warnings.catch_warnings(): # temporarily suppress warnings for this block
|
|
86
|
+
warnings.simplefilter("ignore") # ignore all warnings
|
|
87
|
+
cobra_logger = logging.getLogger("cobra.util.solver")
|
|
88
|
+
old_level = cobra_logger.level
|
|
89
|
+
cobra_logger.setLevel(logging.ERROR)
|
|
90
|
+
|
|
91
|
+
# triggering code
|
|
92
|
+
model_copy.remove_reactions(to_delete) # should work also with IDs
|
|
93
|
+
|
|
94
|
+
# restore original behaviour:
|
|
95
|
+
cobra_logger.setLevel(old_level)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# save JSON to direct import in Escher:
|
|
99
|
+
outfile = os.path.join(outdir, f'EGC_{mid}.json')
|
|
100
|
+
cobra.io.save_json_model(model_copy, outfile)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# log some messages
|
|
104
|
+
rid_labels = []
|
|
105
|
+
for rid, flux in fluxes_interesting.to_dict().items():
|
|
106
|
+
rid_label = "'" + rid + "'"
|
|
107
|
+
# mark reversible reactions composing the cycle:
|
|
108
|
+
r = model.reactions.get_by_id(rid)
|
|
109
|
+
if r.lower_bound < 0 and r.upper_bound > 0:
|
|
110
|
+
rid_label = rid_label + '(<=>)'
|
|
111
|
+
rid_labels.append(rid_label)
|
|
112
|
+
logger.warning(f"Found erroneous EGC (N={len(model_copy.reactions)}) for '{mid}' (f={obj_value}): [{', '.join(rid_labels)}]. EGC saved to '{outfile}' to be inspected with Escher-FBA.")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
return found_egc
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def verify_egc_all(logger, model, outdir='./', mids_to_check=['atp','ctp','gtp','utp','itp','nadh','nadph','fadh2','accoa','glu__L','q8h2']):
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
all_results = []
|
|
123
|
+
for mid in mids_to_check:
|
|
124
|
+
all_results.append(verify_egc(logger, model, mid, outdir))
|
|
125
|
+
if any(all_results)==False:
|
|
126
|
+
logger.info("Found 0 erroneous energy-generating cycles (EGCs).")
|
|
127
|
+
|
|
128
|
+
|