gsrap 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. gsrap/.ipynb_checkpoints/__init__-checkpoint.py +2 -0
  2. gsrap/__init__.py +2 -0
  3. gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
  4. gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +1 -1
  5. gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +1 -1
  6. gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +70 -37
  7. gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +15 -1
  8. gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py +145 -0
  9. gsrap/commons/.ipynb_checkpoints/medium-checkpoint.py +3 -4
  10. gsrap/commons/__init__.py +1 -0
  11. gsrap/commons/downloads.py +1 -1
  12. gsrap/commons/escherutils.py +1 -1
  13. gsrap/commons/excelhub.py +70 -37
  14. gsrap/commons/figures.py +15 -1
  15. gsrap/commons/keggutils.py +145 -0
  16. gsrap/commons/medium.py +3 -4
  17. gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +69 -19
  18. gsrap/mkmodel/.ipynb_checkpoints/pruner-checkpoint.py +72 -7
  19. gsrap/mkmodel/mkmodel.py +69 -19
  20. gsrap/mkmodel/pruner.py +72 -7
  21. gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +33 -6
  22. gsrap/parsedb/.ipynb_checkpoints/cycles-checkpoint.py +128 -0
  23. gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +9 -9
  24. gsrap/parsedb/.ipynb_checkpoints/manual-checkpoint.py +27 -0
  25. gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +15 -2
  26. gsrap/parsedb/.ipynb_checkpoints/repeating-checkpoint.py +9 -0
  27. gsrap/parsedb/completeness.py +33 -6
  28. gsrap/parsedb/cycles.py +128 -0
  29. gsrap/parsedb/introduce.py +9 -9
  30. gsrap/parsedb/manual.py +27 -0
  31. gsrap/parsedb/parsedb.py +15 -2
  32. gsrap/parsedb/repeating.py +9 -0
  33. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/METADATA +1 -1
  34. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/RECORD +37 -33
  35. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/LICENSE.txt +0 -0
  36. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/WHEEL +0 -0
  37. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/entry_points.txt +0 -0
gsrap/mkmodel/pruner.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import warnings
3
3
  import logging
4
+ import pickle
4
5
 
5
6
 
6
7
  import pandas as pnd
@@ -43,22 +44,57 @@ def load_input_eggnog(logger, eggnog):
43
44
 
44
45
 
45
46
  # load eggnog annotations
46
- eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
47
- eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
48
- eggnog = eggnog.set_index('query', drop=True, verify_integrity=True)
47
+ df_eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
48
+ df_eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
49
+ df_eggnog = df_eggnog.set_index('query', drop=True, verify_integrity=True)
49
50
 
50
51
 
51
- return eggnog
52
+ return df_eggnog
52
53
 
53
54
 
54
55
 
55
- def parse_eggnog(eggnog):
56
+ def load_keggorg_like_eggnog(logger, keggorg, outdir):
57
+
58
+
59
+ # load raw data, downloaded form kegg:
60
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
61
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
62
+
63
+
64
+ # create an eggnog-like dataframe:
65
+ df_eggnog_like = [] # list of dict future df
66
+ for gid in df_keggorg.index:
67
+ row_dict = {}
68
+
69
+ row_dict['query'] = gid
70
+ row_dict['PFAMs'] = ','.join(df_keggorg.loc[gid, 'Pfam']) if type(df_keggorg.loc[gid, 'Pfam'])==list else '-'
71
+ row_dict['KEGG_ko'] = df_keggorg.loc[gid, 'ko'] if type(df_keggorg.loc[gid, 'ko'])==str else '-'
72
+
73
+ df_eggnog_like.append(row_dict)
74
+ df_eggnog_like = pnd.DataFrame.from_records(df_eggnog_like)
75
+
76
+
77
+ # appen missing coluns and sort
78
+ eggnog_columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
79
+ for c in eggnog_columns:
80
+ if c not in df_eggnog_like.columns:
81
+ df_eggnog_like[c] = '-'
82
+ df_eggnog_like = df_eggnog_like[eggnog_columns]
83
+
84
+
85
+ # set the index like in eggnog
86
+ df_eggnog_like = df_eggnog_like.set_index('query', drop=True, verify_integrity=True)
87
+ return df_eggnog_like
88
+
89
+
90
+
91
+ def parse_eggnog(df_eggnog):
56
92
 
57
93
 
58
94
  # PART 1. get KO codes available
59
95
  gid_to_kos = {}
60
96
  ko_to_gids = {}
61
- for gid, kos in eggnog['KEGG_ko'].items():
97
+ for gid, kos in df_eggnog['KEGG_ko'].items():
62
98
  if kos == '-':
63
99
  continue
64
100
 
@@ -229,8 +265,37 @@ def restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos):
229
265
  # collect names
230
266
  names.append(uni_g.name)
231
267
  g.name = '; '.join(names)
268
+
269
+
270
+
271
+ def append_keggorg_gene_annots(logger, model, keggorg, outdir):
272
+
232
273
 
233
-
274
+ # load raw data, downloaded form kegg:
275
+ logger.info("Adding gene annotations retrieved from KEGG...")
276
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
277
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
278
+
279
+
280
+ # KEGG can provide some useful (ie, used in Memote) gene annotations:
281
+ for g in model.genes:
282
+ if g.id in df_keggorg.index:
283
+
284
+ g.annotation['kegg.genes'] = [keggorg + ':' + g.id]
285
+
286
+ if 'NCBI-GeneID' in df_keggorg.columns:
287
+ g.annotation['ncbigene'] = df_keggorg.loc[g.id, 'NCBI-GeneID'] if type(df_keggorg.loc[g.id, 'NCBI-GeneID'])==list else []
288
+ if 'NCBI-ProteinID' in df_keggorg.columns:
289
+ g.annotation['ncbiprotein'] = df_keggorg.loc[g.id, 'NCBI-ProteinID'] if type(df_keggorg.loc[g.id, 'NCBI-ProteinID'])==list else []
290
+ if 'ASAP' in df_keggorg.columns:
291
+ g.annotation['asap'] = df_keggorg.loc[g.id, 'ASAP'] if type(df_keggorg.loc[g.id, 'ASAP'])==list else []
292
+ if 'UniProt' in df_keggorg.columns:
293
+ g.annotation['uniprot'] = df_keggorg.loc[g.id, 'UniProt'] if type(df_keggorg.loc[g.id, 'UniProt'])==list else []
294
+
295
+
296
+
297
+
298
+
234
299
 
235
300
 
236
301
 
@@ -1,4 +1,6 @@
1
1
  from pathlib import Path
2
+ import pickle
3
+ import os
2
4
 
3
5
 
4
6
  import pandas as pnd
@@ -35,14 +37,39 @@ def parse_eggnog(model, eggnog, idcollection_dict):
35
37
  return krs_org
36
38
 
37
39
 
40
+
41
+ def parse_keggorg(keggorg, outdir, idcollection_dict):
42
+
43
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
44
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
45
+
46
+
47
+ # PART 1. get KO codes available
48
+ kos_org = set([i for i in df_keggorg['ko'] if pnd.isna(i)==False])
49
+
50
+
51
+ # PART 2. get reactions in the organism (even the GPR is not complete)
52
+ kr_to_kos = idcollection_dict['kr_to_kos']
53
+ krs_org = set()
54
+ for kr, kos in kr_to_kos.items():
55
+ if any([ko in kos_org for ko in kos]):
56
+ krs_org.add(kr)
57
+
58
+
59
+ return krs_org
60
+
61
+
38
62
 
39
- def check_completeness(logger, model, progress, module, focus, eggnog, idcollection_dict, summary_dict):
63
+ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
40
64
  # check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
41
65
 
42
66
 
43
67
  # get the reference set of kr codes (all kegg or organism specific):
44
68
  kr_uni = set()
45
- if eggnog != '-':
69
+ if keggorg != '-': # keggorg has precedence
70
+ kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
71
+ kr_uni_label = f"organism code '{keggorg}'"
72
+ elif eggnog != '-':
46
73
  for eggfile in eggnog:
47
74
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
48
75
  kr_uni = kr_uni.union(eggset)
@@ -60,7 +87,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, idcollect
60
87
  kr_ids_modeled.add(kr_id)
61
88
  kr_uni_missing = kr_uni - kr_ids_modeled
62
89
  kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
63
- logger.info(f"Coverage for '{kr_uni_label}': {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
90
+ logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
64
91
 
65
92
 
66
93
  # define the map?????, containing krs not included in maps
@@ -177,15 +204,15 @@ def check_completeness(logger, model, progress, module, focus, eggnog, idcollect
177
204
  for eggfile in eggnog:
178
205
  strain = Path(eggfile).stem
179
206
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
180
- col = df_coverage.index.to_series().isin(eggset).astype(int)
207
+ col = df_coverage.index.to_series().isin(eggset).astype(int) # integer: 0 or 1
181
208
  df_strains.append(col.rename(strain))
182
209
  df_strains = pnd.concat(df_strains, axis=1)
183
210
  # sort rows: upper rows are present in more strains
184
- df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index]
211
+ #df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index] # commented: now in charge of figures.py
185
212
  df_coverage = df_coverage.loc[df_strains.index]
186
213
  df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
187
214
  # split in 2: modeled above, non-modeled below:
188
- df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]])
215
+ #df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]]) # commented: now in charge of figures.py
189
216
  else: # not interesting in a super-long table without strains in column
190
217
  df_coverage = None
191
218
 
@@ -0,0 +1,128 @@
1
+ import warnings
2
+ import os
3
+ import logging
4
+
5
+
6
+ import cobra
7
+ import gempipe
8
+
9
+
10
+ from ..commons import fba_no_warnings
11
+ from ..commons import get_optthr
12
+
13
+
14
+
15
+ def verify_egc(logger, model, mid, outdir):
16
+
17
+
18
+ # changes as not permament:
19
+ found_egc = False
20
+ with model:
21
+
22
+ # close (0; 0) all the exchange reactions:
23
+ gempipe.close_boundaries(model)
24
+
25
+
26
+ # create a dissipation reaction:
27
+ dissip = cobra.Reaction(f'__dissip__{mid}')
28
+ model.add_reactions([dissip])
29
+ dissip = model.reactions.get_by_id(f'__dissip__{mid}')
30
+
31
+
32
+ # define the dissipation reaction:
33
+ modeled_mids = [m.id for m in model.metabolites]
34
+ if mid == 'atp':
35
+ dissip_string = 'atp_c + h2o_c --> adp_c + pi_c + h_c'
36
+ elif mid == 'ctp':
37
+ dissip_string = 'ctp_c + h2o_c --> cdp_c + pi_c + h_c'
38
+ elif mid == 'gtp':
39
+ dissip_string = 'gtp_c + h2o_c --> gdp_c + pi_c + h_c'
40
+ elif mid == 'utp':
41
+ dissip_string = 'utp_c + h2o_c --> udp_c + pi_c + h_c'
42
+ elif mid == 'itp':
43
+ dissip_string = 'itp_c + h2o_c --> idp_c + pi_c + h_c'
44
+ elif mid == 'nadh':
45
+ dissip_string = 'nadh_c --> nad_c + h_c'
46
+ elif mid == 'nadph':
47
+ dissip_string = 'nadph_c --> nadp_c + h_c'
48
+ elif mid == 'fadh2':
49
+ dissip_string = 'fadh2_c --> fad_c + 2.0 h_c'
50
+ elif mid == 'accoa':
51
+ dissip_string = 'accoa_c + h2o_c --> ac_c + coa_c + h_c'
52
+ elif mid == 'glu__L':
53
+ dissip_string = 'glu__L_c + h2o_c --> akg_c + nh4_c + 2.0 h_c'
54
+ elif mid == 'q8h2':
55
+ dissip_string = 'q8h2_c --> q8_c + 2.0 h_c'
56
+ dissip.build_reaction_from_string(dissip_string)
57
+
58
+
59
+ # set the objective and optimize:
60
+ model.objective = f'__dissip__{mid}'
61
+ res, obj_value, status = fba_no_warnings(model)
62
+
63
+
64
+ # apply the threshold:
65
+ obj_value = res.objective_value
66
+ status = res.status
67
+ if status == 'optimal' and obj_value >= get_optthr():
68
+ found_egc = True
69
+
70
+
71
+ # get suspect !=0 fluxes
72
+ fluxes = res.fluxes
73
+ # get interesting fluxes (get_optthr() tries to take into account the approximation in glpk and cplex solvers)
74
+ fluxes_interesting = fluxes[(fluxes > get_optthr()) | (fluxes < -get_optthr())]
75
+
76
+
77
+ # create a model for escher, remove Rs not beloning to the cycle
78
+ model_copy = model.copy()
79
+ all_rids = [r.id for r in model_copy.reactions]
80
+ to_delete = set(all_rids) - set(fluxes_interesting.index)
81
+
82
+
83
+ # trick to avoid the WARNING "cobra/core/group.py:147: UserWarning: need to pass in a list"
84
+ # triggered when trying to remove reactions that are included in groups.
85
+ with warnings.catch_warnings(): # temporarily suppress warnings for this block
86
+ warnings.simplefilter("ignore") # ignore all warnings
87
+ cobra_logger = logging.getLogger("cobra.util.solver")
88
+ old_level = cobra_logger.level
89
+ cobra_logger.setLevel(logging.ERROR)
90
+
91
+ # triggering code
92
+ model_copy.remove_reactions(to_delete) # should work also with IDs
93
+
94
+ # restore original behaviour:
95
+ cobra_logger.setLevel(old_level)
96
+
97
+
98
+ # save JSON to direct import in Escher:
99
+ outfile = os.path.join(outdir, f'EGC_{mid}.json')
100
+ cobra.io.save_json_model(model_copy, outfile)
101
+
102
+
103
+ # log some messages
104
+ rid_labels = []
105
+ for rid, flux in fluxes_interesting.to_dict().items():
106
+ rid_label = "'" + rid + "'"
107
+ # mark reversible reactions composing the cycle:
108
+ r = model.reactions.get_by_id(rid)
109
+ if r.lower_bound < 0 and r.upper_bound > 0:
110
+ rid_label = rid_label + '(<=>)'
111
+ rid_labels.append(rid_label)
112
+ logger.warning(f"Found erroneous EGC (N={len(model_copy.reactions)}) for '{mid}' (f={obj_value}): [{', '.join(rid_labels)}]. EGC saved to '{outfile}' to be inspected with Escher-FBA.")
113
+
114
+
115
+ return found_egc
116
+
117
+
118
+
119
+ def verify_egc_all(logger, model, outdir='./', mids_to_check=['atp','ctp','gtp','utp','itp','nadh','nadph','fadh2','accoa','glu__L','q8h2']):
120
+
121
+
122
+ all_results = []
123
+ for mid in mids_to_check:
124
+ all_results.append(verify_egc(logger, model, mid, outdir))
125
+ if any(all_results)==False:
126
+ logger.info("Found 0 erroneous energy-generating cycles (EGCs).")
127
+
128
+
@@ -143,6 +143,14 @@ def introduce_metabolites(logger, db, model, idcollection_dict, kegg_compound_to
143
143
  m.annotation[ankey] = list(m.annotation[ankey])
144
144
 
145
145
 
146
+ # replace inchikey with manually-curated
147
+ if m.annotation['inchikey'] != [] and m.annotation['inchikey'] != [row['inchikey']]:
148
+ logger.debug(f"Metabolite '{pure_mid}': manual-curated inchikey ({[row['inchikey']]}) is diferent from the one derived from MNX ({m.annotation['inchikey']}).")
149
+ m.annotation['inchikey'] = [row['inchikey']] # force the manual-curated version
150
+ if m.annotation['inchikey'] == ['XXXXXXXXXXXXXX-XXXXXXXXXX-X']:
151
+ m.annotation['inchikey'] = []
152
+
153
+
146
154
  # add SBO annotation
147
155
  m.annotation['sbo'] = ['SBO:0000247'] # generic metabolite
148
156
 
@@ -279,15 +287,7 @@ def introduce_transporters(logger, db, model, idcollection_dict, kegg_reaction_t
279
287
  r = model.reactions.get_by_id(f'EX_{mid_e}')
280
288
  r.name = f"Exchange for {model.metabolites.get_by_id(mid_e).name}"
281
289
  r.build_reaction_from_string(f'{mid_e} --> ')
282
- if mid_e in [
283
- # basics:
284
- 'glc__D_e', 'nh4_e', 'pi_e', 'so4_e', 'h2o_e', 'h_e', 'o2_e', 'co2_e',
285
- # metals:
286
- 'cu2_e', 'mobd_e', 'fe2_e', 'cobalt2_e',
287
- ]:
288
- r.bounds = (-1000, 1000)
289
- else:
290
- r.bounds = (0, 1000)
290
+ r.bounds = (0, 1000)
291
291
 
292
292
  # add SBO annotation
293
293
  r.annotation['sbo'] = ['SBO:0000627'] # exchange reaction
@@ -19,6 +19,33 @@ def get_rids_with_mancheck_gpr():
19
19
  return rids_mancheck_gpr
20
20
 
21
21
 
22
+ def get_rids_with_mancheck_balancing():
23
+ rids_mancheck_bal = [ # same reactions involving ATP can be reversible
24
+
25
+ # SECTION "reversible both in KEGG and MetaCyc"
26
+ 'PGK', 'SUCOAS', 'ADK1', 'GK1', 'NNATr', 'CYTK1', 'ACKr',
27
+ 'DGK1', 'PPAKr', 'ATPSr', 'NDPK10',
28
+
29
+ ### SECTION "reversible in KEGG but not in MetaCyc" ###
30
+ 'CYTK2', # clearly reversible in KEGG but not in MetaCyc (RXN-7913)
31
+ 'DADK', # clearly reversible in KEGG but not in MetaCyc (DEOXYADENYLATE-KINASE-RXN)
32
+ 'UMPK', # clearly reversible in KEGG but not in MetaCyc (RXN-12002)
33
+ 'NDPK1', # clearly reversible in KEGG but not in MetaCyc (GDPKIN-RXN)
34
+ 'NDPK2', # clearly reversible in KEGG but not in MetaCyc (UDPKIN-RXN)
35
+ 'NDPK3', # clearly reversible in KEGG but not in MetaCyc (CDPKIN-RXN)
36
+ 'NDPK4', # clearly reversible in KEGG but not in MetaCyc (DTDPKIN-RXN)
37
+ 'NDPK5', # clearly reversible in KEGG but not in MetaCyc (DGDPKIN-RXN)
38
+ 'NDPK6', # clearly reversible in KEGG but not in MetaCyc (DUDPKIN-RXN)
39
+ 'NDPK7', # clearly reversible in KEGG but not in MetaCyc (DCDPKIN-RXN)
40
+ 'NDPK8', # clearly reversible in KEGG but not in MetaCyc (DADPKIN-RXN)
41
+ 'NDPK9', # clearly reversible in KEGG but not in MetaCyc (RXN-14120)
42
+
43
+ ### SECTION "missing reversibility info" ###
44
+ 'LPHERA',
45
+ ]
46
+ return rids_mancheck_bal
47
+
48
+
22
49
 
23
50
  def get_manual_sinks():
24
51
 
@@ -17,6 +17,7 @@ from ..commons import show_contributions
17
17
  from ..commons import adjust_biomass_precursors
18
18
  from ..commons import count_undrawn_rids
19
19
  from ..commons import format_expansion
20
+ from ..commons import download_keggorg
20
21
 
21
22
  from .introduce import introduce_metabolites
22
23
  from .introduce import introduce_reactions
@@ -34,6 +35,8 @@ from ..runsims.biosynth import biosynthesis_on_media
34
35
 
35
36
  from ..mkmodel.polishing import remove_disconnected
36
37
 
38
+ from .cycles import verify_egc_all
39
+
37
40
 
38
41
 
39
42
 
@@ -72,7 +75,14 @@ def main(args, logger):
72
75
 
73
76
 
74
77
  # format the --eggnog param
75
- args.eggnog = format_expansion(logger, args.eggnog)
78
+ args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
79
+
80
+ # get the kegg organism if requested
81
+ if args.keggorg != '-':
82
+ response = download_keggorg(logger, args.keggorg, args.outdir)
83
+ if response == 1: return 1
84
+
85
+
76
86
 
77
87
 
78
88
  # check and extract the required 'gsrap.maps' file
@@ -153,7 +163,7 @@ def main(args, logger):
153
163
 
154
164
  ###### CHECKS 1
155
165
  # check universe completness
156
- df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, idcollection_dict, summary_dict)
166
+ df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, args.keggorg, idcollection_dict, summary_dict, args.outdir)
157
167
  if type(df_C)==int: return 1
158
168
 
159
169
 
@@ -165,6 +175,9 @@ def main(args, logger):
165
175
 
166
176
 
167
177
  ###### CHECKS 2
178
+ # check erroneous EGCs
179
+ verify_egc_all(logger, universe, args.outdir)
180
+
168
181
  # check growth on minmal media
169
182
  df_G = grow_on_media(logger, universe, dbexp, args.media, '-', True)
170
183
  if type(df_G)==int: return 1
@@ -4,6 +4,7 @@ import cobra
4
4
 
5
5
  from .manual import get_deprecated_kos
6
6
  from .manual import get_rids_with_mancheck_gpr
7
+ from .manual import get_rids_with_mancheck_balancing
7
8
 
8
9
 
9
10
 
@@ -138,6 +139,14 @@ def add_reaction(logger, model, rid, row, kr_ids, kegg_reaction_to_others, addty
138
139
  return 1
139
140
 
140
141
 
142
+ # check if reversible and using ATP
143
+ if r.lower_bound < 0 and r.upper_bound > 0:
144
+ for m in r.metabolites:
145
+ if m.id.rsplit('_', 1)[0] == 'atp':
146
+ if rid not in get_rids_with_mancheck_balancing():
147
+ logger.warning(f"Reaction '{rid}' involves ATP and is reversible: are you sure?")
148
+
149
+
141
150
  return 0
142
151
 
143
152
 
@@ -1,4 +1,6 @@
1
1
  from pathlib import Path
2
+ import pickle
3
+ import os
2
4
 
3
5
 
4
6
  import pandas as pnd
@@ -35,14 +37,39 @@ def parse_eggnog(model, eggnog, idcollection_dict):
35
37
  return krs_org
36
38
 
37
39
 
40
+
41
+ def parse_keggorg(keggorg, outdir, idcollection_dict):
42
+
43
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
44
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
45
+
46
+
47
+ # PART 1. get KO codes available
48
+ kos_org = set([i for i in df_keggorg['ko'] if pnd.isna(i)==False])
49
+
50
+
51
+ # PART 2. get reactions in the organism (even the GPR is not complete)
52
+ kr_to_kos = idcollection_dict['kr_to_kos']
53
+ krs_org = set()
54
+ for kr, kos in kr_to_kos.items():
55
+ if any([ko in kos_org for ko in kos]):
56
+ krs_org.add(kr)
57
+
58
+
59
+ return krs_org
60
+
61
+
38
62
 
39
- def check_completeness(logger, model, progress, module, focus, eggnog, idcollection_dict, summary_dict):
63
+ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
40
64
  # check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
41
65
 
42
66
 
43
67
  # get the reference set of kr codes (all kegg or organism specific):
44
68
  kr_uni = set()
45
- if eggnog != '-':
69
+ if keggorg != '-': # keggorg has precedence
70
+ kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
71
+ kr_uni_label = f"organism code '{keggorg}'"
72
+ elif eggnog != '-':
46
73
  for eggfile in eggnog:
47
74
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
48
75
  kr_uni = kr_uni.union(eggset)
@@ -60,7 +87,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, idcollect
60
87
  kr_ids_modeled.add(kr_id)
61
88
  kr_uni_missing = kr_uni - kr_ids_modeled
62
89
  kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
63
- logger.info(f"Coverage for '{kr_uni_label}': {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
90
+ logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
64
91
 
65
92
 
66
93
  # define the map?????, containing krs not included in maps
@@ -177,15 +204,15 @@ def check_completeness(logger, model, progress, module, focus, eggnog, idcollect
177
204
  for eggfile in eggnog:
178
205
  strain = Path(eggfile).stem
179
206
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
180
- col = df_coverage.index.to_series().isin(eggset).astype(int)
207
+ col = df_coverage.index.to_series().isin(eggset).astype(int) # integer: 0 or 1
181
208
  df_strains.append(col.rename(strain))
182
209
  df_strains = pnd.concat(df_strains, axis=1)
183
210
  # sort rows: upper rows are present in more strains
184
- df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index]
211
+ #df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index] # commented: now in charge of figures.py
185
212
  df_coverage = df_coverage.loc[df_strains.index]
186
213
  df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
187
214
  # split in 2: modeled above, non-modeled below:
188
- df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]])
215
+ #df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]]) # commented: now in charge of figures.py
189
216
  else: # not interesting in a super-long table without strains in column
190
217
  df_coverage = None
191
218
 
@@ -0,0 +1,128 @@
1
+ import warnings
2
+ import os
3
+ import logging
4
+
5
+
6
+ import cobra
7
+ import gempipe
8
+
9
+
10
+ from ..commons import fba_no_warnings
11
+ from ..commons import get_optthr
12
+
13
+
14
+
15
+ def verify_egc(logger, model, mid, outdir):
16
+
17
+
18
+ # changes as not permament:
19
+ found_egc = False
20
+ with model:
21
+
22
+ # close (0; 0) all the exchange reactions:
23
+ gempipe.close_boundaries(model)
24
+
25
+
26
+ # create a dissipation reaction:
27
+ dissip = cobra.Reaction(f'__dissip__{mid}')
28
+ model.add_reactions([dissip])
29
+ dissip = model.reactions.get_by_id(f'__dissip__{mid}')
30
+
31
+
32
+ # define the dissipation reaction:
33
+ modeled_mids = [m.id for m in model.metabolites]
34
+ if mid == 'atp':
35
+ dissip_string = 'atp_c + h2o_c --> adp_c + pi_c + h_c'
36
+ elif mid == 'ctp':
37
+ dissip_string = 'ctp_c + h2o_c --> cdp_c + pi_c + h_c'
38
+ elif mid == 'gtp':
39
+ dissip_string = 'gtp_c + h2o_c --> gdp_c + pi_c + h_c'
40
+ elif mid == 'utp':
41
+ dissip_string = 'utp_c + h2o_c --> udp_c + pi_c + h_c'
42
+ elif mid == 'itp':
43
+ dissip_string = 'itp_c + h2o_c --> idp_c + pi_c + h_c'
44
+ elif mid == 'nadh':
45
+ dissip_string = 'nadh_c --> nad_c + h_c'
46
+ elif mid == 'nadph':
47
+ dissip_string = 'nadph_c --> nadp_c + h_c'
48
+ elif mid == 'fadh2':
49
+ dissip_string = 'fadh2_c --> fad_c + 2.0 h_c'
50
+ elif mid == 'accoa':
51
+ dissip_string = 'accoa_c + h2o_c --> ac_c + coa_c + h_c'
52
+ elif mid == 'glu__L':
53
+ dissip_string = 'glu__L_c + h2o_c --> akg_c + nh4_c + 2.0 h_c'
54
+ elif mid == 'q8h2':
55
+ dissip_string = 'q8h2_c --> q8_c + 2.0 h_c'
56
+ dissip.build_reaction_from_string(dissip_string)
57
+
58
+
59
+ # set the objective and optimize:
60
+ model.objective = f'__dissip__{mid}'
61
+ res, obj_value, status = fba_no_warnings(model)
62
+
63
+
64
+ # apply the threshold:
65
+ obj_value = res.objective_value
66
+ status = res.status
67
+ if status == 'optimal' and obj_value >= get_optthr():
68
+ found_egc = True
69
+
70
+
71
+ # get suspect !=0 fluxes
72
+ fluxes = res.fluxes
73
+ # get interesting fluxes (get_optthr() tries to take into account the approximation in glpk and cplex solvers)
74
+ fluxes_interesting = fluxes[(fluxes > get_optthr()) | (fluxes < -get_optthr())]
75
+
76
+
77
+ # create a model for escher, remove Rs not beloning to the cycle
78
+ model_copy = model.copy()
79
+ all_rids = [r.id for r in model_copy.reactions]
80
+ to_delete = set(all_rids) - set(fluxes_interesting.index)
81
+
82
+
83
+ # trick to avoid the WARNING "cobra/core/group.py:147: UserWarning: need to pass in a list"
84
+ # triggered when trying to remove reactions that are included in groups.
85
+ with warnings.catch_warnings(): # temporarily suppress warnings for this block
86
+ warnings.simplefilter("ignore") # ignore all warnings
87
+ cobra_logger = logging.getLogger("cobra.util.solver")
88
+ old_level = cobra_logger.level
89
+ cobra_logger.setLevel(logging.ERROR)
90
+
91
+ # triggering code
92
+ model_copy.remove_reactions(to_delete) # should work also with IDs
93
+
94
+ # restore original behaviour:
95
+ cobra_logger.setLevel(old_level)
96
+
97
+
98
+ # save JSON to direct import in Escher:
99
+ outfile = os.path.join(outdir, f'EGC_{mid}.json')
100
+ cobra.io.save_json_model(model_copy, outfile)
101
+
102
+
103
+ # log some messages
104
+ rid_labels = []
105
+ for rid, flux in fluxes_interesting.to_dict().items():
106
+ rid_label = "'" + rid + "'"
107
+ # mark reversible reactions composing the cycle:
108
+ r = model.reactions.get_by_id(rid)
109
+ if r.lower_bound < 0 and r.upper_bound > 0:
110
+ rid_label = rid_label + '(<=>)'
111
+ rid_labels.append(rid_label)
112
+ logger.warning(f"Found erroneous EGC (N={len(model_copy.reactions)}) for '{mid}' (f={obj_value}): [{', '.join(rid_labels)}]. EGC saved to '{outfile}' to be inspected with Escher-FBA.")
113
+
114
+
115
+ return found_egc
116
+
117
+
118
+
119
+ def verify_egc_all(logger, model, outdir='./', mids_to_check=['atp','ctp','gtp','utp','itp','nadh','nadph','fadh2','accoa','glu__L','q8h2']):
120
+
121
+
122
+ all_results = []
123
+ for mid in mids_to_check:
124
+ all_results.append(verify_egc(logger, model, mid, outdir))
125
+ if any(all_results)==False:
126
+ logger.info("Found 0 erroneous energy-generating cycles (EGCs).")
127
+
128
+