gsrap 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. gsrap/.ipynb_checkpoints/__init__-checkpoint.py +5 -1
  2. gsrap/__init__.py +5 -1
  3. gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
  4. gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +1 -1
  5. gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +1 -1
  6. gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +94 -37
  7. gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +119 -0
  8. gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py +145 -0
  9. gsrap/commons/__init__.py +1 -0
  10. gsrap/commons/downloads.py +1 -1
  11. gsrap/commons/escherutils.py +1 -1
  12. gsrap/commons/excelhub.py +94 -37
  13. gsrap/commons/figures.py +119 -0
  14. gsrap/commons/keggutils.py +145 -0
  15. gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +64 -20
  16. gsrap/mkmodel/.ipynb_checkpoints/pruner-checkpoint.py +72 -7
  17. gsrap/mkmodel/mkmodel.py +64 -20
  18. gsrap/mkmodel/pruner.py +72 -7
  19. gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +124 -64
  20. gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +8 -0
  21. gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +12 -5
  22. gsrap/parsedb/completeness.py +124 -64
  23. gsrap/parsedb/introduce.py +8 -0
  24. gsrap/parsedb/parsedb.py +12 -5
  25. gsrap/runsims/.ipynb_checkpoints/simplegrowth-checkpoint.py +2 -2
  26. gsrap/runsims/simplegrowth.py +2 -2
  27. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/METADATA +3 -1
  28. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/RECORD +31 -27
  29. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/LICENSE.txt +0 -0
  30. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/WHEEL +0 -0
  31. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import warnings
3
3
  import logging
4
+ import pickle
4
5
 
5
6
 
6
7
  import pandas as pnd
@@ -43,22 +44,57 @@ def load_input_eggnog(logger, eggnog):
43
44
 
44
45
 
45
46
  # load eggnog annotations
46
- eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
47
- eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
48
- eggnog = eggnog.set_index('query', drop=True, verify_integrity=True)
47
+ df_eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
48
+ df_eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
49
+ df_eggnog = df_eggnog.set_index('query', drop=True, verify_integrity=True)
49
50
 
50
51
 
51
- return eggnog
52
+ return df_eggnog
52
53
 
53
54
 
54
55
 
55
- def parse_eggnog(eggnog):
56
+ def load_keggorg_like_eggnog(logger, keggorg, outdir):
57
+
58
+
59
+ # load raw data, downloaded form kegg:
60
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
61
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
62
+
63
+
64
+ # create an eggnog-like dataframe:
65
+ df_eggnog_like = [] # list of dict future df
66
+ for gid in df_keggorg.index:
67
+ row_dict = {}
68
+
69
+ row_dict['query'] = gid
70
+ row_dict['PFAMs'] = ','.join(df_keggorg.loc[gid, 'Pfam']) if type(df_keggorg.loc[gid, 'Pfam'])==list else '-'
71
+ row_dict['KEGG_ko'] = df_keggorg.loc[gid, 'ko'] if type(df_keggorg.loc[gid, 'ko'])==str else '-'
72
+
73
+ df_eggnog_like.append(row_dict)
74
+ df_eggnog_like = pnd.DataFrame.from_records(df_eggnog_like)
75
+
76
+
77
+ # appen missing coluns and sort
78
+ eggnog_columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
79
+ for c in eggnog_columns:
80
+ if c not in df_eggnog_like.columns:
81
+ df_eggnog_like[c] = '-'
82
+ df_eggnog_like = df_eggnog_like[eggnog_columns]
83
+
84
+
85
+ # set the index like in eggnog
86
+ df_eggnog_like = df_eggnog_like.set_index('query', drop=True, verify_integrity=True)
87
+ return df_eggnog_like
88
+
89
+
90
+
91
+ def parse_eggnog(df_eggnog):
56
92
 
57
93
 
58
94
  # PART 1. get KO codes available
59
95
  gid_to_kos = {}
60
96
  ko_to_gids = {}
61
- for gid, kos in eggnog['KEGG_ko'].items():
97
+ for gid, kos in df_eggnog['KEGG_ko'].items():
62
98
  if kos == '-':
63
99
  continue
64
100
 
@@ -229,8 +265,37 @@ def restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos):
229
265
  # collect names
230
266
  names.append(uni_g.name)
231
267
  g.name = '; '.join(names)
268
+
269
+
270
+
271
+ def append_keggorg_gene_annots(logger, model, keggorg, outdir):
272
+
232
273
 
233
-
274
+ # load raw data, downloaded form kegg:
275
+ logger.info("Adding gene annotations retrieved from KEGG...")
276
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
277
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
278
+
279
+
280
+ # KEGG can provide some useful (ie, used in Memote) gene annotations:
281
+ for g in model.genes:
282
+ if g.id in df_keggorg.index:
283
+
284
+ g.annotation['kegg.genes'] = [keggorg + ':' + g.id]
285
+
286
+ if 'NCBI-GeneID' in df_keggorg.columns:
287
+ g.annotation['ncbigene'] = df_keggorg.loc[g.id, 'NCBI-GeneID'] if type(df_keggorg.loc[g.id, 'NCBI-GeneID'])==list else []
288
+ if 'NCBI-ProteinID' in df_keggorg.columns:
289
+ g.annotation['ncbiprotein'] = df_keggorg.loc[g.id, 'NCBI-ProteinID'] if type(df_keggorg.loc[g.id, 'NCBI-ProteinID'])==list else []
290
+ if 'ASAP' in df_keggorg.columns:
291
+ g.annotation['asap'] = df_keggorg.loc[g.id, 'ASAP'] if type(df_keggorg.loc[g.id, 'ASAP'])==list else []
292
+ if 'UniProt' in df_keggorg.columns:
293
+ g.annotation['uniprot'] = df_keggorg.loc[g.id, 'UniProt'] if type(df_keggorg.loc[g.id, 'UniProt'])==list else []
294
+
295
+
296
+
297
+
298
+
234
299
 
235
300
 
236
301
 
gsrap/mkmodel/mkmodel.py CHANGED
@@ -12,10 +12,12 @@ import gempipe
12
12
 
13
13
  from .pruner import load_input_universe
14
14
  from .pruner import load_input_eggnog
15
+ from .pruner import load_keggorg_like_eggnog
15
16
  from .pruner import parse_eggnog
16
17
  from .pruner import subtract_kos
17
18
  from .pruner import translate_remaining_kos
18
19
  from .pruner import restore_gene_annotations
20
+ from .pruner import append_keggorg_gene_annots
19
21
 
20
22
  from .gapfillutils import include_forced
21
23
 
@@ -38,26 +40,37 @@ from ..commons import log_metrics
38
40
  from ..commons import log_unbalances
39
41
  from ..commons import format_expansion
40
42
  from ..commons import comparative_table
43
+ from ..commons import download_keggorg
41
44
 
42
45
  from ..runsims.biosynth import biosynthesis_on_media
43
46
 
44
47
 
45
48
 
46
49
  def create_model_incore(params):
47
- universe, eggpath, dbexp, args, multistrain = params
50
+ annotation_source, universe, eggpath, dbexp, args, multistrain = params
51
+
52
+ # get the logger:
48
53
  logger = get_logger('gsrap_queued', args.verbose) # loggers can't be pickled!
54
+
55
+
56
+ # only errors will be recorded if multistrain mode
49
57
  if multistrain:
50
- # only errors will be recorded
51
58
  logger.setLevel(logging.ERROR)
52
59
 
53
60
 
54
61
  # load the annotation
55
- eggnog = load_input_eggnog(logger, eggpath)
62
+ if annotation_source == 'keggorg':
63
+ eggnog_style_table = load_keggorg_like_eggnog(logger, args.keggorg, args.outdir)
64
+ elif annotation_source == 'eggnog':
65
+ eggnog_style_table = load_input_eggnog(logger, eggpath)
56
66
 
57
67
 
58
- # create a copy of the universe
68
+ # create a copy of the universe and define the model ID
59
69
  model = universe.copy()
60
- model.id = Path(eggpath).stem
70
+ if annotation_source == 'keggorg':
71
+ model.id = args.keggorg
72
+ elif annotation_source == 'eggnog':
73
+ model.id = Path(eggpath).stem
61
74
 
62
75
 
63
76
  ###### POLISHING 1
@@ -67,9 +80,10 @@ def create_model_incore(params):
67
80
 
68
81
 
69
82
  ###### PRUNING
70
- logger.info("Reading provided eggnog-mapper annotation...")
83
+ if annotation_source == 'keggorg': logger.info(f"Reading annotation for organism code '{args.keggorg}'...")
84
+ elif annotation_source == 'eggnog': logger.info("Reading provided eggnog-mapper annotation...")
71
85
  # get important dictionaries: 'eggnog_ko_to_gids' and 'eggonog_gid_to_kos'
72
- eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog)
86
+ eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog_style_table)
73
87
 
74
88
  # prune reactions
75
89
  subtract_kos(logger, model, eggnog_ko_to_gids)
@@ -77,6 +91,10 @@ def create_model_incore(params):
77
91
  # translate KOs to the actual genes
78
92
  translate_remaining_kos(logger, model, eggnog_ko_to_gids)
79
93
  restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
94
+
95
+ # insert gene annotation if starting from kegg organisms:
96
+ if annotation_source == 'keggorg':
97
+ append_keggorg_gene_annots(logger, model, args.keggorg, args.outdir)
80
98
 
81
99
 
82
100
 
@@ -141,7 +159,7 @@ def create_model_incore(params):
141
159
  cobra.io.write_sbml_model(model, f'{args.outdir}/{model.id}.xml') # SBML # groups are saved only to SBML
142
160
  logger.info(f"'{args.outdir}/{model.id}.xml' created!")
143
161
  force_id_on_sbml(f'{args.outdir}/{model.id}.xml', model.id) # force introduction of the 'id=""' field
144
- sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', None, df_B, df_P, df_S)
162
+ sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', args.nofigs, None, df_B, df_P, df_S)
145
163
  logger.info(f"'{args.outdir}/{model.id}.mkmodel.xlsx' created!")
146
164
 
147
165
 
@@ -171,13 +189,28 @@ def main(args, logger):
171
189
 
172
190
 
173
191
  # format the --eggnog param
174
- args.eggnog = format_expansion(logger, args.eggnog)
175
- if args.eggnog == '-':
176
- logger.error("No valid eggnog-mapper annotations provided.")
192
+ args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
193
+
194
+ # get the kegg organism if requested
195
+ if args.keggorg != '-':
196
+ response = download_keggorg(logger, args.keggorg, args.outdir)
197
+ if response == 1: return 1
198
+
199
+
200
+
201
+ # determine the source of functional annotation:
202
+ annotation_source = None
203
+ if args.keggorg != '-': # keggorg has precedence
204
+ annotation_source = 'keggorg'
205
+ elif args.eggnog != '-':
206
+ annotation_source = 'eggnog'
207
+ if args.cores > len(args.eggnog):
208
+ logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
209
+ args.cores = len(args.eggnog)
210
+ else:
211
+ logger.error("No valid functional annotations provided: please use '--keggorg' or '--eggnog'.")
177
212
  return 1
178
- if args.cores > len(args.eggnog):
179
- logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
180
- args.cores = len(args.eggnog)
213
+
181
214
 
182
215
 
183
216
  # check compatibility of input parameters:
@@ -201,17 +234,26 @@ def main(args, logger):
201
234
 
202
235
 
203
236
  # disable logging (swith to txt) if strains are more than 1:
204
- multistrain = len(args.eggnog) > 1
205
- if multistrain:
206
- logger.info(f"Number of provided strains is >1: logging will be disabled.")
207
- logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
208
- # actualy this is done inside child processess!
237
+ if annotation_source == 'keggorg':
238
+ multistrain = False
239
+ elif annotation_source == 'eggnog':
240
+ multistrain = len(args.eggnog) > 1
241
+ if multistrain:
242
+ logger.info(f"Number of provided strains is >1: logging will be disabled.")
243
+ logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
244
+ # actualy this is done inside child processess!
245
+
209
246
 
210
247
  # create strain-specific GSMMs using multi-core
211
248
  error_raised = False
212
249
  sheets_dicts = []
213
250
  executor = confu.ProcessPoolExecutor(max_workers=args.cores)
214
- futures = [executor.submit(create_model_incore, (universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
251
+
252
+ if annotation_source == 'keggorg':
253
+ futures = [executor.submit(create_model_incore, (annotation_source, universe, None, dbexp, args, multistrain))]
254
+ elif annotation_source == 'eggnog':
255
+ futures = [executor.submit(create_model_incore, (annotation_source, universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
256
+
215
257
  for f in confu.as_completed(futures):
216
258
  sheets_dict = f.result()
217
259
 
@@ -226,12 +268,14 @@ def main(args, logger):
226
268
  sheets_dicts.append(sheets_dict)
227
269
  print(f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!", end='\r', file=sys.stderr)
228
270
 
271
+
229
272
  # hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
230
273
  if multistrain and sheets_dicts != []:
231
274
  last_trace = f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!"
232
275
  whitewash = ''.join([' ' for i in range(len(last_trace))])
233
276
  print(whitewash, end='\r', file=sys.stderr)
234
277
 
278
+
235
279
  # multiproces part terminated: safely shut down the executor
236
280
  executor.shutdown(wait=True)
237
281
 
gsrap/mkmodel/pruner.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import warnings
3
3
  import logging
4
+ import pickle
4
5
 
5
6
 
6
7
  import pandas as pnd
@@ -43,22 +44,57 @@ def load_input_eggnog(logger, eggnog):
43
44
 
44
45
 
45
46
  # load eggnog annotations
46
- eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
47
- eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
48
- eggnog = eggnog.set_index('query', drop=True, verify_integrity=True)
47
+ df_eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
48
+ df_eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
49
+ df_eggnog = df_eggnog.set_index('query', drop=True, verify_integrity=True)
49
50
 
50
51
 
51
- return eggnog
52
+ return df_eggnog
52
53
 
53
54
 
54
55
 
55
- def parse_eggnog(eggnog):
56
+ def load_keggorg_like_eggnog(logger, keggorg, outdir):
57
+
58
+
59
+ # load raw data, downloaded form kegg:
60
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
61
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
62
+
63
+
64
+ # create an eggnog-like dataframe:
65
+ df_eggnog_like = [] # list of dict future df
66
+ for gid in df_keggorg.index:
67
+ row_dict = {}
68
+
69
+ row_dict['query'] = gid
70
+ row_dict['PFAMs'] = ','.join(df_keggorg.loc[gid, 'Pfam']) if type(df_keggorg.loc[gid, 'Pfam'])==list else '-'
71
+ row_dict['KEGG_ko'] = df_keggorg.loc[gid, 'ko'] if type(df_keggorg.loc[gid, 'ko'])==str else '-'
72
+
73
+ df_eggnog_like.append(row_dict)
74
+ df_eggnog_like = pnd.DataFrame.from_records(df_eggnog_like)
75
+
76
+
77
+ # appen missing coluns and sort
78
+ eggnog_columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
79
+ for c in eggnog_columns:
80
+ if c not in df_eggnog_like.columns:
81
+ df_eggnog_like[c] = '-'
82
+ df_eggnog_like = df_eggnog_like[eggnog_columns]
83
+
84
+
85
+ # set the index like in eggnog
86
+ df_eggnog_like = df_eggnog_like.set_index('query', drop=True, verify_integrity=True)
87
+ return df_eggnog_like
88
+
89
+
90
+
91
+ def parse_eggnog(df_eggnog):
56
92
 
57
93
 
58
94
  # PART 1. get KO codes available
59
95
  gid_to_kos = {}
60
96
  ko_to_gids = {}
61
- for gid, kos in eggnog['KEGG_ko'].items():
97
+ for gid, kos in df_eggnog['KEGG_ko'].items():
62
98
  if kos == '-':
63
99
  continue
64
100
 
@@ -229,8 +265,37 @@ def restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos):
229
265
  # collect names
230
266
  names.append(uni_g.name)
231
267
  g.name = '; '.join(names)
268
+
269
+
270
+
271
+ def append_keggorg_gene_annots(logger, model, keggorg, outdir):
272
+
232
273
 
233
-
274
+ # load raw data, downloaded form kegg:
275
+ logger.info("Adding gene annotations retrieved from KEGG...")
276
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
277
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
278
+
279
+
280
+ # KEGG can provide some useful (ie, used in Memote) gene annotations:
281
+ for g in model.genes:
282
+ if g.id in df_keggorg.index:
283
+
284
+ g.annotation['kegg.genes'] = [keggorg + ':' + g.id]
285
+
286
+ if 'NCBI-GeneID' in df_keggorg.columns:
287
+ g.annotation['ncbigene'] = df_keggorg.loc[g.id, 'NCBI-GeneID'] if type(df_keggorg.loc[g.id, 'NCBI-GeneID'])==list else []
288
+ if 'NCBI-ProteinID' in df_keggorg.columns:
289
+ g.annotation['ncbiprotein'] = df_keggorg.loc[g.id, 'NCBI-ProteinID'] if type(df_keggorg.loc[g.id, 'NCBI-ProteinID'])==list else []
290
+ if 'ASAP' in df_keggorg.columns:
291
+ g.annotation['asap'] = df_keggorg.loc[g.id, 'ASAP'] if type(df_keggorg.loc[g.id, 'ASAP'])==list else []
292
+ if 'UniProt' in df_keggorg.columns:
293
+ g.annotation['uniprot'] = df_keggorg.loc[g.id, 'UniProt'] if type(df_keggorg.loc[g.id, 'UniProt'])==list else []
294
+
295
+
296
+
297
+
298
+
234
299
 
235
300
 
236
301
 
@@ -1,3 +1,8 @@
1
+ from pathlib import Path
2
+ import pickle
3
+ import os
4
+
5
+
1
6
  import pandas as pnd
2
7
 
3
8
 
@@ -32,14 +37,39 @@ def parse_eggnog(model, eggnog, idcollection_dict):
32
37
  return krs_org
33
38
 
34
39
 
40
+
41
+ def parse_keggorg(keggorg, outdir, idcollection_dict):
42
+
43
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
44
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
45
+
46
+
47
+ # PART 1. get KO codes available
48
+ kos_org = set([i for i in df_keggorg['ko'] if pnd.isna(i)==False])
49
+
50
+
51
+ # PART 2. get reactions in the organism (even the GPR is not complete)
52
+ kr_to_kos = idcollection_dict['kr_to_kos']
53
+ krs_org = set()
54
+ for kr, kos in kr_to_kos.items():
55
+ if any([ko in kos_org for ko in kos]):
56
+ krs_org.add(kr)
35
57
 
36
- def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, idcollection_dict, summary_dict):
58
+
59
+ return krs_org
60
+
61
+
62
+
63
+ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
37
64
  # check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
38
65
 
39
66
 
40
67
  # get the reference set of kr codes (all kegg or organism specific):
41
68
  kr_uni = set()
42
- if eggnog != '-':
69
+ if keggorg != '-': # keggorg has precedence
70
+ kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
71
+ kr_uni_label = f"organism code '{keggorg}'"
72
+ elif eggnog != '-':
43
73
  for eggfile in eggnog:
44
74
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
45
75
  kr_uni = kr_uni.union(eggset)
@@ -55,10 +85,22 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
55
85
  if 'kegg.reaction' in r.annotation.keys():
56
86
  for kr_id in r.annotation['kegg.reaction']:
57
87
  kr_ids_modeled.add(kr_id)
58
- kr_uni_missing = len(kr_uni - kr_ids_modeled.intersection(kr_uni))
88
+ kr_uni_missing = kr_uni - kr_ids_modeled
59
89
  kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
60
- logger.info(f"Coverage for '{kr_uni_label}': {round(kr_uni_coverage, 0)}% ({kr_uni_missing} missing).")
90
+ logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
91
+
61
92
 
93
+ # define the map?????, containing krs not included in maps
94
+ krs_in_maps = set()
95
+ for i in summary_dict: krs_in_maps = krs_in_maps.union(i['kr_ids'])
96
+ krs_not_in_maps = idcollection_dict['kr'] - krs_in_maps
97
+ summary_dict.append({
98
+ 'map_id': 'map?????',
99
+ 'map_name': 'Not included in maps',
100
+ 'kr_ids': krs_not_in_maps,
101
+ 'cnt_r': len(krs_not_in_maps),
102
+ 'mds': []
103
+ })
62
104
 
63
105
 
64
106
  # get all the map / md codes:
@@ -112,52 +154,77 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
112
154
  missing_logger = (map_id, missing)
113
155
 
114
156
 
157
+ # put the map in the right bucket:
115
158
  if missing == set() and map_krs != set():
116
159
  maps_finished.add(map_id)
117
-
118
160
  elif map_krs == set():
119
161
  maps_noreac.add(map_id)
120
-
121
162
  elif missing == map_krs:
122
163
  maps_missing.add(map_id)
123
-
124
- if zeroes:
125
- list_coverage.append({
126
- 'map_id': map_id,
127
- 'map_name_short': map_name_short,
128
- 'perc_completeness': 0,
129
- 'perc_completeness_str': ' 0',
130
- 'present': present,
131
- 'missing': missing,
132
- 'md_ids': [j['md_id'] for j in i['mds']],
133
- })
134
-
135
164
  elif len(missing) < len(map_krs):
136
165
  maps_partial.add(map_id)
137
166
 
138
- # get '%' of completeness:
139
- perc_completeness = len(present)/len(map_krs)*100
140
- perc_completeness_str = str(round(perc_completeness)) # version to be printed
141
- if len(perc_completeness_str)==1:
142
- perc_completeness_str = ' ' + perc_completeness_str
143
-
144
- list_coverage.append({
145
- 'map_id': map_id,
146
- 'map_name_short': map_name_short,
147
- 'perc_completeness': perc_completeness,
148
- 'perc_completeness_str': perc_completeness_str,
149
- 'present': present,
150
- 'missing': missing,
151
- 'md_ids': [j['md_id'] for j in i['mds']],
152
- })
153
167
 
168
+ # get '%' of completeness:
169
+ if len(map_krs) != 0: perc_completeness = len(present)/len(map_krs)*100
170
+ else: perc_completeness = 100 # for maps_noreac
171
+ perc_completeness_str = str(round(perc_completeness)) # version to be printed
172
+ if len(perc_completeness_str)==1:
173
+ perc_completeness_str = ' ' + perc_completeness_str
174
+
154
175
 
155
- # order list by '%' of completness and print:
176
+ # append map to list:
177
+ list_coverage.append({
178
+ 'map_id': map_id,
179
+ 'map_name_short': map_name_short,
180
+ 'perc_completeness': perc_completeness,
181
+ 'perc_completeness_str': perc_completeness_str,
182
+ 'present': present,
183
+ 'missing': missing,
184
+ 'md_ids': [j['md_id'] for j in i['mds']],
185
+ })
186
+
187
+
188
+
189
+ # create coverage dataframe
190
+ if eggnog != '-' and len(eggnog) >= 2:
191
+ df_coverage = {}
192
+ for i in list_coverage:
193
+ for kr in i['present'].union(i['missing']):
194
+ if kr not in df_coverage.keys():
195
+ df_coverage[kr] = {'map_ids': set()}
196
+ df_coverage[kr]['map_ids'].add(i['map_id'])
197
+ df_coverage = pnd.DataFrame.from_records(df_coverage).T
198
+ df_coverage['modeled'] = False
199
+ for kr, row in df_coverage.iterrows():
200
+ if kr in kr_ids_modeled:
201
+ df_coverage.loc[kr, 'modeled'] = True
202
+ # build strain columns all at once
203
+ df_strains = [] # list of small DataFrames
204
+ for eggfile in eggnog:
205
+ strain = Path(eggfile).stem
206
+ eggset = parse_eggnog(model, eggfile, idcollection_dict)
207
+ col = df_coverage.index.to_series().isin(eggset).astype(int) # integer: 0 or 1
208
+ df_strains.append(col.rename(strain))
209
+ df_strains = pnd.concat(df_strains, axis=1)
210
+ # sort rows: upper rows are present in more strains
211
+ #df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index] # commented: now in charge of figures.py
212
+ df_coverage = df_coverage.loc[df_strains.index]
213
+ df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
214
+ # split in 2: modeled above, non-modeled below:
215
+ #df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]]) # commented: now in charge of figures.py
216
+ else: # not interesting in a super-long table without strains in column
217
+ df_coverage = None
218
+
219
+
220
+
221
+ # order list by '%' of completness and print if needed:
156
222
  list_coverage = sorted(list_coverage, key=lambda x: x['perc_completeness'], reverse=True)
157
223
  for i in list_coverage:
158
224
  if progress:
159
225
  if focus=='-' or focus in i['md_ids'] or focus==i['map_id']:
160
- logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
226
+ if i['map_id'] in maps_missing or i['map_id'] in maps_partial:
227
+ logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
161
228
 
162
229
 
163
230
  # get the correspondent pathway element of the 'summary_dict'
@@ -199,50 +266,43 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
199
266
  missing_logger = (md_id, missing)
200
267
 
201
268
 
269
+ # put the map in the right bucket:
202
270
  if missing == set() and md_krs != set():
203
271
  mds_completed.add(md_id)
204
-
205
272
  elif md_krs == set():
206
273
  mds_noreac.add(md_id)
207
-
208
274
  elif missing == md_krs:
209
275
  mds_missing.add(md_id)
210
-
211
- if zeroes:
212
- list_coverage_md.append({
213
- 'md_id': md_id,
214
- 'md_name_short': md_name_short,
215
- 'perc_completeness': 0,
216
- 'perc_completeness_str': ' 0',
217
- 'present': present,
218
- 'missing': missing,
219
- })
220
-
221
276
  elif len(missing) < len(md_krs):
222
277
  mds_partial.add(md_id)
223
278
 
224
- # get '%' of completeness:
225
- perc_completeness = len(present)/len(md_krs)*100
226
- perc_completeness_str = str(round(perc_completeness)) # version to be printed
227
- if len(perc_completeness_str)==1:
228
- perc_completeness_str = ' ' + perc_completeness_str
279
+
280
+ # get '%' of completeness:
281
+ if len(md_krs) != 0: perc_completeness = len(present)/len(md_krs)*100
282
+ else: perc_completeness = 100 # for mds_noreac
283
+ perc_completeness_str = str(round(perc_completeness)) # version to be printed
284
+ if len(perc_completeness_str)==1:
285
+ perc_completeness_str = ' ' + perc_completeness_str
229
286
 
230
- list_coverage_md.append({
231
- 'md_id': md_id,
232
- 'md_name_short': md_name_short,
233
- 'perc_completeness': perc_completeness,
234
- 'perc_completeness_str': perc_completeness_str,
235
- 'present': present,
236
- 'missing': missing,
237
- })
287
+
288
+ # append md to list:
289
+ list_coverage_md.append({
290
+ 'md_id': md_id,
291
+ 'md_name_short': md_name_short,
292
+ 'perc_completeness': perc_completeness,
293
+ 'perc_completeness_str': perc_completeness_str,
294
+ 'present': present,
295
+ 'missing': missing,
296
+ })
238
297
 
239
298
 
240
- # order list by '%' of completness and print:
299
+ # order list by '%' of completness and print if needed:
241
300
  list_coverage_md = sorted(list_coverage_md, key=lambda x: x['perc_completeness'], reverse=True)
242
301
  for z in list_coverage_md:
243
302
  if module:
244
303
  if focus=='-' or focus==z['md_id']:
245
- logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
304
+ if z['md_id'] in mds_missing or z['md_id'] in mds_partial:
305
+ logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
246
306
 
247
307
 
248
308
  # print summary:
@@ -254,6 +314,6 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
254
314
  logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
255
315
 
256
316
 
257
- return 0
317
+ return df_coverage
258
318
 
259
319
 
@@ -143,6 +143,14 @@ def introduce_metabolites(logger, db, model, idcollection_dict, kegg_compound_to
143
143
  m.annotation[ankey] = list(m.annotation[ankey])
144
144
 
145
145
 
146
+ # replace inchikey with manually-curated
147
+ if m.annotation['inchikey'] != [] and m.annotation['inchikey'] != [row['inchikey']]:
148
+ logger.debug(f"Metabolite '{pure_mid}': manual-curated inchikey ({[row['inchikey']]}) is diferent from the one derived from MNX ({m.annotation['inchikey']}).")
149
+ m.annotation['inchikey'] = [row['inchikey']] # force the manual-curated version
150
+ if m.annotation['inchikey'] == ['XXXXXXXXXXXXXX-XXXXXXXXXX-X']:
151
+ m.annotation['inchikey'] = []
152
+
153
+
146
154
  # add SBO annotation
147
155
  m.annotation['sbo'] = ['SBO:0000247'] # generic metabolite
148
156