gsrap 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsrap/.ipynb_checkpoints/__init__-checkpoint.py +5 -1
- gsrap/__init__.py +5 -1
- gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
- gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +94 -37
- gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +119 -0
- gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py +145 -0
- gsrap/commons/__init__.py +1 -0
- gsrap/commons/downloads.py +1 -1
- gsrap/commons/escherutils.py +1 -1
- gsrap/commons/excelhub.py +94 -37
- gsrap/commons/figures.py +119 -0
- gsrap/commons/keggutils.py +145 -0
- gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +64 -20
- gsrap/mkmodel/.ipynb_checkpoints/pruner-checkpoint.py +72 -7
- gsrap/mkmodel/mkmodel.py +64 -20
- gsrap/mkmodel/pruner.py +72 -7
- gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +124 -64
- gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +8 -0
- gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +12 -5
- gsrap/parsedb/completeness.py +124 -64
- gsrap/parsedb/introduce.py +8 -0
- gsrap/parsedb/parsedb.py +12 -5
- gsrap/runsims/.ipynb_checkpoints/simplegrowth-checkpoint.py +2 -2
- gsrap/runsims/simplegrowth.py +2 -2
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/METADATA +3 -1
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/RECORD +31 -27
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/LICENSE.txt +0 -0
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/WHEEL +0 -0
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import warnings
|
|
3
3
|
import logging
|
|
4
|
+
import pickle
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
import pandas as pnd
|
|
@@ -43,22 +44,57 @@ def load_input_eggnog(logger, eggnog):
|
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
# load eggnog annotations
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
df_eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
|
|
48
|
+
df_eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
|
|
49
|
+
df_eggnog = df_eggnog.set_index('query', drop=True, verify_integrity=True)
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
return
|
|
52
|
+
return df_eggnog
|
|
52
53
|
|
|
53
54
|
|
|
54
55
|
|
|
55
|
-
def
|
|
56
|
+
def load_keggorg_like_eggnog(logger, keggorg, outdir):
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# load raw data, downloaded form kegg:
|
|
60
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
61
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# create an eggnog-like dataframe:
|
|
65
|
+
df_eggnog_like = [] # list of dict future df
|
|
66
|
+
for gid in df_keggorg.index:
|
|
67
|
+
row_dict = {}
|
|
68
|
+
|
|
69
|
+
row_dict['query'] = gid
|
|
70
|
+
row_dict['PFAMs'] = ','.join(df_keggorg.loc[gid, 'Pfam']) if type(df_keggorg.loc[gid, 'Pfam'])==list else '-'
|
|
71
|
+
row_dict['KEGG_ko'] = df_keggorg.loc[gid, 'ko'] if type(df_keggorg.loc[gid, 'ko'])==str else '-'
|
|
72
|
+
|
|
73
|
+
df_eggnog_like.append(row_dict)
|
|
74
|
+
df_eggnog_like = pnd.DataFrame.from_records(df_eggnog_like)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# appen missing coluns and sort
|
|
78
|
+
eggnog_columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
|
|
79
|
+
for c in eggnog_columns:
|
|
80
|
+
if c not in df_eggnog_like.columns:
|
|
81
|
+
df_eggnog_like[c] = '-'
|
|
82
|
+
df_eggnog_like = df_eggnog_like[eggnog_columns]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# set the index like in eggnog
|
|
86
|
+
df_eggnog_like = df_eggnog_like.set_index('query', drop=True, verify_integrity=True)
|
|
87
|
+
return df_eggnog_like
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def parse_eggnog(df_eggnog):
|
|
56
92
|
|
|
57
93
|
|
|
58
94
|
# PART 1. get KO codes available
|
|
59
95
|
gid_to_kos = {}
|
|
60
96
|
ko_to_gids = {}
|
|
61
|
-
for gid, kos in
|
|
97
|
+
for gid, kos in df_eggnog['KEGG_ko'].items():
|
|
62
98
|
if kos == '-':
|
|
63
99
|
continue
|
|
64
100
|
|
|
@@ -229,8 +265,37 @@ def restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos):
|
|
|
229
265
|
# collect names
|
|
230
266
|
names.append(uni_g.name)
|
|
231
267
|
g.name = '; '.join(names)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def append_keggorg_gene_annots(logger, model, keggorg, outdir):
|
|
272
|
+
|
|
232
273
|
|
|
233
|
-
|
|
274
|
+
# load raw data, downloaded form kegg:
|
|
275
|
+
logger.info("Adding gene annotations retrieved from KEGG...")
|
|
276
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
277
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# KEGG can provide some useful (ie, used in Memote) gene annotations:
|
|
281
|
+
for g in model.genes:
|
|
282
|
+
if g.id in df_keggorg.index:
|
|
283
|
+
|
|
284
|
+
g.annotation['kegg.genes'] = [keggorg + ':' + g.id]
|
|
285
|
+
|
|
286
|
+
if 'NCBI-GeneID' in df_keggorg.columns:
|
|
287
|
+
g.annotation['ncbigene'] = df_keggorg.loc[g.id, 'NCBI-GeneID'] if type(df_keggorg.loc[g.id, 'NCBI-GeneID'])==list else []
|
|
288
|
+
if 'NCBI-ProteinID' in df_keggorg.columns:
|
|
289
|
+
g.annotation['ncbiprotein'] = df_keggorg.loc[g.id, 'NCBI-ProteinID'] if type(df_keggorg.loc[g.id, 'NCBI-ProteinID'])==list else []
|
|
290
|
+
if 'ASAP' in df_keggorg.columns:
|
|
291
|
+
g.annotation['asap'] = df_keggorg.loc[g.id, 'ASAP'] if type(df_keggorg.loc[g.id, 'ASAP'])==list else []
|
|
292
|
+
if 'UniProt' in df_keggorg.columns:
|
|
293
|
+
g.annotation['uniprot'] = df_keggorg.loc[g.id, 'UniProt'] if type(df_keggorg.loc[g.id, 'UniProt'])==list else []
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
|
|
234
299
|
|
|
235
300
|
|
|
236
301
|
|
gsrap/mkmodel/mkmodel.py
CHANGED
|
@@ -12,10 +12,12 @@ import gempipe
|
|
|
12
12
|
|
|
13
13
|
from .pruner import load_input_universe
|
|
14
14
|
from .pruner import load_input_eggnog
|
|
15
|
+
from .pruner import load_keggorg_like_eggnog
|
|
15
16
|
from .pruner import parse_eggnog
|
|
16
17
|
from .pruner import subtract_kos
|
|
17
18
|
from .pruner import translate_remaining_kos
|
|
18
19
|
from .pruner import restore_gene_annotations
|
|
20
|
+
from .pruner import append_keggorg_gene_annots
|
|
19
21
|
|
|
20
22
|
from .gapfillutils import include_forced
|
|
21
23
|
|
|
@@ -38,26 +40,37 @@ from ..commons import log_metrics
|
|
|
38
40
|
from ..commons import log_unbalances
|
|
39
41
|
from ..commons import format_expansion
|
|
40
42
|
from ..commons import comparative_table
|
|
43
|
+
from ..commons import download_keggorg
|
|
41
44
|
|
|
42
45
|
from ..runsims.biosynth import biosynthesis_on_media
|
|
43
46
|
|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
def create_model_incore(params):
|
|
47
|
-
universe, eggpath, dbexp, args, multistrain = params
|
|
50
|
+
annotation_source, universe, eggpath, dbexp, args, multistrain = params
|
|
51
|
+
|
|
52
|
+
# get the logger:
|
|
48
53
|
logger = get_logger('gsrap_queued', args.verbose) # loggers can't be pickled!
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# only errors will be recorded if multistrain mode
|
|
49
57
|
if multistrain:
|
|
50
|
-
# only errors will be recorded
|
|
51
58
|
logger.setLevel(logging.ERROR)
|
|
52
59
|
|
|
53
60
|
|
|
54
61
|
# load the annotation
|
|
55
|
-
|
|
62
|
+
if annotation_source == 'keggorg':
|
|
63
|
+
eggnog_style_table = load_keggorg_like_eggnog(logger, args.keggorg, args.outdir)
|
|
64
|
+
elif annotation_source == 'eggnog':
|
|
65
|
+
eggnog_style_table = load_input_eggnog(logger, eggpath)
|
|
56
66
|
|
|
57
67
|
|
|
58
|
-
# create a copy of the universe
|
|
68
|
+
# create a copy of the universe and define the model ID
|
|
59
69
|
model = universe.copy()
|
|
60
|
-
|
|
70
|
+
if annotation_source == 'keggorg':
|
|
71
|
+
model.id = args.keggorg
|
|
72
|
+
elif annotation_source == 'eggnog':
|
|
73
|
+
model.id = Path(eggpath).stem
|
|
61
74
|
|
|
62
75
|
|
|
63
76
|
###### POLISHING 1
|
|
@@ -67,9 +80,10 @@ def create_model_incore(params):
|
|
|
67
80
|
|
|
68
81
|
|
|
69
82
|
###### PRUNING
|
|
70
|
-
logger.info("Reading
|
|
83
|
+
if annotation_source == 'keggorg': logger.info(f"Reading annotation for organism code '{args.keggorg}'...")
|
|
84
|
+
elif annotation_source == 'eggnog': logger.info("Reading provided eggnog-mapper annotation...")
|
|
71
85
|
# get important dictionaries: 'eggnog_ko_to_gids' and 'eggonog_gid_to_kos'
|
|
72
|
-
eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(
|
|
86
|
+
eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog_style_table)
|
|
73
87
|
|
|
74
88
|
# prune reactions
|
|
75
89
|
subtract_kos(logger, model, eggnog_ko_to_gids)
|
|
@@ -77,6 +91,10 @@ def create_model_incore(params):
|
|
|
77
91
|
# translate KOs to the actual genes
|
|
78
92
|
translate_remaining_kos(logger, model, eggnog_ko_to_gids)
|
|
79
93
|
restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
|
|
94
|
+
|
|
95
|
+
# insert gene annotation if starting from kegg organisms:
|
|
96
|
+
if annotation_source == 'keggorg':
|
|
97
|
+
append_keggorg_gene_annots(logger, model, args.keggorg, args.outdir)
|
|
80
98
|
|
|
81
99
|
|
|
82
100
|
|
|
@@ -141,7 +159,7 @@ def create_model_incore(params):
|
|
|
141
159
|
cobra.io.write_sbml_model(model, f'{args.outdir}/{model.id}.xml') # SBML # groups are saved only to SBML
|
|
142
160
|
logger.info(f"'{args.outdir}/{model.id}.xml' created!")
|
|
143
161
|
force_id_on_sbml(f'{args.outdir}/{model.id}.xml', model.id) # force introduction of the 'id=""' field
|
|
144
|
-
sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', None, df_B, df_P, df_S)
|
|
162
|
+
sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', args.nofigs, None, df_B, df_P, df_S)
|
|
145
163
|
logger.info(f"'{args.outdir}/{model.id}.mkmodel.xlsx' created!")
|
|
146
164
|
|
|
147
165
|
|
|
@@ -171,13 +189,28 @@ def main(args, logger):
|
|
|
171
189
|
|
|
172
190
|
|
|
173
191
|
# format the --eggnog param
|
|
174
|
-
args.eggnog = format_expansion(logger, args.eggnog)
|
|
175
|
-
|
|
176
|
-
|
|
192
|
+
args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
|
|
193
|
+
|
|
194
|
+
# get the kegg organism if requested
|
|
195
|
+
if args.keggorg != '-':
|
|
196
|
+
response = download_keggorg(logger, args.keggorg, args.outdir)
|
|
197
|
+
if response == 1: return 1
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# determine the source of functional annotation:
|
|
202
|
+
annotation_source = None
|
|
203
|
+
if args.keggorg != '-': # keggorg has precedence
|
|
204
|
+
annotation_source = 'keggorg'
|
|
205
|
+
elif args.eggnog != '-':
|
|
206
|
+
annotation_source = 'eggnog'
|
|
207
|
+
if args.cores > len(args.eggnog):
|
|
208
|
+
logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
|
|
209
|
+
args.cores = len(args.eggnog)
|
|
210
|
+
else:
|
|
211
|
+
logger.error("No valid functional annotations provided: please use '--keggorg' or '--eggnog'.")
|
|
177
212
|
return 1
|
|
178
|
-
|
|
179
|
-
logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
|
|
180
|
-
args.cores = len(args.eggnog)
|
|
213
|
+
|
|
181
214
|
|
|
182
215
|
|
|
183
216
|
# check compatibility of input parameters:
|
|
@@ -201,17 +234,26 @@ def main(args, logger):
|
|
|
201
234
|
|
|
202
235
|
|
|
203
236
|
# disable logging (swith to txt) if strains are more than 1:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
237
|
+
if annotation_source == 'keggorg':
|
|
238
|
+
multistrain = False
|
|
239
|
+
elif annotation_source == 'eggnog':
|
|
240
|
+
multistrain = len(args.eggnog) > 1
|
|
241
|
+
if multistrain:
|
|
242
|
+
logger.info(f"Number of provided strains is >1: logging will be disabled.")
|
|
243
|
+
logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
|
|
244
|
+
# actualy this is done inside child processess!
|
|
245
|
+
|
|
209
246
|
|
|
210
247
|
# create strain-specific GSMMs using multi-core
|
|
211
248
|
error_raised = False
|
|
212
249
|
sheets_dicts = []
|
|
213
250
|
executor = confu.ProcessPoolExecutor(max_workers=args.cores)
|
|
214
|
-
|
|
251
|
+
|
|
252
|
+
if annotation_source == 'keggorg':
|
|
253
|
+
futures = [executor.submit(create_model_incore, (annotation_source, universe, None, dbexp, args, multistrain))]
|
|
254
|
+
elif annotation_source == 'eggnog':
|
|
255
|
+
futures = [executor.submit(create_model_incore, (annotation_source, universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
|
|
256
|
+
|
|
215
257
|
for f in confu.as_completed(futures):
|
|
216
258
|
sheets_dict = f.result()
|
|
217
259
|
|
|
@@ -226,12 +268,14 @@ def main(args, logger):
|
|
|
226
268
|
sheets_dicts.append(sheets_dict)
|
|
227
269
|
print(f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!", end='\r', file=sys.stderr)
|
|
228
270
|
|
|
271
|
+
|
|
229
272
|
# hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
|
|
230
273
|
if multistrain and sheets_dicts != []:
|
|
231
274
|
last_trace = f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!"
|
|
232
275
|
whitewash = ''.join([' ' for i in range(len(last_trace))])
|
|
233
276
|
print(whitewash, end='\r', file=sys.stderr)
|
|
234
277
|
|
|
278
|
+
|
|
235
279
|
# multiproces part terminated: safely shut down the executor
|
|
236
280
|
executor.shutdown(wait=True)
|
|
237
281
|
|
gsrap/mkmodel/pruner.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import warnings
|
|
3
3
|
import logging
|
|
4
|
+
import pickle
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
import pandas as pnd
|
|
@@ -43,22 +44,57 @@ def load_input_eggnog(logger, eggnog):
|
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
# load eggnog annotations
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
df_eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
|
|
48
|
+
df_eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
|
|
49
|
+
df_eggnog = df_eggnog.set_index('query', drop=True, verify_integrity=True)
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
return
|
|
52
|
+
return df_eggnog
|
|
52
53
|
|
|
53
54
|
|
|
54
55
|
|
|
55
|
-
def
|
|
56
|
+
def load_keggorg_like_eggnog(logger, keggorg, outdir):
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# load raw data, downloaded form kegg:
|
|
60
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
61
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# create an eggnog-like dataframe:
|
|
65
|
+
df_eggnog_like = [] # list of dict future df
|
|
66
|
+
for gid in df_keggorg.index:
|
|
67
|
+
row_dict = {}
|
|
68
|
+
|
|
69
|
+
row_dict['query'] = gid
|
|
70
|
+
row_dict['PFAMs'] = ','.join(df_keggorg.loc[gid, 'Pfam']) if type(df_keggorg.loc[gid, 'Pfam'])==list else '-'
|
|
71
|
+
row_dict['KEGG_ko'] = df_keggorg.loc[gid, 'ko'] if type(df_keggorg.loc[gid, 'ko'])==str else '-'
|
|
72
|
+
|
|
73
|
+
df_eggnog_like.append(row_dict)
|
|
74
|
+
df_eggnog_like = pnd.DataFrame.from_records(df_eggnog_like)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# appen missing coluns and sort
|
|
78
|
+
eggnog_columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
|
|
79
|
+
for c in eggnog_columns:
|
|
80
|
+
if c not in df_eggnog_like.columns:
|
|
81
|
+
df_eggnog_like[c] = '-'
|
|
82
|
+
df_eggnog_like = df_eggnog_like[eggnog_columns]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# set the index like in eggnog
|
|
86
|
+
df_eggnog_like = df_eggnog_like.set_index('query', drop=True, verify_integrity=True)
|
|
87
|
+
return df_eggnog_like
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def parse_eggnog(df_eggnog):
|
|
56
92
|
|
|
57
93
|
|
|
58
94
|
# PART 1. get KO codes available
|
|
59
95
|
gid_to_kos = {}
|
|
60
96
|
ko_to_gids = {}
|
|
61
|
-
for gid, kos in
|
|
97
|
+
for gid, kos in df_eggnog['KEGG_ko'].items():
|
|
62
98
|
if kos == '-':
|
|
63
99
|
continue
|
|
64
100
|
|
|
@@ -229,8 +265,37 @@ def restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos):
|
|
|
229
265
|
# collect names
|
|
230
266
|
names.append(uni_g.name)
|
|
231
267
|
g.name = '; '.join(names)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def append_keggorg_gene_annots(logger, model, keggorg, outdir):
|
|
272
|
+
|
|
232
273
|
|
|
233
|
-
|
|
274
|
+
# load raw data, downloaded form kegg:
|
|
275
|
+
logger.info("Adding gene annotations retrieved from KEGG...")
|
|
276
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
277
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# KEGG can provide some useful (ie, used in Memote) gene annotations:
|
|
281
|
+
for g in model.genes:
|
|
282
|
+
if g.id in df_keggorg.index:
|
|
283
|
+
|
|
284
|
+
g.annotation['kegg.genes'] = [keggorg + ':' + g.id]
|
|
285
|
+
|
|
286
|
+
if 'NCBI-GeneID' in df_keggorg.columns:
|
|
287
|
+
g.annotation['ncbigene'] = df_keggorg.loc[g.id, 'NCBI-GeneID'] if type(df_keggorg.loc[g.id, 'NCBI-GeneID'])==list else []
|
|
288
|
+
if 'NCBI-ProteinID' in df_keggorg.columns:
|
|
289
|
+
g.annotation['ncbiprotein'] = df_keggorg.loc[g.id, 'NCBI-ProteinID'] if type(df_keggorg.loc[g.id, 'NCBI-ProteinID'])==list else []
|
|
290
|
+
if 'ASAP' in df_keggorg.columns:
|
|
291
|
+
g.annotation['asap'] = df_keggorg.loc[g.id, 'ASAP'] if type(df_keggorg.loc[g.id, 'ASAP'])==list else []
|
|
292
|
+
if 'UniProt' in df_keggorg.columns:
|
|
293
|
+
g.annotation['uniprot'] = df_keggorg.loc[g.id, 'UniProt'] if type(df_keggorg.loc[g.id, 'UniProt'])==list else []
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
|
|
234
299
|
|
|
235
300
|
|
|
236
301
|
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import pickle
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
1
6
|
import pandas as pnd
|
|
2
7
|
|
|
3
8
|
|
|
@@ -32,14 +37,39 @@ def parse_eggnog(model, eggnog, idcollection_dict):
|
|
|
32
37
|
return krs_org
|
|
33
38
|
|
|
34
39
|
|
|
40
|
+
|
|
41
|
+
def parse_keggorg(keggorg, outdir, idcollection_dict):
|
|
42
|
+
|
|
43
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
44
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# PART 1. get KO codes available
|
|
48
|
+
kos_org = set([i for i in df_keggorg['ko'] if pnd.isna(i)==False])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# PART 2. get reactions in the organism (even the GPR is not complete)
|
|
52
|
+
kr_to_kos = idcollection_dict['kr_to_kos']
|
|
53
|
+
krs_org = set()
|
|
54
|
+
for kr, kos in kr_to_kos.items():
|
|
55
|
+
if any([ko in kos_org for ko in kos]):
|
|
56
|
+
krs_org.add(kr)
|
|
35
57
|
|
|
36
|
-
|
|
58
|
+
|
|
59
|
+
return krs_org
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
|
|
37
64
|
# check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
|
|
38
65
|
|
|
39
66
|
|
|
40
67
|
# get the reference set of kr codes (all kegg or organism specific):
|
|
41
68
|
kr_uni = set()
|
|
42
|
-
if
|
|
69
|
+
if keggorg != '-': # keggorg has precedence
|
|
70
|
+
kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
|
|
71
|
+
kr_uni_label = f"organism code '{keggorg}'"
|
|
72
|
+
elif eggnog != '-':
|
|
43
73
|
for eggfile in eggnog:
|
|
44
74
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
45
75
|
kr_uni = kr_uni.union(eggset)
|
|
@@ -55,10 +85,22 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
55
85
|
if 'kegg.reaction' in r.annotation.keys():
|
|
56
86
|
for kr_id in r.annotation['kegg.reaction']:
|
|
57
87
|
kr_ids_modeled.add(kr_id)
|
|
58
|
-
kr_uni_missing =
|
|
88
|
+
kr_uni_missing = kr_uni - kr_ids_modeled
|
|
59
89
|
kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
|
|
60
|
-
logger.info(f"Coverage for
|
|
90
|
+
logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
|
|
91
|
+
|
|
61
92
|
|
|
93
|
+
# define the map?????, containing krs not included in maps
|
|
94
|
+
krs_in_maps = set()
|
|
95
|
+
for i in summary_dict: krs_in_maps = krs_in_maps.union(i['kr_ids'])
|
|
96
|
+
krs_not_in_maps = idcollection_dict['kr'] - krs_in_maps
|
|
97
|
+
summary_dict.append({
|
|
98
|
+
'map_id': 'map?????',
|
|
99
|
+
'map_name': 'Not included in maps',
|
|
100
|
+
'kr_ids': krs_not_in_maps,
|
|
101
|
+
'cnt_r': len(krs_not_in_maps),
|
|
102
|
+
'mds': []
|
|
103
|
+
})
|
|
62
104
|
|
|
63
105
|
|
|
64
106
|
# get all the map / md codes:
|
|
@@ -112,52 +154,77 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
112
154
|
missing_logger = (map_id, missing)
|
|
113
155
|
|
|
114
156
|
|
|
157
|
+
# put the map in the right bucket:
|
|
115
158
|
if missing == set() and map_krs != set():
|
|
116
159
|
maps_finished.add(map_id)
|
|
117
|
-
|
|
118
160
|
elif map_krs == set():
|
|
119
161
|
maps_noreac.add(map_id)
|
|
120
|
-
|
|
121
162
|
elif missing == map_krs:
|
|
122
163
|
maps_missing.add(map_id)
|
|
123
|
-
|
|
124
|
-
if zeroes:
|
|
125
|
-
list_coverage.append({
|
|
126
|
-
'map_id': map_id,
|
|
127
|
-
'map_name_short': map_name_short,
|
|
128
|
-
'perc_completeness': 0,
|
|
129
|
-
'perc_completeness_str': ' 0',
|
|
130
|
-
'present': present,
|
|
131
|
-
'missing': missing,
|
|
132
|
-
'md_ids': [j['md_id'] for j in i['mds']],
|
|
133
|
-
})
|
|
134
|
-
|
|
135
164
|
elif len(missing) < len(map_krs):
|
|
136
165
|
maps_partial.add(map_id)
|
|
137
166
|
|
|
138
|
-
# get '%' of completeness:
|
|
139
|
-
perc_completeness = len(present)/len(map_krs)*100
|
|
140
|
-
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
141
|
-
if len(perc_completeness_str)==1:
|
|
142
|
-
perc_completeness_str = ' ' + perc_completeness_str
|
|
143
|
-
|
|
144
|
-
list_coverage.append({
|
|
145
|
-
'map_id': map_id,
|
|
146
|
-
'map_name_short': map_name_short,
|
|
147
|
-
'perc_completeness': perc_completeness,
|
|
148
|
-
'perc_completeness_str': perc_completeness_str,
|
|
149
|
-
'present': present,
|
|
150
|
-
'missing': missing,
|
|
151
|
-
'md_ids': [j['md_id'] for j in i['mds']],
|
|
152
|
-
})
|
|
153
167
|
|
|
168
|
+
# get '%' of completeness:
|
|
169
|
+
if len(map_krs) != 0: perc_completeness = len(present)/len(map_krs)*100
|
|
170
|
+
else: perc_completeness = 100 # for maps_noreac
|
|
171
|
+
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
172
|
+
if len(perc_completeness_str)==1:
|
|
173
|
+
perc_completeness_str = ' ' + perc_completeness_str
|
|
174
|
+
|
|
154
175
|
|
|
155
|
-
|
|
176
|
+
# append map to list:
|
|
177
|
+
list_coverage.append({
|
|
178
|
+
'map_id': map_id,
|
|
179
|
+
'map_name_short': map_name_short,
|
|
180
|
+
'perc_completeness': perc_completeness,
|
|
181
|
+
'perc_completeness_str': perc_completeness_str,
|
|
182
|
+
'present': present,
|
|
183
|
+
'missing': missing,
|
|
184
|
+
'md_ids': [j['md_id'] for j in i['mds']],
|
|
185
|
+
})
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# create coverage dataframe
|
|
190
|
+
if eggnog != '-' and len(eggnog) >= 2:
|
|
191
|
+
df_coverage = {}
|
|
192
|
+
for i in list_coverage:
|
|
193
|
+
for kr in i['present'].union(i['missing']):
|
|
194
|
+
if kr not in df_coverage.keys():
|
|
195
|
+
df_coverage[kr] = {'map_ids': set()}
|
|
196
|
+
df_coverage[kr]['map_ids'].add(i['map_id'])
|
|
197
|
+
df_coverage = pnd.DataFrame.from_records(df_coverage).T
|
|
198
|
+
df_coverage['modeled'] = False
|
|
199
|
+
for kr, row in df_coverage.iterrows():
|
|
200
|
+
if kr in kr_ids_modeled:
|
|
201
|
+
df_coverage.loc[kr, 'modeled'] = True
|
|
202
|
+
# build strain columns all at once
|
|
203
|
+
df_strains = [] # list of small DataFrames
|
|
204
|
+
for eggfile in eggnog:
|
|
205
|
+
strain = Path(eggfile).stem
|
|
206
|
+
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
207
|
+
col = df_coverage.index.to_series().isin(eggset).astype(int) # integer: 0 or 1
|
|
208
|
+
df_strains.append(col.rename(strain))
|
|
209
|
+
df_strains = pnd.concat(df_strains, axis=1)
|
|
210
|
+
# sort rows: upper rows are present in more strains
|
|
211
|
+
#df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index] # commented: now in charge of figures.py
|
|
212
|
+
df_coverage = df_coverage.loc[df_strains.index]
|
|
213
|
+
df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
|
|
214
|
+
# split in 2: modeled above, non-modeled below:
|
|
215
|
+
#df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]]) # commented: now in charge of figures.py
|
|
216
|
+
else: # not interesting in a super-long table without strains in column
|
|
217
|
+
df_coverage = None
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# order list by '%' of completness and print if needed:
|
|
156
222
|
list_coverage = sorted(list_coverage, key=lambda x: x['perc_completeness'], reverse=True)
|
|
157
223
|
for i in list_coverage:
|
|
158
224
|
if progress:
|
|
159
225
|
if focus=='-' or focus in i['md_ids'] or focus==i['map_id']:
|
|
160
|
-
|
|
226
|
+
if i['map_id'] in maps_missing or i['map_id'] in maps_partial:
|
|
227
|
+
logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
|
|
161
228
|
|
|
162
229
|
|
|
163
230
|
# get the correspondent pathway element of the 'summary_dict'
|
|
@@ -199,50 +266,43 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
199
266
|
missing_logger = (md_id, missing)
|
|
200
267
|
|
|
201
268
|
|
|
269
|
+
# put the map in the right bucket:
|
|
202
270
|
if missing == set() and md_krs != set():
|
|
203
271
|
mds_completed.add(md_id)
|
|
204
|
-
|
|
205
272
|
elif md_krs == set():
|
|
206
273
|
mds_noreac.add(md_id)
|
|
207
|
-
|
|
208
274
|
elif missing == md_krs:
|
|
209
275
|
mds_missing.add(md_id)
|
|
210
|
-
|
|
211
|
-
if zeroes:
|
|
212
|
-
list_coverage_md.append({
|
|
213
|
-
'md_id': md_id,
|
|
214
|
-
'md_name_short': md_name_short,
|
|
215
|
-
'perc_completeness': 0,
|
|
216
|
-
'perc_completeness_str': ' 0',
|
|
217
|
-
'present': present,
|
|
218
|
-
'missing': missing,
|
|
219
|
-
})
|
|
220
|
-
|
|
221
276
|
elif len(missing) < len(md_krs):
|
|
222
277
|
mds_partial.add(md_id)
|
|
223
278
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
279
|
+
|
|
280
|
+
# get '%' of completeness:
|
|
281
|
+
if len(md_krs) != 0: perc_completeness = len(present)/len(md_krs)*100
|
|
282
|
+
else: perc_completeness = 100 # for mds_noreac
|
|
283
|
+
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
284
|
+
if len(perc_completeness_str)==1:
|
|
285
|
+
perc_completeness_str = ' ' + perc_completeness_str
|
|
229
286
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
287
|
+
|
|
288
|
+
# append md to list:
|
|
289
|
+
list_coverage_md.append({
|
|
290
|
+
'md_id': md_id,
|
|
291
|
+
'md_name_short': md_name_short,
|
|
292
|
+
'perc_completeness': perc_completeness,
|
|
293
|
+
'perc_completeness_str': perc_completeness_str,
|
|
294
|
+
'present': present,
|
|
295
|
+
'missing': missing,
|
|
296
|
+
})
|
|
238
297
|
|
|
239
298
|
|
|
240
|
-
# order list by '%' of completness and print:
|
|
299
|
+
# order list by '%' of completness and print if needed:
|
|
241
300
|
list_coverage_md = sorted(list_coverage_md, key=lambda x: x['perc_completeness'], reverse=True)
|
|
242
301
|
for z in list_coverage_md:
|
|
243
302
|
if module:
|
|
244
303
|
if focus=='-' or focus==z['md_id']:
|
|
245
|
-
|
|
304
|
+
if z['md_id'] in mds_missing or z['md_id'] in mds_partial:
|
|
305
|
+
logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
|
|
246
306
|
|
|
247
307
|
|
|
248
308
|
# print summary:
|
|
@@ -254,6 +314,6 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
254
314
|
logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
|
|
255
315
|
|
|
256
316
|
|
|
257
|
-
return
|
|
317
|
+
return df_coverage
|
|
258
318
|
|
|
259
319
|
|
|
@@ -143,6 +143,14 @@ def introduce_metabolites(logger, db, model, idcollection_dict, kegg_compound_to
|
|
|
143
143
|
m.annotation[ankey] = list(m.annotation[ankey])
|
|
144
144
|
|
|
145
145
|
|
|
146
|
+
# replace inchikey with manually-curated
|
|
147
|
+
if m.annotation['inchikey'] != [] and m.annotation['inchikey'] != [row['inchikey']]:
|
|
148
|
+
logger.debug(f"Metabolite '{pure_mid}': manual-curated inchikey ({[row['inchikey']]}) is diferent from the one derived from MNX ({m.annotation['inchikey']}).")
|
|
149
|
+
m.annotation['inchikey'] = [row['inchikey']] # force the manual-curated version
|
|
150
|
+
if m.annotation['inchikey'] == ['XXXXXXXXXXXXXX-XXXXXXXXXX-X']:
|
|
151
|
+
m.annotation['inchikey'] = []
|
|
152
|
+
|
|
153
|
+
|
|
146
154
|
# add SBO annotation
|
|
147
155
|
m.annotation['sbo'] = ['SBO:0000247'] # generic metabolite
|
|
148
156
|
|