gsrap 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsrap/.ipynb_checkpoints/__init__-checkpoint.py +34 -5
- gsrap/__init__.py +34 -5
- gsrap/commons/.ipynb_checkpoints/biomass-checkpoint.py +4 -0
- gsrap/commons/.ipynb_checkpoints/coeffs-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +27 -3
- gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +105 -0
- gsrap/commons/.ipynb_checkpoints/fluxbal-checkpoint.py +1 -1
- gsrap/commons/biomass.py +4 -0
- gsrap/commons/coeffs.py +1 -1
- gsrap/commons/excelhub.py +27 -3
- gsrap/commons/figures.py +105 -0
- gsrap/commons/fluxbal.py +1 -1
- gsrap/mkmodel/.ipynb_checkpoints/gapfillutils-checkpoint.py +3 -0
- gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +11 -4
- gsrap/mkmodel/gapfillutils.py +3 -0
- gsrap/mkmodel/mkmodel.py +11 -4
- gsrap/parsedb/.ipynb_checkpoints/annotation-checkpoint.py +3 -0
- gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +101 -65
- gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +16 -1
- gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +4 -5
- gsrap/parsedb/.ipynb_checkpoints/repeating-checkpoint.py +7 -0
- gsrap/parsedb/annotation.py +3 -0
- gsrap/parsedb/completeness.py +101 -65
- gsrap/parsedb/introduce.py +16 -1
- gsrap/parsedb/parsedb.py +4 -5
- gsrap/parsedb/repeating.py +7 -0
- gsrap/runsims/.ipynb_checkpoints/simplegrowth-checkpoint.py +6 -7
- gsrap/runsims/simplegrowth.py +6 -7
- {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/METADATA +3 -1
- {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/RECORD +33 -31
- {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/LICENSE.txt +0 -0
- {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/WHEEL +0 -0
- {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/entry_points.txt +0 -0
gsrap/mkmodel/mkmodel.py
CHANGED
|
@@ -64,6 +64,7 @@ def create_model_incore(params):
|
|
|
64
64
|
# remove universal orphans
|
|
65
65
|
model = remove_universal_orphans(logger, model)
|
|
66
66
|
|
|
67
|
+
|
|
67
68
|
|
|
68
69
|
###### PRUNING
|
|
69
70
|
logger.info("Reading provided eggnog-mapper annotation...")
|
|
@@ -77,6 +78,7 @@ def create_model_incore(params):
|
|
|
77
78
|
translate_remaining_kos(logger, model, eggnog_ko_to_gids)
|
|
78
79
|
restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
|
|
79
80
|
|
|
81
|
+
|
|
80
82
|
|
|
81
83
|
###### GAPFILLING
|
|
82
84
|
# force inclusion of reactions:
|
|
@@ -103,30 +105,35 @@ def create_model_incore(params):
|
|
|
103
105
|
if type(df_P)==int: return 1
|
|
104
106
|
|
|
105
107
|
|
|
106
|
-
###### POLISHING 2
|
|
107
|
-
# remove disconnected metabolites
|
|
108
|
-
model = remove_disconnected(logger, model)
|
|
109
108
|
|
|
109
|
+
###### POLISHING 2
|
|
110
110
|
# remove unsed sinks and demands
|
|
111
111
|
model = remove_sinks_demands(logger, model)
|
|
112
|
+
|
|
113
|
+
# remove disconnected metabolites
|
|
114
|
+
model = remove_disconnected(logger, model)
|
|
112
115
|
|
|
116
|
+
|
|
113
117
|
|
|
114
118
|
# # # # # DERIVATION ENDS HERE # # # # #
|
|
115
119
|
log_metrics(logger, model)
|
|
116
120
|
log_unbalances(logger, model)
|
|
117
121
|
|
|
118
122
|
|
|
123
|
+
|
|
119
124
|
###### CHECKS
|
|
120
125
|
# check blocked metabolites / dead-ends
|
|
121
126
|
df_S = biosynthesis_on_media(logger, model, dbexp, args.gap_fill, args.biosynth)
|
|
122
127
|
if type(df_S)==int: return 1
|
|
123
128
|
|
|
124
129
|
|
|
130
|
+
|
|
125
131
|
###### POLISHING 3
|
|
126
132
|
# reset growth environment befor saving the model
|
|
127
133
|
gempipe.reset_growth_env(model)
|
|
128
134
|
|
|
129
135
|
|
|
136
|
+
|
|
130
137
|
# output the model:
|
|
131
138
|
logger.info("Writing strain-specific model...")
|
|
132
139
|
cobra.io.save_json_model(model, f'{args.outdir}/{model.id}.json') # JSON
|
|
@@ -134,7 +141,7 @@ def create_model_incore(params):
|
|
|
134
141
|
cobra.io.write_sbml_model(model, f'{args.outdir}/{model.id}.xml') # SBML # groups are saved only to SBML
|
|
135
142
|
logger.info(f"'{args.outdir}/{model.id}.xml' created!")
|
|
136
143
|
force_id_on_sbml(f'{args.outdir}/{model.id}.xml', model.id) # force introduction of the 'id=""' field
|
|
137
|
-
sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', None, df_B, df_P, df_S)
|
|
144
|
+
sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', args.nofigs, None, df_B, df_P, df_S)
|
|
138
145
|
logger.info(f"'{args.outdir}/{model.id}.mkmodel.xlsx' created!")
|
|
139
146
|
|
|
140
147
|
|
|
@@ -66,6 +66,9 @@ def translate_annotate_genes(logger, model, idcollection_dict):
|
|
|
66
66
|
g.annotation['ec'] = list(ko_to_ecs[ko])
|
|
67
67
|
g.annotation['cog'] = list(ko_to_cogs[ko])
|
|
68
68
|
g.annotation['go'] = list(ko_to_gos[ko])
|
|
69
|
+
|
|
70
|
+
# add SBO annotation
|
|
71
|
+
g.annotation['sbo'] = ['SBO:0000243'] # demand reaction
|
|
69
72
|
|
|
70
73
|
|
|
71
74
|
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
import pandas as pnd
|
|
2
5
|
|
|
3
6
|
|
|
@@ -33,7 +36,7 @@ def parse_eggnog(model, eggnog, idcollection_dict):
|
|
|
33
36
|
|
|
34
37
|
|
|
35
38
|
|
|
36
|
-
def check_completeness(logger, model, progress, module, focus, eggnog,
|
|
39
|
+
def check_completeness(logger, model, progress, module, focus, eggnog, idcollection_dict, summary_dict):
|
|
37
40
|
# check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
|
|
38
41
|
|
|
39
42
|
|
|
@@ -43,10 +46,10 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
43
46
|
for eggfile in eggnog:
|
|
44
47
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
45
48
|
kr_uni = kr_uni.union(eggset)
|
|
46
|
-
kr_uni_label = f"
|
|
49
|
+
kr_uni_label = f"{len(eggnog)} eggnog annotations"
|
|
47
50
|
else:
|
|
48
51
|
kr_uni = idcollection_dict['kr']
|
|
49
|
-
kr_uni_label = "
|
|
52
|
+
kr_uni_label = "whole KEGG"
|
|
50
53
|
|
|
51
54
|
|
|
52
55
|
# get all the 'kr' annotations in the model
|
|
@@ -55,7 +58,22 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
55
58
|
if 'kegg.reaction' in r.annotation.keys():
|
|
56
59
|
for kr_id in r.annotation['kegg.reaction']:
|
|
57
60
|
kr_ids_modeled.add(kr_id)
|
|
58
|
-
|
|
61
|
+
kr_uni_missing = kr_uni - kr_ids_modeled
|
|
62
|
+
kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
|
|
63
|
+
logger.info(f"Coverage for '{kr_uni_label}': {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# define the map?????, containing krs not included in maps
|
|
67
|
+
krs_in_maps = set()
|
|
68
|
+
for i in summary_dict: krs_in_maps = krs_in_maps.union(i['kr_ids'])
|
|
69
|
+
krs_not_in_maps = idcollection_dict['kr'] - krs_in_maps
|
|
70
|
+
summary_dict.append({
|
|
71
|
+
'map_id': 'map?????',
|
|
72
|
+
'map_name': 'Not included in maps',
|
|
73
|
+
'kr_ids': krs_not_in_maps,
|
|
74
|
+
'cnt_r': len(krs_not_in_maps),
|
|
75
|
+
'mds': []
|
|
76
|
+
})
|
|
59
77
|
|
|
60
78
|
|
|
61
79
|
# get all the map / md codes:
|
|
@@ -109,52 +127,77 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
109
127
|
missing_logger = (map_id, missing)
|
|
110
128
|
|
|
111
129
|
|
|
130
|
+
# put the map in the right bucket:
|
|
112
131
|
if missing == set() and map_krs != set():
|
|
113
132
|
maps_finished.add(map_id)
|
|
114
|
-
|
|
115
133
|
elif map_krs == set():
|
|
116
134
|
maps_noreac.add(map_id)
|
|
117
|
-
|
|
118
135
|
elif missing == map_krs:
|
|
119
136
|
maps_missing.add(map_id)
|
|
120
|
-
|
|
121
|
-
if zeroes:
|
|
122
|
-
list_coverage.append({
|
|
123
|
-
'map_id': map_id,
|
|
124
|
-
'map_name_short': map_name_short,
|
|
125
|
-
'perc_completeness': 0,
|
|
126
|
-
'perc_completeness_str': ' 0',
|
|
127
|
-
'present': present,
|
|
128
|
-
'missing': missing,
|
|
129
|
-
'md_ids': [j['md_id'] for j in i['mds']],
|
|
130
|
-
})
|
|
131
|
-
|
|
132
137
|
elif len(missing) < len(map_krs):
|
|
133
138
|
maps_partial.add(map_id)
|
|
134
139
|
|
|
135
|
-
# get '%' of completeness:
|
|
136
|
-
perc_completeness = len(present)/len(map_krs)*100
|
|
137
|
-
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
138
|
-
if len(perc_completeness_str)==1:
|
|
139
|
-
perc_completeness_str = ' ' + perc_completeness_str
|
|
140
|
-
|
|
141
|
-
list_coverage.append({
|
|
142
|
-
'map_id': map_id,
|
|
143
|
-
'map_name_short': map_name_short,
|
|
144
|
-
'perc_completeness': perc_completeness,
|
|
145
|
-
'perc_completeness_str': perc_completeness_str,
|
|
146
|
-
'present': present,
|
|
147
|
-
'missing': missing,
|
|
148
|
-
'md_ids': [j['md_id'] for j in i['mds']],
|
|
149
|
-
})
|
|
150
|
-
|
|
151
140
|
|
|
152
|
-
|
|
141
|
+
# get '%' of completeness:
|
|
142
|
+
if len(map_krs) != 0: perc_completeness = len(present)/len(map_krs)*100
|
|
143
|
+
else: perc_completeness = 100 # for maps_noreac
|
|
144
|
+
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
145
|
+
if len(perc_completeness_str)==1:
|
|
146
|
+
perc_completeness_str = ' ' + perc_completeness_str
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# append map to list:
|
|
150
|
+
list_coverage.append({
|
|
151
|
+
'map_id': map_id,
|
|
152
|
+
'map_name_short': map_name_short,
|
|
153
|
+
'perc_completeness': perc_completeness,
|
|
154
|
+
'perc_completeness_str': perc_completeness_str,
|
|
155
|
+
'present': present,
|
|
156
|
+
'missing': missing,
|
|
157
|
+
'md_ids': [j['md_id'] for j in i['mds']],
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# create coverage dataframe
|
|
163
|
+
if eggnog != '-' and len(eggnog) >= 2:
|
|
164
|
+
df_coverage = {}
|
|
165
|
+
for i in list_coverage:
|
|
166
|
+
for kr in i['present'].union(i['missing']):
|
|
167
|
+
if kr not in df_coverage.keys():
|
|
168
|
+
df_coverage[kr] = {'map_ids': set()}
|
|
169
|
+
df_coverage[kr]['map_ids'].add(i['map_id'])
|
|
170
|
+
df_coverage = pnd.DataFrame.from_records(df_coverage).T
|
|
171
|
+
df_coverage['modeled'] = False
|
|
172
|
+
for kr, row in df_coverage.iterrows():
|
|
173
|
+
if kr in kr_ids_modeled:
|
|
174
|
+
df_coverage.loc[kr, 'modeled'] = True
|
|
175
|
+
# build strain columns all at once
|
|
176
|
+
df_strains = [] # list of small DataFrames
|
|
177
|
+
for eggfile in eggnog:
|
|
178
|
+
strain = Path(eggfile).stem
|
|
179
|
+
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
180
|
+
col = df_coverage.index.to_series().isin(eggset).astype(int)
|
|
181
|
+
df_strains.append(col.rename(strain))
|
|
182
|
+
df_strains = pnd.concat(df_strains, axis=1)
|
|
183
|
+
# sort rows: upper rows are present in more strains
|
|
184
|
+
df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index]
|
|
185
|
+
df_coverage = df_coverage.loc[df_strains.index]
|
|
186
|
+
df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
|
|
187
|
+
# split in 2: modeled above, non-modeled below:
|
|
188
|
+
df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]])
|
|
189
|
+
else: # not interesting in a super-long table without strains in column
|
|
190
|
+
df_coverage = None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# order list by '%' of completness and print if needed:
|
|
153
195
|
list_coverage = sorted(list_coverage, key=lambda x: x['perc_completeness'], reverse=True)
|
|
154
196
|
for i in list_coverage:
|
|
155
197
|
if progress:
|
|
156
198
|
if focus=='-' or focus in i['md_ids'] or focus==i['map_id']:
|
|
157
|
-
|
|
199
|
+
if i['map_id'] in maps_missing or i['map_id'] in maps_partial:
|
|
200
|
+
logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
|
|
158
201
|
|
|
159
202
|
|
|
160
203
|
# get the correspondent pathway element of the 'summary_dict'
|
|
@@ -196,50 +239,43 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
196
239
|
missing_logger = (md_id, missing)
|
|
197
240
|
|
|
198
241
|
|
|
242
|
+
# put the map in the right bucket:
|
|
199
243
|
if missing == set() and md_krs != set():
|
|
200
244
|
mds_completed.add(md_id)
|
|
201
|
-
|
|
202
245
|
elif md_krs == set():
|
|
203
246
|
mds_noreac.add(md_id)
|
|
204
|
-
|
|
205
247
|
elif missing == md_krs:
|
|
206
248
|
mds_missing.add(md_id)
|
|
207
|
-
|
|
208
|
-
if zeroes:
|
|
209
|
-
list_coverage_md.append({
|
|
210
|
-
'md_id': md_id,
|
|
211
|
-
'md_name_short': md_name_short,
|
|
212
|
-
'perc_completeness': 0,
|
|
213
|
-
'perc_completeness_str': ' 0',
|
|
214
|
-
'present': present,
|
|
215
|
-
'missing': missing,
|
|
216
|
-
})
|
|
217
|
-
|
|
218
249
|
elif len(missing) < len(md_krs):
|
|
219
250
|
mds_partial.add(md_id)
|
|
220
251
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
252
|
+
|
|
253
|
+
# get '%' of completeness:
|
|
254
|
+
if len(md_krs) != 0: perc_completeness = len(present)/len(md_krs)*100
|
|
255
|
+
else: perc_completeness = 100 # for mds_noreac
|
|
256
|
+
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
257
|
+
if len(perc_completeness_str)==1:
|
|
258
|
+
perc_completeness_str = ' ' + perc_completeness_str
|
|
226
259
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
260
|
+
|
|
261
|
+
# append md to list:
|
|
262
|
+
list_coverage_md.append({
|
|
263
|
+
'md_id': md_id,
|
|
264
|
+
'md_name_short': md_name_short,
|
|
265
|
+
'perc_completeness': perc_completeness,
|
|
266
|
+
'perc_completeness_str': perc_completeness_str,
|
|
267
|
+
'present': present,
|
|
268
|
+
'missing': missing,
|
|
269
|
+
})
|
|
235
270
|
|
|
236
271
|
|
|
237
|
-
# order list by '%' of completness and print:
|
|
272
|
+
# order list by '%' of completness and print if needed:
|
|
238
273
|
list_coverage_md = sorted(list_coverage_md, key=lambda x: x['perc_completeness'], reverse=True)
|
|
239
274
|
for z in list_coverage_md:
|
|
240
275
|
if module:
|
|
241
276
|
if focus=='-' or focus==z['md_id']:
|
|
242
|
-
|
|
277
|
+
if z['md_id'] in mds_missing or z['md_id'] in mds_partial:
|
|
278
|
+
logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
|
|
243
279
|
|
|
244
280
|
|
|
245
281
|
# print summary:
|
|
@@ -251,6 +287,6 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
251
287
|
logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
|
|
252
288
|
|
|
253
289
|
|
|
254
|
-
return
|
|
290
|
+
return df_coverage
|
|
255
291
|
|
|
256
292
|
|
|
@@ -141,6 +141,10 @@ def introduce_metabolites(logger, db, model, idcollection_dict, kegg_compound_to
|
|
|
141
141
|
# save as list:
|
|
142
142
|
for ankey in ankeys:
|
|
143
143
|
m.annotation[ankey] = list(m.annotation[ankey])
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# add SBO annotation
|
|
147
|
+
m.annotation['sbo'] = ['SBO:0000247'] # generic metabolite
|
|
144
148
|
|
|
145
149
|
|
|
146
150
|
|
|
@@ -264,7 +268,8 @@ def introduce_transporters(logger, db, model, idcollection_dict, kegg_reaction_t
|
|
|
264
268
|
m_e.name = m_c.name
|
|
265
269
|
m_e.formula = m_c.formula
|
|
266
270
|
m_e.charge = m_c.charge
|
|
267
|
-
|
|
271
|
+
|
|
272
|
+
m_e.annotation = m_c.annotation # transfer all annotations, including SBO!
|
|
268
273
|
|
|
269
274
|
|
|
270
275
|
def add_exchange_reaction(model, mid_e):
|
|
@@ -283,6 +288,10 @@ def introduce_transporters(logger, db, model, idcollection_dict, kegg_reaction_t
|
|
|
283
288
|
r.bounds = (-1000, 1000)
|
|
284
289
|
else:
|
|
285
290
|
r.bounds = (0, 1000)
|
|
291
|
+
|
|
292
|
+
# add SBO annotation
|
|
293
|
+
r.annotation['sbo'] = ['SBO:0000627'] # exchange reaction
|
|
294
|
+
|
|
286
295
|
|
|
287
296
|
|
|
288
297
|
|
|
@@ -418,6 +427,9 @@ def introduce_sinks_demands(logger, model):
|
|
|
418
427
|
r.name = f"Sink for {model.metabolites.get_by_id(f'{puremid}_c').name}"
|
|
419
428
|
r.build_reaction_from_string(f'{puremid}_c <=> ')
|
|
420
429
|
r.bounds = (-1000, 1000)
|
|
430
|
+
|
|
431
|
+
# add SBO annotation
|
|
432
|
+
r.annotation['sbo'] = ['SBO:0000632'] # sink reaction
|
|
421
433
|
|
|
422
434
|
|
|
423
435
|
for puremid in demands:
|
|
@@ -427,6 +439,9 @@ def introduce_sinks_demands(logger, model):
|
|
|
427
439
|
r.name = f"Demand for {model.metabolites.get_by_id(f'{puremid}_c').name}"
|
|
428
440
|
r.build_reaction_from_string(f'{puremid}_c --> ')
|
|
429
441
|
r.bounds = (0, 1000)
|
|
442
|
+
|
|
443
|
+
# add SBO annotation
|
|
444
|
+
r.annotation['sbo'] = ['SBO:0000628'] # demand reaction
|
|
430
445
|
|
|
431
446
|
|
|
432
447
|
return model
|
|
@@ -113,7 +113,7 @@ def main(args, logger):
|
|
|
113
113
|
|
|
114
114
|
###### RECONSTRUCTION
|
|
115
115
|
# create the model
|
|
116
|
-
universe = cobra.Model('
|
|
116
|
+
universe = cobra.Model('universe')
|
|
117
117
|
logger.info("Parsing gsrap database...")
|
|
118
118
|
|
|
119
119
|
# introduce M / R / T
|
|
@@ -153,9 +153,8 @@ def main(args, logger):
|
|
|
153
153
|
|
|
154
154
|
###### CHECKS 1
|
|
155
155
|
# check universe completness
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if response==1: return 1
|
|
156
|
+
df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, idcollection_dict, summary_dict)
|
|
157
|
+
if type(df_C)==int: return 1
|
|
159
158
|
|
|
160
159
|
|
|
161
160
|
|
|
@@ -194,7 +193,7 @@ def main(args, logger):
|
|
|
194
193
|
cobra.io.write_sbml_model(universe, f'{args.outdir}/universe.xml') # groups are saved only to SBML
|
|
195
194
|
logger.info(f"'{args.outdir}/universe.xml' created!")
|
|
196
195
|
force_id_on_sbml(f'{args.outdir}/universe.xml', 'universe') # force introduction of the 'id=""' field
|
|
197
|
-
sheets_dict = write_excel_model(universe, f'{args.outdir}/universe.parsedb.xlsx', df_E, None, None, df_S)
|
|
196
|
+
sheets_dict = write_excel_model(universe, f'{args.outdir}/universe.parsedb.xlsx', args.nofigs, df_E, None, None, df_S, df_C)
|
|
198
197
|
logger.info(f"'{args.outdir}/universe.parsedb.xlsx' created!")
|
|
199
198
|
|
|
200
199
|
|
|
@@ -125,6 +125,13 @@ def add_reaction(logger, model, rid, row, kr_ids, kegg_reaction_to_others, addty
|
|
|
125
125
|
r.annotation[ankey] = list(r.annotation[ankey])
|
|
126
126
|
|
|
127
127
|
|
|
128
|
+
# add SBO annotation
|
|
129
|
+
if addtype=='R':
|
|
130
|
+
r.annotation['sbo'] = ['SBO:0000176'] # metabolic reaction
|
|
131
|
+
else:
|
|
132
|
+
r.annotation['sbo'] = ['SBO:0000185'] # transport reaction
|
|
133
|
+
|
|
134
|
+
|
|
128
135
|
# check if unbalanced
|
|
129
136
|
if r.check_mass_balance() != {}:
|
|
130
137
|
logger.error(f"{itemtype} '{r.id}' is unbalanced: {r.check_mass_balance()}.")
|
gsrap/parsedb/annotation.py
CHANGED
|
@@ -66,6 +66,9 @@ def translate_annotate_genes(logger, model, idcollection_dict):
|
|
|
66
66
|
g.annotation['ec'] = list(ko_to_ecs[ko])
|
|
67
67
|
g.annotation['cog'] = list(ko_to_cogs[ko])
|
|
68
68
|
g.annotation['go'] = list(ko_to_gos[ko])
|
|
69
|
+
|
|
70
|
+
# add SBO annotation
|
|
71
|
+
g.annotation['sbo'] = ['SBO:0000243'] # demand reaction
|
|
69
72
|
|
|
70
73
|
|
|
71
74
|
|
gsrap/parsedb/completeness.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
import pandas as pnd
|
|
2
5
|
|
|
3
6
|
|
|
@@ -33,7 +36,7 @@ def parse_eggnog(model, eggnog, idcollection_dict):
|
|
|
33
36
|
|
|
34
37
|
|
|
35
38
|
|
|
36
|
-
def check_completeness(logger, model, progress, module, focus, eggnog,
|
|
39
|
+
def check_completeness(logger, model, progress, module, focus, eggnog, idcollection_dict, summary_dict):
|
|
37
40
|
# check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
|
|
38
41
|
|
|
39
42
|
|
|
@@ -43,10 +46,10 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
43
46
|
for eggfile in eggnog:
|
|
44
47
|
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
45
48
|
kr_uni = kr_uni.union(eggset)
|
|
46
|
-
kr_uni_label = f"
|
|
49
|
+
kr_uni_label = f"{len(eggnog)} eggnog annotations"
|
|
47
50
|
else:
|
|
48
51
|
kr_uni = idcollection_dict['kr']
|
|
49
|
-
kr_uni_label = "
|
|
52
|
+
kr_uni_label = "whole KEGG"
|
|
50
53
|
|
|
51
54
|
|
|
52
55
|
# get all the 'kr' annotations in the model
|
|
@@ -55,7 +58,22 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
55
58
|
if 'kegg.reaction' in r.annotation.keys():
|
|
56
59
|
for kr_id in r.annotation['kegg.reaction']:
|
|
57
60
|
kr_ids_modeled.add(kr_id)
|
|
58
|
-
|
|
61
|
+
kr_uni_missing = kr_uni - kr_ids_modeled
|
|
62
|
+
kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
|
|
63
|
+
logger.info(f"Coverage for '{kr_uni_label}': {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# define the map?????, containing krs not included in maps
|
|
67
|
+
krs_in_maps = set()
|
|
68
|
+
for i in summary_dict: krs_in_maps = krs_in_maps.union(i['kr_ids'])
|
|
69
|
+
krs_not_in_maps = idcollection_dict['kr'] - krs_in_maps
|
|
70
|
+
summary_dict.append({
|
|
71
|
+
'map_id': 'map?????',
|
|
72
|
+
'map_name': 'Not included in maps',
|
|
73
|
+
'kr_ids': krs_not_in_maps,
|
|
74
|
+
'cnt_r': len(krs_not_in_maps),
|
|
75
|
+
'mds': []
|
|
76
|
+
})
|
|
59
77
|
|
|
60
78
|
|
|
61
79
|
# get all the map / md codes:
|
|
@@ -109,52 +127,77 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
109
127
|
missing_logger = (map_id, missing)
|
|
110
128
|
|
|
111
129
|
|
|
130
|
+
# put the map in the right bucket:
|
|
112
131
|
if missing == set() and map_krs != set():
|
|
113
132
|
maps_finished.add(map_id)
|
|
114
|
-
|
|
115
133
|
elif map_krs == set():
|
|
116
134
|
maps_noreac.add(map_id)
|
|
117
|
-
|
|
118
135
|
elif missing == map_krs:
|
|
119
136
|
maps_missing.add(map_id)
|
|
120
|
-
|
|
121
|
-
if zeroes:
|
|
122
|
-
list_coverage.append({
|
|
123
|
-
'map_id': map_id,
|
|
124
|
-
'map_name_short': map_name_short,
|
|
125
|
-
'perc_completeness': 0,
|
|
126
|
-
'perc_completeness_str': ' 0',
|
|
127
|
-
'present': present,
|
|
128
|
-
'missing': missing,
|
|
129
|
-
'md_ids': [j['md_id'] for j in i['mds']],
|
|
130
|
-
})
|
|
131
|
-
|
|
132
137
|
elif len(missing) < len(map_krs):
|
|
133
138
|
maps_partial.add(map_id)
|
|
134
139
|
|
|
135
|
-
# get '%' of completeness:
|
|
136
|
-
perc_completeness = len(present)/len(map_krs)*100
|
|
137
|
-
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
138
|
-
if len(perc_completeness_str)==1:
|
|
139
|
-
perc_completeness_str = ' ' + perc_completeness_str
|
|
140
|
-
|
|
141
|
-
list_coverage.append({
|
|
142
|
-
'map_id': map_id,
|
|
143
|
-
'map_name_short': map_name_short,
|
|
144
|
-
'perc_completeness': perc_completeness,
|
|
145
|
-
'perc_completeness_str': perc_completeness_str,
|
|
146
|
-
'present': present,
|
|
147
|
-
'missing': missing,
|
|
148
|
-
'md_ids': [j['md_id'] for j in i['mds']],
|
|
149
|
-
})
|
|
150
|
-
|
|
151
140
|
|
|
152
|
-
|
|
141
|
+
# get '%' of completeness:
|
|
142
|
+
if len(map_krs) != 0: perc_completeness = len(present)/len(map_krs)*100
|
|
143
|
+
else: perc_completeness = 100 # for maps_noreac
|
|
144
|
+
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
145
|
+
if len(perc_completeness_str)==1:
|
|
146
|
+
perc_completeness_str = ' ' + perc_completeness_str
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# append map to list:
|
|
150
|
+
list_coverage.append({
|
|
151
|
+
'map_id': map_id,
|
|
152
|
+
'map_name_short': map_name_short,
|
|
153
|
+
'perc_completeness': perc_completeness,
|
|
154
|
+
'perc_completeness_str': perc_completeness_str,
|
|
155
|
+
'present': present,
|
|
156
|
+
'missing': missing,
|
|
157
|
+
'md_ids': [j['md_id'] for j in i['mds']],
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# create coverage dataframe
|
|
163
|
+
if eggnog != '-' and len(eggnog) >= 2:
|
|
164
|
+
df_coverage = {}
|
|
165
|
+
for i in list_coverage:
|
|
166
|
+
for kr in i['present'].union(i['missing']):
|
|
167
|
+
if kr not in df_coverage.keys():
|
|
168
|
+
df_coverage[kr] = {'map_ids': set()}
|
|
169
|
+
df_coverage[kr]['map_ids'].add(i['map_id'])
|
|
170
|
+
df_coverage = pnd.DataFrame.from_records(df_coverage).T
|
|
171
|
+
df_coverage['modeled'] = False
|
|
172
|
+
for kr, row in df_coverage.iterrows():
|
|
173
|
+
if kr in kr_ids_modeled:
|
|
174
|
+
df_coverage.loc[kr, 'modeled'] = True
|
|
175
|
+
# build strain columns all at once
|
|
176
|
+
df_strains = [] # list of small DataFrames
|
|
177
|
+
for eggfile in eggnog:
|
|
178
|
+
strain = Path(eggfile).stem
|
|
179
|
+
eggset = parse_eggnog(model, eggfile, idcollection_dict)
|
|
180
|
+
col = df_coverage.index.to_series().isin(eggset).astype(int)
|
|
181
|
+
df_strains.append(col.rename(strain))
|
|
182
|
+
df_strains = pnd.concat(df_strains, axis=1)
|
|
183
|
+
# sort rows: upper rows are present in more strains
|
|
184
|
+
df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index]
|
|
185
|
+
df_coverage = df_coverage.loc[df_strains.index]
|
|
186
|
+
df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
|
|
187
|
+
# split in 2: modeled above, non-modeled below:
|
|
188
|
+
df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]])
|
|
189
|
+
else: # not interesting in a super-long table without strains in column
|
|
190
|
+
df_coverage = None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# order list by '%' of completness and print if needed:
|
|
153
195
|
list_coverage = sorted(list_coverage, key=lambda x: x['perc_completeness'], reverse=True)
|
|
154
196
|
for i in list_coverage:
|
|
155
197
|
if progress:
|
|
156
198
|
if focus=='-' or focus in i['md_ids'] or focus==i['map_id']:
|
|
157
|
-
|
|
199
|
+
if i['map_id'] in maps_missing or i['map_id'] in maps_partial:
|
|
200
|
+
logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
|
|
158
201
|
|
|
159
202
|
|
|
160
203
|
# get the correspondent pathway element of the 'summary_dict'
|
|
@@ -196,50 +239,43 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
196
239
|
missing_logger = (md_id, missing)
|
|
197
240
|
|
|
198
241
|
|
|
242
|
+
# put the map in the right bucket:
|
|
199
243
|
if missing == set() and md_krs != set():
|
|
200
244
|
mds_completed.add(md_id)
|
|
201
|
-
|
|
202
245
|
elif md_krs == set():
|
|
203
246
|
mds_noreac.add(md_id)
|
|
204
|
-
|
|
205
247
|
elif missing == md_krs:
|
|
206
248
|
mds_missing.add(md_id)
|
|
207
|
-
|
|
208
|
-
if zeroes:
|
|
209
|
-
list_coverage_md.append({
|
|
210
|
-
'md_id': md_id,
|
|
211
|
-
'md_name_short': md_name_short,
|
|
212
|
-
'perc_completeness': 0,
|
|
213
|
-
'perc_completeness_str': ' 0',
|
|
214
|
-
'present': present,
|
|
215
|
-
'missing': missing,
|
|
216
|
-
})
|
|
217
|
-
|
|
218
249
|
elif len(missing) < len(md_krs):
|
|
219
250
|
mds_partial.add(md_id)
|
|
220
251
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
252
|
+
|
|
253
|
+
# get '%' of completeness:
|
|
254
|
+
if len(md_krs) != 0: perc_completeness = len(present)/len(md_krs)*100
|
|
255
|
+
else: perc_completeness = 100 # for mds_noreac
|
|
256
|
+
perc_completeness_str = str(round(perc_completeness)) # version to be printed
|
|
257
|
+
if len(perc_completeness_str)==1:
|
|
258
|
+
perc_completeness_str = ' ' + perc_completeness_str
|
|
226
259
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
260
|
+
|
|
261
|
+
# append md to list:
|
|
262
|
+
list_coverage_md.append({
|
|
263
|
+
'md_id': md_id,
|
|
264
|
+
'md_name_short': md_name_short,
|
|
265
|
+
'perc_completeness': perc_completeness,
|
|
266
|
+
'perc_completeness_str': perc_completeness_str,
|
|
267
|
+
'present': present,
|
|
268
|
+
'missing': missing,
|
|
269
|
+
})
|
|
235
270
|
|
|
236
271
|
|
|
237
|
-
# order list by '%' of completness and print:
|
|
272
|
+
# order list by '%' of completness and print if needed:
|
|
238
273
|
list_coverage_md = sorted(list_coverage_md, key=lambda x: x['perc_completeness'], reverse=True)
|
|
239
274
|
for z in list_coverage_md:
|
|
240
275
|
if module:
|
|
241
276
|
if focus=='-' or focus==z['md_id']:
|
|
242
|
-
|
|
277
|
+
if z['md_id'] in mds_missing or z['md_id'] in mds_partial:
|
|
278
|
+
logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
|
|
243
279
|
|
|
244
280
|
|
|
245
281
|
# print summary:
|
|
@@ -251,6 +287,6 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
|
|
|
251
287
|
logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
|
|
252
288
|
|
|
253
289
|
|
|
254
|
-
return
|
|
290
|
+
return df_coverage
|
|
255
291
|
|
|
256
292
|
|