gsrap 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. gsrap/.ipynb_checkpoints/__init__-checkpoint.py +34 -5
  2. gsrap/__init__.py +34 -5
  3. gsrap/commons/.ipynb_checkpoints/biomass-checkpoint.py +4 -0
  4. gsrap/commons/.ipynb_checkpoints/coeffs-checkpoint.py +1 -1
  5. gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +27 -3
  6. gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +105 -0
  7. gsrap/commons/.ipynb_checkpoints/fluxbal-checkpoint.py +1 -1
  8. gsrap/commons/biomass.py +4 -0
  9. gsrap/commons/coeffs.py +1 -1
  10. gsrap/commons/excelhub.py +27 -3
  11. gsrap/commons/figures.py +105 -0
  12. gsrap/commons/fluxbal.py +1 -1
  13. gsrap/mkmodel/.ipynb_checkpoints/gapfillutils-checkpoint.py +3 -0
  14. gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +11 -4
  15. gsrap/mkmodel/gapfillutils.py +3 -0
  16. gsrap/mkmodel/mkmodel.py +11 -4
  17. gsrap/parsedb/.ipynb_checkpoints/annotation-checkpoint.py +3 -0
  18. gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +101 -65
  19. gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +16 -1
  20. gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +4 -5
  21. gsrap/parsedb/.ipynb_checkpoints/repeating-checkpoint.py +7 -0
  22. gsrap/parsedb/annotation.py +3 -0
  23. gsrap/parsedb/completeness.py +101 -65
  24. gsrap/parsedb/introduce.py +16 -1
  25. gsrap/parsedb/parsedb.py +4 -5
  26. gsrap/parsedb/repeating.py +7 -0
  27. gsrap/runsims/.ipynb_checkpoints/simplegrowth-checkpoint.py +6 -7
  28. gsrap/runsims/simplegrowth.py +6 -7
  29. {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/METADATA +3 -1
  30. {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/RECORD +33 -31
  31. {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/LICENSE.txt +0 -0
  32. {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/WHEEL +0 -0
  33. {gsrap-0.7.0.dist-info → gsrap-0.7.2.dist-info}/entry_points.txt +0 -0
gsrap/mkmodel/mkmodel.py CHANGED
@@ -64,6 +64,7 @@ def create_model_incore(params):
64
64
  # remove universal orphans
65
65
  model = remove_universal_orphans(logger, model)
66
66
 
67
+
67
68
 
68
69
  ###### PRUNING
69
70
  logger.info("Reading provided eggnog-mapper annotation...")
@@ -77,6 +78,7 @@ def create_model_incore(params):
77
78
  translate_remaining_kos(logger, model, eggnog_ko_to_gids)
78
79
  restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
79
80
 
81
+
80
82
 
81
83
  ###### GAPFILLING
82
84
  # force inclusion of reactions:
@@ -103,30 +105,35 @@ def create_model_incore(params):
103
105
  if type(df_P)==int: return 1
104
106
 
105
107
 
106
- ###### POLISHING 2
107
- # remove disconnected metabolites
108
- model = remove_disconnected(logger, model)
109
108
 
109
+ ###### POLISHING 2
110
110
  # remove unsed sinks and demands
111
111
  model = remove_sinks_demands(logger, model)
112
+
113
+ # remove disconnected metabolites
114
+ model = remove_disconnected(logger, model)
112
115
 
116
+
113
117
 
114
118
  # # # # # DERIVATION ENDS HERE # # # # #
115
119
  log_metrics(logger, model)
116
120
  log_unbalances(logger, model)
117
121
 
118
122
 
123
+
119
124
  ###### CHECKS
120
125
  # check blocked metabolites / dead-ends
121
126
  df_S = biosynthesis_on_media(logger, model, dbexp, args.gap_fill, args.biosynth)
122
127
  if type(df_S)==int: return 1
123
128
 
124
129
 
130
+
125
131
  ###### POLISHING 3
126
132
  # reset growth environment befor saving the model
127
133
  gempipe.reset_growth_env(model)
128
134
 
129
135
 
136
+
130
137
  # output the model:
131
138
  logger.info("Writing strain-specific model...")
132
139
  cobra.io.save_json_model(model, f'{args.outdir}/{model.id}.json') # JSON
@@ -134,7 +141,7 @@ def create_model_incore(params):
134
141
  cobra.io.write_sbml_model(model, f'{args.outdir}/{model.id}.xml') # SBML # groups are saved only to SBML
135
142
  logger.info(f"'{args.outdir}/{model.id}.xml' created!")
136
143
  force_id_on_sbml(f'{args.outdir}/{model.id}.xml', model.id) # force introduction of the 'id=""' field
137
- sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', None, df_B, df_P, df_S)
144
+ sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', args.nofigs, None, df_B, df_P, df_S)
138
145
  logger.info(f"'{args.outdir}/{model.id}.mkmodel.xlsx' created!")
139
146
 
140
147
 
@@ -66,6 +66,9 @@ def translate_annotate_genes(logger, model, idcollection_dict):
66
66
  g.annotation['ec'] = list(ko_to_ecs[ko])
67
67
  g.annotation['cog'] = list(ko_to_cogs[ko])
68
68
  g.annotation['go'] = list(ko_to_gos[ko])
69
+
70
+ # add SBO annotation
71
+ g.annotation['sbo'] = ['SBO:0000243'] # demand reaction
69
72
 
70
73
 
71
74
 
@@ -1,3 +1,6 @@
1
+ from pathlib import Path
2
+
3
+
1
4
  import pandas as pnd
2
5
 
3
6
 
@@ -33,7 +36,7 @@ def parse_eggnog(model, eggnog, idcollection_dict):
33
36
 
34
37
 
35
38
 
36
- def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, idcollection_dict, summary_dict):
39
+ def check_completeness(logger, model, progress, module, focus, eggnog, idcollection_dict, summary_dict):
37
40
  # check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
38
41
 
39
42
 
@@ -43,10 +46,10 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
43
46
  for eggfile in eggnog:
44
47
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
45
48
  kr_uni = kr_uni.union(eggset)
46
- kr_uni_label = f"'{len(eggnog)} eggnog annotations'"
49
+ kr_uni_label = f"{len(eggnog)} eggnog annotations"
47
50
  else:
48
51
  kr_uni = idcollection_dict['kr']
49
- kr_uni_label = "'whole KEGG'"
52
+ kr_uni_label = "whole KEGG"
50
53
 
51
54
 
52
55
  # get all the 'kr' annotations in the model
@@ -55,7 +58,22 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
55
58
  if 'kegg.reaction' in r.annotation.keys():
56
59
  for kr_id in r.annotation['kegg.reaction']:
57
60
  kr_ids_modeled.add(kr_id)
58
- logger.info(f"Universe coverage for {kr_uni_label}: {round(len(kr_ids_modeled.intersection(kr_uni))/len(kr_uni)*100, 0)}%!")
61
+ kr_uni_missing = kr_uni - kr_ids_modeled
62
+ kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
63
+ logger.info(f"Coverage for '{kr_uni_label}': {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
64
+
65
+
66
+ # define the map?????, containing krs not included in maps
67
+ krs_in_maps = set()
68
+ for i in summary_dict: krs_in_maps = krs_in_maps.union(i['kr_ids'])
69
+ krs_not_in_maps = idcollection_dict['kr'] - krs_in_maps
70
+ summary_dict.append({
71
+ 'map_id': 'map?????',
72
+ 'map_name': 'Not included in maps',
73
+ 'kr_ids': krs_not_in_maps,
74
+ 'cnt_r': len(krs_not_in_maps),
75
+ 'mds': []
76
+ })
59
77
 
60
78
 
61
79
  # get all the map / md codes:
@@ -109,52 +127,77 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
109
127
  missing_logger = (map_id, missing)
110
128
 
111
129
 
130
+ # put the map in the right bucket:
112
131
  if missing == set() and map_krs != set():
113
132
  maps_finished.add(map_id)
114
-
115
133
  elif map_krs == set():
116
134
  maps_noreac.add(map_id)
117
-
118
135
  elif missing == map_krs:
119
136
  maps_missing.add(map_id)
120
-
121
- if zeroes:
122
- list_coverage.append({
123
- 'map_id': map_id,
124
- 'map_name_short': map_name_short,
125
- 'perc_completeness': 0,
126
- 'perc_completeness_str': ' 0',
127
- 'present': present,
128
- 'missing': missing,
129
- 'md_ids': [j['md_id'] for j in i['mds']],
130
- })
131
-
132
137
  elif len(missing) < len(map_krs):
133
138
  maps_partial.add(map_id)
134
139
 
135
- # get '%' of completeness:
136
- perc_completeness = len(present)/len(map_krs)*100
137
- perc_completeness_str = str(round(perc_completeness)) # version to be printed
138
- if len(perc_completeness_str)==1:
139
- perc_completeness_str = ' ' + perc_completeness_str
140
-
141
- list_coverage.append({
142
- 'map_id': map_id,
143
- 'map_name_short': map_name_short,
144
- 'perc_completeness': perc_completeness,
145
- 'perc_completeness_str': perc_completeness_str,
146
- 'present': present,
147
- 'missing': missing,
148
- 'md_ids': [j['md_id'] for j in i['mds']],
149
- })
150
-
151
140
 
152
- # order list by '%' of completness and print:
141
+ # get '%' of completeness:
142
+ if len(map_krs) != 0: perc_completeness = len(present)/len(map_krs)*100
143
+ else: perc_completeness = 100 # for maps_noreac
144
+ perc_completeness_str = str(round(perc_completeness)) # version to be printed
145
+ if len(perc_completeness_str)==1:
146
+ perc_completeness_str = ' ' + perc_completeness_str
147
+
148
+
149
+ # append map to list:
150
+ list_coverage.append({
151
+ 'map_id': map_id,
152
+ 'map_name_short': map_name_short,
153
+ 'perc_completeness': perc_completeness,
154
+ 'perc_completeness_str': perc_completeness_str,
155
+ 'present': present,
156
+ 'missing': missing,
157
+ 'md_ids': [j['md_id'] for j in i['mds']],
158
+ })
159
+
160
+
161
+
162
+ # create coverage dataframe
163
+ if eggnog != '-' and len(eggnog) >= 2:
164
+ df_coverage = {}
165
+ for i in list_coverage:
166
+ for kr in i['present'].union(i['missing']):
167
+ if kr not in df_coverage.keys():
168
+ df_coverage[kr] = {'map_ids': set()}
169
+ df_coverage[kr]['map_ids'].add(i['map_id'])
170
+ df_coverage = pnd.DataFrame.from_records(df_coverage).T
171
+ df_coverage['modeled'] = False
172
+ for kr, row in df_coverage.iterrows():
173
+ if kr in kr_ids_modeled:
174
+ df_coverage.loc[kr, 'modeled'] = True
175
+ # build strain columns all at once
176
+ df_strains = [] # list of small DataFrames
177
+ for eggfile in eggnog:
178
+ strain = Path(eggfile).stem
179
+ eggset = parse_eggnog(model, eggfile, idcollection_dict)
180
+ col = df_coverage.index.to_series().isin(eggset).astype(int)
181
+ df_strains.append(col.rename(strain))
182
+ df_strains = pnd.concat(df_strains, axis=1)
183
+ # sort rows: upper rows are present in more strains
184
+ df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index]
185
+ df_coverage = df_coverage.loc[df_strains.index]
186
+ df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
187
+ # split in 2: modeled above, non-modeled below:
188
+ df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]])
189
+ else: # not interesting in a super-long table without strains in column
190
+ df_coverage = None
191
+
192
+
193
+
194
+ # order list by '%' of completness and print if needed:
153
195
  list_coverage = sorted(list_coverage, key=lambda x: x['perc_completeness'], reverse=True)
154
196
  for i in list_coverage:
155
197
  if progress:
156
198
  if focus=='-' or focus in i['md_ids'] or focus==i['map_id']:
157
- logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
199
+ if i['map_id'] in maps_missing or i['map_id'] in maps_partial:
200
+ logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
158
201
 
159
202
 
160
203
  # get the correspondent pathway element of the 'summary_dict'
@@ -196,50 +239,43 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
196
239
  missing_logger = (md_id, missing)
197
240
 
198
241
 
242
+ # put the map in the right bucket:
199
243
  if missing == set() and md_krs != set():
200
244
  mds_completed.add(md_id)
201
-
202
245
  elif md_krs == set():
203
246
  mds_noreac.add(md_id)
204
-
205
247
  elif missing == md_krs:
206
248
  mds_missing.add(md_id)
207
-
208
- if zeroes:
209
- list_coverage_md.append({
210
- 'md_id': md_id,
211
- 'md_name_short': md_name_short,
212
- 'perc_completeness': 0,
213
- 'perc_completeness_str': ' 0',
214
- 'present': present,
215
- 'missing': missing,
216
- })
217
-
218
249
  elif len(missing) < len(md_krs):
219
250
  mds_partial.add(md_id)
220
251
 
221
- # get '%' of completeness:
222
- perc_completeness = len(present)/len(md_krs)*100
223
- perc_completeness_str = str(round(perc_completeness)) # version to be printed
224
- if len(perc_completeness_str)==1:
225
- perc_completeness_str = ' ' + perc_completeness_str
252
+
253
+ # get '%' of completeness:
254
+ if len(md_krs) != 0: perc_completeness = len(present)/len(md_krs)*100
255
+ else: perc_completeness = 100 # for mds_noreac
256
+ perc_completeness_str = str(round(perc_completeness)) # version to be printed
257
+ if len(perc_completeness_str)==1:
258
+ perc_completeness_str = ' ' + perc_completeness_str
226
259
 
227
- list_coverage_md.append({
228
- 'md_id': md_id,
229
- 'md_name_short': md_name_short,
230
- 'perc_completeness': perc_completeness,
231
- 'perc_completeness_str': perc_completeness_str,
232
- 'present': present,
233
- 'missing': missing,
234
- })
260
+
261
+ # append md to list:
262
+ list_coverage_md.append({
263
+ 'md_id': md_id,
264
+ 'md_name_short': md_name_short,
265
+ 'perc_completeness': perc_completeness,
266
+ 'perc_completeness_str': perc_completeness_str,
267
+ 'present': present,
268
+ 'missing': missing,
269
+ })
235
270
 
236
271
 
237
- # order list by '%' of completness and print:
272
+ # order list by '%' of completness and print if needed:
238
273
  list_coverage_md = sorted(list_coverage_md, key=lambda x: x['perc_completeness'], reverse=True)
239
274
  for z in list_coverage_md:
240
275
  if module:
241
276
  if focus=='-' or focus==z['md_id']:
242
- logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
277
+ if z['md_id'] in mds_missing or z['md_id'] in mds_partial:
278
+ logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
243
279
 
244
280
 
245
281
  # print summary:
@@ -251,6 +287,6 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
251
287
  logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
252
288
 
253
289
 
254
- return 0
290
+ return df_coverage
255
291
 
256
292
 
@@ -141,6 +141,10 @@ def introduce_metabolites(logger, db, model, idcollection_dict, kegg_compound_to
141
141
  # save as list:
142
142
  for ankey in ankeys:
143
143
  m.annotation[ankey] = list(m.annotation[ankey])
144
+
145
+
146
+ # add SBO annotation
147
+ m.annotation['sbo'] = ['SBO:0000247'] # generic metabolite
144
148
 
145
149
 
146
150
 
@@ -264,7 +268,8 @@ def introduce_transporters(logger, db, model, idcollection_dict, kegg_reaction_t
264
268
  m_e.name = m_c.name
265
269
  m_e.formula = m_c.formula
266
270
  m_e.charge = m_c.charge
267
- m_e.annotation = m_c.annotation
271
+
272
+ m_e.annotation = m_c.annotation # transfer all annotations, including SBO!
268
273
 
269
274
 
270
275
  def add_exchange_reaction(model, mid_e):
@@ -283,6 +288,10 @@ def introduce_transporters(logger, db, model, idcollection_dict, kegg_reaction_t
283
288
  r.bounds = (-1000, 1000)
284
289
  else:
285
290
  r.bounds = (0, 1000)
291
+
292
+ # add SBO annotation
293
+ r.annotation['sbo'] = ['SBO:0000627'] # exchange reaction
294
+
286
295
 
287
296
 
288
297
 
@@ -418,6 +427,9 @@ def introduce_sinks_demands(logger, model):
418
427
  r.name = f"Sink for {model.metabolites.get_by_id(f'{puremid}_c').name}"
419
428
  r.build_reaction_from_string(f'{puremid}_c <=> ')
420
429
  r.bounds = (-1000, 1000)
430
+
431
+ # add SBO annotation
432
+ r.annotation['sbo'] = ['SBO:0000632'] # sink reaction
421
433
 
422
434
 
423
435
  for puremid in demands:
@@ -427,6 +439,9 @@ def introduce_sinks_demands(logger, model):
427
439
  r.name = f"Demand for {model.metabolites.get_by_id(f'{puremid}_c').name}"
428
440
  r.build_reaction_from_string(f'{puremid}_c --> ')
429
441
  r.bounds = (0, 1000)
442
+
443
+ # add SBO annotation
444
+ r.annotation['sbo'] = ['SBO:0000628'] # demand reaction
430
445
 
431
446
 
432
447
  return model
@@ -113,7 +113,7 @@ def main(args, logger):
113
113
 
114
114
  ###### RECONSTRUCTION
115
115
  # create the model
116
- universe = cobra.Model('newuni')
116
+ universe = cobra.Model('universe')
117
117
  logger.info("Parsing gsrap database...")
118
118
 
119
119
  # introduce M / R / T
@@ -153,9 +153,8 @@ def main(args, logger):
153
153
 
154
154
  ###### CHECKS 1
155
155
  # check universe completness
156
- setattr(args, 'zeroes', True) # old parameter, forced to True from v0.6.1
157
- response = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, args.zeroes, idcollection_dict, summary_dict)
158
- if response==1: return 1
156
+ df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, idcollection_dict, summary_dict)
157
+ if type(df_C)==int: return 1
159
158
 
160
159
 
161
160
 
@@ -194,7 +193,7 @@ def main(args, logger):
194
193
  cobra.io.write_sbml_model(universe, f'{args.outdir}/universe.xml') # groups are saved only to SBML
195
194
  logger.info(f"'{args.outdir}/universe.xml' created!")
196
195
  force_id_on_sbml(f'{args.outdir}/universe.xml', 'universe') # force introduction of the 'id=""' field
197
- sheets_dict = write_excel_model(universe, f'{args.outdir}/universe.parsedb.xlsx', df_E, None, None, df_S)
196
+ sheets_dict = write_excel_model(universe, f'{args.outdir}/universe.parsedb.xlsx', args.nofigs, df_E, None, None, df_S, df_C)
198
197
  logger.info(f"'{args.outdir}/universe.parsedb.xlsx' created!")
199
198
 
200
199
 
@@ -125,6 +125,13 @@ def add_reaction(logger, model, rid, row, kr_ids, kegg_reaction_to_others, addty
125
125
  r.annotation[ankey] = list(r.annotation[ankey])
126
126
 
127
127
 
128
+ # add SBO annotation
129
+ if addtype=='R':
130
+ r.annotation['sbo'] = ['SBO:0000176'] # metabolic reaction
131
+ else:
132
+ r.annotation['sbo'] = ['SBO:0000185'] # transport reaction
133
+
134
+
128
135
  # check if unbalanced
129
136
  if r.check_mass_balance() != {}:
130
137
  logger.error(f"{itemtype} '{r.id}' is unbalanced: {r.check_mass_balance()}.")
@@ -66,6 +66,9 @@ def translate_annotate_genes(logger, model, idcollection_dict):
66
66
  g.annotation['ec'] = list(ko_to_ecs[ko])
67
67
  g.annotation['cog'] = list(ko_to_cogs[ko])
68
68
  g.annotation['go'] = list(ko_to_gos[ko])
69
+
70
+ # add SBO annotation
71
+ g.annotation['sbo'] = ['SBO:0000243'] # demand reaction
69
72
 
70
73
 
71
74
 
@@ -1,3 +1,6 @@
1
+ from pathlib import Path
2
+
3
+
1
4
  import pandas as pnd
2
5
 
3
6
 
@@ -33,7 +36,7 @@ def parse_eggnog(model, eggnog, idcollection_dict):
33
36
 
34
37
 
35
38
 
36
- def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, idcollection_dict, summary_dict):
39
+ def check_completeness(logger, model, progress, module, focus, eggnog, idcollection_dict, summary_dict):
37
40
  # check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
38
41
 
39
42
 
@@ -43,10 +46,10 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
43
46
  for eggfile in eggnog:
44
47
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
45
48
  kr_uni = kr_uni.union(eggset)
46
- kr_uni_label = f"'{len(eggnog)} eggnog annotations'"
49
+ kr_uni_label = f"{len(eggnog)} eggnog annotations"
47
50
  else:
48
51
  kr_uni = idcollection_dict['kr']
49
- kr_uni_label = "'whole KEGG'"
52
+ kr_uni_label = "whole KEGG"
50
53
 
51
54
 
52
55
  # get all the 'kr' annotations in the model
@@ -55,7 +58,22 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
55
58
  if 'kegg.reaction' in r.annotation.keys():
56
59
  for kr_id in r.annotation['kegg.reaction']:
57
60
  kr_ids_modeled.add(kr_id)
58
- logger.info(f"Universe coverage for {kr_uni_label}: {round(len(kr_ids_modeled.intersection(kr_uni))/len(kr_uni)*100, 0)}%!")
61
+ kr_uni_missing = kr_uni - kr_ids_modeled
62
+ kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
63
+ logger.info(f"Coverage for '{kr_uni_label}': {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
64
+
65
+
66
+ # define the map?????, containing krs not included in maps
67
+ krs_in_maps = set()
68
+ for i in summary_dict: krs_in_maps = krs_in_maps.union(i['kr_ids'])
69
+ krs_not_in_maps = idcollection_dict['kr'] - krs_in_maps
70
+ summary_dict.append({
71
+ 'map_id': 'map?????',
72
+ 'map_name': 'Not included in maps',
73
+ 'kr_ids': krs_not_in_maps,
74
+ 'cnt_r': len(krs_not_in_maps),
75
+ 'mds': []
76
+ })
59
77
 
60
78
 
61
79
  # get all the map / md codes:
@@ -109,52 +127,77 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
109
127
  missing_logger = (map_id, missing)
110
128
 
111
129
 
130
+ # put the map in the right bucket:
112
131
  if missing == set() and map_krs != set():
113
132
  maps_finished.add(map_id)
114
-
115
133
  elif map_krs == set():
116
134
  maps_noreac.add(map_id)
117
-
118
135
  elif missing == map_krs:
119
136
  maps_missing.add(map_id)
120
-
121
- if zeroes:
122
- list_coverage.append({
123
- 'map_id': map_id,
124
- 'map_name_short': map_name_short,
125
- 'perc_completeness': 0,
126
- 'perc_completeness_str': ' 0',
127
- 'present': present,
128
- 'missing': missing,
129
- 'md_ids': [j['md_id'] for j in i['mds']],
130
- })
131
-
132
137
  elif len(missing) < len(map_krs):
133
138
  maps_partial.add(map_id)
134
139
 
135
- # get '%' of completeness:
136
- perc_completeness = len(present)/len(map_krs)*100
137
- perc_completeness_str = str(round(perc_completeness)) # version to be printed
138
- if len(perc_completeness_str)==1:
139
- perc_completeness_str = ' ' + perc_completeness_str
140
-
141
- list_coverage.append({
142
- 'map_id': map_id,
143
- 'map_name_short': map_name_short,
144
- 'perc_completeness': perc_completeness,
145
- 'perc_completeness_str': perc_completeness_str,
146
- 'present': present,
147
- 'missing': missing,
148
- 'md_ids': [j['md_id'] for j in i['mds']],
149
- })
150
-
151
140
 
152
- # order list by '%' of completness and print:
141
+ # get '%' of completeness:
142
+ if len(map_krs) != 0: perc_completeness = len(present)/len(map_krs)*100
143
+ else: perc_completeness = 100 # for maps_noreac
144
+ perc_completeness_str = str(round(perc_completeness)) # version to be printed
145
+ if len(perc_completeness_str)==1:
146
+ perc_completeness_str = ' ' + perc_completeness_str
147
+
148
+
149
+ # append map to list:
150
+ list_coverage.append({
151
+ 'map_id': map_id,
152
+ 'map_name_short': map_name_short,
153
+ 'perc_completeness': perc_completeness,
154
+ 'perc_completeness_str': perc_completeness_str,
155
+ 'present': present,
156
+ 'missing': missing,
157
+ 'md_ids': [j['md_id'] for j in i['mds']],
158
+ })
159
+
160
+
161
+
162
+ # create coverage dataframe
163
+ if eggnog != '-' and len(eggnog) >= 2:
164
+ df_coverage = {}
165
+ for i in list_coverage:
166
+ for kr in i['present'].union(i['missing']):
167
+ if kr not in df_coverage.keys():
168
+ df_coverage[kr] = {'map_ids': set()}
169
+ df_coverage[kr]['map_ids'].add(i['map_id'])
170
+ df_coverage = pnd.DataFrame.from_records(df_coverage).T
171
+ df_coverage['modeled'] = False
172
+ for kr, row in df_coverage.iterrows():
173
+ if kr in kr_ids_modeled:
174
+ df_coverage.loc[kr, 'modeled'] = True
175
+ # build strain columns all at once
176
+ df_strains = [] # list of small DataFrames
177
+ for eggfile in eggnog:
178
+ strain = Path(eggfile).stem
179
+ eggset = parse_eggnog(model, eggfile, idcollection_dict)
180
+ col = df_coverage.index.to_series().isin(eggset).astype(int)
181
+ df_strains.append(col.rename(strain))
182
+ df_strains = pnd.concat(df_strains, axis=1)
183
+ # sort rows: upper rows are present in more strains
184
+ df_strains = df_strains.loc[df_strains.sum(axis=1).sort_values(ascending=False).index]
185
+ df_coverage = df_coverage.loc[df_strains.index]
186
+ df_coverage = pnd.concat([df_coverage, df_strains], axis=1)
187
+ # split in 2: modeled above, non-modeled below:
188
+ df_coverage = pnd.concat([df_coverage[df_coverage['modeled']==True], df_coverage[df_coverage['modeled']==False]])
189
+ else: # not interesting in a super-long table without strains in column
190
+ df_coverage = None
191
+
192
+
193
+
194
+ # order list by '%' of completness and print if needed:
153
195
  list_coverage = sorted(list_coverage, key=lambda x: x['perc_completeness'], reverse=True)
154
196
  for i in list_coverage:
155
197
  if progress:
156
198
  if focus=='-' or focus in i['md_ids'] or focus==i['map_id']:
157
- logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
199
+ if i['map_id'] in maps_missing or i['map_id'] in maps_partial:
200
+ logger.info(f"{i['map_id']}: {i['map_name_short']} {i['perc_completeness_str']}% completed, {len(i['present'])} added, {len(i['missing'])} missing.")
158
201
 
159
202
 
160
203
  # get the correspondent pathway element of the 'summary_dict'
@@ -196,50 +239,43 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
196
239
  missing_logger = (md_id, missing)
197
240
 
198
241
 
242
+ # put the map in the right bucket:
199
243
  if missing == set() and md_krs != set():
200
244
  mds_completed.add(md_id)
201
-
202
245
  elif md_krs == set():
203
246
  mds_noreac.add(md_id)
204
-
205
247
  elif missing == md_krs:
206
248
  mds_missing.add(md_id)
207
-
208
- if zeroes:
209
- list_coverage_md.append({
210
- 'md_id': md_id,
211
- 'md_name_short': md_name_short,
212
- 'perc_completeness': 0,
213
- 'perc_completeness_str': ' 0',
214
- 'present': present,
215
- 'missing': missing,
216
- })
217
-
218
249
  elif len(missing) < len(md_krs):
219
250
  mds_partial.add(md_id)
220
251
 
221
- # get '%' of completeness:
222
- perc_completeness = len(present)/len(md_krs)*100
223
- perc_completeness_str = str(round(perc_completeness)) # version to be printed
224
- if len(perc_completeness_str)==1:
225
- perc_completeness_str = ' ' + perc_completeness_str
252
+
253
+ # get '%' of completeness:
254
+ if len(md_krs) != 0: perc_completeness = len(present)/len(md_krs)*100
255
+ else: perc_completeness = 100 # for mds_noreac
256
+ perc_completeness_str = str(round(perc_completeness)) # version to be printed
257
+ if len(perc_completeness_str)==1:
258
+ perc_completeness_str = ' ' + perc_completeness_str
226
259
 
227
- list_coverage_md.append({
228
- 'md_id': md_id,
229
- 'md_name_short': md_name_short,
230
- 'perc_completeness': perc_completeness,
231
- 'perc_completeness_str': perc_completeness_str,
232
- 'present': present,
233
- 'missing': missing,
234
- })
260
+
261
+ # append md to list:
262
+ list_coverage_md.append({
263
+ 'md_id': md_id,
264
+ 'md_name_short': md_name_short,
265
+ 'perc_completeness': perc_completeness,
266
+ 'perc_completeness_str': perc_completeness_str,
267
+ 'present': present,
268
+ 'missing': missing,
269
+ })
235
270
 
236
271
 
237
- # order list by '%' of completness and print:
272
+ # order list by '%' of completness and print if needed:
238
273
  list_coverage_md = sorted(list_coverage_md, key=lambda x: x['perc_completeness'], reverse=True)
239
274
  for z in list_coverage_md:
240
275
  if module:
241
276
  if focus=='-' or focus==z['md_id']:
242
- logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
277
+ if z['md_id'] in mds_missing or z['md_id'] in mds_partial:
278
+ logger.info(f"{spacer}{z['md_id']}: {z['md_name_short']} {z['perc_completeness_str']}% completed, {len(z['present'])} added, {len(z['missing'])} missing.")
243
279
 
244
280
 
245
281
  # print summary:
@@ -251,6 +287,6 @@ def check_completeness(logger, model, progress, module, focus, eggnog, zeroes, i
251
287
  logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
252
288
 
253
289
 
254
- return 0
290
+ return df_coverage
255
291
 
256
292