gsrap 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. gsrap/.ipynb_checkpoints/__init__-checkpoint.py +5 -1
  2. gsrap/__init__.py +5 -1
  3. gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
  4. gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +1 -1
  5. gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +1 -1
  6. gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +94 -37
  7. gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +119 -0
  8. gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py +145 -0
  9. gsrap/commons/__init__.py +1 -0
  10. gsrap/commons/downloads.py +1 -1
  11. gsrap/commons/escherutils.py +1 -1
  12. gsrap/commons/excelhub.py +94 -37
  13. gsrap/commons/figures.py +119 -0
  14. gsrap/commons/keggutils.py +145 -0
  15. gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +64 -20
  16. gsrap/mkmodel/.ipynb_checkpoints/pruner-checkpoint.py +72 -7
  17. gsrap/mkmodel/mkmodel.py +64 -20
  18. gsrap/mkmodel/pruner.py +72 -7
  19. gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +124 -64
  20. gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +8 -0
  21. gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +12 -5
  22. gsrap/parsedb/completeness.py +124 -64
  23. gsrap/parsedb/introduce.py +8 -0
  24. gsrap/parsedb/parsedb.py +12 -5
  25. gsrap/runsims/.ipynb_checkpoints/simplegrowth-checkpoint.py +2 -2
  26. gsrap/runsims/simplegrowth.py +2 -2
  27. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/METADATA +3 -1
  28. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/RECORD +31 -27
  29. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/LICENSE.txt +0 -0
  30. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/WHEEL +0 -0
  31. {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/entry_points.txt +0 -0
@@ -72,9 +72,10 @@ def main():
72
72
  parsedb_parser.add_argument("--precursors", action='store_true', help="Verify biosynthesis of biomass precursors and show blocked ones.")
73
73
  parsedb_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
74
74
  parsedb_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the optional eggnog-mapper annotation table(s).")
75
- #parsedb_parser.add_argument("-z", "--zeroes", action='store_true', help="Show maps/modules with 0%% coverage, in addition to partials (use only with --progress).")
75
+ parsedb_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code.")
76
76
  parsedb_parser.add_argument("--goodbefore", metavar='', type=str, default='-', help="Syntax is {pure_mid}-{rid1}-{rid2}. From top to bottom, build the universe until reaction {rid1}, transport {rid2} and metabolite {pure_mid} are reached.")
77
77
  parsedb_parser.add_argument("--onlyauthor", metavar='', type=str, default='-', help="Build the universe by parsing contents of the specified author ID only. Contents affected by --goodbefore are parsed anyway.")
78
+ parsedb_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
78
79
 
79
80
 
80
81
  # add arguments for the 'mkmodel' command
@@ -84,6 +85,7 @@ def main():
84
85
  mkmodel_parser.add_argument("-c", "--cores", metavar='', type=int, default=0, help="Number of cores to use (if 0, use all available cores).")
85
86
  mkmodel_parser.add_argument("-o", "--outdir", metavar='', type=str, default='./', help="Main output directory (will be created if not existing).")
86
87
  mkmodel_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the eggnog-mapper annotation table(s).")
88
+ mkmodel_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code.")
87
89
  mkmodel_parser.add_argument("-u", "--universe", metavar='', type=str, default='-', help="Path to the universe model (SBML format).")
88
90
  mkmodel_parser.add_argument("-i", "--force_inclusion", metavar='', type=str, default='-', help="Force the inclusion of the specified reactions (comma-separated IDs).")
89
91
  mkmodel_parser.add_argument("-f", "--gap_fill", metavar='', type=str, default='-', help="Media to use during gap-filling (comma-separated IDs); if not provided, gap-filling will be skipped.")
@@ -94,6 +96,7 @@ def main():
94
96
  mkmodel_parser.add_argument("--conditional", metavar='', type=float, default=0.5, help="Expected minimum fraction of reactions in a biosynthetic pathway for an actually present conditional biomass precursor.")
95
97
  mkmodel_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
96
98
  mkmodel_parser.add_argument("-b", "--biomass", metavar='', type=str, default='-', help="Strain ID associated to experimental biomass data.")
99
+ mkmodel_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
97
100
 
98
101
 
99
102
  # add arguments for the 'runsims' command
@@ -110,6 +113,7 @@ def main():
110
113
  runsims_parser.add_argument("--omission", action='store_true', help="Perform single omission experiments to study auxotrophies.")
111
114
  runsims_parser.add_argument("--essential", action='store_true', help="Predict essential genes (single-gene knock-out simulations).")
112
115
  runsims_parser.add_argument("--factors", action='store_true', help="Predict putative growth factors.")
116
+ runsims_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
113
117
 
114
118
 
115
119
  # check the inputted subcommand, automatic sys.exit(1) if a bad subprogram was specied.
gsrap/__init__.py CHANGED
@@ -72,9 +72,10 @@ def main():
72
72
  parsedb_parser.add_argument("--precursors", action='store_true', help="Verify biosynthesis of biomass precursors and show blocked ones.")
73
73
  parsedb_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
74
74
  parsedb_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the optional eggnog-mapper annotation table(s).")
75
- #parsedb_parser.add_argument("-z", "--zeroes", action='store_true', help="Show maps/modules with 0%% coverage, in addition to partials (use only with --progress).")
75
+ parsedb_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code.")
76
76
  parsedb_parser.add_argument("--goodbefore", metavar='', type=str, default='-', help="Syntax is {pure_mid}-{rid1}-{rid2}. From top to bottom, build the universe until reaction {rid1}, transport {rid2} and metabolite {pure_mid} are reached.")
77
77
  parsedb_parser.add_argument("--onlyauthor", metavar='', type=str, default='-', help="Build the universe by parsing contents of the specified author ID only. Contents affected by --goodbefore are parsed anyway.")
78
+ parsedb_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
78
79
 
79
80
 
80
81
  # add arguments for the 'mkmodel' command
@@ -84,6 +85,7 @@ def main():
84
85
  mkmodel_parser.add_argument("-c", "--cores", metavar='', type=int, default=0, help="Number of cores to use (if 0, use all available cores).")
85
86
  mkmodel_parser.add_argument("-o", "--outdir", metavar='', type=str, default='./', help="Main output directory (will be created if not existing).")
86
87
  mkmodel_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the eggnog-mapper annotation table(s).")
88
+ mkmodel_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code.")
87
89
  mkmodel_parser.add_argument("-u", "--universe", metavar='', type=str, default='-', help="Path to the universe model (SBML format).")
88
90
  mkmodel_parser.add_argument("-i", "--force_inclusion", metavar='', type=str, default='-', help="Force the inclusion of the specified reactions (comma-separated IDs).")
89
91
  mkmodel_parser.add_argument("-f", "--gap_fill", metavar='', type=str, default='-', help="Media to use during gap-filling (comma-separated IDs); if not provided, gap-filling will be skipped.")
@@ -94,6 +96,7 @@ def main():
94
96
  mkmodel_parser.add_argument("--conditional", metavar='', type=float, default=0.5, help="Expected minimum fraction of reactions in a biosynthetic pathway for an actually present conditional biomass precursor.")
95
97
  mkmodel_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
96
98
  mkmodel_parser.add_argument("-b", "--biomass", metavar='', type=str, default='-', help="Strain ID associated to experimental biomass data.")
99
+ mkmodel_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
97
100
 
98
101
 
99
102
  # add arguments for the 'runsims' command
@@ -110,6 +113,7 @@ def main():
110
113
  runsims_parser.add_argument("--omission", action='store_true', help="Perform single omission experiments to study auxotrophies.")
111
114
  runsims_parser.add_argument("--essential", action='store_true', help="Predict essential genes (single-gene knock-out simulations).")
112
115
  runsims_parser.add_argument("--factors", action='store_true', help="Predict putative growth factors.")
116
+ runsims_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
113
117
 
114
118
 
115
119
  # check the inputted subcommand, automatic sys.exit(1) if a bad subprogram was specied.
@@ -7,3 +7,4 @@ from .metrics import *
7
7
  from .sbmlutils import *
8
8
  from .escherutils import *
9
9
  from .logutils import *
10
+ from .keggutils import *
@@ -236,7 +236,7 @@ def format_expansion(logger, eggnog):
236
236
 
237
237
 
238
238
  if eggnog == [] or eggnog == ['-']:
239
- eggnog = '-' # return always a list except for ths case
239
+ eggnog = '-' # return always a list except for this case
240
240
 
241
241
 
242
242
  return eggnog
@@ -31,7 +31,7 @@ def count_undrawn_rids(logger, universe, lastmap):
31
31
  filename = lastmap['filename']
32
32
  logger.debug(f"Last universal map version detected: '{filename}'.")
33
33
  if len(remainings) > 0:
34
- logger.info(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
34
+ logger.warning(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
35
35
  else:
36
36
  logger.info(f"Our universal map is {len(remainings)} reactions behind. Thank you ♥")
37
37
 
@@ -1,14 +1,20 @@
1
1
  import pandas as pnd
2
2
 
3
3
 
4
+ from .figures import figure_df_C_F1
4
5
 
5
- def write_excel_model(model, filepath, df_E, df_B, df_P, df_S):
6
+
7
+
8
+ def write_excel_model(model, filepath, nofigs, df_E, df_B, df_P, df_S, df_C=None):
6
9
 
7
- df_M = []
8
- df_R = []
9
- df_T = []
10
- df_A = []
11
10
 
11
+ # generate figures
12
+ if nofigs == False:
13
+
14
+ if df_C is not None:
15
+ df_C_F1 = figure_df_C_F1(df_C)
16
+
17
+
12
18
 
13
19
  # format df_E: # biomass precursors biosynthesis
14
20
  if df_E is not None:
@@ -33,64 +39,112 @@ def write_excel_model(model, filepath, df_E, df_B, df_P, df_S):
33
39
  df_S.insert(0, 'mid', '') # new columns as first
34
40
  df_S['mid'] = df_S.index
35
41
  df_S = df_S.reset_index(drop=True)
42
+
43
+ # format df_C: universal reaction coverage
44
+ if df_C is not None:
45
+ df_C.insert(0, 'kr', '') # new columns as first
46
+ df_C['kr'] = df_C.index
47
+ df_C = df_C.reset_index(drop=True)
36
48
 
37
49
 
50
+
51
+ # define dict-lists, future dataframes
52
+ df_M = []
53
+ df_R = []
54
+ df_T = []
55
+ df_G = []
56
+ df_A = []
57
+
38
58
  for m in model.metabolites:
59
+ row_dict = {'mid': m.id, 'name': m.name, 'formula': m.formula, 'charge': m.charge,}
39
60
 
40
- # get kc codes:
41
- if 'kegg.compound' not in m.annotation.keys(): kc_ids = ''
42
- else:
43
- kc_ids = m.annotation['kegg.compound']
44
- if type(kc_ids) == str: kc_ids = [kc_ids]
45
- kc_ids = '; '.join([i for i in kc_ids if i!='CXXXXX'])
46
-
47
- df_M.append({'mid': m.id, 'formula': m.formula, 'charge': m.charge, 'kc': kc_ids, 'name': m.name})
48
-
61
+ for db in m.annotation.keys():
62
+ annots = m.annotation[db]
63
+ if type(annots) == str: annots = [annots]
64
+ annots = '; '.join([i for i in annots])
65
+ row_dict[db] = annots
66
+ df_M.append(row_dict)
49
67
 
50
68
  for r in model.reactions:
69
+ row_dict = {'rid': r.id, 'name': r.name, 'rstring': r.reaction, 'gpr': "Not applicable", 'bounds': r.bounds}
70
+
71
+ for db in r.annotation.keys():
72
+ annots = r.annotation[db]
73
+ if type(annots) == str: annots = [annots]
74
+ annots = '; '.join([i for i in annots])
75
+ row_dict[db] = annots
51
76
 
52
77
  # handle artificial reactions
53
78
  if r.id == 'Biomass':
54
- df_A.append({'rid': r.id, 'rstring': r.reaction, 'type': 'biomass', 'name': r.name})
79
+ # commented as the type is inplicit in the ID
80
+ #row_dict['type'] = 'biomass'
81
+ df_A.append(row_dict)
55
82
 
56
83
  elif len(r.metabolites) == 1:
84
+ # commented as the type is inplicit in the ID
85
+ """
57
86
  if len(r.metabolites)==1 and list(r.metabolites)[0].id.rsplit('_',1)[-1] == 'e':
58
- df_A.append({'rid': r.id, 'rstring': r.reaction, 'type': 'exchange', 'name': r.name})
87
+ row_dict['type'] = 'exchange'
59
88
  elif r.lower_bound < 0 and r.upper_bound > 0:
60
- df_A.append({'rid': r.id, 'rstring': r.reaction, 'type': 'sink', 'name': r.name})
89
+ row_dict['type'] = 'sink'
61
90
  elif r.lower_bound == 0 and r.upper_bound > 0:
62
- df_A.append({'rid': r.id, 'rstring': r.reaction, 'type': 'demand', 'name': r.name})
91
+ row_dict['type'] = 'demand'
92
+ """
93
+ df_A.append(row_dict)
63
94
 
64
95
  else: # more than 1 metabolite involved
96
+ row_dict['gpr'] = r.gene_reaction_rule
65
97
 
66
- # get kr codes:
67
- if 'kegg.reaction' not in r.annotation.keys(): kr_ids = ''
68
- else:
69
- kr_ids = r.annotation['kegg.reaction']
70
- if type(kr_ids) == str: kr_ids = [kr_ids]
71
- kr_ids = '; '.join([i for i in kr_ids if i!='RXXXXX'])
72
-
73
98
  # introduce reaction in the correct table:
74
- r_dict = {'rid': r.id, 'rstring': r.reaction, 'kr': kr_ids, 'gpr': r.gene_reaction_rule, 'name': r.name}
75
99
  if len(set([m.id.rsplit('_',1)[-1] for m in r.metabolites])) == 1:
76
- df_R.append(r_dict)
77
- else: df_T.append(r_dict)
78
-
100
+ df_R.append(row_dict)
101
+ else: df_T.append(row_dict)
102
+
103
+ for g in model.genes:
104
+ row_dict = {'gid': g.id, 'involved_in': '; '.join([r.id for r in g.reactions])}
105
+
106
+ for db in g.annotation.keys():
107
+ annots = g.annotation[db]
108
+ if type(annots) == str: annots = [annots]
109
+ annots = '; '.join([i for i in annots])
110
+ row_dict[db] = annots
111
+ df_G.append(row_dict)
79
112
 
113
+ # create dataframes from dict-lists
80
114
  df_M = pnd.DataFrame.from_records(df_M)
81
115
  df_R = pnd.DataFrame.from_records(df_R)
82
116
  df_T = pnd.DataFrame.from_records(df_T)
83
117
  df_A = pnd.DataFrame.from_records(df_A)
84
- with pnd.ExcelWriter(filepath) as writer:
118
+ df_G = pnd.DataFrame.from_records(df_G)
119
+
120
+ # sort columns
121
+ df_M_first_cols = ['mid', 'name', 'formula', 'charge']
122
+ df_M = df_M[df_M_first_cols + sorted([c for c in df_M.columns if c not in df_M_first_cols])]
123
+ df_R_first_cols = ['rid', 'name', 'rstring', 'gpr', 'bounds']
124
+ df_R = df_R[df_R_first_cols + sorted([c for c in df_R.columns if c not in df_R_first_cols])]
125
+ df_T = df_T[df_R_first_cols + sorted([c for c in df_T.columns if c not in df_R_first_cols])]
126
+ df_A = df_A[df_R_first_cols + sorted([c for c in df_A.columns if c not in df_R_first_cols])]
127
+ df_G_first_cols = ['gid', 'involved_in']
128
+ df_G = df_G[df_G_first_cols + sorted([c for c in df_G.columns if c not in df_G_first_cols])]
129
+
130
+
131
+
132
+ with pnd.ExcelWriter(filepath, engine='xlsxwriter') as writer:
85
133
  df_M.to_excel(writer, sheet_name='Metabolites', index=False)
86
134
  df_R.to_excel(writer, sheet_name='Reactions', index=False)
87
135
  df_T.to_excel(writer, sheet_name='Transporters', index=False)
136
+ df_G.to_excel(writer, sheet_name='Genes', index=False)
88
137
  df_A.to_excel(writer, sheet_name='Artificials', index=False)
89
138
  if df_E is not None and len(df_E)!=0: df_E.to_excel(writer, sheet_name='Precursors', index=False)
90
139
  if df_B is not None: df_B.to_excel(writer, sheet_name='Biomass', index=False)
91
140
  if df_P is not None and len(df_P)!=0: df_P.to_excel(writer, sheet_name='Biolog®', index=False)
92
141
  if df_S is not None and len(df_S.columns)>2: df_S.to_excel(writer, sheet_name='Biosynth', index=False)
93
-
142
+ if df_C is not None:
143
+ df_C.to_excel(writer, sheet_name='Coverage', index=False)
144
+ if nofigs == False:
145
+ worksheet = writer.sheets['Coverage']
146
+ worksheet.insert_image('E3', 'df_C_F1.png', {'image_data': df_C_F1})
147
+
94
148
 
95
149
  sheets_dict = {
96
150
  'model_id': model.id,
@@ -102,6 +156,7 @@ def write_excel_model(model, filepath, df_E, df_B, df_P, df_S):
102
156
  'Biomass': df_B,
103
157
  'Biolog': df_P,
104
158
  'Biosynth': df_S,
159
+ 'Coverage': df_C,
105
160
  }
106
161
  return sheets_dict
107
162
 
@@ -115,9 +170,10 @@ def comparative_table(logger, outdir, sheets_dicts):
115
170
  for sheets_dict in sheets_dicts:
116
171
  for index, row in sheets_dict['Reactions'].iterrows():
117
172
  if row['rid'] not in df_topology.index:
118
- df_topology.loc[row['rid'], 'rstring'] = row['rstring']
119
- df_topology.loc[row['rid'], 'kr'] = row['kr']
120
- df_topology.loc[row['rid'], 'name'] = row['name']
173
+ df_topology.loc[row['rid'], 'rid'] = row['rid']
174
+ for key, value in row.to_dict().items():
175
+ # force string to avoid errors with bounds
176
+ df_topology.loc[row['rid'], key] = '' if pnd.isna(value) else str(value)
121
177
  df_topology.loc[row['rid'], sheets_dict['model_id']] = 1
122
178
  for sheets_dict in sheets_dicts: # replace missing values:
123
179
  df_topology = df_topology.fillna({sheets_dict['model_id']: 0})
@@ -128,9 +184,10 @@ def comparative_table(logger, outdir, sheets_dicts):
128
184
  for sheets_dict in sheets_dicts:
129
185
  for index, row in sheets_dict['Reactions'].iterrows():
130
186
  if row['rid'] not in df_gprs.index:
131
- df_gprs.loc[row['rid'], 'rstring'] = row['rstring']
132
- df_gprs.loc[row['rid'], 'kr'] = row['kr']
133
- df_gprs.loc[row['rid'], 'name'] = row['name']
187
+ df_gprs.loc[row['rid'], 'rid'] = row['rid']
188
+ for key, value in row.to_dict().items():
189
+ # force string to avoid errors with bounds
190
+ df_gprs.loc[row['rid'], key] = '' if pnd.isna(value) else str(value)
134
191
  df_gprs.loc[row['rid'], sheets_dict['model_id']] = row['gpr']
135
192
  for sheets_dict in sheets_dicts: # replace missing values:
136
193
  df_gprs = df_gprs.fillna({sheets_dict['model_id']: 'missing'})
@@ -0,0 +1,119 @@
1
+ from io import BytesIO
2
+
3
+ import numpy as np
4
+ import pandas as pnd
5
+
6
+ from scipy.spatial.distance import pdist
7
+ from scipy.cluster.hierarchy import linkage, cut_tree, dendrogram, leaves_list
8
+
9
+ import matplotlib.pyplot as plt
10
+ from matplotlib.patches import Patch
11
+
12
+
13
+
14
+ def figure_df_C_F1(df_coverage):
15
+
16
+
17
+
18
+ # prepare the binary matrix:
19
+ modeled_rs = df_coverage[df_coverage['modeled']==True].index
20
+ unmodeled_rs = df_coverage[df_coverage['modeled']==False].index
21
+ # remove useless columns
22
+ bin_matrix = df_coverage[[i for i in df_coverage.columns if i not in ['map_ids', 'modeled']]]
23
+ # sort rows: upper rows are present in more strains
24
+ bin_matrix = bin_matrix.loc[bin_matrix.sum(axis=1).sort_values(ascending=False).index]
25
+ # split in 2: modeled above, non-modeled below:
26
+ bin_matrix = pnd.concat([
27
+ bin_matrix.loc[[i for i in bin_matrix.index if i in modeled_rs], ],
28
+ bin_matrix.loc[[i for i in bin_matrix.index if i in unmodeled_rs], ]
29
+ ])
30
+ strains = bin_matrix.columns
31
+ bin_matrix = bin_matrix.T # features in column
32
+
33
+
34
+ # pdist() / linkage() will loose the accession information. So here we save a dict:
35
+ index_to_strain = {i: strain for i, strain in enumerate(bin_matrix.index)}
36
+
37
+ # Calculate the linkage matrix using Ward clustering and Jaccard dissimilarity
38
+ distances = pdist(bin_matrix, 'jaccard')
39
+ linkage_matrix = linkage(distances, method='ward')
40
+
41
+
42
+ # PART 0: create the frame
43
+ fig, axs = plt.subplots(
44
+ nrows=2, ncols=2,
45
+ figsize=(15, 10),
46
+ gridspec_kw={ # suplots width proportions.
47
+ 'width_ratios': [0.5, 1.0],
48
+ 'height_ratios': [0.015, 0.985]
49
+ }
50
+ )
51
+
52
+ # PART 1: dendrogram
53
+ dn = dendrogram(
54
+ linkage_matrix, ax=axs[1,0],
55
+ orientation='left',
56
+ color_threshold=0, above_threshold_color='black',
57
+ )
58
+
59
+
60
+ ### PART 2: heatmap
61
+ ord_leaves = leaves_list(linkage_matrix)
62
+ ord_leaves = np.flip(ord_leaves) # because leaves are returned in the inverse sense.
63
+ ord_leaves = [index_to_strain[i] for i in ord_leaves] # convert index as number to index as accession
64
+ bin_matrix = bin_matrix.loc[ord_leaves, :] # reordered dataframe.
65
+ axs[1,1].matshow(
66
+ bin_matrix,
67
+ cmap='viridis',
68
+ aspect='auto', # non-squared pixels to fit the axis
69
+ )
70
+
71
+
72
+ ### PART 3: coverage bar
73
+ axs[0,1].matshow(
74
+ df_coverage.loc[bin_matrix.T.index, ['modeled']].T,
75
+ cmap='cool_r',
76
+ aspect='auto', # non-squared pixels to fit the axis
77
+ )
78
+
79
+
80
+ ### PART 4: legends
81
+ legend_feat = [
82
+ Patch(facecolor=plt.colormaps.get_cmap('viridis')(0.0), edgecolor='black', label='Absent'),
83
+ Patch(facecolor=plt.colormaps.get_cmap('viridis')(1.0), edgecolor='black', label='Probably present'),
84
+ ]
85
+ legend_cov = [
86
+ Patch(facecolor=plt.colormaps.get_cmap('cool_r')(0.0), edgecolor='black', label='Not modeled'),
87
+ Patch(facecolor=plt.colormaps.get_cmap('cool_r')(1.0), edgecolor='black', label='Modeled'),
88
+ ]
89
+ l1 = axs[1,0].legend(handles=legend_cov, title='Universe coverage', loc='upper left')
90
+ l2 = axs[1,0].legend(handles=legend_feat, title='KEGG reaction in strain', loc='lower left')
91
+ axs[1,0].add_artist(l1) # keep both legends visible
92
+
93
+
94
+ ### PART 5: aesthetics
95
+ plt.subplots_adjust(wspace=0, hspace=0) # adjust the space between subplots:
96
+ axs[0,0].axis('off') # remove frame and axis
97
+ axs[1,0].axis('off') # remove frame and axis
98
+
99
+ axs[0,1].yaxis.set_visible(False) # remove ticks, tick labels, axis label
100
+
101
+ axs[1,1].xaxis.set_ticks([]) # remove ticks
102
+ axs[1,1].set_xticklabels([]) # remove tick labels
103
+ axs[1,1].xaxis.set_label_position("bottom")
104
+ axs[1,1].set_xlabel("KEGG reactions")
105
+
106
+ axs[1,1].yaxis.set_ticks([]) # remove ticks
107
+ axs[1,1].set_yticklabels([]) # remove tick labels
108
+ axs[1,1].yaxis.set_label_position("right")
109
+ axs[1,1].set_ylabel(f"{len(strains)} strains", rotation=270, labelpad=13) # labelpad is in points (1 point = 1/72 inch)
110
+
111
+
112
+ ### PART 6: save fig
113
+ buf = BytesIO()
114
+ fig.savefig(buf, dpi=300, bbox_inches='tight') # labelpad is in inches (1 point = 1/72 inch)
115
+ plt.close(fig)
116
+ buf.seek(0) # rewind the buffer to the beginning
117
+
118
+
119
+ return buf
@@ -0,0 +1,145 @@
1
+ import time
2
+ import os
3
+ import sys
4
+ import pickle
5
+
6
+
7
+ import pandas as pnd
8
+ from Bio.KEGG import REST
9
+
10
+
11
+
12
+ def download_keggorg(logger, keggorg='lpl', outdir='./', ):
13
+
14
+
15
+ # check if already downloaded
16
+ outfile = os.path.join(outdir, f'{keggorg}.keggorg')
17
+ if os.path.exists(outfile):
18
+ logger.info(f"Organism code '{keggorg}' already downloaded ('{os.path.join(outdir, f'{keggorg}.keggorg')}').")
19
+ return 0
20
+
21
+
22
+ # donwload entire txt:
23
+ logger.info(f"Verifying existence of organism code '{keggorg}' on KEGG...")
24
+ time.sleep(0.5) # be respectful
25
+ try: response = REST.kegg_list(keggorg).read()
26
+ except:
27
+ logger.error(f"Organism code '{keggorg}' not found in KEGG database.")
28
+ return 1
29
+ # response is now a string similar to:
30
+ """
31
+ lpl:lp_0026 CDS 31317..32084 hydrolase, HAD superfamily, Cof family
32
+ lpl:lp_0027 CDS complement(32236..32907) pgmB1; beta-phosphoglucomutase
33
+ """
34
+
35
+
36
+ # extract the gene IDs list:
37
+ gene_ids = [line.split('\t')[0] for line in response.strip().split('\n')]
38
+ # example of gene_id: "lpl:lp_0005"
39
+ logger.info(f"Respectfully downloading {len(gene_ids)} genes from KEGG...")
40
+
41
+
42
+
43
+ # respectfully download in batch
44
+ # 10 is the max number of elements that can be downloaded
45
+ batch_size = 10
46
+ n_batches = len(gene_ids) // batch_size + (1 if (len(gene_ids) % batch_size) > 0 else 0)
47
+
48
+
49
+ n_attempts = 5
50
+ attempts_left = n_attempts
51
+ default_sleep = 0.5
52
+ sleep_time = default_sleep
53
+
54
+
55
+ completed_batches = 0
56
+ completed_genes = 0
57
+ res_string_list = []
58
+ while completed_batches < n_batches:
59
+
60
+ # be respectful
61
+ time.sleep(sleep_time)
62
+
63
+ # extract batch
64
+ start_index = completed_batches *batch_size
65
+ end_index = (completed_batches+1) *batch_size
66
+ if end_index > len(gene_ids): end_index = len(gene_ids)
67
+ curr_batch = gene_ids[start_index: end_index]
68
+
69
+
70
+ # download batch
71
+ try:
72
+ res_string = REST.kegg_get(curr_batch).read()
73
+ for item in res_string.split("///\n\n"):
74
+ res_string_list.append(item.replace('///\n', ''))
75
+ completed_batches += 1
76
+ completed_genes += len(curr_batch)
77
+
78
+ print(f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!", end='\r', file=sys.stderr)
79
+
80
+ attempts_left = n_attempts
81
+ sleep_time = default_sleep
82
+ except:
83
+ attempts_left -= 1
84
+ sleep_time = default_sleep *4 # increase sleep time to be more respectful
85
+ logger.warning(f"An error occurred during kegg_get() of batch {curr_batch}. Remaining attempts: {attempts_left}.")
86
+
87
+
88
+ if attempts_left == 0:
89
+ logger.error("No attemps left! Shutting down...")
90
+ return 1
91
+
92
+
93
+ # hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
94
+ last_trace = f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!"
95
+ whitewash = ''.join([' ' for i in range(len(last_trace))])
96
+ print(whitewash, end='\r', file=sys.stderr)
97
+
98
+
99
+
100
+ # extract info into a formatted df:
101
+ df = [] # list of dicts, future df
102
+ for entry in res_string_list:
103
+
104
+ entry_dict = {}
105
+ curr_header = None
106
+
107
+ for line in entry.split('\n'):
108
+ if line == '': continue
109
+
110
+ header = line[:12]
111
+ content = line[12:]
112
+ if header != ' '*12:
113
+ curr_header = header
114
+
115
+ if curr_header == 'ENTRY ':
116
+ gid = content.split(' ', 1)[0]
117
+ entry_dict['gid'] = gid
118
+
119
+ if curr_header == 'POSITION ':
120
+ entry_dict['pos'] = content.strip()
121
+
122
+ if curr_header == 'ORTHOLOGY ':
123
+ ko = content.split(' ', 1)[0]
124
+ entry_dict['ko'] = ko
125
+
126
+ if curr_header == 'MOTIF ':
127
+ db, value = content.strip().split(': ', 1)
128
+ entry_dict[db] = value.split(' ')
129
+
130
+ if curr_header == 'DBLINKS ':
131
+ db, value = content.strip().split(': ', 1)
132
+ entry_dict[db] = value.split(' ')
133
+
134
+ df.append(entry_dict)
135
+ df = pnd.DataFrame.from_records(df)
136
+
137
+
138
+ # save dataframe in the output dir:
139
+ with open(outfile, 'wb') as wb_handler:
140
+ pickle.dump(df, wb_handler)
141
+ logger.info(f"'{outfile}' created!")
142
+
143
+
144
+
145
+ return 0
gsrap/commons/__init__.py CHANGED
@@ -7,3 +7,4 @@ from .metrics import *
7
7
  from .sbmlutils import *
8
8
  from .escherutils import *
9
9
  from .logutils import *
10
+ from .keggutils import *
@@ -236,7 +236,7 @@ def format_expansion(logger, eggnog):
236
236
 
237
237
 
238
238
  if eggnog == [] or eggnog == ['-']:
239
- eggnog = '-' # return always a list except for ths case
239
+ eggnog = '-' # return always a list except for this case
240
240
 
241
241
 
242
242
  return eggnog
@@ -31,7 +31,7 @@ def count_undrawn_rids(logger, universe, lastmap):
31
31
  filename = lastmap['filename']
32
32
  logger.debug(f"Last universal map version detected: '{filename}'.")
33
33
  if len(remainings) > 0:
34
- logger.info(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
34
+ logger.warning(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
35
35
  else:
36
36
  logger.info(f"Our universal map is {len(remainings)} reactions behind. Thank you ♥")
37
37