gsrap 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsrap/.ipynb_checkpoints/__init__-checkpoint.py +5 -1
- gsrap/__init__.py +5 -1
- gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
- gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +94 -37
- gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +119 -0
- gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py +145 -0
- gsrap/commons/__init__.py +1 -0
- gsrap/commons/downloads.py +1 -1
- gsrap/commons/escherutils.py +1 -1
- gsrap/commons/excelhub.py +94 -37
- gsrap/commons/figures.py +119 -0
- gsrap/commons/keggutils.py +145 -0
- gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +64 -20
- gsrap/mkmodel/.ipynb_checkpoints/pruner-checkpoint.py +72 -7
- gsrap/mkmodel/mkmodel.py +64 -20
- gsrap/mkmodel/pruner.py +72 -7
- gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +124 -64
- gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +8 -0
- gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +12 -5
- gsrap/parsedb/completeness.py +124 -64
- gsrap/parsedb/introduce.py +8 -0
- gsrap/parsedb/parsedb.py +12 -5
- gsrap/runsims/.ipynb_checkpoints/simplegrowth-checkpoint.py +2 -2
- gsrap/runsims/simplegrowth.py +2 -2
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/METADATA +3 -1
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/RECORD +31 -27
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/LICENSE.txt +0 -0
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/WHEEL +0 -0
- {gsrap-0.7.1.dist-info → gsrap-0.8.0.dist-info}/entry_points.txt +0 -0
gsrap/commons/excelhub.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
import pandas as pnd
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
from .figures import figure_df_C_F1
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def write_excel_model(model, filepath, nofigs, df_E, df_B, df_P, df_S, df_C=None):
|
|
6
9
|
|
|
7
|
-
df_M = []
|
|
8
|
-
df_R = []
|
|
9
|
-
df_T = []
|
|
10
|
-
df_A = []
|
|
11
10
|
|
|
11
|
+
# generate figures
|
|
12
|
+
if nofigs == False:
|
|
13
|
+
|
|
14
|
+
if df_C is not None:
|
|
15
|
+
df_C_F1 = figure_df_C_F1(df_C)
|
|
16
|
+
|
|
17
|
+
|
|
12
18
|
|
|
13
19
|
# format df_E: # biomass precursors biosynthesis
|
|
14
20
|
if df_E is not None:
|
|
@@ -33,64 +39,112 @@ def write_excel_model(model, filepath, df_E, df_B, df_P, df_S):
|
|
|
33
39
|
df_S.insert(0, 'mid', '') # new columns as first
|
|
34
40
|
df_S['mid'] = df_S.index
|
|
35
41
|
df_S = df_S.reset_index(drop=True)
|
|
42
|
+
|
|
43
|
+
# format df_C: universal reaction coverage
|
|
44
|
+
if df_C is not None:
|
|
45
|
+
df_C.insert(0, 'kr', '') # new columns as first
|
|
46
|
+
df_C['kr'] = df_C.index
|
|
47
|
+
df_C = df_C.reset_index(drop=True)
|
|
36
48
|
|
|
37
49
|
|
|
50
|
+
|
|
51
|
+
# define dict-lists, future dataframes
|
|
52
|
+
df_M = []
|
|
53
|
+
df_R = []
|
|
54
|
+
df_T = []
|
|
55
|
+
df_G = []
|
|
56
|
+
df_A = []
|
|
57
|
+
|
|
38
58
|
for m in model.metabolites:
|
|
59
|
+
row_dict = {'mid': m.id, 'name': m.name, 'formula': m.formula, 'charge': m.charge,}
|
|
39
60
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
df_M.append({'mid': m.id, 'formula': m.formula, 'charge': m.charge, 'kc': kc_ids, 'name': m.name})
|
|
48
|
-
|
|
61
|
+
for db in m.annotation.keys():
|
|
62
|
+
annots = m.annotation[db]
|
|
63
|
+
if type(annots) == str: annots = [annots]
|
|
64
|
+
annots = '; '.join([i for i in annots])
|
|
65
|
+
row_dict[db] = annots
|
|
66
|
+
df_M.append(row_dict)
|
|
49
67
|
|
|
50
68
|
for r in model.reactions:
|
|
69
|
+
row_dict = {'rid': r.id, 'name': r.name, 'rstring': r.reaction, 'gpr': "Not applicable", 'bounds': r.bounds}
|
|
70
|
+
|
|
71
|
+
for db in r.annotation.keys():
|
|
72
|
+
annots = r.annotation[db]
|
|
73
|
+
if type(annots) == str: annots = [annots]
|
|
74
|
+
annots = '; '.join([i for i in annots])
|
|
75
|
+
row_dict[db] = annots
|
|
51
76
|
|
|
52
77
|
# handle artificial reactions
|
|
53
78
|
if r.id == 'Biomass':
|
|
54
|
-
|
|
79
|
+
# commented as the type is inplicit in the ID
|
|
80
|
+
#row_dict['type'] = 'biomass'
|
|
81
|
+
df_A.append(row_dict)
|
|
55
82
|
|
|
56
83
|
elif len(r.metabolites) == 1:
|
|
84
|
+
# commented as the type is inplicit in the ID
|
|
85
|
+
"""
|
|
57
86
|
if len(r.metabolites)==1 and list(r.metabolites)[0].id.rsplit('_',1)[-1] == 'e':
|
|
58
|
-
|
|
87
|
+
row_dict['type'] = 'exchange'
|
|
59
88
|
elif r.lower_bound < 0 and r.upper_bound > 0:
|
|
60
|
-
|
|
89
|
+
row_dict['type'] = 'sink'
|
|
61
90
|
elif r.lower_bound == 0 and r.upper_bound > 0:
|
|
62
|
-
|
|
91
|
+
row_dict['type'] = 'demand'
|
|
92
|
+
"""
|
|
93
|
+
df_A.append(row_dict)
|
|
63
94
|
|
|
64
95
|
else: # more than 1 metabolite involved
|
|
96
|
+
row_dict['gpr'] = r.gene_reaction_rule
|
|
65
97
|
|
|
66
|
-
# get kr codes:
|
|
67
|
-
if 'kegg.reaction' not in r.annotation.keys(): kr_ids = ''
|
|
68
|
-
else:
|
|
69
|
-
kr_ids = r.annotation['kegg.reaction']
|
|
70
|
-
if type(kr_ids) == str: kr_ids = [kr_ids]
|
|
71
|
-
kr_ids = '; '.join([i for i in kr_ids if i!='RXXXXX'])
|
|
72
|
-
|
|
73
98
|
# introduce reaction in the correct table:
|
|
74
|
-
r_dict = {'rid': r.id, 'rstring': r.reaction, 'kr': kr_ids, 'gpr': r.gene_reaction_rule, 'name': r.name}
|
|
75
99
|
if len(set([m.id.rsplit('_',1)[-1] for m in r.metabolites])) == 1:
|
|
76
|
-
df_R.append(
|
|
77
|
-
else: df_T.append(
|
|
78
|
-
|
|
100
|
+
df_R.append(row_dict)
|
|
101
|
+
else: df_T.append(row_dict)
|
|
102
|
+
|
|
103
|
+
for g in model.genes:
|
|
104
|
+
row_dict = {'gid': g.id, 'involved_in': '; '.join([r.id for r in g.reactions])}
|
|
105
|
+
|
|
106
|
+
for db in g.annotation.keys():
|
|
107
|
+
annots = g.annotation[db]
|
|
108
|
+
if type(annots) == str: annots = [annots]
|
|
109
|
+
annots = '; '.join([i for i in annots])
|
|
110
|
+
row_dict[db] = annots
|
|
111
|
+
df_G.append(row_dict)
|
|
79
112
|
|
|
113
|
+
# create dataframes from dict-lists
|
|
80
114
|
df_M = pnd.DataFrame.from_records(df_M)
|
|
81
115
|
df_R = pnd.DataFrame.from_records(df_R)
|
|
82
116
|
df_T = pnd.DataFrame.from_records(df_T)
|
|
83
117
|
df_A = pnd.DataFrame.from_records(df_A)
|
|
84
|
-
|
|
118
|
+
df_G = pnd.DataFrame.from_records(df_G)
|
|
119
|
+
|
|
120
|
+
# sort columns
|
|
121
|
+
df_M_first_cols = ['mid', 'name', 'formula', 'charge']
|
|
122
|
+
df_M = df_M[df_M_first_cols + sorted([c for c in df_M.columns if c not in df_M_first_cols])]
|
|
123
|
+
df_R_first_cols = ['rid', 'name', 'rstring', 'gpr', 'bounds']
|
|
124
|
+
df_R = df_R[df_R_first_cols + sorted([c for c in df_R.columns if c not in df_R_first_cols])]
|
|
125
|
+
df_T = df_T[df_R_first_cols + sorted([c for c in df_T.columns if c not in df_R_first_cols])]
|
|
126
|
+
df_A = df_A[df_R_first_cols + sorted([c for c in df_A.columns if c not in df_R_first_cols])]
|
|
127
|
+
df_G_first_cols = ['gid', 'involved_in']
|
|
128
|
+
df_G = df_G[df_G_first_cols + sorted([c for c in df_G.columns if c not in df_G_first_cols])]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
with pnd.ExcelWriter(filepath, engine='xlsxwriter') as writer:
|
|
85
133
|
df_M.to_excel(writer, sheet_name='Metabolites', index=False)
|
|
86
134
|
df_R.to_excel(writer, sheet_name='Reactions', index=False)
|
|
87
135
|
df_T.to_excel(writer, sheet_name='Transporters', index=False)
|
|
136
|
+
df_G.to_excel(writer, sheet_name='Genes', index=False)
|
|
88
137
|
df_A.to_excel(writer, sheet_name='Artificials', index=False)
|
|
89
138
|
if df_E is not None and len(df_E)!=0: df_E.to_excel(writer, sheet_name='Precursors', index=False)
|
|
90
139
|
if df_B is not None: df_B.to_excel(writer, sheet_name='Biomass', index=False)
|
|
91
140
|
if df_P is not None and len(df_P)!=0: df_P.to_excel(writer, sheet_name='Biolog®', index=False)
|
|
92
141
|
if df_S is not None and len(df_S.columns)>2: df_S.to_excel(writer, sheet_name='Biosynth', index=False)
|
|
93
|
-
|
|
142
|
+
if df_C is not None:
|
|
143
|
+
df_C.to_excel(writer, sheet_name='Coverage', index=False)
|
|
144
|
+
if nofigs == False:
|
|
145
|
+
worksheet = writer.sheets['Coverage']
|
|
146
|
+
worksheet.insert_image('E3', 'df_C_F1.png', {'image_data': df_C_F1})
|
|
147
|
+
|
|
94
148
|
|
|
95
149
|
sheets_dict = {
|
|
96
150
|
'model_id': model.id,
|
|
@@ -102,6 +156,7 @@ def write_excel_model(model, filepath, df_E, df_B, df_P, df_S):
|
|
|
102
156
|
'Biomass': df_B,
|
|
103
157
|
'Biolog': df_P,
|
|
104
158
|
'Biosynth': df_S,
|
|
159
|
+
'Coverage': df_C,
|
|
105
160
|
}
|
|
106
161
|
return sheets_dict
|
|
107
162
|
|
|
@@ -115,9 +170,10 @@ def comparative_table(logger, outdir, sheets_dicts):
|
|
|
115
170
|
for sheets_dict in sheets_dicts:
|
|
116
171
|
for index, row in sheets_dict['Reactions'].iterrows():
|
|
117
172
|
if row['rid'] not in df_topology.index:
|
|
118
|
-
df_topology.loc[row['rid'], '
|
|
119
|
-
|
|
120
|
-
|
|
173
|
+
df_topology.loc[row['rid'], 'rid'] = row['rid']
|
|
174
|
+
for key, value in row.to_dict().items():
|
|
175
|
+
# force string to avoid errors with bounds
|
|
176
|
+
df_topology.loc[row['rid'], key] = '' if pnd.isna(value) else str(value)
|
|
121
177
|
df_topology.loc[row['rid'], sheets_dict['model_id']] = 1
|
|
122
178
|
for sheets_dict in sheets_dicts: # replace missing values:
|
|
123
179
|
df_topology = df_topology.fillna({sheets_dict['model_id']: 0})
|
|
@@ -128,9 +184,10 @@ def comparative_table(logger, outdir, sheets_dicts):
|
|
|
128
184
|
for sheets_dict in sheets_dicts:
|
|
129
185
|
for index, row in sheets_dict['Reactions'].iterrows():
|
|
130
186
|
if row['rid'] not in df_gprs.index:
|
|
131
|
-
df_gprs.loc[row['rid'], '
|
|
132
|
-
|
|
133
|
-
|
|
187
|
+
df_gprs.loc[row['rid'], 'rid'] = row['rid']
|
|
188
|
+
for key, value in row.to_dict().items():
|
|
189
|
+
# force string to avoid errors with bounds
|
|
190
|
+
df_gprs.loc[row['rid'], key] = '' if pnd.isna(value) else str(value)
|
|
134
191
|
df_gprs.loc[row['rid'], sheets_dict['model_id']] = row['gpr']
|
|
135
192
|
for sheets_dict in sheets_dicts: # replace missing values:
|
|
136
193
|
df_gprs = df_gprs.fillna({sheets_dict['model_id']: 'missing'})
|
gsrap/commons/figures.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pnd
|
|
5
|
+
|
|
6
|
+
from scipy.spatial.distance import pdist
|
|
7
|
+
from scipy.cluster.hierarchy import linkage, cut_tree, dendrogram, leaves_list
|
|
8
|
+
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
from matplotlib.patches import Patch
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def figure_df_C_F1(df_coverage):
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# prepare the binary matrix:
|
|
19
|
+
modeled_rs = df_coverage[df_coverage['modeled']==True].index
|
|
20
|
+
unmodeled_rs = df_coverage[df_coverage['modeled']==False].index
|
|
21
|
+
# remove useless columns
|
|
22
|
+
bin_matrix = df_coverage[[i for i in df_coverage.columns if i not in ['map_ids', 'modeled']]]
|
|
23
|
+
# sort rows: upper rows are present in more strains
|
|
24
|
+
bin_matrix = bin_matrix.loc[bin_matrix.sum(axis=1).sort_values(ascending=False).index]
|
|
25
|
+
# split in 2: modeled above, non-modeled below:
|
|
26
|
+
bin_matrix = pnd.concat([
|
|
27
|
+
bin_matrix.loc[[i for i in bin_matrix.index if i in modeled_rs], ],
|
|
28
|
+
bin_matrix.loc[[i for i in bin_matrix.index if i in unmodeled_rs], ]
|
|
29
|
+
])
|
|
30
|
+
strains = bin_matrix.columns
|
|
31
|
+
bin_matrix = bin_matrix.T # features in column
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# pdist() / linkage() will loose the accession information. So here we save a dict:
|
|
35
|
+
index_to_strain = {i: strain for i, strain in enumerate(bin_matrix.index)}
|
|
36
|
+
|
|
37
|
+
# Calculate the linkage matrix using Ward clustering and Jaccard dissimilarity
|
|
38
|
+
distances = pdist(bin_matrix, 'jaccard')
|
|
39
|
+
linkage_matrix = linkage(distances, method='ward')
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# PART 0: create the frame
|
|
43
|
+
fig, axs = plt.subplots(
|
|
44
|
+
nrows=2, ncols=2,
|
|
45
|
+
figsize=(15, 10),
|
|
46
|
+
gridspec_kw={ # suplots width proportions.
|
|
47
|
+
'width_ratios': [0.5, 1.0],
|
|
48
|
+
'height_ratios': [0.015, 0.985]
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# PART 1: dendrogram
|
|
53
|
+
dn = dendrogram(
|
|
54
|
+
linkage_matrix, ax=axs[1,0],
|
|
55
|
+
orientation='left',
|
|
56
|
+
color_threshold=0, above_threshold_color='black',
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
### PART 2: heatmap
|
|
61
|
+
ord_leaves = leaves_list(linkage_matrix)
|
|
62
|
+
ord_leaves = np.flip(ord_leaves) # because leaves are returned in the inverse sense.
|
|
63
|
+
ord_leaves = [index_to_strain[i] for i in ord_leaves] # convert index as number to index as accession
|
|
64
|
+
bin_matrix = bin_matrix.loc[ord_leaves, :] # reordered dataframe.
|
|
65
|
+
axs[1,1].matshow(
|
|
66
|
+
bin_matrix,
|
|
67
|
+
cmap='viridis',
|
|
68
|
+
aspect='auto', # non-squared pixels to fit the axis
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
### PART 3: coverage bar
|
|
73
|
+
axs[0,1].matshow(
|
|
74
|
+
df_coverage.loc[bin_matrix.T.index, ['modeled']].T,
|
|
75
|
+
cmap='cool_r',
|
|
76
|
+
aspect='auto', # non-squared pixels to fit the axis
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
### PART 4: legends
|
|
81
|
+
legend_feat = [
|
|
82
|
+
Patch(facecolor=plt.colormaps.get_cmap('viridis')(0.0), edgecolor='black', label='Absent'),
|
|
83
|
+
Patch(facecolor=plt.colormaps.get_cmap('viridis')(1.0), edgecolor='black', label='Probably present'),
|
|
84
|
+
]
|
|
85
|
+
legend_cov = [
|
|
86
|
+
Patch(facecolor=plt.colormaps.get_cmap('cool_r')(0.0), edgecolor='black', label='Not modeled'),
|
|
87
|
+
Patch(facecolor=plt.colormaps.get_cmap('cool_r')(1.0), edgecolor='black', label='Modeled'),
|
|
88
|
+
]
|
|
89
|
+
l1 = axs[1,0].legend(handles=legend_cov, title='Universe coverage', loc='upper left')
|
|
90
|
+
l2 = axs[1,0].legend(handles=legend_feat, title='KEGG reaction in strain', loc='lower left')
|
|
91
|
+
axs[1,0].add_artist(l1) # keep both legends visible
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
### PART 5: aesthetics
|
|
95
|
+
plt.subplots_adjust(wspace=0, hspace=0) # adjust the space between subplots:
|
|
96
|
+
axs[0,0].axis('off') # remove frame and axis
|
|
97
|
+
axs[1,0].axis('off') # remove frame and axis
|
|
98
|
+
|
|
99
|
+
axs[0,1].yaxis.set_visible(False) # remove ticks, tick labels, axis label
|
|
100
|
+
|
|
101
|
+
axs[1,1].xaxis.set_ticks([]) # remove ticks
|
|
102
|
+
axs[1,1].set_xticklabels([]) # remove tick labels
|
|
103
|
+
axs[1,1].xaxis.set_label_position("bottom")
|
|
104
|
+
axs[1,1].set_xlabel("KEGG reactions")
|
|
105
|
+
|
|
106
|
+
axs[1,1].yaxis.set_ticks([]) # remove ticks
|
|
107
|
+
axs[1,1].set_yticklabels([]) # remove tick labels
|
|
108
|
+
axs[1,1].yaxis.set_label_position("right")
|
|
109
|
+
axs[1,1].set_ylabel(f"{len(strains)} strains", rotation=270, labelpad=13) # labelpad is in points (1 point = 1/72 inch)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
### PART 6: save fig
|
|
113
|
+
buf = BytesIO()
|
|
114
|
+
fig.savefig(buf, dpi=300, bbox_inches='tight') # labelpad is in inches (1 point = 1/72 inch)
|
|
115
|
+
plt.close(fig)
|
|
116
|
+
buf.seek(0) # rewind the buffer to the beginning
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
return buf
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import pickle
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
import pandas as pnd
|
|
8
|
+
from Bio.KEGG import REST
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def download_keggorg(logger, keggorg='lpl', outdir='./', ):
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# check if already downloaded
|
|
16
|
+
outfile = os.path.join(outdir, f'{keggorg}.keggorg')
|
|
17
|
+
if os.path.exists(outfile):
|
|
18
|
+
logger.info(f"Organism code '{keggorg}' already downloaded ('{os.path.join(outdir, f'{keggorg}.keggorg')}').")
|
|
19
|
+
return 0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# donwload entire txt:
|
|
23
|
+
logger.info(f"Verifying existence of organism code '{keggorg}' on KEGG...")
|
|
24
|
+
time.sleep(0.5) # be respectful
|
|
25
|
+
try: response = REST.kegg_list(keggorg).read()
|
|
26
|
+
except:
|
|
27
|
+
logger.error(f"Organism code '{keggorg}' not found in KEGG database.")
|
|
28
|
+
return 1
|
|
29
|
+
# response is now a string similar to:
|
|
30
|
+
"""
|
|
31
|
+
lpl:lp_0026 CDS 31317..32084 hydrolase, HAD superfamily, Cof family
|
|
32
|
+
lpl:lp_0027 CDS complement(32236..32907) pgmB1; beta-phosphoglucomutase
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# extract the gene IDs list:
|
|
37
|
+
gene_ids = [line.split('\t')[0] for line in response.strip().split('\n')]
|
|
38
|
+
# example of gene_id: "lpl:lp_0005"
|
|
39
|
+
logger.info(f"Respectfully downloading {len(gene_ids)} genes from KEGG...")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# respectfully download in batch
|
|
44
|
+
# 10 is the max number of elements that can be downloaded
|
|
45
|
+
batch_size = 10
|
|
46
|
+
n_batches = len(gene_ids) // batch_size + (1 if (len(gene_ids) % batch_size) > 0 else 0)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
n_attempts = 5
|
|
50
|
+
attempts_left = n_attempts
|
|
51
|
+
default_sleep = 0.5
|
|
52
|
+
sleep_time = default_sleep
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
completed_batches = 0
|
|
56
|
+
completed_genes = 0
|
|
57
|
+
res_string_list = []
|
|
58
|
+
while completed_batches < n_batches:
|
|
59
|
+
|
|
60
|
+
# be respectful
|
|
61
|
+
time.sleep(sleep_time)
|
|
62
|
+
|
|
63
|
+
# extract batch
|
|
64
|
+
start_index = completed_batches *batch_size
|
|
65
|
+
end_index = (completed_batches+1) *batch_size
|
|
66
|
+
if end_index > len(gene_ids): end_index = len(gene_ids)
|
|
67
|
+
curr_batch = gene_ids[start_index: end_index]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# download batch
|
|
71
|
+
try:
|
|
72
|
+
res_string = REST.kegg_get(curr_batch).read()
|
|
73
|
+
for item in res_string.split("///\n\n"):
|
|
74
|
+
res_string_list.append(item.replace('///\n', ''))
|
|
75
|
+
completed_batches += 1
|
|
76
|
+
completed_genes += len(curr_batch)
|
|
77
|
+
|
|
78
|
+
print(f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!", end='\r', file=sys.stderr)
|
|
79
|
+
|
|
80
|
+
attempts_left = n_attempts
|
|
81
|
+
sleep_time = default_sleep
|
|
82
|
+
except:
|
|
83
|
+
attempts_left -= 1
|
|
84
|
+
sleep_time = default_sleep *4 # increase sleep time to be more respectful
|
|
85
|
+
logger.warning(f"An error occurred during kegg_get() of batch {curr_batch}. Remaining attempts: {attempts_left}.")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if attempts_left == 0:
|
|
89
|
+
logger.error("No attemps left! Shutting down...")
|
|
90
|
+
return 1
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
|
|
94
|
+
last_trace = f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!"
|
|
95
|
+
whitewash = ''.join([' ' for i in range(len(last_trace))])
|
|
96
|
+
print(whitewash, end='\r', file=sys.stderr)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# extract info into a formatted df:
|
|
101
|
+
df = [] # list of dicts, future df
|
|
102
|
+
for entry in res_string_list:
|
|
103
|
+
|
|
104
|
+
entry_dict = {}
|
|
105
|
+
curr_header = None
|
|
106
|
+
|
|
107
|
+
for line in entry.split('\n'):
|
|
108
|
+
if line == '': continue
|
|
109
|
+
|
|
110
|
+
header = line[:12]
|
|
111
|
+
content = line[12:]
|
|
112
|
+
if header != ' '*12:
|
|
113
|
+
curr_header = header
|
|
114
|
+
|
|
115
|
+
if curr_header == 'ENTRY ':
|
|
116
|
+
gid = content.split(' ', 1)[0]
|
|
117
|
+
entry_dict['gid'] = gid
|
|
118
|
+
|
|
119
|
+
if curr_header == 'POSITION ':
|
|
120
|
+
entry_dict['pos'] = content.strip()
|
|
121
|
+
|
|
122
|
+
if curr_header == 'ORTHOLOGY ':
|
|
123
|
+
ko = content.split(' ', 1)[0]
|
|
124
|
+
entry_dict['ko'] = ko
|
|
125
|
+
|
|
126
|
+
if curr_header == 'MOTIF ':
|
|
127
|
+
db, value = content.strip().split(': ', 1)
|
|
128
|
+
entry_dict[db] = value.split(' ')
|
|
129
|
+
|
|
130
|
+
if curr_header == 'DBLINKS ':
|
|
131
|
+
db, value = content.strip().split(': ', 1)
|
|
132
|
+
entry_dict[db] = value.split(' ')
|
|
133
|
+
|
|
134
|
+
df.append(entry_dict)
|
|
135
|
+
df = pnd.DataFrame.from_records(df)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# save dataframe in the output dir:
|
|
139
|
+
with open(outfile, 'wb') as wb_handler:
|
|
140
|
+
pickle.dump(df, wb_handler)
|
|
141
|
+
logger.info(f"'{outfile}' created!")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
return 0
|
|
@@ -12,10 +12,12 @@ import gempipe
|
|
|
12
12
|
|
|
13
13
|
from .pruner import load_input_universe
|
|
14
14
|
from .pruner import load_input_eggnog
|
|
15
|
+
from .pruner import load_keggorg_like_eggnog
|
|
15
16
|
from .pruner import parse_eggnog
|
|
16
17
|
from .pruner import subtract_kos
|
|
17
18
|
from .pruner import translate_remaining_kos
|
|
18
19
|
from .pruner import restore_gene_annotations
|
|
20
|
+
from .pruner import append_keggorg_gene_annots
|
|
19
21
|
|
|
20
22
|
from .gapfillutils import include_forced
|
|
21
23
|
|
|
@@ -38,26 +40,37 @@ from ..commons import log_metrics
|
|
|
38
40
|
from ..commons import log_unbalances
|
|
39
41
|
from ..commons import format_expansion
|
|
40
42
|
from ..commons import comparative_table
|
|
43
|
+
from ..commons import download_keggorg
|
|
41
44
|
|
|
42
45
|
from ..runsims.biosynth import biosynthesis_on_media
|
|
43
46
|
|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
def create_model_incore(params):
|
|
47
|
-
universe, eggpath, dbexp, args, multistrain = params
|
|
50
|
+
annotation_source, universe, eggpath, dbexp, args, multistrain = params
|
|
51
|
+
|
|
52
|
+
# get the logger:
|
|
48
53
|
logger = get_logger('gsrap_queued', args.verbose) # loggers can't be pickled!
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# only errors will be recorded if multistrain mode
|
|
49
57
|
if multistrain:
|
|
50
|
-
# only errors will be recorded
|
|
51
58
|
logger.setLevel(logging.ERROR)
|
|
52
59
|
|
|
53
60
|
|
|
54
61
|
# load the annotation
|
|
55
|
-
|
|
62
|
+
if annotation_source == 'keggorg':
|
|
63
|
+
eggnog_style_table = load_keggorg_like_eggnog(logger, args.keggorg, args.outdir)
|
|
64
|
+
elif annotation_source == 'eggnog':
|
|
65
|
+
eggnog_style_table = load_input_eggnog(logger, eggpath)
|
|
56
66
|
|
|
57
67
|
|
|
58
|
-
# create a copy of the universe
|
|
68
|
+
# create a copy of the universe and define the model ID
|
|
59
69
|
model = universe.copy()
|
|
60
|
-
|
|
70
|
+
if annotation_source == 'keggorg':
|
|
71
|
+
model.id = args.keggorg
|
|
72
|
+
elif annotation_source == 'eggnog':
|
|
73
|
+
model.id = Path(eggpath).stem
|
|
61
74
|
|
|
62
75
|
|
|
63
76
|
###### POLISHING 1
|
|
@@ -67,9 +80,10 @@ def create_model_incore(params):
|
|
|
67
80
|
|
|
68
81
|
|
|
69
82
|
###### PRUNING
|
|
70
|
-
logger.info("Reading
|
|
83
|
+
if annotation_source == 'keggorg': logger.info(f"Reading annotation for organism code '{args.keggorg}'...")
|
|
84
|
+
elif annotation_source == 'eggnog': logger.info("Reading provided eggnog-mapper annotation...")
|
|
71
85
|
# get important dictionaries: 'eggnog_ko_to_gids' and 'eggonog_gid_to_kos'
|
|
72
|
-
eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(
|
|
86
|
+
eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog_style_table)
|
|
73
87
|
|
|
74
88
|
# prune reactions
|
|
75
89
|
subtract_kos(logger, model, eggnog_ko_to_gids)
|
|
@@ -77,6 +91,10 @@ def create_model_incore(params):
|
|
|
77
91
|
# translate KOs to the actual genes
|
|
78
92
|
translate_remaining_kos(logger, model, eggnog_ko_to_gids)
|
|
79
93
|
restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
|
|
94
|
+
|
|
95
|
+
# insert gene annotation if starting from kegg organisms:
|
|
96
|
+
if annotation_source == 'keggorg':
|
|
97
|
+
append_keggorg_gene_annots(logger, model, args.keggorg, args.outdir)
|
|
80
98
|
|
|
81
99
|
|
|
82
100
|
|
|
@@ -141,7 +159,7 @@ def create_model_incore(params):
|
|
|
141
159
|
cobra.io.write_sbml_model(model, f'{args.outdir}/{model.id}.xml') # SBML # groups are saved only to SBML
|
|
142
160
|
logger.info(f"'{args.outdir}/{model.id}.xml' created!")
|
|
143
161
|
force_id_on_sbml(f'{args.outdir}/{model.id}.xml', model.id) # force introduction of the 'id=""' field
|
|
144
|
-
sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', None, df_B, df_P, df_S)
|
|
162
|
+
sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', args.nofigs, None, df_B, df_P, df_S)
|
|
145
163
|
logger.info(f"'{args.outdir}/{model.id}.mkmodel.xlsx' created!")
|
|
146
164
|
|
|
147
165
|
|
|
@@ -171,13 +189,28 @@ def main(args, logger):
|
|
|
171
189
|
|
|
172
190
|
|
|
173
191
|
# format the --eggnog param
|
|
174
|
-
args.eggnog = format_expansion(logger, args.eggnog)
|
|
175
|
-
|
|
176
|
-
|
|
192
|
+
args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
|
|
193
|
+
|
|
194
|
+
# get the kegg organism if requested
|
|
195
|
+
if args.keggorg != '-':
|
|
196
|
+
response = download_keggorg(logger, args.keggorg, args.outdir)
|
|
197
|
+
if response == 1: return 1
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# determine the source of functional annotation:
|
|
202
|
+
annotation_source = None
|
|
203
|
+
if args.keggorg != '-': # keggorg has precedence
|
|
204
|
+
annotation_source = 'keggorg'
|
|
205
|
+
elif args.eggnog != '-':
|
|
206
|
+
annotation_source = 'eggnog'
|
|
207
|
+
if args.cores > len(args.eggnog):
|
|
208
|
+
logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
|
|
209
|
+
args.cores = len(args.eggnog)
|
|
210
|
+
else:
|
|
211
|
+
logger.error("No valid functional annotations provided: please use '--keggorg' or '--eggnog'.")
|
|
177
212
|
return 1
|
|
178
|
-
|
|
179
|
-
logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
|
|
180
|
-
args.cores = len(args.eggnog)
|
|
213
|
+
|
|
181
214
|
|
|
182
215
|
|
|
183
216
|
# check compatibility of input parameters:
|
|
@@ -201,17 +234,26 @@ def main(args, logger):
|
|
|
201
234
|
|
|
202
235
|
|
|
203
236
|
# disable logging (swith to txt) if strains are more than 1:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
237
|
+
if annotation_source == 'keggorg':
|
|
238
|
+
multistrain = False
|
|
239
|
+
elif annotation_source == 'eggnog':
|
|
240
|
+
multistrain = len(args.eggnog) > 1
|
|
241
|
+
if multistrain:
|
|
242
|
+
logger.info(f"Number of provided strains is >1: logging will be disabled.")
|
|
243
|
+
logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
|
|
244
|
+
# actualy this is done inside child processess!
|
|
245
|
+
|
|
209
246
|
|
|
210
247
|
# create strain-specific GSMMs using multi-core
|
|
211
248
|
error_raised = False
|
|
212
249
|
sheets_dicts = []
|
|
213
250
|
executor = confu.ProcessPoolExecutor(max_workers=args.cores)
|
|
214
|
-
|
|
251
|
+
|
|
252
|
+
if annotation_source == 'keggorg':
|
|
253
|
+
futures = [executor.submit(create_model_incore, (annotation_source, universe, None, dbexp, args, multistrain))]
|
|
254
|
+
elif annotation_source == 'eggnog':
|
|
255
|
+
futures = [executor.submit(create_model_incore, (annotation_source, universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
|
|
256
|
+
|
|
215
257
|
for f in confu.as_completed(futures):
|
|
216
258
|
sheets_dict = f.result()
|
|
217
259
|
|
|
@@ -226,12 +268,14 @@ def main(args, logger):
|
|
|
226
268
|
sheets_dicts.append(sheets_dict)
|
|
227
269
|
print(f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!", end='\r', file=sys.stderr)
|
|
228
270
|
|
|
271
|
+
|
|
229
272
|
# hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
|
|
230
273
|
if multistrain and sheets_dicts != []:
|
|
231
274
|
last_trace = f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!"
|
|
232
275
|
whitewash = ''.join([' ' for i in range(len(last_trace))])
|
|
233
276
|
print(whitewash, end='\r', file=sys.stderr)
|
|
234
277
|
|
|
278
|
+
|
|
235
279
|
# multiproces part terminated: safely shut down the executor
|
|
236
280
|
executor.shutdown(wait=True)
|
|
237
281
|
|