gsrap 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsrap/.ipynb_checkpoints/__init__-checkpoint.py +2 -0
- gsrap/__init__.py +2 -0
- gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
- gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +1 -1
- gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +70 -37
- gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +15 -1
- gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py +145 -0
- gsrap/commons/__init__.py +1 -0
- gsrap/commons/downloads.py +1 -1
- gsrap/commons/escherutils.py +1 -1
- gsrap/commons/excelhub.py +70 -37
- gsrap/commons/figures.py +15 -1
- gsrap/commons/keggutils.py +145 -0
- gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +63 -19
- gsrap/mkmodel/.ipynb_checkpoints/pruner-checkpoint.py +72 -7
- gsrap/mkmodel/mkmodel.py +63 -19
- gsrap/mkmodel/pruner.py +72 -7
- gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +33 -6
- gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +8 -0
- gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +10 -2
- gsrap/parsedb/completeness.py +33 -6
- gsrap/parsedb/introduce.py +8 -0
- gsrap/parsedb/parsedb.py +10 -2
- {gsrap-0.7.2.dist-info → gsrap-0.8.0.dist-info}/METADATA +1 -1
- {gsrap-0.7.2.dist-info → gsrap-0.8.0.dist-info}/RECORD +29 -27
- {gsrap-0.7.2.dist-info → gsrap-0.8.0.dist-info}/LICENSE.txt +0 -0
- {gsrap-0.7.2.dist-info → gsrap-0.8.0.dist-info}/WHEEL +0 -0
- {gsrap-0.7.2.dist-info → gsrap-0.8.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import pickle
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
import pandas as pnd
|
|
8
|
+
from Bio.KEGG import REST
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def download_keggorg(logger, keggorg='lpl', outdir='./', ):
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# check if already downloaded
|
|
16
|
+
outfile = os.path.join(outdir, f'{keggorg}.keggorg')
|
|
17
|
+
if os.path.exists(outfile):
|
|
18
|
+
logger.info(f"Organism code '{keggorg}' already downloaded ('{os.path.join(outdir, f'{keggorg}.keggorg')}').")
|
|
19
|
+
return 0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# donwload entire txt:
|
|
23
|
+
logger.info(f"Verifying existence of organism code '{keggorg}' on KEGG...")
|
|
24
|
+
time.sleep(0.5) # be respectful
|
|
25
|
+
try: response = REST.kegg_list(keggorg).read()
|
|
26
|
+
except:
|
|
27
|
+
logger.error(f"Organism code '{keggorg}' not found in KEGG database.")
|
|
28
|
+
return 1
|
|
29
|
+
# response is now a string similar to:
|
|
30
|
+
"""
|
|
31
|
+
lpl:lp_0026 CDS 31317..32084 hydrolase, HAD superfamily, Cof family
|
|
32
|
+
lpl:lp_0027 CDS complement(32236..32907) pgmB1; beta-phosphoglucomutase
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# extract the gene IDs list:
|
|
37
|
+
gene_ids = [line.split('\t')[0] for line in response.strip().split('\n')]
|
|
38
|
+
# example of gene_id: "lpl:lp_0005"
|
|
39
|
+
logger.info(f"Respectfully downloading {len(gene_ids)} genes from KEGG...")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# respectfully download in batch
|
|
44
|
+
# 10 is the max number of elements that can be downloaded
|
|
45
|
+
batch_size = 10
|
|
46
|
+
n_batches = len(gene_ids) // batch_size + (1 if (len(gene_ids) % batch_size) > 0 else 0)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
n_attempts = 5
|
|
50
|
+
attempts_left = n_attempts
|
|
51
|
+
default_sleep = 0.5
|
|
52
|
+
sleep_time = default_sleep
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
completed_batches = 0
|
|
56
|
+
completed_genes = 0
|
|
57
|
+
res_string_list = []
|
|
58
|
+
while completed_batches < n_batches:
|
|
59
|
+
|
|
60
|
+
# be respectful
|
|
61
|
+
time.sleep(sleep_time)
|
|
62
|
+
|
|
63
|
+
# extract batch
|
|
64
|
+
start_index = completed_batches *batch_size
|
|
65
|
+
end_index = (completed_batches+1) *batch_size
|
|
66
|
+
if end_index > len(gene_ids): end_index = len(gene_ids)
|
|
67
|
+
curr_batch = gene_ids[start_index: end_index]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# download batch
|
|
71
|
+
try:
|
|
72
|
+
res_string = REST.kegg_get(curr_batch).read()
|
|
73
|
+
for item in res_string.split("///\n\n"):
|
|
74
|
+
res_string_list.append(item.replace('///\n', ''))
|
|
75
|
+
completed_batches += 1
|
|
76
|
+
completed_genes += len(curr_batch)
|
|
77
|
+
|
|
78
|
+
print(f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!", end='\r', file=sys.stderr)
|
|
79
|
+
|
|
80
|
+
attempts_left = n_attempts
|
|
81
|
+
sleep_time = default_sleep
|
|
82
|
+
except:
|
|
83
|
+
attempts_left -= 1
|
|
84
|
+
sleep_time = default_sleep *4 # increase sleep time to be more respectful
|
|
85
|
+
logger.warning(f"An error occurred during kegg_get() of batch {curr_batch}. Remaining attempts: {attempts_left}.")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if attempts_left == 0:
|
|
89
|
+
logger.error("No attemps left! Shutting down...")
|
|
90
|
+
return 1
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
|
|
94
|
+
last_trace = f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!"
|
|
95
|
+
whitewash = ''.join([' ' for i in range(len(last_trace))])
|
|
96
|
+
print(whitewash, end='\r', file=sys.stderr)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# extract info into a formatted df:
|
|
101
|
+
df = [] # list of dicts, future df
|
|
102
|
+
for entry in res_string_list:
|
|
103
|
+
|
|
104
|
+
entry_dict = {}
|
|
105
|
+
curr_header = None
|
|
106
|
+
|
|
107
|
+
for line in entry.split('\n'):
|
|
108
|
+
if line == '': continue
|
|
109
|
+
|
|
110
|
+
header = line[:12]
|
|
111
|
+
content = line[12:]
|
|
112
|
+
if header != ' '*12:
|
|
113
|
+
curr_header = header
|
|
114
|
+
|
|
115
|
+
if curr_header == 'ENTRY ':
|
|
116
|
+
gid = content.split(' ', 1)[0]
|
|
117
|
+
entry_dict['gid'] = gid
|
|
118
|
+
|
|
119
|
+
if curr_header == 'POSITION ':
|
|
120
|
+
entry_dict['pos'] = content.strip()
|
|
121
|
+
|
|
122
|
+
if curr_header == 'ORTHOLOGY ':
|
|
123
|
+
ko = content.split(' ', 1)[0]
|
|
124
|
+
entry_dict['ko'] = ko
|
|
125
|
+
|
|
126
|
+
if curr_header == 'MOTIF ':
|
|
127
|
+
db, value = content.strip().split(': ', 1)
|
|
128
|
+
entry_dict[db] = value.split(' ')
|
|
129
|
+
|
|
130
|
+
if curr_header == 'DBLINKS ':
|
|
131
|
+
db, value = content.strip().split(': ', 1)
|
|
132
|
+
entry_dict[db] = value.split(' ')
|
|
133
|
+
|
|
134
|
+
df.append(entry_dict)
|
|
135
|
+
df = pnd.DataFrame.from_records(df)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# save dataframe in the output dir:
|
|
139
|
+
with open(outfile, 'wb') as wb_handler:
|
|
140
|
+
pickle.dump(df, wb_handler)
|
|
141
|
+
logger.info(f"'{outfile}' created!")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
return 0
|
|
@@ -12,10 +12,12 @@ import gempipe
|
|
|
12
12
|
|
|
13
13
|
from .pruner import load_input_universe
|
|
14
14
|
from .pruner import load_input_eggnog
|
|
15
|
+
from .pruner import load_keggorg_like_eggnog
|
|
15
16
|
from .pruner import parse_eggnog
|
|
16
17
|
from .pruner import subtract_kos
|
|
17
18
|
from .pruner import translate_remaining_kos
|
|
18
19
|
from .pruner import restore_gene_annotations
|
|
20
|
+
from .pruner import append_keggorg_gene_annots
|
|
19
21
|
|
|
20
22
|
from .gapfillutils import include_forced
|
|
21
23
|
|
|
@@ -38,26 +40,37 @@ from ..commons import log_metrics
|
|
|
38
40
|
from ..commons import log_unbalances
|
|
39
41
|
from ..commons import format_expansion
|
|
40
42
|
from ..commons import comparative_table
|
|
43
|
+
from ..commons import download_keggorg
|
|
41
44
|
|
|
42
45
|
from ..runsims.biosynth import biosynthesis_on_media
|
|
43
46
|
|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
def create_model_incore(params):
|
|
47
|
-
universe, eggpath, dbexp, args, multistrain = params
|
|
50
|
+
annotation_source, universe, eggpath, dbexp, args, multistrain = params
|
|
51
|
+
|
|
52
|
+
# get the logger:
|
|
48
53
|
logger = get_logger('gsrap_queued', args.verbose) # loggers can't be pickled!
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# only errors will be recorded if multistrain mode
|
|
49
57
|
if multistrain:
|
|
50
|
-
# only errors will be recorded
|
|
51
58
|
logger.setLevel(logging.ERROR)
|
|
52
59
|
|
|
53
60
|
|
|
54
61
|
# load the annotation
|
|
55
|
-
|
|
62
|
+
if annotation_source == 'keggorg':
|
|
63
|
+
eggnog_style_table = load_keggorg_like_eggnog(logger, args.keggorg, args.outdir)
|
|
64
|
+
elif annotation_source == 'eggnog':
|
|
65
|
+
eggnog_style_table = load_input_eggnog(logger, eggpath)
|
|
56
66
|
|
|
57
67
|
|
|
58
|
-
# create a copy of the universe
|
|
68
|
+
# create a copy of the universe and define the model ID
|
|
59
69
|
model = universe.copy()
|
|
60
|
-
|
|
70
|
+
if annotation_source == 'keggorg':
|
|
71
|
+
model.id = args.keggorg
|
|
72
|
+
elif annotation_source == 'eggnog':
|
|
73
|
+
model.id = Path(eggpath).stem
|
|
61
74
|
|
|
62
75
|
|
|
63
76
|
###### POLISHING 1
|
|
@@ -67,9 +80,10 @@ def create_model_incore(params):
|
|
|
67
80
|
|
|
68
81
|
|
|
69
82
|
###### PRUNING
|
|
70
|
-
logger.info("Reading
|
|
83
|
+
if annotation_source == 'keggorg': logger.info(f"Reading annotation for organism code '{args.keggorg}'...")
|
|
84
|
+
elif annotation_source == 'eggnog': logger.info("Reading provided eggnog-mapper annotation...")
|
|
71
85
|
# get important dictionaries: 'eggnog_ko_to_gids' and 'eggonog_gid_to_kos'
|
|
72
|
-
eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(
|
|
86
|
+
eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog_style_table)
|
|
73
87
|
|
|
74
88
|
# prune reactions
|
|
75
89
|
subtract_kos(logger, model, eggnog_ko_to_gids)
|
|
@@ -77,6 +91,10 @@ def create_model_incore(params):
|
|
|
77
91
|
# translate KOs to the actual genes
|
|
78
92
|
translate_remaining_kos(logger, model, eggnog_ko_to_gids)
|
|
79
93
|
restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
|
|
94
|
+
|
|
95
|
+
# insert gene annotation if starting from kegg organisms:
|
|
96
|
+
if annotation_source == 'keggorg':
|
|
97
|
+
append_keggorg_gene_annots(logger, model, args.keggorg, args.outdir)
|
|
80
98
|
|
|
81
99
|
|
|
82
100
|
|
|
@@ -171,13 +189,28 @@ def main(args, logger):
|
|
|
171
189
|
|
|
172
190
|
|
|
173
191
|
# format the --eggnog param
|
|
174
|
-
args.eggnog = format_expansion(logger, args.eggnog)
|
|
175
|
-
|
|
176
|
-
|
|
192
|
+
args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
|
|
193
|
+
|
|
194
|
+
# get the kegg organism if requested
|
|
195
|
+
if args.keggorg != '-':
|
|
196
|
+
response = download_keggorg(logger, args.keggorg, args.outdir)
|
|
197
|
+
if response == 1: return 1
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# determine the source of functional annotation:
|
|
202
|
+
annotation_source = None
|
|
203
|
+
if args.keggorg != '-': # keggorg has precedence
|
|
204
|
+
annotation_source = 'keggorg'
|
|
205
|
+
elif args.eggnog != '-':
|
|
206
|
+
annotation_source = 'eggnog'
|
|
207
|
+
if args.cores > len(args.eggnog):
|
|
208
|
+
logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
|
|
209
|
+
args.cores = len(args.eggnog)
|
|
210
|
+
else:
|
|
211
|
+
logger.error("No valid functional annotations provided: please use '--keggorg' or '--eggnog'.")
|
|
177
212
|
return 1
|
|
178
|
-
|
|
179
|
-
logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
|
|
180
|
-
args.cores = len(args.eggnog)
|
|
213
|
+
|
|
181
214
|
|
|
182
215
|
|
|
183
216
|
# check compatibility of input parameters:
|
|
@@ -201,17 +234,26 @@ def main(args, logger):
|
|
|
201
234
|
|
|
202
235
|
|
|
203
236
|
# disable logging (swith to txt) if strains are more than 1:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
237
|
+
if annotation_source == 'keggorg':
|
|
238
|
+
multistrain = False
|
|
239
|
+
elif annotation_source == 'eggnog':
|
|
240
|
+
multistrain = len(args.eggnog) > 1
|
|
241
|
+
if multistrain:
|
|
242
|
+
logger.info(f"Number of provided strains is >1: logging will be disabled.")
|
|
243
|
+
logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
|
|
244
|
+
# actualy this is done inside child processess!
|
|
245
|
+
|
|
209
246
|
|
|
210
247
|
# create strain-specific GSMMs using multi-core
|
|
211
248
|
error_raised = False
|
|
212
249
|
sheets_dicts = []
|
|
213
250
|
executor = confu.ProcessPoolExecutor(max_workers=args.cores)
|
|
214
|
-
|
|
251
|
+
|
|
252
|
+
if annotation_source == 'keggorg':
|
|
253
|
+
futures = [executor.submit(create_model_incore, (annotation_source, universe, None, dbexp, args, multistrain))]
|
|
254
|
+
elif annotation_source == 'eggnog':
|
|
255
|
+
futures = [executor.submit(create_model_incore, (annotation_source, universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
|
|
256
|
+
|
|
215
257
|
for f in confu.as_completed(futures):
|
|
216
258
|
sheets_dict = f.result()
|
|
217
259
|
|
|
@@ -226,12 +268,14 @@ def main(args, logger):
|
|
|
226
268
|
sheets_dicts.append(sheets_dict)
|
|
227
269
|
print(f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!", end='\r', file=sys.stderr)
|
|
228
270
|
|
|
271
|
+
|
|
229
272
|
# hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
|
|
230
273
|
if multistrain and sheets_dicts != []:
|
|
231
274
|
last_trace = f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!"
|
|
232
275
|
whitewash = ''.join([' ' for i in range(len(last_trace))])
|
|
233
276
|
print(whitewash, end='\r', file=sys.stderr)
|
|
234
277
|
|
|
278
|
+
|
|
235
279
|
# multiproces part terminated: safely shut down the executor
|
|
236
280
|
executor.shutdown(wait=True)
|
|
237
281
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import warnings
|
|
3
3
|
import logging
|
|
4
|
+
import pickle
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
import pandas as pnd
|
|
@@ -43,22 +44,57 @@ def load_input_eggnog(logger, eggnog):
|
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
# load eggnog annotations
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
df_eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
|
|
48
|
+
df_eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
|
|
49
|
+
df_eggnog = df_eggnog.set_index('query', drop=True, verify_integrity=True)
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
return
|
|
52
|
+
return df_eggnog
|
|
52
53
|
|
|
53
54
|
|
|
54
55
|
|
|
55
|
-
def
|
|
56
|
+
def load_keggorg_like_eggnog(logger, keggorg, outdir):
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# load raw data, downloaded form kegg:
|
|
60
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
61
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# create an eggnog-like dataframe:
|
|
65
|
+
df_eggnog_like = [] # list of dict future df
|
|
66
|
+
for gid in df_keggorg.index:
|
|
67
|
+
row_dict = {}
|
|
68
|
+
|
|
69
|
+
row_dict['query'] = gid
|
|
70
|
+
row_dict['PFAMs'] = ','.join(df_keggorg.loc[gid, 'Pfam']) if type(df_keggorg.loc[gid, 'Pfam'])==list else '-'
|
|
71
|
+
row_dict['KEGG_ko'] = df_keggorg.loc[gid, 'ko'] if type(df_keggorg.loc[gid, 'ko'])==str else '-'
|
|
72
|
+
|
|
73
|
+
df_eggnog_like.append(row_dict)
|
|
74
|
+
df_eggnog_like = pnd.DataFrame.from_records(df_eggnog_like)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# appen missing coluns and sort
|
|
78
|
+
eggnog_columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
|
|
79
|
+
for c in eggnog_columns:
|
|
80
|
+
if c not in df_eggnog_like.columns:
|
|
81
|
+
df_eggnog_like[c] = '-'
|
|
82
|
+
df_eggnog_like = df_eggnog_like[eggnog_columns]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# set the index like in eggnog
|
|
86
|
+
df_eggnog_like = df_eggnog_like.set_index('query', drop=True, verify_integrity=True)
|
|
87
|
+
return df_eggnog_like
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def parse_eggnog(df_eggnog):
|
|
56
92
|
|
|
57
93
|
|
|
58
94
|
# PART 1. get KO codes available
|
|
59
95
|
gid_to_kos = {}
|
|
60
96
|
ko_to_gids = {}
|
|
61
|
-
for gid, kos in
|
|
97
|
+
for gid, kos in df_eggnog['KEGG_ko'].items():
|
|
62
98
|
if kos == '-':
|
|
63
99
|
continue
|
|
64
100
|
|
|
@@ -229,8 +265,37 @@ def restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos):
|
|
|
229
265
|
# collect names
|
|
230
266
|
names.append(uni_g.name)
|
|
231
267
|
g.name = '; '.join(names)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def append_keggorg_gene_annots(logger, model, keggorg, outdir):
|
|
272
|
+
|
|
232
273
|
|
|
233
|
-
|
|
274
|
+
# load raw data, downloaded form kegg:
|
|
275
|
+
logger.info("Adding gene annotations retrieved from KEGG...")
|
|
276
|
+
df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
|
|
277
|
+
df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# KEGG can provide some useful (ie, used in Memote) gene annotations:
|
|
281
|
+
for g in model.genes:
|
|
282
|
+
if g.id in df_keggorg.index:
|
|
283
|
+
|
|
284
|
+
g.annotation['kegg.genes'] = [keggorg + ':' + g.id]
|
|
285
|
+
|
|
286
|
+
if 'NCBI-GeneID' in df_keggorg.columns:
|
|
287
|
+
g.annotation['ncbigene'] = df_keggorg.loc[g.id, 'NCBI-GeneID'] if type(df_keggorg.loc[g.id, 'NCBI-GeneID'])==list else []
|
|
288
|
+
if 'NCBI-ProteinID' in df_keggorg.columns:
|
|
289
|
+
g.annotation['ncbiprotein'] = df_keggorg.loc[g.id, 'NCBI-ProteinID'] if type(df_keggorg.loc[g.id, 'NCBI-ProteinID'])==list else []
|
|
290
|
+
if 'ASAP' in df_keggorg.columns:
|
|
291
|
+
g.annotation['asap'] = df_keggorg.loc[g.id, 'ASAP'] if type(df_keggorg.loc[g.id, 'ASAP'])==list else []
|
|
292
|
+
if 'UniProt' in df_keggorg.columns:
|
|
293
|
+
g.annotation['uniprot'] = df_keggorg.loc[g.id, 'UniProt'] if type(df_keggorg.loc[g.id, 'UniProt'])==list else []
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
|
|
234
299
|
|
|
235
300
|
|
|
236
301
|
|
gsrap/mkmodel/mkmodel.py
CHANGED
|
@@ -12,10 +12,12 @@ import gempipe
|
|
|
12
12
|
|
|
13
13
|
from .pruner import load_input_universe
|
|
14
14
|
from .pruner import load_input_eggnog
|
|
15
|
+
from .pruner import load_keggorg_like_eggnog
|
|
15
16
|
from .pruner import parse_eggnog
|
|
16
17
|
from .pruner import subtract_kos
|
|
17
18
|
from .pruner import translate_remaining_kos
|
|
18
19
|
from .pruner import restore_gene_annotations
|
|
20
|
+
from .pruner import append_keggorg_gene_annots
|
|
19
21
|
|
|
20
22
|
from .gapfillutils import include_forced
|
|
21
23
|
|
|
@@ -38,26 +40,37 @@ from ..commons import log_metrics
|
|
|
38
40
|
from ..commons import log_unbalances
|
|
39
41
|
from ..commons import format_expansion
|
|
40
42
|
from ..commons import comparative_table
|
|
43
|
+
from ..commons import download_keggorg
|
|
41
44
|
|
|
42
45
|
from ..runsims.biosynth import biosynthesis_on_media
|
|
43
46
|
|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
def create_model_incore(params):
|
|
47
|
-
universe, eggpath, dbexp, args, multistrain = params
|
|
50
|
+
annotation_source, universe, eggpath, dbexp, args, multistrain = params
|
|
51
|
+
|
|
52
|
+
# get the logger:
|
|
48
53
|
logger = get_logger('gsrap_queued', args.verbose) # loggers can't be pickled!
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# only errors will be recorded if multistrain mode
|
|
49
57
|
if multistrain:
|
|
50
|
-
# only errors will be recorded
|
|
51
58
|
logger.setLevel(logging.ERROR)
|
|
52
59
|
|
|
53
60
|
|
|
54
61
|
# load the annotation
|
|
55
|
-
|
|
62
|
+
if annotation_source == 'keggorg':
|
|
63
|
+
eggnog_style_table = load_keggorg_like_eggnog(logger, args.keggorg, args.outdir)
|
|
64
|
+
elif annotation_source == 'eggnog':
|
|
65
|
+
eggnog_style_table = load_input_eggnog(logger, eggpath)
|
|
56
66
|
|
|
57
67
|
|
|
58
|
-
# create a copy of the universe
|
|
68
|
+
# create a copy of the universe and define the model ID
|
|
59
69
|
model = universe.copy()
|
|
60
|
-
|
|
70
|
+
if annotation_source == 'keggorg':
|
|
71
|
+
model.id = args.keggorg
|
|
72
|
+
elif annotation_source == 'eggnog':
|
|
73
|
+
model.id = Path(eggpath).stem
|
|
61
74
|
|
|
62
75
|
|
|
63
76
|
###### POLISHING 1
|
|
@@ -67,9 +80,10 @@ def create_model_incore(params):
|
|
|
67
80
|
|
|
68
81
|
|
|
69
82
|
###### PRUNING
|
|
70
|
-
logger.info("Reading
|
|
83
|
+
if annotation_source == 'keggorg': logger.info(f"Reading annotation for organism code '{args.keggorg}'...")
|
|
84
|
+
elif annotation_source == 'eggnog': logger.info("Reading provided eggnog-mapper annotation...")
|
|
71
85
|
# get important dictionaries: 'eggnog_ko_to_gids' and 'eggonog_gid_to_kos'
|
|
72
|
-
eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(
|
|
86
|
+
eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog_style_table)
|
|
73
87
|
|
|
74
88
|
# prune reactions
|
|
75
89
|
subtract_kos(logger, model, eggnog_ko_to_gids)
|
|
@@ -77,6 +91,10 @@ def create_model_incore(params):
|
|
|
77
91
|
# translate KOs to the actual genes
|
|
78
92
|
translate_remaining_kos(logger, model, eggnog_ko_to_gids)
|
|
79
93
|
restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
|
|
94
|
+
|
|
95
|
+
# insert gene annotation if starting from kegg organisms:
|
|
96
|
+
if annotation_source == 'keggorg':
|
|
97
|
+
append_keggorg_gene_annots(logger, model, args.keggorg, args.outdir)
|
|
80
98
|
|
|
81
99
|
|
|
82
100
|
|
|
@@ -171,13 +189,28 @@ def main(args, logger):
|
|
|
171
189
|
|
|
172
190
|
|
|
173
191
|
# format the --eggnog param
|
|
174
|
-
args.eggnog = format_expansion(logger, args.eggnog)
|
|
175
|
-
|
|
176
|
-
|
|
192
|
+
args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
|
|
193
|
+
|
|
194
|
+
# get the kegg organism if requested
|
|
195
|
+
if args.keggorg != '-':
|
|
196
|
+
response = download_keggorg(logger, args.keggorg, args.outdir)
|
|
197
|
+
if response == 1: return 1
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# determine the source of functional annotation:
|
|
202
|
+
annotation_source = None
|
|
203
|
+
if args.keggorg != '-': # keggorg has precedence
|
|
204
|
+
annotation_source = 'keggorg'
|
|
205
|
+
elif args.eggnog != '-':
|
|
206
|
+
annotation_source = 'eggnog'
|
|
207
|
+
if args.cores > len(args.eggnog):
|
|
208
|
+
logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
|
|
209
|
+
args.cores = len(args.eggnog)
|
|
210
|
+
else:
|
|
211
|
+
logger.error("No valid functional annotations provided: please use '--keggorg' or '--eggnog'.")
|
|
177
212
|
return 1
|
|
178
|
-
|
|
179
|
-
logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
|
|
180
|
-
args.cores = len(args.eggnog)
|
|
213
|
+
|
|
181
214
|
|
|
182
215
|
|
|
183
216
|
# check compatibility of input parameters:
|
|
@@ -201,17 +234,26 @@ def main(args, logger):
|
|
|
201
234
|
|
|
202
235
|
|
|
203
236
|
# disable logging (swith to txt) if strains are more than 1:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
237
|
+
if annotation_source == 'keggorg':
|
|
238
|
+
multistrain = False
|
|
239
|
+
elif annotation_source == 'eggnog':
|
|
240
|
+
multistrain = len(args.eggnog) > 1
|
|
241
|
+
if multistrain:
|
|
242
|
+
logger.info(f"Number of provided strains is >1: logging will be disabled.")
|
|
243
|
+
logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
|
|
244
|
+
# actualy this is done inside child processess!
|
|
245
|
+
|
|
209
246
|
|
|
210
247
|
# create strain-specific GSMMs using multi-core
|
|
211
248
|
error_raised = False
|
|
212
249
|
sheets_dicts = []
|
|
213
250
|
executor = confu.ProcessPoolExecutor(max_workers=args.cores)
|
|
214
|
-
|
|
251
|
+
|
|
252
|
+
if annotation_source == 'keggorg':
|
|
253
|
+
futures = [executor.submit(create_model_incore, (annotation_source, universe, None, dbexp, args, multistrain))]
|
|
254
|
+
elif annotation_source == 'eggnog':
|
|
255
|
+
futures = [executor.submit(create_model_incore, (annotation_source, universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
|
|
256
|
+
|
|
215
257
|
for f in confu.as_completed(futures):
|
|
216
258
|
sheets_dict = f.result()
|
|
217
259
|
|
|
@@ -226,12 +268,14 @@ def main(args, logger):
|
|
|
226
268
|
sheets_dicts.append(sheets_dict)
|
|
227
269
|
print(f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!", end='\r', file=sys.stderr)
|
|
228
270
|
|
|
271
|
+
|
|
229
272
|
# hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
|
|
230
273
|
if multistrain and sheets_dicts != []:
|
|
231
274
|
last_trace = f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!"
|
|
232
275
|
whitewash = ''.join([' ' for i in range(len(last_trace))])
|
|
233
276
|
print(whitewash, end='\r', file=sys.stderr)
|
|
234
277
|
|
|
278
|
+
|
|
235
279
|
# multiproces part terminated: safely shut down the executor
|
|
236
280
|
executor.shutdown(wait=True)
|
|
237
281
|
|