gsrap 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. gsrap/.ipynb_checkpoints/__init__-checkpoint.py +2 -0
  2. gsrap/__init__.py +2 -0
  3. gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
  4. gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +1 -1
  5. gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +1 -1
  6. gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +70 -37
  7. gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py +15 -1
  8. gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py +145 -0
  9. gsrap/commons/.ipynb_checkpoints/medium-checkpoint.py +3 -4
  10. gsrap/commons/__init__.py +1 -0
  11. gsrap/commons/downloads.py +1 -1
  12. gsrap/commons/escherutils.py +1 -1
  13. gsrap/commons/excelhub.py +70 -37
  14. gsrap/commons/figures.py +15 -1
  15. gsrap/commons/keggutils.py +145 -0
  16. gsrap/commons/medium.py +3 -4
  17. gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +69 -19
  18. gsrap/mkmodel/.ipynb_checkpoints/pruner-checkpoint.py +72 -7
  19. gsrap/mkmodel/mkmodel.py +69 -19
  20. gsrap/mkmodel/pruner.py +72 -7
  21. gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +33 -6
  22. gsrap/parsedb/.ipynb_checkpoints/cycles-checkpoint.py +128 -0
  23. gsrap/parsedb/.ipynb_checkpoints/introduce-checkpoint.py +9 -9
  24. gsrap/parsedb/.ipynb_checkpoints/manual-checkpoint.py +27 -0
  25. gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +15 -2
  26. gsrap/parsedb/.ipynb_checkpoints/repeating-checkpoint.py +9 -0
  27. gsrap/parsedb/completeness.py +33 -6
  28. gsrap/parsedb/cycles.py +128 -0
  29. gsrap/parsedb/introduce.py +9 -9
  30. gsrap/parsedb/manual.py +27 -0
  31. gsrap/parsedb/parsedb.py +15 -2
  32. gsrap/parsedb/repeating.py +9 -0
  33. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/METADATA +1 -1
  34. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/RECORD +37 -33
  35. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/LICENSE.txt +0 -0
  36. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/WHEEL +0 -0
  37. {gsrap-0.7.2.dist-info → gsrap-0.8.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,145 @@
1
+ import time
2
+ import os
3
+ import sys
4
+ import pickle
5
+
6
+
7
+ import pandas as pnd
8
+ from Bio.KEGG import REST
9
+
10
+
11
+
12
+ def download_keggorg(logger, keggorg='lpl', outdir='./', ):
13
+
14
+
15
+ # check if already downloaded
16
+ outfile = os.path.join(outdir, f'{keggorg}.keggorg')
17
+ if os.path.exists(outfile):
18
+ logger.info(f"Organism code '{keggorg}' already downloaded ('{os.path.join(outdir, f'{keggorg}.keggorg')}').")
19
+ return 0
20
+
21
+
22
+ # donwload entire txt:
23
+ logger.info(f"Verifying existence of organism code '{keggorg}' on KEGG...")
24
+ time.sleep(0.5) # be respectful
25
+ try: response = REST.kegg_list(keggorg).read()
26
+ except:
27
+ logger.error(f"Organism code '{keggorg}' not found in KEGG database.")
28
+ return 1
29
+ # response is now a string similar to:
30
+ """
31
+ lpl:lp_0026 CDS 31317..32084 hydrolase, HAD superfamily, Cof family
32
+ lpl:lp_0027 CDS complement(32236..32907) pgmB1; beta-phosphoglucomutase
33
+ """
34
+
35
+
36
+ # extract the gene IDs list:
37
+ gene_ids = [line.split('\t')[0] for line in response.strip().split('\n')]
38
+ # example of gene_id: "lpl:lp_0005"
39
+ logger.info(f"Respectfully downloading {len(gene_ids)} genes from KEGG...")
40
+
41
+
42
+
43
+ # respectfully download in batch
44
+ # 10 is the max number of elements that can be downloaded
45
+ batch_size = 10
46
+ n_batches = len(gene_ids) // batch_size + (1 if (len(gene_ids) % batch_size) > 0 else 0)
47
+
48
+
49
+ n_attempts = 5
50
+ attempts_left = n_attempts
51
+ default_sleep = 0.5
52
+ sleep_time = default_sleep
53
+
54
+
55
+ completed_batches = 0
56
+ completed_genes = 0
57
+ res_string_list = []
58
+ while completed_batches < n_batches:
59
+
60
+ # be respectful
61
+ time.sleep(sleep_time)
62
+
63
+ # extract batch
64
+ start_index = completed_batches *batch_size
65
+ end_index = (completed_batches+1) *batch_size
66
+ if end_index > len(gene_ids): end_index = len(gene_ids)
67
+ curr_batch = gene_ids[start_index: end_index]
68
+
69
+
70
+ # download batch
71
+ try:
72
+ res_string = REST.kegg_get(curr_batch).read()
73
+ for item in res_string.split("///\n\n"):
74
+ res_string_list.append(item.replace('///\n', ''))
75
+ completed_batches += 1
76
+ completed_genes += len(curr_batch)
77
+
78
+ print(f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!", end='\r', file=sys.stderr)
79
+
80
+ attempts_left = n_attempts
81
+ sleep_time = default_sleep
82
+ except:
83
+ attempts_left -= 1
84
+ sleep_time = default_sleep *4 # increase sleep time to be more respectful
85
+ logger.warning(f"An error occurred during kegg_get() of batch {curr_batch}. Remaining attempts: {attempts_left}.")
86
+
87
+
88
+ if attempts_left == 0:
89
+ logger.error("No attemps left! Shutting down...")
90
+ return 1
91
+
92
+
93
+ # hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
94
+ last_trace = f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!"
95
+ whitewash = ''.join([' ' for i in range(len(last_trace))])
96
+ print(whitewash, end='\r', file=sys.stderr)
97
+
98
+
99
+
100
+ # extract info into a formatted df:
101
+ df = [] # list of dicts, future df
102
+ for entry in res_string_list:
103
+
104
+ entry_dict = {}
105
+ curr_header = None
106
+
107
+ for line in entry.split('\n'):
108
+ if line == '': continue
109
+
110
+ header = line[:12]
111
+ content = line[12:]
112
+ if header != ' '*12:
113
+ curr_header = header
114
+
115
+ if curr_header == 'ENTRY ':
116
+ gid = content.split(' ', 1)[0]
117
+ entry_dict['gid'] = gid
118
+
119
+ if curr_header == 'POSITION ':
120
+ entry_dict['pos'] = content.strip()
121
+
122
+ if curr_header == 'ORTHOLOGY ':
123
+ ko = content.split(' ', 1)[0]
124
+ entry_dict['ko'] = ko
125
+
126
+ if curr_header == 'MOTIF ':
127
+ db, value = content.strip().split(': ', 1)
128
+ entry_dict[db] = value.split(' ')
129
+
130
+ if curr_header == 'DBLINKS ':
131
+ db, value = content.strip().split(': ', 1)
132
+ entry_dict[db] = value.split(' ')
133
+
134
+ df.append(entry_dict)
135
+ df = pnd.DataFrame.from_records(df)
136
+
137
+
138
+ # save dataframe in the output dir:
139
+ with open(outfile, 'wb') as wb_handler:
140
+ pickle.dump(df, wb_handler)
141
+ logger.info(f"'{outfile}' created!")
142
+
143
+
144
+
145
+ return 0
gsrap/commons/medium.py CHANGED
@@ -17,10 +17,9 @@ def apply_medium_given_column(logger, model, medium, column, is_reference=False)
17
17
  column = column.to_dict()
18
18
 
19
19
 
20
- # add trace elements:
21
- column['fe2'] = 'NL'
22
- column['mobd'] = 'NL'
23
- column['cobalt2'] = 'NL'
20
+ # add default elements (acqueous media)
21
+ column['h2o'] = 'NL'
22
+ column['h'] = '-0.0001' # pH=7
24
23
 
25
24
 
26
25
  # reset exchanges
@@ -12,10 +12,12 @@ import gempipe
12
12
 
13
13
  from .pruner import load_input_universe
14
14
  from .pruner import load_input_eggnog
15
+ from .pruner import load_keggorg_like_eggnog
15
16
  from .pruner import parse_eggnog
16
17
  from .pruner import subtract_kos
17
18
  from .pruner import translate_remaining_kos
18
19
  from .pruner import restore_gene_annotations
20
+ from .pruner import append_keggorg_gene_annots
19
21
 
20
22
  from .gapfillutils import include_forced
21
23
 
@@ -38,26 +40,40 @@ from ..commons import log_metrics
38
40
  from ..commons import log_unbalances
39
41
  from ..commons import format_expansion
40
42
  from ..commons import comparative_table
43
+ from ..commons import download_keggorg
41
44
 
42
45
  from ..runsims.biosynth import biosynthesis_on_media
43
46
 
47
+ from ..parsedb.cycles import verify_egc_all
48
+
49
+
44
50
 
45
51
 
46
52
  def create_model_incore(params):
47
- universe, eggpath, dbexp, args, multistrain = params
53
+ annotation_source, universe, eggpath, dbexp, args, multistrain = params
54
+
55
+ # get the logger:
48
56
  logger = get_logger('gsrap_queued', args.verbose) # loggers can't be pickled!
57
+
58
+
59
+ # only errors will be recorded if multistrain mode
49
60
  if multistrain:
50
- # only errors will be recorded
51
61
  logger.setLevel(logging.ERROR)
52
62
 
53
63
 
54
64
  # load the annotation
55
- eggnog = load_input_eggnog(logger, eggpath)
65
+ if annotation_source == 'keggorg':
66
+ eggnog_style_table = load_keggorg_like_eggnog(logger, args.keggorg, args.outdir)
67
+ elif annotation_source == 'eggnog':
68
+ eggnog_style_table = load_input_eggnog(logger, eggpath)
56
69
 
57
70
 
58
- # create a copy of the universe
71
+ # create a copy of the universe and define the model ID
59
72
  model = universe.copy()
60
- model.id = Path(eggpath).stem
73
+ if annotation_source == 'keggorg':
74
+ model.id = args.keggorg
75
+ elif annotation_source == 'eggnog':
76
+ model.id = Path(eggpath).stem
61
77
 
62
78
 
63
79
  ###### POLISHING 1
@@ -67,9 +83,10 @@ def create_model_incore(params):
67
83
 
68
84
 
69
85
  ###### PRUNING
70
- logger.info("Reading provided eggnog-mapper annotation...")
86
+ if annotation_source == 'keggorg': logger.info(f"Reading annotation for organism code '{args.keggorg}'...")
87
+ elif annotation_source == 'eggnog': logger.info("Reading provided eggnog-mapper annotation...")
71
88
  # get important dictionaries: 'eggnog_ko_to_gids' and 'eggonog_gid_to_kos'
72
- eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog)
89
+ eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog_style_table)
73
90
 
74
91
  # prune reactions
75
92
  subtract_kos(logger, model, eggnog_ko_to_gids)
@@ -77,6 +94,10 @@ def create_model_incore(params):
77
94
  # translate KOs to the actual genes
78
95
  translate_remaining_kos(logger, model, eggnog_ko_to_gids)
79
96
  restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
97
+
98
+ # insert gene annotation if starting from kegg organisms:
99
+ if annotation_source == 'keggorg':
100
+ append_keggorg_gene_annots(logger, model, args.keggorg, args.outdir)
80
101
 
81
102
 
82
103
 
@@ -122,6 +143,9 @@ def create_model_incore(params):
122
143
 
123
144
 
124
145
  ###### CHECKS
146
+ # check erroneous EGCs
147
+ verify_egc_all(logger, model, args.outdir)
148
+
125
149
  # check blocked metabolites / dead-ends
126
150
  df_S = biosynthesis_on_media(logger, model, dbexp, args.gap_fill, args.biosynth)
127
151
  if type(df_S)==int: return 1
@@ -171,13 +195,28 @@ def main(args, logger):
171
195
 
172
196
 
173
197
  # format the --eggnog param
174
- args.eggnog = format_expansion(logger, args.eggnog)
175
- if args.eggnog == '-':
176
- logger.error("No valid eggnog-mapper annotations provided.")
198
+ args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
199
+
200
+ # get the kegg organism if requested
201
+ if args.keggorg != '-':
202
+ response = download_keggorg(logger, args.keggorg, args.outdir)
203
+ if response == 1: return 1
204
+
205
+
206
+
207
+ # determine the source of functional annotation:
208
+ annotation_source = None
209
+ if args.keggorg != '-': # keggorg has precedence
210
+ annotation_source = 'keggorg'
211
+ elif args.eggnog != '-':
212
+ annotation_source = 'eggnog'
213
+ if args.cores > len(args.eggnog):
214
+ logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
215
+ args.cores = len(args.eggnog)
216
+ else:
217
+ logger.error("No valid functional annotations provided: please use '--keggorg' or '--eggnog'.")
177
218
  return 1
178
- if args.cores > len(args.eggnog):
179
- logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
180
- args.cores = len(args.eggnog)
219
+
181
220
 
182
221
 
183
222
  # check compatibility of input parameters:
@@ -201,17 +240,26 @@ def main(args, logger):
201
240
 
202
241
 
203
242
  # disable logging (swith to txt) if strains are more than 1:
204
- multistrain = len(args.eggnog) > 1
205
- if multistrain:
206
- logger.info(f"Number of provided strains is >1: logging will be disabled.")
207
- logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
208
- # actualy this is done inside child processess!
243
+ if annotation_source == 'keggorg':
244
+ multistrain = False
245
+ elif annotation_source == 'eggnog':
246
+ multistrain = len(args.eggnog) > 1
247
+ if multistrain:
248
+ logger.info(f"Number of provided strains is >1: logging will be disabled.")
249
+ logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
250
+ # actualy this is done inside child processess!
251
+
209
252
 
210
253
  # create strain-specific GSMMs using multi-core
211
254
  error_raised = False
212
255
  sheets_dicts = []
213
256
  executor = confu.ProcessPoolExecutor(max_workers=args.cores)
214
- futures = [executor.submit(create_model_incore, (universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
257
+
258
+ if annotation_source == 'keggorg':
259
+ futures = [executor.submit(create_model_incore, (annotation_source, universe, None, dbexp, args, multistrain))]
260
+ elif annotation_source == 'eggnog':
261
+ futures = [executor.submit(create_model_incore, (annotation_source, universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
262
+
215
263
  for f in confu.as_completed(futures):
216
264
  sheets_dict = f.result()
217
265
 
@@ -226,12 +274,14 @@ def main(args, logger):
226
274
  sheets_dicts.append(sheets_dict)
227
275
  print(f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!", end='\r', file=sys.stderr)
228
276
 
277
+
229
278
  # hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
230
279
  if multistrain and sheets_dicts != []:
231
280
  last_trace = f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!"
232
281
  whitewash = ''.join([' ' for i in range(len(last_trace))])
233
282
  print(whitewash, end='\r', file=sys.stderr)
234
283
 
284
+
235
285
  # multiproces part terminated: safely shut down the executor
236
286
  executor.shutdown(wait=True)
237
287
 
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import warnings
3
3
  import logging
4
+ import pickle
4
5
 
5
6
 
6
7
  import pandas as pnd
@@ -43,22 +44,57 @@ def load_input_eggnog(logger, eggnog):
43
44
 
44
45
 
45
46
  # load eggnog annotations
46
- eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
47
- eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
48
- eggnog = eggnog.set_index('query', drop=True, verify_integrity=True)
47
+ df_eggnog = pnd.read_csv(eggnog, sep='\t', comment='#', header=None)
48
+ df_eggnog.columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
49
+ df_eggnog = df_eggnog.set_index('query', drop=True, verify_integrity=True)
49
50
 
50
51
 
51
- return eggnog
52
+ return df_eggnog
52
53
 
53
54
 
54
55
 
55
- def parse_eggnog(eggnog):
56
+ def load_keggorg_like_eggnog(logger, keggorg, outdir):
57
+
58
+
59
+ # load raw data, downloaded form kegg:
60
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
61
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
62
+
63
+
64
+ # create an eggnog-like dataframe:
65
+ df_eggnog_like = [] # list of dict future df
66
+ for gid in df_keggorg.index:
67
+ row_dict = {}
68
+
69
+ row_dict['query'] = gid
70
+ row_dict['PFAMs'] = ','.join(df_keggorg.loc[gid, 'Pfam']) if type(df_keggorg.loc[gid, 'Pfam'])==list else '-'
71
+ row_dict['KEGG_ko'] = df_keggorg.loc[gid, 'ko'] if type(df_keggorg.loc[gid, 'ko'])==str else '-'
72
+
73
+ df_eggnog_like.append(row_dict)
74
+ df_eggnog_like = pnd.DataFrame.from_records(df_eggnog_like)
75
+
76
+
77
+ # appen missing coluns and sort
78
+ eggnog_columns = 'query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs'.split('\t')
79
+ for c in eggnog_columns:
80
+ if c not in df_eggnog_like.columns:
81
+ df_eggnog_like[c] = '-'
82
+ df_eggnog_like = df_eggnog_like[eggnog_columns]
83
+
84
+
85
+ # set the index like in eggnog
86
+ df_eggnog_like = df_eggnog_like.set_index('query', drop=True, verify_integrity=True)
87
+ return df_eggnog_like
88
+
89
+
90
+
91
+ def parse_eggnog(df_eggnog):
56
92
 
57
93
 
58
94
  # PART 1. get KO codes available
59
95
  gid_to_kos = {}
60
96
  ko_to_gids = {}
61
- for gid, kos in eggnog['KEGG_ko'].items():
97
+ for gid, kos in df_eggnog['KEGG_ko'].items():
62
98
  if kos == '-':
63
99
  continue
64
100
 
@@ -229,8 +265,37 @@ def restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos):
229
265
  # collect names
230
266
  names.append(uni_g.name)
231
267
  g.name = '; '.join(names)
268
+
269
+
270
+
271
+ def append_keggorg_gene_annots(logger, model, keggorg, outdir):
272
+
232
273
 
233
-
274
+ # load raw data, downloaded form kegg:
275
+ logger.info("Adding gene annotations retrieved from KEGG...")
276
+ df_keggorg = pickle.load(open(os.path.join(outdir, f'{keggorg}.keggorg'), 'rb'))
277
+ df_keggorg = df_keggorg.set_index('gid', drop=True, verify_integrity=True)
278
+
279
+
280
+ # KEGG can provide some useful (ie, used in Memote) gene annotations:
281
+ for g in model.genes:
282
+ if g.id in df_keggorg.index:
283
+
284
+ g.annotation['kegg.genes'] = [keggorg + ':' + g.id]
285
+
286
+ if 'NCBI-GeneID' in df_keggorg.columns:
287
+ g.annotation['ncbigene'] = df_keggorg.loc[g.id, 'NCBI-GeneID'] if type(df_keggorg.loc[g.id, 'NCBI-GeneID'])==list else []
288
+ if 'NCBI-ProteinID' in df_keggorg.columns:
289
+ g.annotation['ncbiprotein'] = df_keggorg.loc[g.id, 'NCBI-ProteinID'] if type(df_keggorg.loc[g.id, 'NCBI-ProteinID'])==list else []
290
+ if 'ASAP' in df_keggorg.columns:
291
+ g.annotation['asap'] = df_keggorg.loc[g.id, 'ASAP'] if type(df_keggorg.loc[g.id, 'ASAP'])==list else []
292
+ if 'UniProt' in df_keggorg.columns:
293
+ g.annotation['uniprot'] = df_keggorg.loc[g.id, 'UniProt'] if type(df_keggorg.loc[g.id, 'UniProt'])==list else []
294
+
295
+
296
+
297
+
298
+
234
299
 
235
300
 
236
301
 
gsrap/mkmodel/mkmodel.py CHANGED
@@ -12,10 +12,12 @@ import gempipe
12
12
 
13
13
  from .pruner import load_input_universe
14
14
  from .pruner import load_input_eggnog
15
+ from .pruner import load_keggorg_like_eggnog
15
16
  from .pruner import parse_eggnog
16
17
  from .pruner import subtract_kos
17
18
  from .pruner import translate_remaining_kos
18
19
  from .pruner import restore_gene_annotations
20
+ from .pruner import append_keggorg_gene_annots
19
21
 
20
22
  from .gapfillutils import include_forced
21
23
 
@@ -38,26 +40,40 @@ from ..commons import log_metrics
38
40
  from ..commons import log_unbalances
39
41
  from ..commons import format_expansion
40
42
  from ..commons import comparative_table
43
+ from ..commons import download_keggorg
41
44
 
42
45
  from ..runsims.biosynth import biosynthesis_on_media
43
46
 
47
+ from ..parsedb.cycles import verify_egc_all
48
+
49
+
44
50
 
45
51
 
46
52
  def create_model_incore(params):
47
- universe, eggpath, dbexp, args, multistrain = params
53
+ annotation_source, universe, eggpath, dbexp, args, multistrain = params
54
+
55
+ # get the logger:
48
56
  logger = get_logger('gsrap_queued', args.verbose) # loggers can't be pickled!
57
+
58
+
59
+ # only errors will be recorded if multistrain mode
49
60
  if multistrain:
50
- # only errors will be recorded
51
61
  logger.setLevel(logging.ERROR)
52
62
 
53
63
 
54
64
  # load the annotation
55
- eggnog = load_input_eggnog(logger, eggpath)
65
+ if annotation_source == 'keggorg':
66
+ eggnog_style_table = load_keggorg_like_eggnog(logger, args.keggorg, args.outdir)
67
+ elif annotation_source == 'eggnog':
68
+ eggnog_style_table = load_input_eggnog(logger, eggpath)
56
69
 
57
70
 
58
- # create a copy of the universe
71
+ # create a copy of the universe and define the model ID
59
72
  model = universe.copy()
60
- model.id = Path(eggpath).stem
73
+ if annotation_source == 'keggorg':
74
+ model.id = args.keggorg
75
+ elif annotation_source == 'eggnog':
76
+ model.id = Path(eggpath).stem
61
77
 
62
78
 
63
79
  ###### POLISHING 1
@@ -67,9 +83,10 @@ def create_model_incore(params):
67
83
 
68
84
 
69
85
  ###### PRUNING
70
- logger.info("Reading provided eggnog-mapper annotation...")
86
+ if annotation_source == 'keggorg': logger.info(f"Reading annotation for organism code '{args.keggorg}'...")
87
+ elif annotation_source == 'eggnog': logger.info("Reading provided eggnog-mapper annotation...")
71
88
  # get important dictionaries: 'eggnog_ko_to_gids' and 'eggonog_gid_to_kos'
72
- eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog)
89
+ eggnog_ko_to_gids, eggonog_gid_to_kos = parse_eggnog(eggnog_style_table)
73
90
 
74
91
  # prune reactions
75
92
  subtract_kos(logger, model, eggnog_ko_to_gids)
@@ -77,6 +94,10 @@ def create_model_incore(params):
77
94
  # translate KOs to the actual genes
78
95
  translate_remaining_kos(logger, model, eggnog_ko_to_gids)
79
96
  restore_gene_annotations(logger, model, universe, eggonog_gid_to_kos)
97
+
98
+ # insert gene annotation if starting from kegg organisms:
99
+ if annotation_source == 'keggorg':
100
+ append_keggorg_gene_annots(logger, model, args.keggorg, args.outdir)
80
101
 
81
102
 
82
103
 
@@ -122,6 +143,9 @@ def create_model_incore(params):
122
143
 
123
144
 
124
145
  ###### CHECKS
146
+ # check erroneous EGCs
147
+ verify_egc_all(logger, model, args.outdir)
148
+
125
149
  # check blocked metabolites / dead-ends
126
150
  df_S = biosynthesis_on_media(logger, model, dbexp, args.gap_fill, args.biosynth)
127
151
  if type(df_S)==int: return 1
@@ -171,13 +195,28 @@ def main(args, logger):
171
195
 
172
196
 
173
197
  # format the --eggnog param
174
- args.eggnog = format_expansion(logger, args.eggnog)
175
- if args.eggnog == '-':
176
- logger.error("No valid eggnog-mapper annotations provided.")
198
+ args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
199
+
200
+ # get the kegg organism if requested
201
+ if args.keggorg != '-':
202
+ response = download_keggorg(logger, args.keggorg, args.outdir)
203
+ if response == 1: return 1
204
+
205
+
206
+
207
+ # determine the source of functional annotation:
208
+ annotation_source = None
209
+ if args.keggorg != '-': # keggorg has precedence
210
+ annotation_source = 'keggorg'
211
+ elif args.eggnog != '-':
212
+ annotation_source = 'eggnog'
213
+ if args.cores > len(args.eggnog):
214
+ logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
215
+ args.cores = len(args.eggnog)
216
+ else:
217
+ logger.error("No valid functional annotations provided: please use '--keggorg' or '--eggnog'.")
177
218
  return 1
178
- if args.cores > len(args.eggnog):
179
- logger.debug(f"Parameter --cores {args.cores} is greater than the number of strains ({len(args.eggnog)}): reset to {len(args.eggnog)}.")
180
- args.cores = len(args.eggnog)
219
+
181
220
 
182
221
 
183
222
  # check compatibility of input parameters:
@@ -201,17 +240,26 @@ def main(args, logger):
201
240
 
202
241
 
203
242
  # disable logging (swith to txt) if strains are more than 1:
204
- multistrain = len(args.eggnog) > 1
205
- if multistrain:
206
- logger.info(f"Number of provided strains is >1: logging will be disabled.")
207
- logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
208
- # actualy this is done inside child processess!
243
+ if annotation_source == 'keggorg':
244
+ multistrain = False
245
+ elif annotation_source == 'eggnog':
246
+ multistrain = len(args.eggnog) > 1
247
+ if multistrain:
248
+ logger.info(f"Number of provided strains is >1: logging will be disabled.")
249
+ logger.info(f"Performing {len(args.eggnog)} reconstructions relying on {args.cores} cores... ")
250
+ # actualy this is done inside child processess!
251
+
209
252
 
210
253
  # create strain-specific GSMMs using multi-core
211
254
  error_raised = False
212
255
  sheets_dicts = []
213
256
  executor = confu.ProcessPoolExecutor(max_workers=args.cores)
214
- futures = [executor.submit(create_model_incore, (universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
257
+
258
+ if annotation_source == 'keggorg':
259
+ futures = [executor.submit(create_model_incore, (annotation_source, universe, None, dbexp, args, multistrain))]
260
+ elif annotation_source == 'eggnog':
261
+ futures = [executor.submit(create_model_incore, (annotation_source, universe, eggpath, dbexp, args, multistrain)) for eggpath in args.eggnog]
262
+
215
263
  for f in confu.as_completed(futures):
216
264
  sheets_dict = f.result()
217
265
 
@@ -226,12 +274,14 @@ def main(args, logger):
226
274
  sheets_dicts.append(sheets_dict)
227
275
  print(f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!", end='\r', file=sys.stderr)
228
276
 
277
+
229
278
  # hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
230
279
  if multistrain and sheets_dicts != []:
231
280
  last_trace = f"{len(sheets_dicts)}/{len(args.eggnog)} ({int(len(sheets_dicts)/len(args.eggnog)*100)}%) completed!"
232
281
  whitewash = ''.join([' ' for i in range(len(last_trace))])
233
282
  print(whitewash, end='\r', file=sys.stderr)
234
283
 
284
+
235
285
  # multiproces part terminated: safely shut down the executor
236
286
  executor.shutdown(wait=True)
237
287