gsrap 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. gsrap/.ipynb_checkpoints/__init__-checkpoint.py +2 -0
  2. gsrap/__init__.py +2 -0
  3. gsrap/assets/kegg_compound_to_others.pickle +0 -0
  4. gsrap/assets/kegg_reaction_to_others.pickle +0 -0
  5. gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +96 -4
  6. gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +72 -1
  7. gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +2 -2
  8. gsrap/commons/downloads.py +96 -4
  9. gsrap/commons/escherutils.py +72 -1
  10. gsrap/commons/excelhub.py +2 -2
  11. gsrap/getmaps/.ipynb_checkpoints/getmaps-checkpoint.py +14 -5
  12. gsrap/getmaps/.ipynb_checkpoints/kdown-checkpoint.py +75 -4
  13. gsrap/getmaps/getmaps.py +14 -5
  14. gsrap/getmaps/kdown.py +75 -4
  15. gsrap/parsedb/.ipynb_checkpoints/annotation-checkpoint.py +9 -0
  16. gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +45 -11
  17. gsrap/parsedb/.ipynb_checkpoints/manual-checkpoint.py +10 -0
  18. gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +40 -19
  19. gsrap/parsedb/.ipynb_checkpoints/repeating-checkpoint.py +2 -2
  20. gsrap/parsedb/annotation.py +9 -0
  21. gsrap/parsedb/completeness.py +45 -11
  22. gsrap/parsedb/manual.py +10 -0
  23. gsrap/parsedb/parsedb.py +40 -19
  24. gsrap/parsedb/repeating.py +2 -2
  25. {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/METADATA +1 -1
  26. {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/RECORD +29 -29
  27. {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/LICENSE.txt +0 -0
  28. {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/WHEEL +0 -0
  29. {gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/entry_points.txt +0 -0
gsrap/getmaps/getmaps.py CHANGED
@@ -4,6 +4,7 @@ import pickle
4
4
 
5
5
 
6
6
  from .kdown import download_raw_txtfiles
7
+ from .kdown import create_dict_keggorg
7
8
  from .kdown import create_dict_ko
8
9
  from .kdown import create_dict_c
9
10
  from .kdown import create_dict_r
@@ -20,13 +21,19 @@ def do_kdown(logger, outdir, usecache, keeptmp):
20
21
  logger.info(f"Respectfully retrieving metabolic information from KEGG. Raw data are being saved into '{outdir}/kdown/'. Be patient, could take a couple of days...")
21
22
  os.makedirs(f'{outdir}/kdown/', exist_ok=True)
22
23
 
24
+
23
25
  response = download_raw_txtfiles(logger, outdir, usecache)
24
26
  if type(response) == int: return 1
25
27
  else: RELEASE_kegg = response
26
28
 
29
+
27
30
 
28
31
  logger.info("Parsing downloaded KEGG information...")
29
-
32
+
33
+ response = create_dict_keggorg(logger, outdir)
34
+ if type(response) == int: return 1
35
+ else: dict_keggorg = response
36
+
30
37
  response = create_dict_ko(logger, outdir)
31
38
  if type(response) == int: return 1
32
39
  else: dict_ko = response
@@ -49,7 +56,7 @@ def do_kdown(logger, outdir, usecache, keeptmp):
49
56
 
50
57
 
51
58
  # create 'idcollection_dict' and 'summary_dict' dictionaries
52
- idcollection_dict = create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md)
59
+ idcollection_dict = create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md)
53
60
  summary_dict = create_summary_dict(dict_c, dict_r, dict_map, dict_md)
54
61
 
55
62
 
@@ -57,7 +64,6 @@ def do_kdown(logger, outdir, usecache, keeptmp):
57
64
 
58
65
 
59
66
 
60
-
61
67
  def main(args, logger):
62
68
 
63
69
 
@@ -67,7 +73,7 @@ def main(args, logger):
67
73
  os.makedirs(f'{args.outdir}/', exist_ok=True)
68
74
 
69
75
 
70
- # KEGG
76
+ # KEGG download
71
77
  response = do_kdown(logger, args.outdir, args.usecache, args.keeptmp)
72
78
  if type(response) == int: return 1
73
79
  else: RELEASE_kegg, idcollection_dict, summary_dict = response[0], response[1], response[2]
@@ -76,7 +82,9 @@ def main(args, logger):
76
82
  # create 'gsrap.maps':
77
83
  with open(f'{args.outdir}/gsrap.maps', 'wb') as wb_handler:
78
84
  pickle.dump({
79
- 'RELEASE_kegg': RELEASE_kegg, 'idcollection_dict': idcollection_dict, 'summary_dict': summary_dict,
85
+ 'RELEASE_kegg': RELEASE_kegg,
86
+ 'idcollection_dict': idcollection_dict,
87
+ 'summary_dict': summary_dict,
80
88
  }, wb_handler)
81
89
  logger.info(f"'{args.outdir}/gsrap.maps' created!")
82
90
 
@@ -87,4 +95,5 @@ def main(args, logger):
87
95
  logger.info(f"Temporary raw files deleted!")
88
96
 
89
97
 
98
+
90
99
  return 0
gsrap/getmaps/kdown.py CHANGED
@@ -34,6 +34,7 @@ def download_raw_txtfiles(logger, outdir, usecache):
34
34
  'orthology',
35
35
  'module',
36
36
  'pathway',
37
+ 'organism',
37
38
  ]
38
39
  for db in databases:
39
40
  time.sleep(0.5)
@@ -45,8 +46,9 @@ def download_raw_txtfiles(logger, outdir, usecache):
45
46
 
46
47
  # mix the items to download to be respectful/compliant
47
48
  items_to_download = []
48
-
49
49
  for db in databases:
50
+ if db == 'organism':
51
+ continue # here we just need the list
50
52
  with open(f"{outdir}/kdown/{db}.txt", 'r') as file:
51
53
  res_string = file.read()
52
54
  rows = res_string.split('\n')
@@ -54,7 +56,6 @@ def download_raw_txtfiles(logger, outdir, usecache):
54
56
  item_id = row.split('\t', 1)[0]
55
57
  if item_id == '': continue
56
58
  items_to_download.append({'db': db, 'id': item_id})
57
-
58
59
  random.shuffle(items_to_download)
59
60
 
60
61
 
@@ -79,6 +80,51 @@ def download_raw_txtfiles(logger, outdir, usecache):
79
80
 
80
81
 
81
82
 
83
+ def create_dict_keggorg(logger, outdir):
84
+
85
+ organisms_raw = open(f'{outdir}/kdown/organism.txt', 'r').read()
86
+
87
+ # create a dataframe listing all organisms in KEGG;
88
+ # columns are [tnumber, name, domain, kingdom, phylum, classification]
89
+ df = [] # list fo dicts
90
+ for line in organisms_raw.strip().split("\n"):
91
+ fields = line.split("\t")
92
+ if len(fields) == 4:
93
+ tnumber, keggorg, name, classification = fields
94
+ levels = classification.split(";")
95
+ domain = levels[0]
96
+ kingdom = levels[1]
97
+ phylum = levels[2]
98
+ df.append({
99
+ 'tnumber':tnumber,
100
+ 'keggorg': keggorg,
101
+ 'name': name,
102
+ 'domain': domain,
103
+ 'kingdom': kingdom,
104
+ 'phylum': phylum,
105
+ 'classification': classification
106
+ })
107
+ else:
108
+ # never verified during tests!
109
+ logger.warning(f'Strange number of fields found in this line of "organism.txt": """{line}""".')
110
+ df = pnd.DataFrame.from_records(df)
111
+ df = df.set_index('keggorg', drop=True, verify_integrity=True)
112
+
113
+
114
+ # convert dataframe to dict
115
+ dict_keggorg = {}
116
+ for keggorg, row in df.iterrows():
117
+ dict_keggorg[keggorg] = {
118
+ 'kingdom': row['kingdom'],
119
+ 'phylum': row['phylum'],
120
+ #'name': row['name'], # not strictly needed. Commented to save disk space.
121
+ }
122
+
123
+ if logger != None: logger.info(f'Number of unique items (org): {len(dict_keggorg.keys())}.')
124
+ return dict_keggorg
125
+
126
+
127
+
82
128
  def create_dict_ko(logger, outdir):
83
129
 
84
130
  dict_ko = {} # main output
@@ -98,6 +144,7 @@ def create_dict_ko(logger, outdir):
98
144
  'ecs': set(),
99
145
  'cogs': set(),
100
146
  'gos': set(),
147
+ 'keggorgs': set(),
101
148
  }
102
149
  else:
103
150
  logger.error(f"{ko_id} already included!")
@@ -175,7 +222,13 @@ def create_dict_ko(logger, outdir):
175
222
  gos = content[len('GO: '):].strip().split(' ')
176
223
  for go in gos:
177
224
  dict_ko[ko_id]['gos'].add(go)
178
-
225
+
226
+
227
+ # parse the organism-specific genes
228
+ if curr_header == 'GENES ':
229
+ keggorg = content.split(': ',1)[0]
230
+ dict_ko[ko_id]['keggorgs'].add(keggorg.lower()) # organism.txt has IDs in lowercase
231
+
179
232
 
180
233
  # parse the reactions
181
234
  if curr_header == 'REACTION ':
@@ -547,7 +600,7 @@ def create_dict_md(logger, outdir):
547
600
 
548
601
 
549
602
 
550
- def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
603
+ def create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md):
551
604
 
552
605
  idcollection_dict = {}
553
606
 
@@ -620,6 +673,24 @@ def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
620
673
  for go in dict_ko[ko_id]['gos']:
621
674
  idcollection_dict['ko_to_gos'][ko_id].add(go)
622
675
 
676
+
677
+ # creation of 'ko_to_keggorgs' skipped as it takes too much disk space. Replaced with 'ko_to_taxa'.
678
+ idcollection_dict['ko_to_taxa'] = {}
679
+ missing_keggorgs = set()
680
+ for ko_id in dict_ko.keys():
681
+ idcollection_dict['ko_to_taxa'][ko_id] = {'kingdom': set(), 'phylum': set()}
682
+ for keggorg in dict_ko[ko_id]['keggorgs']:
683
+ try:
684
+ kingdom = dict_keggorg[keggorg]['kingdom']
685
+ phylum = dict_keggorg[keggorg]['phylum']
686
+ except:
687
+ if keggorg not in missing_keggorgs:
688
+ missing_keggorgs.add(keggorg)
689
+ #print(f"Organism '{keggorg}' appears in 'orthology/' but not in 'organism.txt'.")
690
+ continue
691
+ idcollection_dict['ko_to_taxa'][ko_id]['kingdom'].add(kingdom)
692
+ idcollection_dict['ko_to_taxa'][ko_id]['phylum'].add(phylum)
693
+
623
694
 
624
695
  idcollection_dict['map_to_name'] = {}
625
696
  for map_id in dict_map.keys():
@@ -138,6 +138,15 @@ def set_up_groups(logger, model, idcollection_dict):
138
138
 
139
139
  # insert custom groups:
140
140
  custom_groups = get_custom_groups()
141
+ #
142
+ # create a group for transporters on-the-fly
143
+ custom_groups['transport'] = []
144
+ for r in model.reactions:
145
+ if len(r.metabolites) == 1: # exchanges / sinks/ demands
146
+ custom_groups['transport'].append(r.id)
147
+ if len(set([m.id.rsplit('_', 1)[-1] for m in r.metabolites])) > 1: # transport reactions
148
+ custom_groups['transport'].append(r.id)
149
+ #
141
150
  for group_id in custom_groups.keys():
142
151
  actual_group = cobra.core.Group(
143
152
  group_id,
@@ -6,6 +6,9 @@ import os
6
6
  import pandas as pnd
7
7
 
8
8
 
9
+ from .manual import get_krs_to_exclude
10
+
11
+
9
12
 
10
13
  def parse_eggnog(model, eggnog, idcollection_dict):
11
14
 
@@ -27,9 +30,8 @@ def parse_eggnog(model, eggnog, idcollection_dict):
27
30
 
28
31
 
29
32
  # PART 2. get reactions in the organism (even the GPR is not complete)
30
- kr_to_kos = idcollection_dict['kr_to_kos']
31
33
  krs_org = set()
32
- for kr, kos in kr_to_kos.items():
34
+ for kr, kos in idcollection_dict['kr_to_kos'].items():
33
35
  if any([ko in kos_org for ko in kos]):
34
36
  krs_org.add(kr)
35
37
 
@@ -49,9 +51,34 @@ def parse_keggorg(keggorg, outdir, idcollection_dict):
49
51
 
50
52
 
51
53
  # PART 2. get reactions in the organism (even the GPR is not complete)
52
- kr_to_kos = idcollection_dict['kr_to_kos']
53
54
  krs_org = set()
54
- for kr, kos in kr_to_kos.items():
55
+ for kr, kos in idcollection_dict['kr_to_kos'].items():
56
+ if any([ko in kos_org for ko in kos]):
57
+ krs_org.add(kr)
58
+
59
+
60
+ return krs_org
61
+
62
+
63
+
64
+ def parse_taxon(taxon, idcollection_dict):
65
+
66
+
67
+ # formatting of --taxon was already verified at startup.
68
+ # also the presence of 'ko_to_taxa' in idcollection_dict was veryfied at startup.
69
+ level, name = taxon.split(':')
70
+
71
+
72
+ # PART 1. get KO codes available
73
+ kos_org = set()
74
+ for ko in idcollection_dict['ko_to_taxa'].keys():
75
+ if name in idcollection_dict['ko_to_taxa'][ko][level]:
76
+ kos_org.add(ko)
77
+
78
+
79
+ # PART 2. get reactions in the organism (even the GPR is not complete)
80
+ krs_org = set()
81
+ for kr, kos in idcollection_dict['kr_to_kos'].items():
55
82
  if any([ko in kos_org for ko in kos]):
56
83
  krs_org.add(kr)
57
84
 
@@ -60,7 +87,7 @@ def parse_keggorg(keggorg, outdir, idcollection_dict):
60
87
 
61
88
 
62
89
 
63
- def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
90
+ def check_completeness(logger, model, progress, module, focus, taxon, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
64
91
  # check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
65
92
 
66
93
 
@@ -69,6 +96,9 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
69
96
  if keggorg != '-': # keggorg has precedence
70
97
  kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
71
98
  kr_uni_label = f"organism code '{keggorg}'"
99
+ elif taxon != '-':
100
+ kr_uni = parse_taxon(taxon, idcollection_dict)
101
+ kr_uni_label = f"taxon '{taxon}'"
72
102
  elif eggnog != '-':
73
103
  for eggfile in eggnog:
74
104
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
@@ -85,7 +115,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
85
115
  if 'kegg.reaction' in r.annotation.keys():
86
116
  for kr_id in r.annotation['kegg.reaction']:
87
117
  kr_ids_modeled.add(kr_id)
88
- kr_uni_missing = kr_uni - kr_ids_modeled
118
+ kr_uni_missing = (kr_uni - kr_ids_modeled) - get_krs_to_exclude()
89
119
  kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
90
120
  logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
91
121
 
@@ -114,8 +144,12 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
114
144
 
115
145
  # check if 'focus' exist
116
146
  if focus != '-' and focus not in map_ids and focus not in md_ids:
117
- logger.error(f"The ID provided with --focus does not exist: {focus}.")
118
- return 1
147
+ if focus == 'transport':
148
+ df_coverage = None
149
+ return df_coverage # just the jeneration of 'transport.json' for Escher drawing is needed here
150
+ else:
151
+ logger.error(f"The ID provided with --focus does not exist: {focus}.")
152
+ return 1
119
153
  if focus.startswith('map'):
120
154
  logger.debug(f"With --focus {focus}, --module will switch to False.")
121
155
  module = False
@@ -148,7 +182,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
148
182
 
149
183
  # check if this map was (at least partially) covered:
150
184
  map_krs = set([kr for kr in i['kr_ids'] if kr in kr_uni])
151
- missing = map_krs - kr_ids_modeled
185
+ missing = (map_krs - kr_ids_modeled) - get_krs_to_exclude()
152
186
  present = kr_ids_modeled.intersection(map_krs)
153
187
  if focus == map_id:
154
188
  missing_logger = (map_id, missing)
@@ -260,7 +294,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
260
294
 
261
295
  # check if this module was (at least partially) covered:
262
296
  md_krs = set([kr for kr in z['kr_ids_md'] if kr in kr_uni])
263
- missing = md_krs - kr_ids_modeled
297
+ missing = (md_krs - kr_ids_modeled) - get_krs_to_exclude()
264
298
  present = kr_ids_modeled.intersection(md_krs)
265
299
  if focus == md_id:
266
300
  missing_logger = (md_id, missing)
@@ -309,7 +343,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
309
343
  if module and focus=='-':
310
344
  logger.info(f"{spacer}Modules of {right_item['map_id']}: completed {len(mds_completed)} - partial {len(mds_partial)} - missing {len(mds_missing)} - noreac {len(mds_noreac)}")
311
345
  if focus != '-':
312
- logger.info(f"Missing reactions focusing on {missing_logger[0]}: {' '.join(list(missing_logger[1]))}.")
346
+ logger.info(f"Missing reactions focusing on '{missing_logger[0]}': {' '.join(list(missing_logger[1]))}.")
313
347
  if progress:
314
348
  logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
315
349
 
@@ -5,11 +5,21 @@ def get_deprecated_kos():
5
5
  deprecated_kos = [
6
6
  'K11189', # should be K02784
7
7
  'K07011', # linked to lp_1215(cps3A) and lp_1216(cps3B) during 2018 and not replaced
8
+ #'K24301', # to be introduced in GPRs
8
9
  ]
9
10
  return deprecated_kos
10
11
 
11
12
 
12
13
 
14
+ def get_krs_to_exclude():
15
+ return set([
16
+ 'R12328', 'R05190', # general forms of fatty acid biosynthesis
17
+ 'R01347', 'R04121', # general forms of fatty acid degradation
18
+ ])
19
+
20
+
21
+
22
+
13
23
  def get_rids_with_mancheck_gpr():
14
24
  rids_mancheck_gpr = [ # reactions with manually checked GPRs
15
25
  'SUCD1', 'ALKP', 'PFK_3', 'TCMPTS', 'PPA', 'APSR',
@@ -16,7 +16,10 @@ from ..commons import write_excel_model
16
16
  from ..commons import show_contributions
17
17
  from ..commons import adjust_biomass_precursors
18
18
  from ..commons import count_undrawn_rids
19
+ from ..commons import count_undrawn_rids_focus
20
+
19
21
  from ..commons import format_expansion
22
+ from ..commons import check_taxon
20
23
  from ..commons import download_keggorg
21
24
  from ..commons import initialize_model
22
25
  from ..commons import get_memote_results_dict
@@ -46,6 +49,7 @@ from .cycles import verify_egc_all
46
49
  def main(args, logger):
47
50
 
48
51
 
52
+ ###### FORMAT ARGS NOT REQUIRING RESOURCES
49
53
  # adjust out folder path
50
54
  while args.outdir.endswith('/'):
51
55
  args.outdir = args.outdir[:-1]
@@ -77,17 +81,8 @@ def main(args, logger):
77
81
  if args.onlyauthor == '-': args.onlyauthor = None
78
82
 
79
83
 
80
- # format the --eggnog param
81
- args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
82
-
83
- # get the kegg organism if requested
84
- if args.keggorg != '-':
85
- response = download_keggorg(logger, args.keggorg, args.outdir)
86
- if response == 1: return 1
87
-
88
-
89
-
90
84
 
85
+ ###### LOAD LOCAL RESOURCES
91
86
  # check and extract the required 'gsrap.maps' file
92
87
  if os.path.exists(f'{args.inmaps}') == False:
93
88
  logger.error(f"File 'gsrap.maps' not found at {args.inmaps}.")
@@ -108,9 +103,27 @@ def main(args, logger):
108
103
  kegg_compound_to_others = pickle.load(handle)
109
104
  with resources.path("gsrap.assets", f"kegg_reaction_to_others.pickle") as asset_path:
110
105
  with open(asset_path, 'rb') as handle:
111
- kegg_reaction_to_others = pickle.load(handle)
106
+ kegg_reaction_to_others = pickle.load(handle)
107
+
108
+
109
+
110
+ ###### FORMAT/CHECK FOCUSING ARGS
111
+ # format the --eggnog param
112
+ args.eggnog = format_expansion(logger, args.eggnog) # now 'args.eggnog' could still be '-'
112
113
 
114
+ # check the --taxon param
115
+ if args.taxon != '-':
116
+ response = check_taxon(logger, args.taxon, idcollection_dict)
117
+ if response == 1: return 1
113
118
 
119
+ # get the kegg organism if requested
120
+ if args.keggorg != '-':
121
+ response = download_keggorg(logger, args.keggorg, args.outdir)
122
+ if response == 1: return 1
123
+
124
+
125
+
126
+ # DOWNLOAD ONLINE RESOURCES
114
127
  # get dbuni and dbexp:
115
128
  logger.info("Downloading gsrap database...")
116
129
  response = get_databases(logger)
@@ -166,14 +179,15 @@ def main(args, logger):
166
179
 
167
180
  ###### CHECKS 1
168
181
  # check universe completness
169
- df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.eggnog, args.keggorg, idcollection_dict, summary_dict, args.outdir)
182
+ df_C = check_completeness(logger, universe, args.progress, args.module, args.focus, args.taxon, args.eggnog, args.keggorg, idcollection_dict, summary_dict, args.outdir)
170
183
  if type(df_C)==int: return 1
171
184
 
172
185
 
173
186
 
174
187
  ###### POLISHING 1
175
188
  # remove disconnected metabolites
176
- universe = remove_disconnected(logger, universe)
189
+ if args.keepdisconn == False:
190
+ universe = remove_disconnected(logger, universe) # can be commented when using booster.py
177
191
 
178
192
 
179
193
 
@@ -182,9 +196,9 @@ def main(args, logger):
182
196
  verify_egc_all(logger, universe, args.outdir)
183
197
 
184
198
 
199
+
185
200
  if not args.justparse:
186
201
 
187
-
188
202
  ###### CHECKS 3
189
203
  # check growth on minmal media
190
204
  df_G = grow_on_media(logger, universe, dbexp, args.media, '-', True)
@@ -217,10 +231,15 @@ def main(args, logger):
217
231
 
218
232
 
219
233
 
220
- # output the universe
221
- logger.info("Writing universal model...")
222
- cobra.io.save_json_model(universe, f'{args.outdir}/universe.json')
223
- logger.info(f"'{args.outdir}/universe.json' created!")
234
+ # output the universe (even when --justparse)
235
+ logger.info("Writing universal model...")
236
+ cobra.io.save_json_model(universe, f'{args.outdir}/universe.json')
237
+ logger.info(f"'{args.outdir}/universe.json' created!")
238
+
239
+
240
+ if not args.justparse:
241
+
242
+ # outptu in the remaining formats:
224
243
  cobra.io.write_sbml_model(universe, f'{args.outdir}/universe.xml') # groups are saved only to SBML
225
244
  logger.info(f"'{args.outdir}/universe.xml' created!")
226
245
  force_id_on_sbml(f'{args.outdir}/universe.xml', 'universe') # force introduction of the 'id=""' field
@@ -231,7 +250,9 @@ def main(args, logger):
231
250
 
232
251
  ###### CHECKS 4
233
252
  # check if universal escher map is updated:
234
- count_undrawn_rids(logger, universe, lastmap)
253
+ count_undrawn_rids(logger, universe, lastmap, args.focus)
254
+ if args.focus != '-':
255
+ count_undrawn_rids_focus(logger, universe, lastmap, args.focus, args.outdir)
235
256
 
236
257
 
237
258
  return 0
@@ -45,7 +45,7 @@ def check_gpr(logger, rid, row, kr_ids, idcollection_dict, addtype='R'):
45
45
  pass
46
46
  elif ko_id not in idcollection_dict['ko'] and ko_id != 'spontaneous' and ko_id != 'orphan':
47
47
  logger.error(f"{itemtype} '{rid}' has an invalid KEGG Ortholog: '{ko_id}'.")
48
- return 1
48
+ return 1 # can be commented when migrating to new kegg release
49
49
 
50
50
 
51
51
  # check if these ko_ids are really assigned to this reaction:
@@ -61,7 +61,7 @@ def check_gpr(logger, rid, row, kr_ids, idcollection_dict, addtype='R'):
61
61
  missing_ko_ids = ko_for_rid - (set(ko_ids_parsed) - set(['spontaneous', 'orphan']))
62
62
  if len(missing_ko_ids) > 0:
63
63
  logger.error(f"Orthologs {missing_ko_ids} are missing from reaction '{rid}' ({kr_ids}).")
64
- return 1
64
+ return 1 # can be commented when migrating to new kegg release
65
65
 
66
66
 
67
67
  return 0
@@ -138,6 +138,15 @@ def set_up_groups(logger, model, idcollection_dict):
138
138
 
139
139
  # insert custom groups:
140
140
  custom_groups = get_custom_groups()
141
+ #
142
+ # create a group for transporters on-the-fly
143
+ custom_groups['transport'] = []
144
+ for r in model.reactions:
145
+ if len(r.metabolites) == 1: # exchanges / sinks/ demands
146
+ custom_groups['transport'].append(r.id)
147
+ if len(set([m.id.rsplit('_', 1)[-1] for m in r.metabolites])) > 1: # transport reactions
148
+ custom_groups['transport'].append(r.id)
149
+ #
141
150
  for group_id in custom_groups.keys():
142
151
  actual_group = cobra.core.Group(
143
152
  group_id,
@@ -6,6 +6,9 @@ import os
6
6
  import pandas as pnd
7
7
 
8
8
 
9
+ from .manual import get_krs_to_exclude
10
+
11
+
9
12
 
10
13
  def parse_eggnog(model, eggnog, idcollection_dict):
11
14
 
@@ -27,9 +30,8 @@ def parse_eggnog(model, eggnog, idcollection_dict):
27
30
 
28
31
 
29
32
  # PART 2. get reactions in the organism (even the GPR is not complete)
30
- kr_to_kos = idcollection_dict['kr_to_kos']
31
33
  krs_org = set()
32
- for kr, kos in kr_to_kos.items():
34
+ for kr, kos in idcollection_dict['kr_to_kos'].items():
33
35
  if any([ko in kos_org for ko in kos]):
34
36
  krs_org.add(kr)
35
37
 
@@ -49,9 +51,34 @@ def parse_keggorg(keggorg, outdir, idcollection_dict):
49
51
 
50
52
 
51
53
  # PART 2. get reactions in the organism (even the GPR is not complete)
52
- kr_to_kos = idcollection_dict['kr_to_kos']
53
54
  krs_org = set()
54
- for kr, kos in kr_to_kos.items():
55
+ for kr, kos in idcollection_dict['kr_to_kos'].items():
56
+ if any([ko in kos_org for ko in kos]):
57
+ krs_org.add(kr)
58
+
59
+
60
+ return krs_org
61
+
62
+
63
+
64
+ def parse_taxon(taxon, idcollection_dict):
65
+
66
+
67
+ # formatting of --taxon was already verified at startup.
68
+ # also the presence of 'ko_to_taxa' in idcollection_dict was veryfied at startup.
69
+ level, name = taxon.split(':')
70
+
71
+
72
+ # PART 1. get KO codes available
73
+ kos_org = set()
74
+ for ko in idcollection_dict['ko_to_taxa'].keys():
75
+ if name in idcollection_dict['ko_to_taxa'][ko][level]:
76
+ kos_org.add(ko)
77
+
78
+
79
+ # PART 2. get reactions in the organism (even the GPR is not complete)
80
+ krs_org = set()
81
+ for kr, kos in idcollection_dict['kr_to_kos'].items():
55
82
  if any([ko in kos_org for ko in kos]):
56
83
  krs_org.add(kr)
57
84
 
@@ -60,7 +87,7 @@ def parse_keggorg(keggorg, outdir, idcollection_dict):
60
87
 
61
88
 
62
89
 
63
- def check_completeness(logger, model, progress, module, focus, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
90
+ def check_completeness(logger, model, progress, module, focus, taxon, eggnog, keggorg, idcollection_dict, summary_dict, outdir):
64
91
  # check KEGG annotations in the universe model to get '%' of completeness per pathway/module.
65
92
 
66
93
 
@@ -69,6 +96,9 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
69
96
  if keggorg != '-': # keggorg has precedence
70
97
  kr_uni = parse_keggorg(keggorg, outdir, idcollection_dict)
71
98
  kr_uni_label = f"organism code '{keggorg}'"
99
+ elif taxon != '-':
100
+ kr_uni = parse_taxon(taxon, idcollection_dict)
101
+ kr_uni_label = f"taxon '{taxon}'"
72
102
  elif eggnog != '-':
73
103
  for eggfile in eggnog:
74
104
  eggset = parse_eggnog(model, eggfile, idcollection_dict)
@@ -85,7 +115,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
85
115
  if 'kegg.reaction' in r.annotation.keys():
86
116
  for kr_id in r.annotation['kegg.reaction']:
87
117
  kr_ids_modeled.add(kr_id)
88
- kr_uni_missing = kr_uni - kr_ids_modeled
118
+ kr_uni_missing = (kr_uni - kr_ids_modeled) - get_krs_to_exclude()
89
119
  kr_uni_coverage = len(kr_ids_modeled.intersection(kr_uni)) / len(kr_uni) * 100
90
120
  logger.info(f"Coverage for {kr_uni_label}: {round(kr_uni_coverage, 0)}% ({len(kr_uni_missing)} missing).")
91
121
 
@@ -114,8 +144,12 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
114
144
 
115
145
  # check if 'focus' exist
116
146
  if focus != '-' and focus not in map_ids and focus not in md_ids:
117
- logger.error(f"The ID provided with --focus does not exist: {focus}.")
118
- return 1
147
+ if focus == 'transport':
148
+ df_coverage = None
149
+ return df_coverage # just the jeneration of 'transport.json' for Escher drawing is needed here
150
+ else:
151
+ logger.error(f"The ID provided with --focus does not exist: {focus}.")
152
+ return 1
119
153
  if focus.startswith('map'):
120
154
  logger.debug(f"With --focus {focus}, --module will switch to False.")
121
155
  module = False
@@ -148,7 +182,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
148
182
 
149
183
  # check if this map was (at least partially) covered:
150
184
  map_krs = set([kr for kr in i['kr_ids'] if kr in kr_uni])
151
- missing = map_krs - kr_ids_modeled
185
+ missing = (map_krs - kr_ids_modeled) - get_krs_to_exclude()
152
186
  present = kr_ids_modeled.intersection(map_krs)
153
187
  if focus == map_id:
154
188
  missing_logger = (map_id, missing)
@@ -260,7 +294,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
260
294
 
261
295
  # check if this module was (at least partially) covered:
262
296
  md_krs = set([kr for kr in z['kr_ids_md'] if kr in kr_uni])
263
- missing = md_krs - kr_ids_modeled
297
+ missing = (md_krs - kr_ids_modeled) - get_krs_to_exclude()
264
298
  present = kr_ids_modeled.intersection(md_krs)
265
299
  if focus == md_id:
266
300
  missing_logger = (md_id, missing)
@@ -309,7 +343,7 @@ def check_completeness(logger, model, progress, module, focus, eggnog, keggorg,
309
343
  if module and focus=='-':
310
344
  logger.info(f"{spacer}Modules of {right_item['map_id']}: completed {len(mds_completed)} - partial {len(mds_partial)} - missing {len(mds_missing)} - noreac {len(mds_noreac)}")
311
345
  if focus != '-':
312
- logger.info(f"Missing reactions focusing on {missing_logger[0]}: {' '.join(list(missing_logger[1]))}.")
346
+ logger.info(f"Missing reactions focusing on '{missing_logger[0]}': {' '.join(list(missing_logger[1]))}.")
313
347
  if progress:
314
348
  logger.info(f"Maps: finished {len(maps_finished)} - partial {len(maps_partial)} - missing {len(maps_missing)} - noreac {len(maps_noreac)}")
315
349
 
gsrap/parsedb/manual.py CHANGED
@@ -5,11 +5,21 @@ def get_deprecated_kos():
5
5
  deprecated_kos = [
6
6
  'K11189', # should be K02784
7
7
  'K07011', # linked to lp_1215(cps3A) and lp_1216(cps3B) during 2018 and not replaced
8
+ #'K24301', # to be introduced in GPRs
8
9
  ]
9
10
  return deprecated_kos
10
11
 
11
12
 
12
13
 
14
+ def get_krs_to_exclude():
15
+ return set([
16
+ 'R12328', 'R05190', # general forms of fatty acid biosynthesis
17
+ 'R01347', 'R01348', 'R04121', # general forms of fatty acid degradation
18
+ ])
19
+
20
+
21
+
22
+
13
23
  def get_rids_with_mancheck_gpr():
14
24
  rids_mancheck_gpr = [ # reactions with manually checked GPRs
15
25
  'SUCD1', 'ALKP', 'PFK_3', 'TCMPTS', 'PPA', 'APSR',