gsrap 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsrap/.ipynb_checkpoints/__init__-checkpoint.py +21 -4
- gsrap/__init__.py +21 -4
- gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
- gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +34 -3
- gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +51 -3
- gsrap/commons/.ipynb_checkpoints/medium-checkpoint.py +36 -0
- gsrap/commons/.ipynb_checkpoints/memoteutils-checkpoint.py +132 -0
- gsrap/commons/__init__.py +1 -0
- gsrap/commons/downloads.py +34 -3
- gsrap/commons/excelhub.py +51 -3
- gsrap/commons/medium.py +36 -0
- gsrap/commons/memoteutils.py +132 -0
- gsrap/getmaps/.ipynb_checkpoints/getmaps-checkpoint.py +14 -5
- gsrap/getmaps/.ipynb_checkpoints/kdown-checkpoint.py +75 -4
- gsrap/getmaps/getmaps.py +14 -5
- gsrap/getmaps/kdown.py +75 -4
- gsrap/mkmodel/.ipynb_checkpoints/mkmodel-checkpoint.py +22 -8
- gsrap/mkmodel/mkmodel.py +22 -8
- gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +32 -5
- gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +66 -37
- gsrap/parsedb/completeness.py +32 -5
- gsrap/parsedb/parsedb.py +66 -37
- {gsrap-0.8.1.dist-info → gsrap-0.8.3.dist-info}/METADATA +1 -1
- {gsrap-0.8.1.dist-info → gsrap-0.8.3.dist-info}/RECORD +27 -25
- {gsrap-0.8.1.dist-info → gsrap-0.8.3.dist-info}/LICENSE.txt +0 -0
- {gsrap-0.8.1.dist-info → gsrap-0.8.3.dist-info}/WHEEL +0 -0
- {gsrap-0.8.1.dist-info → gsrap-0.8.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import contextlib
|
|
3
|
+
import importlib.metadata
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
import memote
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_memote_results_dict(logger, model):
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger.info(f"Running selected modules of MEMOTE v{importlib.metadata.metadata('memote')['Version']}...")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# launch memote (only relevant modules)
|
|
19
|
+
with open(os.devnull, 'w') as devnull:
|
|
20
|
+
with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
|
|
21
|
+
try: memote_report = memote.suite.api.test_model(model, exclusive=[
|
|
22
|
+
'test_annotation',
|
|
23
|
+
'test_sbo',
|
|
24
|
+
'test_stoichiometric_consistency',
|
|
25
|
+
'test_reaction_mass_balance',
|
|
26
|
+
'test_reaction_charge_balance',
|
|
27
|
+
'test_find_disconnected',
|
|
28
|
+
'test_find_reactions_unbounded_flux_default_condition'], results=True)
|
|
29
|
+
except ValueError: memote_report = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# parse memote's results
|
|
33
|
+
results_dict = {}
|
|
34
|
+
results_dict['version'] = importlib.metadata.version("memote")
|
|
35
|
+
test_results = dict(memote_report[1])['tests']
|
|
36
|
+
sections = {
|
|
37
|
+
'consistency': [
|
|
38
|
+
('test_stoichiometric_consistency', 3),
|
|
39
|
+
('test_reaction_mass_balance', 1),
|
|
40
|
+
('test_reaction_charge_balance', 1),
|
|
41
|
+
('test_find_disconnected', 1),
|
|
42
|
+
('test_find_reactions_unbounded_flux_default_condition', 1)
|
|
43
|
+
],
|
|
44
|
+
'annotation_M': [
|
|
45
|
+
('test_metabolite_annotation_presence', 1),
|
|
46
|
+
('test_metabolite_annotation_overview', 1),
|
|
47
|
+
('test_metabolite_annotation_wrong_ids', 1),
|
|
48
|
+
('test_metabolite_id_namespace_consistency', 1),
|
|
49
|
+
],
|
|
50
|
+
'annotation_R': [
|
|
51
|
+
('test_reaction_annotation_presence', 1),
|
|
52
|
+
('test_reaction_annotation_overview', 1),
|
|
53
|
+
('test_reaction_annotation_wrong_ids', 1),
|
|
54
|
+
('test_reaction_id_namespace_consistency', 1),
|
|
55
|
+
],
|
|
56
|
+
'annotation_G': [
|
|
57
|
+
('test_gene_product_annotation_presence', 1),
|
|
58
|
+
('test_gene_product_annotation_overview', 1),
|
|
59
|
+
('test_gene_product_annotation_wrong_ids', 1),
|
|
60
|
+
],
|
|
61
|
+
'annotation_SBO': [
|
|
62
|
+
('test_metabolite_sbo_presence', 1),
|
|
63
|
+
('test_metabolite_specific_sbo_presence', 1),
|
|
64
|
+
('test_reaction_sbo_presence', 1),
|
|
65
|
+
('test_metabolic_reaction_specific_sbo_presence', 1),
|
|
66
|
+
('test_transport_reaction_specific_sbo_presence', 1),
|
|
67
|
+
('test_exchange_specific_sbo_presence', 1),
|
|
68
|
+
('test_demand_specific_sbo_presence', 1),
|
|
69
|
+
('test_sink_specific_sbo_presence', 1),
|
|
70
|
+
('test_gene_sbo_presence', 1),
|
|
71
|
+
('test_gene_specific_sbo_presence', 1),
|
|
72
|
+
('test_biomass_specific_sbo_presence', 1),
|
|
73
|
+
],
|
|
74
|
+
}
|
|
75
|
+
section_multipliers = {
|
|
76
|
+
'consistency': 3,
|
|
77
|
+
'annotation_M': 1,
|
|
78
|
+
'annotation_R': 1,
|
|
79
|
+
'annotation_G': 1,
|
|
80
|
+
'annotation_SBO': 2,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
numerator_total = 0
|
|
85
|
+
denominator_total = 0
|
|
86
|
+
for section, metrics in sections.items():
|
|
87
|
+
numerator = 0
|
|
88
|
+
denominator = 0
|
|
89
|
+
results_dict[section] = {}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# iterate metrics of this section:
|
|
93
|
+
for metric, metric_multiplier in metrics:
|
|
94
|
+
metric_raw = test_results[metric]['metric']
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# no subcategories here:
|
|
98
|
+
if type(metric_raw) == float:
|
|
99
|
+
metric_percentage = ((1- metric_raw ) *100)
|
|
100
|
+
numerator = numerator + (metric_percentage * metric_multiplier)
|
|
101
|
+
denominator = denominator + metric_multiplier
|
|
102
|
+
results_dict[section][metric] = round(metric_percentage, 1)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# there are subcategories (like in the case of M/R/G/SBO annots)
|
|
106
|
+
else:
|
|
107
|
+
results_dict[section][metric] = {}
|
|
108
|
+
for key, value in metric_raw.items():
|
|
109
|
+
n_subcategories = len(metric_raw)
|
|
110
|
+
multiplier_corrected = metric_multiplier / n_subcategories
|
|
111
|
+
metric_percentage = ((1- value ) *100)
|
|
112
|
+
numerator = numerator + (metric_percentage * multiplier_corrected)
|
|
113
|
+
denominator = denominator + multiplier_corrected
|
|
114
|
+
results_dict[section][metric][key] = round(metric_percentage, 1)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# compute the subtotal:
|
|
118
|
+
sub_total = numerator / denominator
|
|
119
|
+
results_dict[section]['sub_total'] = int(round(sub_total, 0))
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# compute the total:
|
|
123
|
+
denominator_total = denominator_total + section_multipliers[section] *denominator
|
|
124
|
+
numerator_total = numerator_total + section_multipliers[section] *numerator
|
|
125
|
+
total = numerator_total / denominator_total
|
|
126
|
+
results_dict['total'] = int(round(total, 0))
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
logger.info(f"Done! MEMOTE Total Score: {results_dict['total']}%.")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
return results_dict
|
|
@@ -4,6 +4,7 @@ import pickle
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
from .kdown import download_raw_txtfiles
|
|
7
|
+
from .kdown import create_dict_keggorg
|
|
7
8
|
from .kdown import create_dict_ko
|
|
8
9
|
from .kdown import create_dict_c
|
|
9
10
|
from .kdown import create_dict_r
|
|
@@ -20,13 +21,19 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
20
21
|
logger.info(f"Respectfully retrieving metabolic information from KEGG. Raw data are being saved into '{outdir}/kdown/'. Be patient, could take a couple of days...")
|
|
21
22
|
os.makedirs(f'{outdir}/kdown/', exist_ok=True)
|
|
22
23
|
|
|
24
|
+
|
|
23
25
|
response = download_raw_txtfiles(logger, outdir, usecache)
|
|
24
26
|
if type(response) == int: return 1
|
|
25
27
|
else: RELEASE_kegg = response
|
|
26
28
|
|
|
29
|
+
|
|
27
30
|
|
|
28
31
|
logger.info("Parsing downloaded KEGG information...")
|
|
29
|
-
|
|
32
|
+
|
|
33
|
+
response = create_dict_keggorg(logger, outdir)
|
|
34
|
+
if type(response) == int: return 1
|
|
35
|
+
else: dict_keggorg = response
|
|
36
|
+
|
|
30
37
|
response = create_dict_ko(logger, outdir)
|
|
31
38
|
if type(response) == int: return 1
|
|
32
39
|
else: dict_ko = response
|
|
@@ -49,7 +56,7 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
49
56
|
|
|
50
57
|
|
|
51
58
|
# create 'idcollection_dict' and 'summary_dict' dictionaries
|
|
52
|
-
idcollection_dict = create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md)
|
|
59
|
+
idcollection_dict = create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md)
|
|
53
60
|
summary_dict = create_summary_dict(dict_c, dict_r, dict_map, dict_md)
|
|
54
61
|
|
|
55
62
|
|
|
@@ -57,7 +64,6 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
57
64
|
|
|
58
65
|
|
|
59
66
|
|
|
60
|
-
|
|
61
67
|
def main(args, logger):
|
|
62
68
|
|
|
63
69
|
|
|
@@ -67,7 +73,7 @@ def main(args, logger):
|
|
|
67
73
|
os.makedirs(f'{args.outdir}/', exist_ok=True)
|
|
68
74
|
|
|
69
75
|
|
|
70
|
-
# KEGG
|
|
76
|
+
# KEGG download
|
|
71
77
|
response = do_kdown(logger, args.outdir, args.usecache, args.keeptmp)
|
|
72
78
|
if type(response) == int: return 1
|
|
73
79
|
else: RELEASE_kegg, idcollection_dict, summary_dict = response[0], response[1], response[2]
|
|
@@ -76,7 +82,9 @@ def main(args, logger):
|
|
|
76
82
|
# create 'gsrap.maps':
|
|
77
83
|
with open(f'{args.outdir}/gsrap.maps', 'wb') as wb_handler:
|
|
78
84
|
pickle.dump({
|
|
79
|
-
'RELEASE_kegg': RELEASE_kegg,
|
|
85
|
+
'RELEASE_kegg': RELEASE_kegg,
|
|
86
|
+
'idcollection_dict': idcollection_dict,
|
|
87
|
+
'summary_dict': summary_dict,
|
|
80
88
|
}, wb_handler)
|
|
81
89
|
logger.info(f"'{args.outdir}/gsrap.maps' created!")
|
|
82
90
|
|
|
@@ -87,4 +95,5 @@ def main(args, logger):
|
|
|
87
95
|
logger.info(f"Temporary raw files deleted!")
|
|
88
96
|
|
|
89
97
|
|
|
98
|
+
|
|
90
99
|
return 0
|
|
@@ -34,6 +34,7 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
34
34
|
'orthology',
|
|
35
35
|
'module',
|
|
36
36
|
'pathway',
|
|
37
|
+
'organism',
|
|
37
38
|
]
|
|
38
39
|
for db in databases:
|
|
39
40
|
time.sleep(0.5)
|
|
@@ -45,8 +46,9 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
45
46
|
|
|
46
47
|
# mix the items to download to be respectful/compliant
|
|
47
48
|
items_to_download = []
|
|
48
|
-
|
|
49
49
|
for db in databases:
|
|
50
|
+
if db == 'organism':
|
|
51
|
+
continue # here we just need the list
|
|
50
52
|
with open(f"{outdir}/kdown/{db}.txt", 'r') as file:
|
|
51
53
|
res_string = file.read()
|
|
52
54
|
rows = res_string.split('\n')
|
|
@@ -54,7 +56,6 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
54
56
|
item_id = row.split('\t', 1)[0]
|
|
55
57
|
if item_id == '': continue
|
|
56
58
|
items_to_download.append({'db': db, 'id': item_id})
|
|
57
|
-
|
|
58
59
|
random.shuffle(items_to_download)
|
|
59
60
|
|
|
60
61
|
|
|
@@ -79,6 +80,51 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
79
80
|
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
def create_dict_keggorg(logger, outdir):
|
|
84
|
+
|
|
85
|
+
organisms_raw = open(f'{outdir}/kdown/organism.txt', 'r').read()
|
|
86
|
+
|
|
87
|
+
# create a dataframe listing all organisms in KEGG;
|
|
88
|
+
# columns are [tnumber, name, domain, kingdom, phylum, classification]
|
|
89
|
+
df = [] # list fo dicts
|
|
90
|
+
for line in organisms_raw.strip().split("\n"):
|
|
91
|
+
fields = line.split("\t")
|
|
92
|
+
if len(fields) == 4:
|
|
93
|
+
tnumber, keggorg, name, classification = fields
|
|
94
|
+
levels = classification.split(";")
|
|
95
|
+
domain = levels[0]
|
|
96
|
+
kingdom = levels[1]
|
|
97
|
+
phylum = levels[2]
|
|
98
|
+
df.append({
|
|
99
|
+
'tnumber':tnumber,
|
|
100
|
+
'keggorg': keggorg,
|
|
101
|
+
'name': name,
|
|
102
|
+
'domain': domain,
|
|
103
|
+
'kingdom': kingdom,
|
|
104
|
+
'phylum': phylum,
|
|
105
|
+
'classification': classification
|
|
106
|
+
})
|
|
107
|
+
else:
|
|
108
|
+
# never verified during tests!
|
|
109
|
+
logger.warning(f'Strange number of fields found in this line of "organism.txt": """{line}""".')
|
|
110
|
+
df = pnd.DataFrame.from_records(df)
|
|
111
|
+
df = df.set_index('keggorg', drop=True, verify_integrity=True)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# convert dataframe to dict
|
|
115
|
+
dict_keggorg = {}
|
|
116
|
+
for keggorg, row in df.iterrows():
|
|
117
|
+
dict_keggorg[keggorg] = {
|
|
118
|
+
'kingdom': row['kingdom'],
|
|
119
|
+
'phylum': row['phylum'],
|
|
120
|
+
#'name': row['name'], # not strictly needed. Commented to save disk space.
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if logger != None: logger.info(f'Number of unique items (org): {len(dict_keggorg.keys())}.')
|
|
124
|
+
return dict_keggorg
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
82
128
|
def create_dict_ko(logger, outdir):
|
|
83
129
|
|
|
84
130
|
dict_ko = {} # main output
|
|
@@ -98,6 +144,7 @@ def create_dict_ko(logger, outdir):
|
|
|
98
144
|
'ecs': set(),
|
|
99
145
|
'cogs': set(),
|
|
100
146
|
'gos': set(),
|
|
147
|
+
'keggorgs': set(),
|
|
101
148
|
}
|
|
102
149
|
else:
|
|
103
150
|
logger.error(f"{ko_id} already included!")
|
|
@@ -175,7 +222,13 @@ def create_dict_ko(logger, outdir):
|
|
|
175
222
|
gos = content[len('GO: '):].strip().split(' ')
|
|
176
223
|
for go in gos:
|
|
177
224
|
dict_ko[ko_id]['gos'].add(go)
|
|
178
|
-
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# parse the organism-specific genes
|
|
228
|
+
if curr_header == 'GENES ':
|
|
229
|
+
keggorg = content.split(': ',1)[0]
|
|
230
|
+
dict_ko[ko_id]['keggorgs'].add(keggorg.lower()) # organism.txt has IDs in lowercase
|
|
231
|
+
|
|
179
232
|
|
|
180
233
|
# parse the reactions
|
|
181
234
|
if curr_header == 'REACTION ':
|
|
@@ -547,7 +600,7 @@ def create_dict_md(logger, outdir):
|
|
|
547
600
|
|
|
548
601
|
|
|
549
602
|
|
|
550
|
-
def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
603
|
+
def create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
551
604
|
|
|
552
605
|
idcollection_dict = {}
|
|
553
606
|
|
|
@@ -620,6 +673,24 @@ def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
|
620
673
|
for go in dict_ko[ko_id]['gos']:
|
|
621
674
|
idcollection_dict['ko_to_gos'][ko_id].add(go)
|
|
622
675
|
|
|
676
|
+
|
|
677
|
+
# creation of 'ko_to_keggorgs' skipped as it takes too much disk space. Replaced with 'ko_to_taxa'.
|
|
678
|
+
idcollection_dict['ko_to_taxa'] = {}
|
|
679
|
+
missing_keggorgs = set()
|
|
680
|
+
for ko_id in dict_ko.keys():
|
|
681
|
+
idcollection_dict['ko_to_taxa'][ko_id] = {'kingdom': set(), 'phylum': set()}
|
|
682
|
+
for keggorg in dict_ko[ko_id]['keggorgs']:
|
|
683
|
+
try:
|
|
684
|
+
kingdom = dict_keggorg[keggorg]['kingdom']
|
|
685
|
+
phylum = dict_keggorg[keggorg]['phylum']
|
|
686
|
+
except:
|
|
687
|
+
if keggorg not in missing_keggorgs:
|
|
688
|
+
missing_keggorgs.add(keggorg)
|
|
689
|
+
#print(f"Organism '{keggorg}' appears in 'orthology/' but not in 'organism.txt'.")
|
|
690
|
+
continue
|
|
691
|
+
idcollection_dict['ko_to_taxa'][ko_id]['kingdom'].add(kingdom)
|
|
692
|
+
idcollection_dict['ko_to_taxa'][ko_id]['phylum'].add(phylum)
|
|
693
|
+
|
|
623
694
|
|
|
624
695
|
idcollection_dict['map_to_name'] = {}
|
|
625
696
|
for map_id in dict_map.keys():
|
gsrap/getmaps/getmaps.py
CHANGED
|
@@ -4,6 +4,7 @@ import pickle
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
from .kdown import download_raw_txtfiles
|
|
7
|
+
from .kdown import create_dict_keggorg
|
|
7
8
|
from .kdown import create_dict_ko
|
|
8
9
|
from .kdown import create_dict_c
|
|
9
10
|
from .kdown import create_dict_r
|
|
@@ -20,13 +21,19 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
20
21
|
logger.info(f"Respectfully retrieving metabolic information from KEGG. Raw data are being saved into '{outdir}/kdown/'. Be patient, could take a couple of days...")
|
|
21
22
|
os.makedirs(f'{outdir}/kdown/', exist_ok=True)
|
|
22
23
|
|
|
24
|
+
|
|
23
25
|
response = download_raw_txtfiles(logger, outdir, usecache)
|
|
24
26
|
if type(response) == int: return 1
|
|
25
27
|
else: RELEASE_kegg = response
|
|
26
28
|
|
|
29
|
+
|
|
27
30
|
|
|
28
31
|
logger.info("Parsing downloaded KEGG information...")
|
|
29
|
-
|
|
32
|
+
|
|
33
|
+
response = create_dict_keggorg(logger, outdir)
|
|
34
|
+
if type(response) == int: return 1
|
|
35
|
+
else: dict_keggorg = response
|
|
36
|
+
|
|
30
37
|
response = create_dict_ko(logger, outdir)
|
|
31
38
|
if type(response) == int: return 1
|
|
32
39
|
else: dict_ko = response
|
|
@@ -49,7 +56,7 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
49
56
|
|
|
50
57
|
|
|
51
58
|
# create 'idcollection_dict' and 'summary_dict' dictionaries
|
|
52
|
-
idcollection_dict = create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md)
|
|
59
|
+
idcollection_dict = create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md)
|
|
53
60
|
summary_dict = create_summary_dict(dict_c, dict_r, dict_map, dict_md)
|
|
54
61
|
|
|
55
62
|
|
|
@@ -57,7 +64,6 @@ def do_kdown(logger, outdir, usecache, keeptmp):
|
|
|
57
64
|
|
|
58
65
|
|
|
59
66
|
|
|
60
|
-
|
|
61
67
|
def main(args, logger):
|
|
62
68
|
|
|
63
69
|
|
|
@@ -67,7 +73,7 @@ def main(args, logger):
|
|
|
67
73
|
os.makedirs(f'{args.outdir}/', exist_ok=True)
|
|
68
74
|
|
|
69
75
|
|
|
70
|
-
# KEGG
|
|
76
|
+
# KEGG download
|
|
71
77
|
response = do_kdown(logger, args.outdir, args.usecache, args.keeptmp)
|
|
72
78
|
if type(response) == int: return 1
|
|
73
79
|
else: RELEASE_kegg, idcollection_dict, summary_dict = response[0], response[1], response[2]
|
|
@@ -76,7 +82,9 @@ def main(args, logger):
|
|
|
76
82
|
# create 'gsrap.maps':
|
|
77
83
|
with open(f'{args.outdir}/gsrap.maps', 'wb') as wb_handler:
|
|
78
84
|
pickle.dump({
|
|
79
|
-
'RELEASE_kegg': RELEASE_kegg,
|
|
85
|
+
'RELEASE_kegg': RELEASE_kegg,
|
|
86
|
+
'idcollection_dict': idcollection_dict,
|
|
87
|
+
'summary_dict': summary_dict,
|
|
80
88
|
}, wb_handler)
|
|
81
89
|
logger.info(f"'{args.outdir}/gsrap.maps' created!")
|
|
82
90
|
|
|
@@ -87,4 +95,5 @@ def main(args, logger):
|
|
|
87
95
|
logger.info(f"Temporary raw files deleted!")
|
|
88
96
|
|
|
89
97
|
|
|
98
|
+
|
|
90
99
|
return 0
|
gsrap/getmaps/kdown.py
CHANGED
|
@@ -34,6 +34,7 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
34
34
|
'orthology',
|
|
35
35
|
'module',
|
|
36
36
|
'pathway',
|
|
37
|
+
'organism',
|
|
37
38
|
]
|
|
38
39
|
for db in databases:
|
|
39
40
|
time.sleep(0.5)
|
|
@@ -45,8 +46,9 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
45
46
|
|
|
46
47
|
# mix the items to download to be respectful/compliant
|
|
47
48
|
items_to_download = []
|
|
48
|
-
|
|
49
49
|
for db in databases:
|
|
50
|
+
if db == 'organism':
|
|
51
|
+
continue # here we just need the list
|
|
50
52
|
with open(f"{outdir}/kdown/{db}.txt", 'r') as file:
|
|
51
53
|
res_string = file.read()
|
|
52
54
|
rows = res_string.split('\n')
|
|
@@ -54,7 +56,6 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
54
56
|
item_id = row.split('\t', 1)[0]
|
|
55
57
|
if item_id == '': continue
|
|
56
58
|
items_to_download.append({'db': db, 'id': item_id})
|
|
57
|
-
|
|
58
59
|
random.shuffle(items_to_download)
|
|
59
60
|
|
|
60
61
|
|
|
@@ -79,6 +80,51 @@ def download_raw_txtfiles(logger, outdir, usecache):
|
|
|
79
80
|
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
def create_dict_keggorg(logger, outdir):
|
|
84
|
+
|
|
85
|
+
organisms_raw = open(f'{outdir}/kdown/organism.txt', 'r').read()
|
|
86
|
+
|
|
87
|
+
# create a dataframe listing all organisms in KEGG;
|
|
88
|
+
# columns are [tnumber, name, domain, kingdom, phylum, classification]
|
|
89
|
+
df = [] # list fo dicts
|
|
90
|
+
for line in organisms_raw.strip().split("\n"):
|
|
91
|
+
fields = line.split("\t")
|
|
92
|
+
if len(fields) == 4:
|
|
93
|
+
tnumber, keggorg, name, classification = fields
|
|
94
|
+
levels = classification.split(";")
|
|
95
|
+
domain = levels[0]
|
|
96
|
+
kingdom = levels[1]
|
|
97
|
+
phylum = levels[2]
|
|
98
|
+
df.append({
|
|
99
|
+
'tnumber':tnumber,
|
|
100
|
+
'keggorg': keggorg,
|
|
101
|
+
'name': name,
|
|
102
|
+
'domain': domain,
|
|
103
|
+
'kingdom': kingdom,
|
|
104
|
+
'phylum': phylum,
|
|
105
|
+
'classification': classification
|
|
106
|
+
})
|
|
107
|
+
else:
|
|
108
|
+
# never verified during tests!
|
|
109
|
+
logger.warning(f'Strange number of fields found in this line of "organism.txt": """{line}""".')
|
|
110
|
+
df = pnd.DataFrame.from_records(df)
|
|
111
|
+
df = df.set_index('keggorg', drop=True, verify_integrity=True)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# convert dataframe to dict
|
|
115
|
+
dict_keggorg = {}
|
|
116
|
+
for keggorg, row in df.iterrows():
|
|
117
|
+
dict_keggorg[keggorg] = {
|
|
118
|
+
'kingdom': row['kingdom'],
|
|
119
|
+
'phylum': row['phylum'],
|
|
120
|
+
#'name': row['name'], # not strictly needed. Commented to save disk space.
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if logger != None: logger.info(f'Number of unique items (org): {len(dict_keggorg.keys())}.')
|
|
124
|
+
return dict_keggorg
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
82
128
|
def create_dict_ko(logger, outdir):
|
|
83
129
|
|
|
84
130
|
dict_ko = {} # main output
|
|
@@ -98,6 +144,7 @@ def create_dict_ko(logger, outdir):
|
|
|
98
144
|
'ecs': set(),
|
|
99
145
|
'cogs': set(),
|
|
100
146
|
'gos': set(),
|
|
147
|
+
'keggorgs': set(),
|
|
101
148
|
}
|
|
102
149
|
else:
|
|
103
150
|
logger.error(f"{ko_id} already included!")
|
|
@@ -175,7 +222,13 @@ def create_dict_ko(logger, outdir):
|
|
|
175
222
|
gos = content[len('GO: '):].strip().split(' ')
|
|
176
223
|
for go in gos:
|
|
177
224
|
dict_ko[ko_id]['gos'].add(go)
|
|
178
|
-
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# parse the organism-specific genes
|
|
228
|
+
if curr_header == 'GENES ':
|
|
229
|
+
keggorg = content.split(': ',1)[0]
|
|
230
|
+
dict_ko[ko_id]['keggorgs'].add(keggorg.lower()) # organism.txt has IDs in lowercase
|
|
231
|
+
|
|
179
232
|
|
|
180
233
|
# parse the reactions
|
|
181
234
|
if curr_header == 'REACTION ':
|
|
@@ -547,7 +600,7 @@ def create_dict_md(logger, outdir):
|
|
|
547
600
|
|
|
548
601
|
|
|
549
602
|
|
|
550
|
-
def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
603
|
+
def create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
551
604
|
|
|
552
605
|
idcollection_dict = {}
|
|
553
606
|
|
|
@@ -620,6 +673,24 @@ def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
|
|
|
620
673
|
for go in dict_ko[ko_id]['gos']:
|
|
621
674
|
idcollection_dict['ko_to_gos'][ko_id].add(go)
|
|
622
675
|
|
|
676
|
+
|
|
677
|
+
# creation of 'ko_to_keggorgs' skipped as it takes too much disk space. Replaced with 'ko_to_taxa'.
|
|
678
|
+
idcollection_dict['ko_to_taxa'] = {}
|
|
679
|
+
missing_keggorgs = set()
|
|
680
|
+
for ko_id in dict_ko.keys():
|
|
681
|
+
idcollection_dict['ko_to_taxa'][ko_id] = {'kingdom': set(), 'phylum': set()}
|
|
682
|
+
for keggorg in dict_ko[ko_id]['keggorgs']:
|
|
683
|
+
try:
|
|
684
|
+
kingdom = dict_keggorg[keggorg]['kingdom']
|
|
685
|
+
phylum = dict_keggorg[keggorg]['phylum']
|
|
686
|
+
except:
|
|
687
|
+
if keggorg not in missing_keggorgs:
|
|
688
|
+
missing_keggorgs.add(keggorg)
|
|
689
|
+
#print(f"Organism '{keggorg}' appears in 'orthology/' but not in 'organism.txt'.")
|
|
690
|
+
continue
|
|
691
|
+
idcollection_dict['ko_to_taxa'][ko_id]['kingdom'].add(kingdom)
|
|
692
|
+
idcollection_dict['ko_to_taxa'][ko_id]['phylum'].add(phylum)
|
|
693
|
+
|
|
623
694
|
|
|
624
695
|
idcollection_dict['map_to_name'] = {}
|
|
625
696
|
for map_id in dict_map.keys():
|
|
@@ -41,8 +41,12 @@ from ..commons import log_unbalances
|
|
|
41
41
|
from ..commons import format_expansion
|
|
42
42
|
from ..commons import comparative_table
|
|
43
43
|
from ..commons import download_keggorg
|
|
44
|
+
from ..commons import initialize_model
|
|
45
|
+
from ..commons import get_memote_results_dict
|
|
46
|
+
|
|
44
47
|
|
|
45
48
|
from ..runsims.biosynth import biosynthesis_on_media
|
|
49
|
+
from ..runsims.simplegrowth import grow_on_media
|
|
46
50
|
|
|
47
51
|
from ..parsedb.cycles import verify_egc_all
|
|
48
52
|
|
|
@@ -103,7 +107,7 @@ def create_model_incore(params):
|
|
|
103
107
|
|
|
104
108
|
###### GAPFILLING
|
|
105
109
|
# force inclusion of reactions:
|
|
106
|
-
include_forced(logger, model, universe, args.
|
|
110
|
+
include_forced(logger, model, universe, args.include)
|
|
107
111
|
|
|
108
112
|
# remove missing conditional precursors + get the 'cond_col_dict' dict.
|
|
109
113
|
# 'cond_col_dict' is str-to-str: {'pheme_c': 'M00868: 1/8; M00121: 2/12;', 'hemeO_c': 'gr_HemeO: 0/1'}
|
|
@@ -114,15 +118,15 @@ def create_model_incore(params):
|
|
|
114
118
|
if response == 1: return 1
|
|
115
119
|
|
|
116
120
|
# gap-fill based on media:
|
|
117
|
-
df_B = gapfill_on_media(logger, model, universe, dbexp, args.
|
|
121
|
+
df_B = gapfill_on_media(logger, model, universe, dbexp, args.gapfill, cond_col_dict, args.excludeorp)
|
|
118
122
|
if type(df_B)==int: return 1
|
|
119
123
|
|
|
120
124
|
# force removal of reactions
|
|
121
|
-
setattr(args, '
|
|
122
|
-
remove_forced(logger, model, universe, args.
|
|
125
|
+
setattr(args, 'remove', '-') # experimental feature, not public. It's main purpose was to test gap-filling in biolog_on_media().
|
|
126
|
+
remove_forced(logger, model, universe, args.remove)
|
|
123
127
|
|
|
124
128
|
# perform Biolog(R) curation based on media
|
|
125
|
-
df_P = biolog_on_media(logger, model, universe, dbexp, args.
|
|
129
|
+
df_P = biolog_on_media(logger, model, universe, dbexp, args.gapfill, args.biolog, args.excludeorp, args.cnps)
|
|
126
130
|
if type(df_P)==int: return 1
|
|
127
131
|
|
|
128
132
|
|
|
@@ -142,12 +146,12 @@ def create_model_incore(params):
|
|
|
142
146
|
|
|
143
147
|
|
|
144
148
|
|
|
145
|
-
###### CHECKS
|
|
149
|
+
###### CHECKS 1
|
|
146
150
|
# check erroneous EGCs
|
|
147
151
|
verify_egc_all(logger, model, args.outdir)
|
|
148
152
|
|
|
149
153
|
# check blocked metabolites / dead-ends
|
|
150
|
-
df_S = biosynthesis_on_media(logger, model, dbexp, args.
|
|
154
|
+
df_S = biosynthesis_on_media(logger, model, dbexp, args.gapfill, args.biosynth)
|
|
151
155
|
if type(df_S)==int: return 1
|
|
152
156
|
|
|
153
157
|
|
|
@@ -155,6 +159,16 @@ def create_model_incore(params):
|
|
|
155
159
|
###### POLISHING 3
|
|
156
160
|
# reset growth environment befor saving the model
|
|
157
161
|
gempipe.reset_growth_env(model)
|
|
162
|
+
|
|
163
|
+
# initialize model
|
|
164
|
+
response = initialize_model(logger, model, dbexp, args.initialize, args.gapfill)
|
|
165
|
+
if response==1: return 1
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
###### CHECKS 2
|
|
170
|
+
# compute Memote metrics
|
|
171
|
+
memote_results_dict = get_memote_results_dict(logger, model)
|
|
158
172
|
|
|
159
173
|
|
|
160
174
|
|
|
@@ -165,7 +179,7 @@ def create_model_incore(params):
|
|
|
165
179
|
cobra.io.write_sbml_model(model, f'{args.outdir}/{model.id}.xml') # SBML # groups are saved only to SBML
|
|
166
180
|
logger.info(f"'{args.outdir}/{model.id}.xml' created!")
|
|
167
181
|
force_id_on_sbml(f'{args.outdir}/{model.id}.xml', model.id) # force introduction of the 'id=""' field
|
|
168
|
-
sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', args.nofigs, None, df_B, df_P, df_S)
|
|
182
|
+
sheets_dict = write_excel_model(model, f'{args.outdir}/{model.id}.mkmodel.xlsx', args.nofigs, memote_results_dict, None, df_B, df_P, df_S)
|
|
169
183
|
logger.info(f"'{args.outdir}/{model.id}.mkmodel.xlsx' created!")
|
|
170
184
|
|
|
171
185
|
|