napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,597 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
|
5
|
+
import igraph as ig
|
6
|
+
import pandas as pd
|
7
|
+
from napistu import sbml_dfs_core
|
8
|
+
from napistu import utils
|
9
|
+
from napistu.constants import SBML_DFS
|
10
|
+
from napistu.constants import CPR_EDGELIST
|
11
|
+
from napistu.constants import CPR_EDGELIST_REQ_VARS
|
12
|
+
from napistu.constants import IDENTIFIERS
|
13
|
+
from napistu.constants import IDENTIFIER_EDGELIST_REQ_VARS
|
14
|
+
from napistu.constants import SPECIES_IDENTIFIERS_REQUIRED_VARS
|
15
|
+
from napistu.network.constants import CPR_GRAPH_EDGES
|
16
|
+
from napistu.network import paths
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
def features_to_pathway_species(
|
22
|
+
feature_identifiers: pd.DataFrame,
|
23
|
+
species_identifiers: pd.DataFrame,
|
24
|
+
ontologies: set,
|
25
|
+
feature_id_var: str,
|
26
|
+
) -> pd.DataFrame:
|
27
|
+
"""
|
28
|
+
Features to Pathway Species
|
29
|
+
|
30
|
+
Match a table of molecular species to their corresponding species in a pathway representation.
|
31
|
+
|
32
|
+
Parameters:
|
33
|
+
feature_identifiers: pd.DataFrame
|
34
|
+
pd.Dataframe containing a "feature_id_var" variable used to match entries
|
35
|
+
species_identifiers: pd.DataFrame
|
36
|
+
A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
|
37
|
+
generally using sbml_dfs_core.export_sbml_dfs()
|
38
|
+
ontologies: set
|
39
|
+
A set of ontologies used to match features to pathway species
|
40
|
+
feature_id_var: str
|
41
|
+
Variable in "feature_identifiers" containing identifiers
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
pathway_species: pd.DataFrame
|
45
|
+
species_identifiers joined to feature_identifiers based on shared identifiers
|
46
|
+
"""
|
47
|
+
|
48
|
+
# map features to molecular features in the pathway
|
49
|
+
if feature_id_var not in feature_identifiers.columns.to_list():
|
50
|
+
raise ValueError(
|
51
|
+
f"{feature_id_var} must be a variable in 'feature_identifiers', "
|
52
|
+
f"possible variables are {', '.join(feature_identifiers.columns.tolist())}"
|
53
|
+
)
|
54
|
+
|
55
|
+
# check identifiers table
|
56
|
+
_check_species_identifiers_table(species_identifiers)
|
57
|
+
|
58
|
+
available_ontologies = set(species_identifiers[IDENTIFIERS.ONTOLOGY].tolist())
|
59
|
+
unavailable_ontologies = ontologies.difference(available_ontologies)
|
60
|
+
|
61
|
+
# no ontologies present
|
62
|
+
if len(unavailable_ontologies) == len(ontologies):
|
63
|
+
raise ValueError(
|
64
|
+
f"None of the requested ontologies ({', '.join(ontologies)}) "
|
65
|
+
"were used to annotate pathway species. Available ontologies are: "
|
66
|
+
f"{', '.join(available_ontologies)}"
|
67
|
+
)
|
68
|
+
|
69
|
+
# 1+ desired ontologies are not present
|
70
|
+
if len(unavailable_ontologies) > 0:
|
71
|
+
raise ValueError(
|
72
|
+
f"Some of the requested ontologies ({', '.join(unavailable_ontologies)}) "
|
73
|
+
"were NOT used to annotate pathway species. Available ontologies are: "
|
74
|
+
f"{', '.join(available_ontologies)}"
|
75
|
+
)
|
76
|
+
|
77
|
+
relevant_identifiers = species_identifiers[
|
78
|
+
species_identifiers[IDENTIFIERS.ONTOLOGY].isin(ontologies)
|
79
|
+
]
|
80
|
+
|
81
|
+
# map features to pathway species
|
82
|
+
pathway_species = feature_identifiers.merge(
|
83
|
+
relevant_identifiers, left_on=feature_id_var, right_on=IDENTIFIERS.IDENTIFIER
|
84
|
+
)
|
85
|
+
|
86
|
+
if pathway_species.shape[0] == 0:
|
87
|
+
logger.warning(
|
88
|
+
"None of the provided species identifiers matched entries of the pathway; returning None"
|
89
|
+
)
|
90
|
+
None
|
91
|
+
|
92
|
+
# report the fraction of unmapped species
|
93
|
+
|
94
|
+
return pathway_species
|
95
|
+
|
96
|
+
|
97
|
+
def edgelist_to_pathway_species(
|
98
|
+
formatted_edgelist: pd.DataFrame, species_identifiers: pd.DataFrame, ontologies: set
|
99
|
+
) -> pd.DataFrame:
|
100
|
+
"""
|
101
|
+
Edgelist to Pathway Species
|
102
|
+
|
103
|
+
Match an edgelist of molecular species pairs to their corresponding species in a pathway representation.
|
104
|
+
|
105
|
+
Parameters:
|
106
|
+
formatted_edgelist: pd.DataFrame
|
107
|
+
pd.Dataframe containing a "identifier_upstream" and "identifier_downstream" variables used to to match entries
|
108
|
+
species_identifiers: pd.DataFrame
|
109
|
+
A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species") generally using
|
110
|
+
sbml_dfs_core.export_sbml_dfs()
|
111
|
+
ontologies: set
|
112
|
+
A set of ontologies used to match features to pathway species
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
edges_on_pathway: pd.DataFrame
|
116
|
+
formatted_edgelist with upstream features mapped
|
117
|
+
to "s_id_upstream" and downstream species mapped
|
118
|
+
to "s_id_downstream"
|
119
|
+
"""
|
120
|
+
|
121
|
+
required_vars_distinct_features = {
|
122
|
+
CPR_EDGELIST.IDENTIFIER_UPSTREAM,
|
123
|
+
CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
|
124
|
+
}
|
125
|
+
missing_required_vars_distinct_features = (
|
126
|
+
required_vars_distinct_features.difference(
|
127
|
+
set(formatted_edgelist.columns.tolist())
|
128
|
+
)
|
129
|
+
)
|
130
|
+
|
131
|
+
if len(missing_required_vars_distinct_features) > 0:
|
132
|
+
raise ValueError(
|
133
|
+
f"{len(missing_required_vars_distinct_features)} required variables were "
|
134
|
+
"missing from 'formatted_edgelist': "
|
135
|
+
f"{', '.join(missing_required_vars_distinct_features)}"
|
136
|
+
)
|
137
|
+
|
138
|
+
# define all distinct identifiers in edgelist
|
139
|
+
distinct_identifiers = (
|
140
|
+
pd.concat(
|
141
|
+
[
|
142
|
+
formatted_edgelist[CPR_EDGELIST.IDENTIFIER_UPSTREAM],
|
143
|
+
formatted_edgelist[CPR_EDGELIST.IDENTIFIER_DOWNSTREAM],
|
144
|
+
]
|
145
|
+
)
|
146
|
+
.drop_duplicates()
|
147
|
+
.reset_index(drop=True)
|
148
|
+
.to_frame()
|
149
|
+
.rename({0: "feature_id"}, axis=1)
|
150
|
+
)
|
151
|
+
|
152
|
+
# merge edgelist identifiers with pathway identifiers to map s_ids to identifiers
|
153
|
+
features_on_pathway = features_to_pathway_species(
|
154
|
+
feature_identifiers=distinct_identifiers,
|
155
|
+
species_identifiers=species_identifiers,
|
156
|
+
ontologies=ontologies,
|
157
|
+
feature_id_var="feature_id",
|
158
|
+
)
|
159
|
+
|
160
|
+
# add s_ids of both upstream and downstream edges to pathway
|
161
|
+
edges_on_pathway = formatted_edgelist.merge(
|
162
|
+
features_on_pathway[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
|
163
|
+
{
|
164
|
+
SBML_DFS.S_ID: CPR_EDGELIST.S_ID_UPSTREAM,
|
165
|
+
IDENTIFIERS.IDENTIFIER: CPR_EDGELIST.IDENTIFIER_UPSTREAM,
|
166
|
+
},
|
167
|
+
axis=1,
|
168
|
+
)
|
169
|
+
).merge(
|
170
|
+
features_on_pathway[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
|
171
|
+
{
|
172
|
+
SBML_DFS.S_ID: CPR_EDGELIST.S_ID_DOWNSTREAM,
|
173
|
+
IDENTIFIERS.IDENTIFIER: CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
|
174
|
+
},
|
175
|
+
axis=1,
|
176
|
+
)
|
177
|
+
)
|
178
|
+
|
179
|
+
return edges_on_pathway
|
180
|
+
|
181
|
+
|
182
|
+
def edgelist_to_scids(
|
183
|
+
formatted_edgelist: pd.DataFrame,
|
184
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
185
|
+
species_identifiers: pd.DataFrame,
|
186
|
+
ontologies: set,
|
187
|
+
):
|
188
|
+
"""
|
189
|
+
|
190
|
+
Edgelist to Compartmentalized Species IDds
|
191
|
+
|
192
|
+
Map an edgelist of possible mechanistic interactions onto a
|
193
|
+
pathadex pathway
|
194
|
+
|
195
|
+
Parameters:
|
196
|
+
formatted_edgelist: pd.DataFrame
|
197
|
+
pd.Dataframe containing a "identifier_upstream" and
|
198
|
+
"identifier_downstream" variables used to to match entries
|
199
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
200
|
+
A mechanistic model
|
201
|
+
species_identifiers: pd.DataFrame
|
202
|
+
A table of molecular species identifiers produced from
|
203
|
+
sbml_dfs.get_identifiers("species") generally using sbml_dfs_core.export_sbml_dfs()
|
204
|
+
ontologies: set
|
205
|
+
A set of ontologies used to match features to pathway species
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
edgelist_w_scids: pd.DataFrame
|
209
|
+
formatted_edgelist with upstream features mapped to "sc_id_upstream" and
|
210
|
+
downstream species mapped to "sc_id_downstream"
|
211
|
+
"""
|
212
|
+
|
213
|
+
_check_species_identifiers_table(species_identifiers)
|
214
|
+
|
215
|
+
# map edges onto pathway entities based on shared identifiers
|
216
|
+
edges_on_pathway = edgelist_to_pathway_species(
|
217
|
+
formatted_edgelist=formatted_edgelist,
|
218
|
+
species_identifiers=species_identifiers,
|
219
|
+
ontologies=ontologies,
|
220
|
+
)
|
221
|
+
|
222
|
+
# expand from s_ids to sc_ids
|
223
|
+
s_id_pairs = edges_on_pathway[
|
224
|
+
[CPR_EDGELIST.S_ID_UPSTREAM, CPR_EDGELIST.S_ID_DOWNSTREAM]
|
225
|
+
].drop_duplicates()
|
226
|
+
sc_id_pairs = s_id_pairs.merge(
|
227
|
+
sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
|
228
|
+
.reset_index()
|
229
|
+
.rename(
|
230
|
+
{
|
231
|
+
SBML_DFS.S_ID: CPR_EDGELIST.S_ID_UPSTREAM,
|
232
|
+
SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM,
|
233
|
+
},
|
234
|
+
axis=1,
|
235
|
+
)
|
236
|
+
).merge(
|
237
|
+
sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
|
238
|
+
.reset_index()
|
239
|
+
.rename(
|
240
|
+
{
|
241
|
+
SBML_DFS.S_ID: CPR_EDGELIST.S_ID_DOWNSTREAM,
|
242
|
+
SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM,
|
243
|
+
},
|
244
|
+
axis=1,
|
245
|
+
)
|
246
|
+
)
|
247
|
+
|
248
|
+
# map sc_ids back to edges_on_pathway
|
249
|
+
# join lookup table of s_id_upstream, s_id_downstream -> sc_ids
|
250
|
+
edgelist_w_scids = edges_on_pathway.merge(sc_id_pairs)
|
251
|
+
|
252
|
+
logger_msg = (
|
253
|
+
f"{edgelist_w_scids.shape[0]} interactions mapped "
|
254
|
+
"onto pairs of compartmentalized species in the mechanistic model"
|
255
|
+
)
|
256
|
+
if edgelist_w_scids.shape[0] == 0:
|
257
|
+
logger.warning(logger_msg)
|
258
|
+
else:
|
259
|
+
logger.info(logger_msg)
|
260
|
+
|
261
|
+
return edgelist_w_scids
|
262
|
+
|
263
|
+
|
264
|
+
def filter_to_direct_mechanistic_interactions(
|
265
|
+
formatted_edgelist: pd.DataFrame,
|
266
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
267
|
+
species_identifiers: pd.DataFrame,
|
268
|
+
ontologies: set,
|
269
|
+
) -> pd.DataFrame:
|
270
|
+
"""
|
271
|
+
Filter to Direct Mechanistic Interactions
|
272
|
+
|
273
|
+
Filter an edgelist to direct mechanistic interactions
|
274
|
+
|
275
|
+
Parameters:
|
276
|
+
formatted_edgelist: pd.DataFrame
|
277
|
+
pd.Dataframe containing a "identifier_upstream" and "identifier_downstream" variables used to to match entries
|
278
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
279
|
+
A mechanistic model
|
280
|
+
species_identifiers: pd.DataFrame
|
281
|
+
A table of molecular species identifiers
|
282
|
+
produced from sbml_dfs.get_identifiers("species") generally
|
283
|
+
using sbml_dfs_core.export_sbml_dfs()
|
284
|
+
ontologies: set
|
285
|
+
A set of ontologies used to match features to pathway species
|
286
|
+
|
287
|
+
Returns:
|
288
|
+
edgelist_w_direct_mechanistic_interactions: pd.DataFrame
|
289
|
+
formatted_edgelist filtered to mechanistic reactions present in the pathway representation
|
290
|
+
"""
|
291
|
+
|
292
|
+
edgelist_w_scids = _edgelist_to_scids_if_needed(
|
293
|
+
formatted_edgelist, sbml_dfs, species_identifiers, ontologies
|
294
|
+
)
|
295
|
+
|
296
|
+
# reduce to distinct sc_id pairs
|
297
|
+
sc_id_pairs = edgelist_w_scids[CPR_EDGELIST_REQ_VARS].drop_duplicates()
|
298
|
+
|
299
|
+
# define all existing direct regulatory interactions
|
300
|
+
pathway_interactions = pd.concat(
|
301
|
+
[
|
302
|
+
# pair 0 -> <0 # modifiers affect substrates
|
303
|
+
sbml_dfs.reaction_species[
|
304
|
+
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] == 0
|
305
|
+
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
|
306
|
+
.rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
|
307
|
+
.merge(
|
308
|
+
sbml_dfs.reaction_species[
|
309
|
+
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] < 0
|
310
|
+
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
|
311
|
+
{SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
|
312
|
+
)
|
313
|
+
),
|
314
|
+
# pair <0 -> >0 # substrates affect products
|
315
|
+
sbml_dfs.reaction_species[
|
316
|
+
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] < 0
|
317
|
+
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
|
318
|
+
.rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
|
319
|
+
.merge(
|
320
|
+
sbml_dfs.reaction_species[
|
321
|
+
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] > 0
|
322
|
+
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
|
323
|
+
{SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
|
324
|
+
)
|
325
|
+
),
|
326
|
+
# pair 0 -> >0 # modifiers affect products
|
327
|
+
sbml_dfs.reaction_species[
|
328
|
+
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] == 0
|
329
|
+
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
|
330
|
+
.rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
|
331
|
+
.merge(
|
332
|
+
sbml_dfs.reaction_species[
|
333
|
+
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] > 0
|
334
|
+
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
|
335
|
+
{SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
|
336
|
+
)
|
337
|
+
),
|
338
|
+
]
|
339
|
+
).reset_index(drop=True)
|
340
|
+
|
341
|
+
# filter pathway interactions based on matches to sc_id_pairs
|
342
|
+
direct_edge_interactions = (
|
343
|
+
sc_id_pairs.merge(pathway_interactions)
|
344
|
+
.merge(
|
345
|
+
sbml_dfs.species[SBML_DFS.S_NAME]
|
346
|
+
.to_frame()
|
347
|
+
.rename({SBML_DFS.S_NAME: CPR_EDGELIST.S_NAME_UPSTREAM}, axis=1),
|
348
|
+
left_on=CPR_EDGELIST.S_ID_UPSTREAM,
|
349
|
+
right_index=True,
|
350
|
+
# add species metadata for matches
|
351
|
+
)
|
352
|
+
.merge(
|
353
|
+
sbml_dfs.species[SBML_DFS.S_NAME]
|
354
|
+
.to_frame()
|
355
|
+
.rename({SBML_DFS.S_NAME: CPR_EDGELIST.S_NAME_DOWNSTREAM}, axis=1),
|
356
|
+
left_on=CPR_EDGELIST.S_ID_DOWNSTREAM,
|
357
|
+
right_index=True,
|
358
|
+
# add metadata for reactions where interaction occurs
|
359
|
+
)
|
360
|
+
.merge(
|
361
|
+
sbml_dfs.reactions[SBML_DFS.R_NAME].to_frame(),
|
362
|
+
left_on=SBML_DFS.R_ID,
|
363
|
+
right_index=True,
|
364
|
+
)
|
365
|
+
)
|
366
|
+
|
367
|
+
edgelist_w_direct_mechanistic_interactions = edgelist_w_scids.merge(
|
368
|
+
direct_edge_interactions[
|
369
|
+
[
|
370
|
+
CPR_EDGELIST.SC_ID_UPSTREAM,
|
371
|
+
CPR_EDGELIST.SC_ID_DOWNSTREAM,
|
372
|
+
SBML_DFS.R_ID,
|
373
|
+
CPR_EDGELIST.S_NAME_UPSTREAM,
|
374
|
+
CPR_EDGELIST.S_NAME_DOWNSTREAM,
|
375
|
+
SBML_DFS.R_NAME,
|
376
|
+
]
|
377
|
+
]
|
378
|
+
)
|
379
|
+
|
380
|
+
return edgelist_w_direct_mechanistic_interactions
|
381
|
+
|
382
|
+
|
383
|
+
def filter_to_indirect_mechanistic_interactions(
|
384
|
+
formatted_edgelist: pd.DataFrame,
|
385
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
386
|
+
species_identifiers: pd.DataFrame,
|
387
|
+
cpr_graph: ig.Graph,
|
388
|
+
ontologies: set,
|
389
|
+
precomputed_distances=None,
|
390
|
+
max_path_length=10,
|
391
|
+
):
|
392
|
+
"""
|
393
|
+
Filter to Indirect Mechanistic Interactions
|
394
|
+
|
395
|
+
Filter an edgelist to indirect mechanistic interactions.
|
396
|
+
Indirect relationships are identified by searching a
|
397
|
+
network for paths from an upstream species to a downstream species
|
398
|
+
|
399
|
+
Parameters:
|
400
|
+
formatted_edgelist: pd.DataFrame
|
401
|
+
pd.Dataframe containing a "identifier_upstream" and
|
402
|
+
"identifier_downstream" variables used to to match entries
|
403
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
404
|
+
A mechanistic model
|
405
|
+
species_identifiers: pandas.DataFrame
|
406
|
+
A table of molecular species identifiers produced from
|
407
|
+
sbml_dfs.get_identifiers("species") generally using sbml_dfs_core.export_sbml_dfs()
|
408
|
+
cpr_graph: igraph.Graph
|
409
|
+
A network representation of the sbml_dfs model
|
410
|
+
ontologies: set
|
411
|
+
A set of ontologies used to match features to pathway species
|
412
|
+
precomputed_distances: None or a pd.DataFrame containing path lengths and weights
|
413
|
+
between pairs of cspecies.
|
414
|
+
max_path_length: int
|
415
|
+
Maximum number of steps to consider.
|
416
|
+
|
417
|
+
Returns:
|
418
|
+
edgelist_w_indirect_mechanistic_interactions: pd.DataFrame
|
419
|
+
formatted_edgelist filtered to mechanistic reactions which can be described
|
420
|
+
by an indirect mechanism. The mechanism is described by a path weight, length,
|
421
|
+
and a vpath and epath list of vertices and edges which were traversed to create the path.
|
422
|
+
"""
|
423
|
+
|
424
|
+
edgelist_w_scids = _edgelist_to_scids_if_needed(
|
425
|
+
formatted_edgelist, sbml_dfs, species_identifiers, ontologies
|
426
|
+
)
|
427
|
+
|
428
|
+
if precomputed_distances is not None:
|
429
|
+
# rename to match conventions in precomputed_distances
|
430
|
+
# filter by these precomputed distances and then restore naming
|
431
|
+
edgelist_w_scids = paths._filter_paths_by_precomputed_distances(
|
432
|
+
edgelist_w_scids.rename(
|
433
|
+
{
|
434
|
+
CPR_EDGELIST.SC_ID_UPSTREAM: CPR_EDGELIST.SC_ID_ORIGIN,
|
435
|
+
CPR_EDGELIST.SC_ID_DOWNSTREAM: CPR_EDGELIST.SC_ID_DEST,
|
436
|
+
},
|
437
|
+
axis=1,
|
438
|
+
),
|
439
|
+
precomputed_distances,
|
440
|
+
).rename(
|
441
|
+
{
|
442
|
+
CPR_EDGELIST.SC_ID_ORIGIN: CPR_EDGELIST.SC_ID_UPSTREAM,
|
443
|
+
CPR_EDGELIST.SC_ID_DEST: CPR_EDGELIST.SC_ID_DOWNSTREAM,
|
444
|
+
},
|
445
|
+
axis=1,
|
446
|
+
)
|
447
|
+
|
448
|
+
# find paths from 1 upstream to all desired downstream sc_ids
|
449
|
+
# (this is the convention with igraph)
|
450
|
+
indexed_origin_vertices = edgelist_w_scids.set_index(CPR_EDGELIST.SC_ID_UPSTREAM)
|
451
|
+
|
452
|
+
# loop through upstream cspecies and find paths to all downstream species
|
453
|
+
global_dict = dict()
|
454
|
+
for an_origin_index in indexed_origin_vertices.index.unique(): # type: ignore
|
455
|
+
origin_targets = indexed_origin_vertices.loc[
|
456
|
+
an_origin_index
|
457
|
+
] # type: pd.DataFrame
|
458
|
+
|
459
|
+
# if indexing only a single entry pd.DataFrame becomes a pd.Series
|
460
|
+
# convert back to DataFrame for consistency
|
461
|
+
origin_targets = utils.ensure_pd_df(origin_targets)
|
462
|
+
|
463
|
+
# log entry for debugging
|
464
|
+
logger.debug(
|
465
|
+
f"finding paths from {an_origin_index} to "
|
466
|
+
f"{origin_targets.shape[0]} target vertices"
|
467
|
+
)
|
468
|
+
|
469
|
+
# find all paths from indexed_origin to desired destination
|
470
|
+
shortest_paths = paths.find_shortest_reaction_paths(
|
471
|
+
cpr_graph,
|
472
|
+
sbml_dfs,
|
473
|
+
origin=an_origin_index,
|
474
|
+
# find all unique destinations (as a list for compatibility with igraph dest)
|
475
|
+
dest=origin_targets[CPR_EDGELIST.SC_ID_DOWNSTREAM].unique().tolist(),
|
476
|
+
weight_var=CPR_GRAPH_EDGES.WEIGHTS,
|
477
|
+
)
|
478
|
+
|
479
|
+
if shortest_paths is None:
|
480
|
+
continue
|
481
|
+
|
482
|
+
vertices, edges = shortest_paths
|
483
|
+
indexed_edges = edges.set_index("path")
|
484
|
+
indexed_vertices = vertices.set_index("path")
|
485
|
+
|
486
|
+
paths_list = list()
|
487
|
+
for ind in indexed_edges.index.unique():
|
488
|
+
one_path = indexed_edges.loc[ind]
|
489
|
+
|
490
|
+
# make sure that we are working with a DF
|
491
|
+
if type(one_path) is pd.Series:
|
492
|
+
one_path = one_path.to_frame().T
|
493
|
+
|
494
|
+
if one_path.shape[0] > max_path_length:
|
495
|
+
continue
|
496
|
+
|
497
|
+
# find the destination node
|
498
|
+
# this is annoying because if the graph is undirected
|
499
|
+
# its not clear if the from or to edge is the actual destination
|
500
|
+
# when taking advantage of the fact that igraph lets you
|
501
|
+
# look up multiple destinations at once this information is lost
|
502
|
+
ancestor_species = {an_origin_index}
|
503
|
+
if one_path.shape[0] > 1:
|
504
|
+
penultimate_edge = one_path.iloc[one_path.shape[0] - 2]
|
505
|
+
ancestor_species = ancestor_species.union(
|
506
|
+
{
|
507
|
+
penultimate_edge[CPR_GRAPH_EDGES.FROM],
|
508
|
+
penultimate_edge[CPR_GRAPH_EDGES.TO],
|
509
|
+
}
|
510
|
+
)
|
511
|
+
|
512
|
+
terminal_edge = one_path.iloc[one_path.shape[0] - 1]
|
513
|
+
ending_cspecies = {terminal_edge[CPR_GRAPH_EDGES.FROM], terminal_edge[CPR_GRAPH_EDGES.TO]}.difference(ancestor_species) # type: ignore
|
514
|
+
|
515
|
+
if len(ending_cspecies) != 1:
|
516
|
+
raise ValueError(
|
517
|
+
"The terminal edge could not be determined when summarizing paths"
|
518
|
+
)
|
519
|
+
ending_cspecies = ending_cspecies.pop()
|
520
|
+
|
521
|
+
path_series = pd.Series(
|
522
|
+
{
|
523
|
+
CPR_GRAPH_EDGES.FROM: an_origin_index,
|
524
|
+
CPR_GRAPH_EDGES.TO: ending_cspecies,
|
525
|
+
"weight": sum(one_path[CPR_GRAPH_EDGES.WEIGHTS]),
|
526
|
+
"path_length": one_path.shape[0],
|
527
|
+
"vpath": indexed_vertices.loc[ind],
|
528
|
+
"epath": one_path,
|
529
|
+
} # type: ignore
|
530
|
+
) # type: pd.Series
|
531
|
+
|
532
|
+
paths_list.append(path_series)
|
533
|
+
|
534
|
+
if len(paths_list) > 0:
|
535
|
+
origin_paths = pd.DataFrame(paths_list)
|
536
|
+
global_dict[an_origin_index] = origin_paths
|
537
|
+
|
538
|
+
if len(global_dict.keys()) == 0:
|
539
|
+
logger.warning(
|
540
|
+
"None of the provide molecular pairs could be mechanistically linked with a network path"
|
541
|
+
)
|
542
|
+
return None
|
543
|
+
|
544
|
+
all_shortest_paths = pd.concat(global_dict.values())
|
545
|
+
|
546
|
+
indirect_shortest_paths = edgelist_w_scids.merge(
|
547
|
+
all_shortest_paths,
|
548
|
+
left_on=[CPR_EDGELIST.SC_ID_UPSTREAM, CPR_EDGELIST.SC_ID_DOWNSTREAM],
|
549
|
+
right_on=[CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO],
|
550
|
+
)
|
551
|
+
|
552
|
+
return indirect_shortest_paths
|
553
|
+
|
554
|
+
|
555
|
+
def _edgelist_to_scids_if_needed(
|
556
|
+
edgelist: pd.DataFrame,
|
557
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
558
|
+
species_identifiers: pd.DataFrame,
|
559
|
+
ontologies: set,
|
560
|
+
) -> pd.DataFrame:
|
561
|
+
"""Map a set of edgelist species to cspecies or skip if cspecies were provided."""
|
562
|
+
|
563
|
+
if utils.match_pd_vars(edgelist, CPR_EDGELIST_REQ_VARS).are_present:
|
564
|
+
logger.info(
|
565
|
+
f"An edgelist with {', '.join(CPR_EDGELIST_REQ_VARS)} was provided; identifier matching will be skipped"
|
566
|
+
)
|
567
|
+
return edgelist
|
568
|
+
else:
|
569
|
+
utils.match_pd_vars(edgelist, IDENTIFIER_EDGELIST_REQ_VARS).assert_present()
|
570
|
+
|
571
|
+
_check_species_identifiers_table(species_identifiers)
|
572
|
+
|
573
|
+
edgelist_w_scids = edgelist_to_scids(
|
574
|
+
edgelist,
|
575
|
+
sbml_dfs=sbml_dfs,
|
576
|
+
species_identifiers=species_identifiers,
|
577
|
+
ontologies=ontologies,
|
578
|
+
)
|
579
|
+
|
580
|
+
return edgelist_w_scids
|
581
|
+
|
582
|
+
|
583
|
+
def _check_species_identifiers_table(
|
584
|
+
species_identifiers: pd.DataFrame,
|
585
|
+
required_vars: set = SPECIES_IDENTIFIERS_REQUIRED_VARS,
|
586
|
+
):
|
587
|
+
missing_required_vars = required_vars.difference(
|
588
|
+
set(species_identifiers.columns.tolist())
|
589
|
+
)
|
590
|
+
if len(missing_required_vars) > 0:
|
591
|
+
raise ValueError(
|
592
|
+
f"{len(missing_required_vars)} required variables "
|
593
|
+
"were missing from the species_identifiers table: "
|
594
|
+
f"{', '.join(missing_required_vars)}"
|
595
|
+
)
|
596
|
+
|
597
|
+
return None
|
@@ -0,0 +1,86 @@
|
|
1
|
+
"""Module to contain constants for the modify submodule"""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
VALID_ANNOTATION_TYPES = [
|
8
|
+
"foci",
|
9
|
+
"reactions",
|
10
|
+
"species",
|
11
|
+
"compartments",
|
12
|
+
"compartmentalized_species",
|
13
|
+
"reaction_species",
|
14
|
+
"remove",
|
15
|
+
]
|
16
|
+
|
17
|
+
# if_all defines reactions species which must all be present for a filter to occur
|
18
|
+
# except_any defines reaction species which will override "if_all"
|
19
|
+
# as_substrates defines reaction species which must be present as a substrate for filtering to occur
|
20
|
+
COFACTOR_SCHEMA = {
|
21
|
+
"ATP PO4 donation": {"if_all": ["ATP", "ADP"], "except_any": ["AMP"]},
|
22
|
+
"GTP PO4 donation": {"if_all": ["GTP", "GDP"]},
|
23
|
+
"ATP PPi donation": {"if_all": ["ATP", "AMP"], "except_any": ["ADP"]},
|
24
|
+
"NADH H- donation": {"if_all": ["NADH", "NAD+"], "as_substrate": ["NADH"]},
|
25
|
+
"NADPH H- donation": {"if_all": ["NADPH", "NADP+"], "as_substrate": ["NADPH"]},
|
26
|
+
"SAH methyltransferase": {"if_all": ["SAH", "SAM"]},
|
27
|
+
"Glutathione oxidation": {"if_all": ["GSSG", "GSH"], "except_any": ["NADPH"]},
|
28
|
+
# "Glutamine aminotransferase" :
|
29
|
+
# {"if_all" : ["Gln", "Glu"],
|
30
|
+
# "except_any" : ["ATP"]},
|
31
|
+
"Water": {"if_all": ["water"]},
|
32
|
+
"PO4": {"if_all": ["PO4"]},
|
33
|
+
"PPi": {"if_all": ["PPi"]},
|
34
|
+
"H+": {"if_all": ["H+"]},
|
35
|
+
"O2": {"if_all": ["O2"]},
|
36
|
+
"CO2": {"if_all": ["CO2"]},
|
37
|
+
"Na+": {"if_all": ["Na+"]},
|
38
|
+
"Cl-": {"if_all": ["Cl-"]},
|
39
|
+
"CoA": {"if_all": ["CoA"]},
|
40
|
+
"HCO3-": {"if_all": ["HCO3"]},
|
41
|
+
}
|
42
|
+
|
43
|
+
COFACTOR_CHEBI_IDS = pd.DataFrame(
|
44
|
+
[
|
45
|
+
("ADP", 456216), # ADP(3−)
|
46
|
+
("ADP", 16761),
|
47
|
+
("AMP", 16027),
|
48
|
+
("ATP", 30616), # ATP(4-)
|
49
|
+
("ATP", 15422),
|
50
|
+
("CO2", 16526),
|
51
|
+
("HCO3", 17544),
|
52
|
+
("H2CO3", 28976),
|
53
|
+
("GDP", 17552),
|
54
|
+
("GSH", 16856),
|
55
|
+
("GSSG", 17858),
|
56
|
+
("GTP", 15996),
|
57
|
+
("Glu", 29985),
|
58
|
+
("Gln", 58359),
|
59
|
+
("H+", 15378),
|
60
|
+
("H+", 24636),
|
61
|
+
("O2", 15379),
|
62
|
+
("NADH", 57945), # NADH(2−)
|
63
|
+
("NADH", 16908), # NADH
|
64
|
+
("NAD+", 57540), # NAD(1-)
|
65
|
+
("NAD+", 15846), # NAD(+)
|
66
|
+
("NADPH", 16474),
|
67
|
+
("NADP+", 18009),
|
68
|
+
("NADP+", 58349), # NADP(3−)
|
69
|
+
("PO4", 18367),
|
70
|
+
("PPi", 29888), # H2PO4
|
71
|
+
("PPi", 18361), # PPi4-
|
72
|
+
("SAH", 16680),
|
73
|
+
("SAM", 15414),
|
74
|
+
("water", 15377),
|
75
|
+
("water", 16234), # HO-
|
76
|
+
("Na+", 29101),
|
77
|
+
("Cl-", 29311),
|
78
|
+
("CoA", 1146900),
|
79
|
+
("CoA", 57287),
|
80
|
+
("acetyl-CoA", 15351),
|
81
|
+
("FAD", 16238),
|
82
|
+
("FADH2", 17877),
|
83
|
+
("UDP", 17659),
|
84
|
+
],
|
85
|
+
columns=["cofactor", "chebi"],
|
86
|
+
)
|