napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,1647 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import copy
|
4
|
+
import logging
|
5
|
+
import random
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
import igraph as ig
|
9
|
+
import matplotlib.pyplot as plt
|
10
|
+
import numpy as np
|
11
|
+
import pandas as pd
|
12
|
+
from pydantic import BaseModel
|
13
|
+
|
14
|
+
from napistu import sbml_dfs_core
|
15
|
+
from napistu import utils
|
16
|
+
|
17
|
+
from napistu.constants import DEFAULT_WT_TRANS
|
18
|
+
from napistu.constants import DEFINED_WEIGHT_TRANSFORMATION
|
19
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
20
|
+
from napistu.constants import MINI_SBO_TO_NAME
|
21
|
+
from napistu.constants import SBML_DFS
|
22
|
+
from napistu.constants import SBO_MODIFIER_NAMES
|
23
|
+
from napistu.constants import SCORE_CALIBRATION_POINTS_DICT
|
24
|
+
from napistu.constants import ENTITIES_W_DATA
|
25
|
+
from napistu.constants import SOURCE_VARS_DICT
|
26
|
+
|
27
|
+
from napistu.network.constants import CPR_GRAPH_NODES
|
28
|
+
from napistu.network.constants import CPR_GRAPH_EDGES
|
29
|
+
from napistu.network.constants import CPR_GRAPH_EDGE_DIRECTIONS
|
30
|
+
from napistu.network.constants import CPR_GRAPH_REQUIRED_EDGE_VARS
|
31
|
+
from napistu.network.constants import CPR_GRAPH_NODE_TYPES
|
32
|
+
from napistu.network.constants import CPR_GRAPH_TYPES
|
33
|
+
from napistu.network.constants import CPR_WEIGHTING_STRATEGIES
|
34
|
+
from napistu.network.constants import SBOTERM_NAMES
|
35
|
+
from napistu.network.constants import REGULATORY_GRAPH_HIERARCHY
|
36
|
+
from napistu.network.constants import SURROGATE_GRAPH_HIERARCHY
|
37
|
+
from napistu.network.constants import VALID_CPR_GRAPH_TYPES
|
38
|
+
from napistu.network.constants import VALID_WEIGHTING_STRATEGIES
|
39
|
+
|
40
|
+
logger = logging.getLogger(__name__)
|
41
|
+
|
42
|
+
|
43
|
+
def create_cpr_graph(
|
44
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
45
|
+
reaction_graph_attrs: dict = dict(),
|
46
|
+
directed: bool = True,
|
47
|
+
edge_reversed: bool = False,
|
48
|
+
graph_type: str = CPR_GRAPH_TYPES.BIPARTITE,
|
49
|
+
verbose: bool = False,
|
50
|
+
) -> ig.Graph:
|
51
|
+
"""
|
52
|
+
Create CPR Graph
|
53
|
+
|
54
|
+
Create an igraph network from a mechanistic network using one of a set of graph_types.
|
55
|
+
|
56
|
+
Parameters
|
57
|
+
----------
|
58
|
+
sbml_dfs : SBML_dfs
|
59
|
+
A model formed by aggregating pathways
|
60
|
+
reaction_graph_attrs: dict
|
61
|
+
Dictionary containing attributes to pull out of reaction_data and
|
62
|
+
a weighting scheme for the graph
|
63
|
+
directed : bool
|
64
|
+
Should a directed (True) or undirected graph be made (False)
|
65
|
+
edge_reversed : bool
|
66
|
+
Should the directions of edges be reversed or not (False)
|
67
|
+
graph_type : str
|
68
|
+
Type of graph to create, valid values are:
|
69
|
+
- bipartite: substrates and modifiers point to the reaction they drive, this reaction points to products
|
70
|
+
- reguatory: non-enzymatic modifiers point to enzymes, enzymes point to substrates and products
|
71
|
+
- surrogate: non-enzymatic modifiers -> substrates -> enzymes -> reaction -> products.
|
72
|
+
In this representation enzymes are effective standing in for their reaction (eventhough the enzyme is
|
73
|
+
not modified by a substrate per-se).
|
74
|
+
verbose : bool
|
75
|
+
Extra reporting
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
----------
|
79
|
+
An Igraph network
|
80
|
+
"""
|
81
|
+
|
82
|
+
if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
|
83
|
+
raise TypeError(
|
84
|
+
f"sbml_dfs must be a sbml_dfs_core.SBML_dfs, but was {type(sbml_dfs)}"
|
85
|
+
)
|
86
|
+
|
87
|
+
if not isinstance(reaction_graph_attrs, dict):
|
88
|
+
raise TypeError(
|
89
|
+
f"reaction_graph_attrs must be a dict, but was {type(reaction_graph_attrs)}"
|
90
|
+
)
|
91
|
+
|
92
|
+
if not isinstance(directed, bool):
|
93
|
+
raise TypeError(f"directed must be a bool, but was {type(directed)}")
|
94
|
+
|
95
|
+
if not isinstance(edge_reversed, bool):
|
96
|
+
raise TypeError(f"edge_reverse must be a bool, but was {type(edge_reversed)}")
|
97
|
+
|
98
|
+
if not isinstance(graph_type, str):
|
99
|
+
raise TypeError(f"graph_type must be a str, but was {type(verbose)}")
|
100
|
+
|
101
|
+
if graph_type not in VALID_CPR_GRAPH_TYPES:
|
102
|
+
raise ValueError(
|
103
|
+
f"graph_type is not a valid value ({graph_type}), valid values are {','.join(VALID_CPR_GRAPH_TYPES)}"
|
104
|
+
)
|
105
|
+
|
106
|
+
if not isinstance(verbose, bool):
|
107
|
+
raise TypeError(f"verbose must be a bool, but was {type(verbose)}")
|
108
|
+
|
109
|
+
# fail fast in reaction_graph_attrs is not properly formatted
|
110
|
+
for k in reaction_graph_attrs.keys():
|
111
|
+
_validate_entity_attrs(reaction_graph_attrs[k])
|
112
|
+
|
113
|
+
working_sbml_dfs = copy.deepcopy(sbml_dfs)
|
114
|
+
reaction_species_counts = working_sbml_dfs.reaction_species.value_counts(
|
115
|
+
SBML_DFS.R_ID
|
116
|
+
)
|
117
|
+
valid_reactions = reaction_species_counts[reaction_species_counts > 1].index
|
118
|
+
# due to autoregulation reactions, and removal of cofactors some
|
119
|
+
# reactions may have 1 (or even zero) species. drop these.
|
120
|
+
|
121
|
+
n_dropped_reactions = working_sbml_dfs.reactions.shape[0] - len(valid_reactions)
|
122
|
+
if n_dropped_reactions != 0:
|
123
|
+
logger.info(
|
124
|
+
f"Dropping {n_dropped_reactions} reactions with <= 1 reaction species "
|
125
|
+
"these underspecified reactions may be due to either unrepresented "
|
126
|
+
"autoregulation and/or removal of cofactors."
|
127
|
+
)
|
128
|
+
|
129
|
+
working_sbml_dfs.reactions = working_sbml_dfs.reactions[
|
130
|
+
working_sbml_dfs.reactions.index.isin(valid_reactions)
|
131
|
+
]
|
132
|
+
working_sbml_dfs.reaction_species = working_sbml_dfs.reaction_species[
|
133
|
+
working_sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(valid_reactions)
|
134
|
+
]
|
135
|
+
|
136
|
+
logger.info(
|
137
|
+
"Organizing all network nodes (compartmentalized species and reactions)"
|
138
|
+
)
|
139
|
+
|
140
|
+
network_nodes = list()
|
141
|
+
network_nodes.append(
|
142
|
+
working_sbml_dfs.compartmentalized_species.reset_index()[
|
143
|
+
[SBML_DFS.SC_ID, SBML_DFS.SC_NAME]
|
144
|
+
]
|
145
|
+
.rename(columns={SBML_DFS.SC_ID: "node_id", SBML_DFS.SC_NAME: "node_name"})
|
146
|
+
.assign(node_type=CPR_GRAPH_NODE_TYPES.SPECIES)
|
147
|
+
)
|
148
|
+
network_nodes.append(
|
149
|
+
working_sbml_dfs.reactions.reset_index()[[SBML_DFS.R_ID, SBML_DFS.R_NAME]]
|
150
|
+
.rename(columns={SBML_DFS.R_ID: "node_id", SBML_DFS.R_NAME: "node_name"})
|
151
|
+
.assign(node_type=CPR_GRAPH_NODE_TYPES.REACTION)
|
152
|
+
)
|
153
|
+
|
154
|
+
# rename nodes to name since it is treated specially
|
155
|
+
network_nodes_df = pd.concat(network_nodes).rename(
|
156
|
+
columns={"node_id": CPR_GRAPH_NODES.NAME}
|
157
|
+
)
|
158
|
+
|
159
|
+
logger.info(f"Formatting edges as a {graph_type} graph")
|
160
|
+
|
161
|
+
if graph_type == CPR_GRAPH_TYPES.BIPARTITE:
|
162
|
+
network_edges = _create_cpr_graph_bipartite(working_sbml_dfs)
|
163
|
+
elif graph_type in [CPR_GRAPH_TYPES.REGULATORY, CPR_GRAPH_TYPES.SURROGATE]:
|
164
|
+
# pass graph_type so that an appropriate tiered schema can be used.
|
165
|
+
network_edges = _create_cpr_graph_tiered(working_sbml_dfs, graph_type)
|
166
|
+
else:
|
167
|
+
raise NotImplementedError("Invalid graph_type")
|
168
|
+
|
169
|
+
logger.info("Adding reversibility and other meta-data from reactions_data")
|
170
|
+
augmented_network_edges = _augment_network_edges(
|
171
|
+
network_edges, working_sbml_dfs, reaction_graph_attrs
|
172
|
+
)
|
173
|
+
|
174
|
+
logger.info(
|
175
|
+
"Creating reverse reactions for reversible reactions on a directed graph"
|
176
|
+
)
|
177
|
+
if directed:
|
178
|
+
directed_network_edges = pd.concat(
|
179
|
+
[
|
180
|
+
# assign forward edges
|
181
|
+
augmented_network_edges.assign(
|
182
|
+
direction=CPR_GRAPH_EDGE_DIRECTIONS.FORWARD
|
183
|
+
),
|
184
|
+
# create reverse edges for reversibile reactions
|
185
|
+
_reverse_network_edges(augmented_network_edges),
|
186
|
+
]
|
187
|
+
)
|
188
|
+
else:
|
189
|
+
directed_network_edges = augmented_network_edges.assign(
|
190
|
+
direction=CPR_GRAPH_EDGE_DIRECTIONS.UNDIRECTED
|
191
|
+
)
|
192
|
+
|
193
|
+
# de-duplicate edges
|
194
|
+
unique_edges = (
|
195
|
+
directed_network_edges.groupby([CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO])
|
196
|
+
.first()
|
197
|
+
.reset_index()
|
198
|
+
)
|
199
|
+
|
200
|
+
if unique_edges.shape[0] != directed_network_edges.shape[0]:
|
201
|
+
logger.warning(
|
202
|
+
f"{directed_network_edges.shape[0] - unique_edges.shape[0]} edges were dropped "
|
203
|
+
"due to duplicated origin -> target relationiships, use verbose for "
|
204
|
+
"more information"
|
205
|
+
)
|
206
|
+
|
207
|
+
if verbose:
|
208
|
+
# report duplicated edges
|
209
|
+
grouped_edges = directed_network_edges.groupby(
|
210
|
+
[CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO]
|
211
|
+
)
|
212
|
+
duplicated_edges = [
|
213
|
+
grouped_edges.get_group(x)
|
214
|
+
for x in grouped_edges.groups
|
215
|
+
if grouped_edges.get_group(x).shape[0] > 1
|
216
|
+
]
|
217
|
+
example_duplicates = pd.concat(
|
218
|
+
random.sample(duplicated_edges, min(5, len(duplicated_edges)))
|
219
|
+
)
|
220
|
+
|
221
|
+
logger.warning(utils.style_df(example_duplicates, headers="keys"))
|
222
|
+
|
223
|
+
# reverse edge directions if edge_reversed is True:
|
224
|
+
|
225
|
+
if edge_reversed:
|
226
|
+
rev_unique_edges_df = unique_edges.copy()
|
227
|
+
rev_unique_edges_df[CPR_GRAPH_EDGES.FROM] = unique_edges[CPR_GRAPH_EDGES.TO]
|
228
|
+
rev_unique_edges_df[CPR_GRAPH_EDGES.TO] = unique_edges[CPR_GRAPH_EDGES.FROM]
|
229
|
+
rev_unique_edges_df[CPR_GRAPH_EDGES.SC_PARENTS] = unique_edges[
|
230
|
+
CPR_GRAPH_EDGES.SC_CHILDREN
|
231
|
+
]
|
232
|
+
rev_unique_edges_df[CPR_GRAPH_EDGES.SC_CHILDREN] = unique_edges[
|
233
|
+
CPR_GRAPH_EDGES.SC_PARENTS
|
234
|
+
]
|
235
|
+
rev_unique_edges_df[CPR_GRAPH_EDGES.STOICHOMETRY] = unique_edges[
|
236
|
+
CPR_GRAPH_EDGES.STOICHOMETRY
|
237
|
+
] * (-1)
|
238
|
+
|
239
|
+
rev_unique_edges_df[CPR_GRAPH_EDGES.DIRECTION] = unique_edges[
|
240
|
+
CPR_GRAPH_EDGES.DIRECTION
|
241
|
+
].replace(
|
242
|
+
{
|
243
|
+
CPR_GRAPH_EDGE_DIRECTIONS.REVERSE: CPR_GRAPH_EDGE_DIRECTIONS.FORWARD,
|
244
|
+
CPR_GRAPH_EDGE_DIRECTIONS.FORWARD: CPR_GRAPH_EDGE_DIRECTIONS.REVERSE,
|
245
|
+
}
|
246
|
+
)
|
247
|
+
else:
|
248
|
+
# unchanged if edge_reversed is False:
|
249
|
+
rev_unique_edges_df = unique_edges
|
250
|
+
|
251
|
+
# convert nodes and edgelist into an igraph network
|
252
|
+
|
253
|
+
logger.info("Formatting cpr_graph output")
|
254
|
+
cpr_graph = ig.Graph.DictList(
|
255
|
+
vertices=network_nodes_df.to_dict("records"),
|
256
|
+
edges=rev_unique_edges_df.to_dict("records"),
|
257
|
+
directed=directed,
|
258
|
+
vertex_name_attr=CPR_GRAPH_NODES.NAME,
|
259
|
+
edge_foreign_keys=(CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO),
|
260
|
+
)
|
261
|
+
|
262
|
+
return cpr_graph
|
263
|
+
|
264
|
+
|
265
|
+
def process_cpr_graph(
|
266
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
267
|
+
reaction_graph_attrs: dict = dict(),
|
268
|
+
directed: bool = True,
|
269
|
+
edge_reversed: bool = False,
|
270
|
+
graph_type: str = CPR_GRAPH_TYPES.BIPARTITE,
|
271
|
+
weighting_strategy: str = CPR_WEIGHTING_STRATEGIES.UNWEIGHTED,
|
272
|
+
verbose: bool = False,
|
273
|
+
) -> ig.Graph:
|
274
|
+
"""
|
275
|
+
Process Consensus Graph
|
276
|
+
|
277
|
+
Setup an igraph network and then add weights and other maleable attributes.
|
278
|
+
|
279
|
+
Args:
|
280
|
+
sbml_dfs (SBML_dfs): A model formed by aggregating pathways
|
281
|
+
reaction_graph_attrs (dict): Dictionary containing attributes to pull out of reaction_data and
|
282
|
+
a weighting scheme for the graph
|
283
|
+
directed (bool): Should a directed (True) or undirected graph be made (False)
|
284
|
+
edge_reversed (bool): Should directions of edges be reversed (False)
|
285
|
+
graph_type (str): Type of graph to create, valid values are:
|
286
|
+
- bipartite: substrates and modifiers point to the reaction they drive, this reaction points to products
|
287
|
+
- reguatory: non-enzymatic modifiers point to enzymes, enzymes point to substrates and products
|
288
|
+
weighting_strategy (str) : a network weighting strategy with options:
|
289
|
+
- unweighted: all weights (and upstream_weights for directed graphs) are set to 1.
|
290
|
+
- topology: weight edges by the degree of the source nodes favoring nodes emerging from nodes
|
291
|
+
with few connections.
|
292
|
+
- mixed: transform edges with a quantitative score based on reaction_attrs; and set edges
|
293
|
+
without quantitative score as a source-specific weight.
|
294
|
+
- calibrated: transforme edges with a quantitative score based on reaction_attrs and combine them
|
295
|
+
with topology scores to generate a consensus.
|
296
|
+
verbose (bool): Extra reporting
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
weighted_graph (ig.Graph): An Igraph network
|
300
|
+
"""
|
301
|
+
|
302
|
+
logging.info("Constructing network")
|
303
|
+
cpr_graph = create_cpr_graph(
|
304
|
+
sbml_dfs,
|
305
|
+
reaction_graph_attrs,
|
306
|
+
directed=directed,
|
307
|
+
edge_reversed=edge_reversed,
|
308
|
+
graph_type=graph_type,
|
309
|
+
verbose=verbose,
|
310
|
+
)
|
311
|
+
|
312
|
+
if "reactions" in reaction_graph_attrs.keys():
|
313
|
+
reaction_attrs = reaction_graph_attrs["reactions"]
|
314
|
+
else:
|
315
|
+
reaction_attrs = dict()
|
316
|
+
|
317
|
+
logging.info(f"Adding edge weights with an {weighting_strategy} strategy")
|
318
|
+
|
319
|
+
weighted_cpr_graph = add_graph_weights(
|
320
|
+
cpr_graph=cpr_graph,
|
321
|
+
reaction_attrs=reaction_attrs,
|
322
|
+
weighting_strategy=weighting_strategy,
|
323
|
+
)
|
324
|
+
|
325
|
+
return weighted_cpr_graph
|
326
|
+
|
327
|
+
|
328
|
+
def pluck_entity_data(
|
329
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, graph_attrs: dict[str, dict], data_type: str
|
330
|
+
) -> pd.DataFrame | None:
|
331
|
+
"""
|
332
|
+
Pluck Entity Attributes
|
333
|
+
|
334
|
+
Pull species or reaction attributes out of an sbml_dfs based on a set of
|
335
|
+
tables and variables to look for.
|
336
|
+
|
337
|
+
Parameters:
|
338
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
339
|
+
A mechanistic model
|
340
|
+
graph_attrs: dict
|
341
|
+
A dictionary of species/reaction attributes to pull out
|
342
|
+
data_type: str
|
343
|
+
"species" or "reactions" to pull out species_data or reactions_data
|
344
|
+
|
345
|
+
Returns:
|
346
|
+
A table where all extracted attributes are merged based on a common index or None
|
347
|
+
if no attributes were extracted.
|
348
|
+
|
349
|
+
"""
|
350
|
+
|
351
|
+
if data_type not in ENTITIES_W_DATA:
|
352
|
+
raise ValueError(
|
353
|
+
f'"data_type" was {data_type} and must be in {", ".join(ENTITIES_W_DATA)}'
|
354
|
+
)
|
355
|
+
|
356
|
+
if data_type not in graph_attrs.keys():
|
357
|
+
logger.info(
|
358
|
+
f'No {data_type} annotations provided in "graph_attrs"; returning None'
|
359
|
+
)
|
360
|
+
return None
|
361
|
+
|
362
|
+
entity_attrs = graph_attrs[data_type]
|
363
|
+
# validating dict
|
364
|
+
_validate_entity_attrs(entity_attrs)
|
365
|
+
|
366
|
+
data_type_attr = data_type + "_data"
|
367
|
+
entity_data_tbls = getattr(sbml_dfs, data_type_attr)
|
368
|
+
|
369
|
+
data_list = list()
|
370
|
+
for k, v in entity_attrs.items():
|
371
|
+
if v["table"] is not None:
|
372
|
+
# does the data table exist?
|
373
|
+
if v["table"] not in entity_data_tbls.keys():
|
374
|
+
raise ValueError(
|
375
|
+
f"{v['table']} was defined as a table in \"graph_attrs\" but "
|
376
|
+
f'it is not present in the "{data_type_attr}" of the sbml_dfs'
|
377
|
+
)
|
378
|
+
|
379
|
+
if v["variable"] not in entity_data_tbls[v["table"]].columns.tolist():
|
380
|
+
raise ValueError(
|
381
|
+
f"{v['variable']} was defined as a variable in \"graph_attrs\" but "
|
382
|
+
f"it is not present in the {v['table']} of the \"{data_type_attr}\" of "
|
383
|
+
"the sbml_dfs"
|
384
|
+
)
|
385
|
+
|
386
|
+
entity_series = entity_data_tbls[v["table"]][v["variable"]].rename(k)
|
387
|
+
data_list.append(entity_series)
|
388
|
+
|
389
|
+
if len(data_list) == 0:
|
390
|
+
return None
|
391
|
+
|
392
|
+
return pd.concat(data_list, axis=1)
|
393
|
+
|
394
|
+
|
395
|
+
def apply_weight_transformations(edges_df: pd.DataFrame, reaction_attrs: dict):
|
396
|
+
"""
|
397
|
+
Apply Weight Transformations
|
398
|
+
|
399
|
+
Args:
|
400
|
+
edges_df (pd.DataFrame): a table of edges and their attributes extracted
|
401
|
+
from a cpr_grpah.
|
402
|
+
reaction_attrs (dict):
|
403
|
+
A dictionary of attributes identifying weighting attributes within
|
404
|
+
an sbml_df's reaction_data, how they will be named in edges_df (the keys),
|
405
|
+
and how they should be transformed (the "trans" aliases")
|
406
|
+
|
407
|
+
Returns:
|
408
|
+
transformed_edges_df (pd.DataFrame): edges_df with weight variables transformed.
|
409
|
+
|
410
|
+
"""
|
411
|
+
|
412
|
+
_validate_entity_attrs(reaction_attrs)
|
413
|
+
|
414
|
+
transformed_edges_df = copy.deepcopy(edges_df)
|
415
|
+
for k, v in reaction_attrs.items():
|
416
|
+
if k not in transformed_edges_df.columns:
|
417
|
+
raise ValueError(f"A weighting variable {k} was missing from edges_df")
|
418
|
+
|
419
|
+
trans_fxn = DEFINED_WEIGHT_TRANSFORMATION[v["trans"]]
|
420
|
+
|
421
|
+
transformed_edges_df[k] = transformed_edges_df[k].apply(globals()[trans_fxn])
|
422
|
+
|
423
|
+
return transformed_edges_df
|
424
|
+
|
425
|
+
|
426
|
+
def summarize_weight_calibration(cpr_graph: ig.Graph, reaction_attrs: dict) -> None:
|
427
|
+
"""
|
428
|
+
Summarize Weight Calibration
|
429
|
+
|
430
|
+
For a network with multiple sources for edge weights summarize the alignment of
|
431
|
+
different weighting schemes and how they map onto our notion of "good" versus
|
432
|
+
"dubious" weights.
|
433
|
+
|
434
|
+
Args:
|
435
|
+
cpr_graph (ig.Graph): A graph where edge weights have already been calibrated.
|
436
|
+
reaction_attrs (dict): a dictionary summarizing the types of weights that
|
437
|
+
exist and how they are transformed for calibration.
|
438
|
+
|
439
|
+
Returns:
|
440
|
+
None
|
441
|
+
|
442
|
+
"""
|
443
|
+
|
444
|
+
score_calibration_df = pd.DataFrame(SCORE_CALIBRATION_POINTS_DICT)
|
445
|
+
score_calibration_df_calibrated = apply_weight_transformations(
|
446
|
+
score_calibration_df, reaction_attrs
|
447
|
+
)
|
448
|
+
|
449
|
+
calibrated_edges = cpr_graph.get_edge_dataframe()
|
450
|
+
|
451
|
+
_summarize_weight_calibration_table(
|
452
|
+
calibrated_edges, score_calibration_df, score_calibration_df_calibrated
|
453
|
+
)
|
454
|
+
|
455
|
+
_summarize_weight_calibration_plots(
|
456
|
+
calibrated_edges, score_calibration_df_calibrated
|
457
|
+
)
|
458
|
+
|
459
|
+
return None
|
460
|
+
|
461
|
+
|
462
|
+
def add_graph_weights(
|
463
|
+
cpr_graph: ig.Graph,
|
464
|
+
reaction_attrs: dict,
|
465
|
+
weighting_strategy: str = CPR_WEIGHTING_STRATEGIES.UNWEIGHTED,
|
466
|
+
) -> ig.Graph:
|
467
|
+
"""
|
468
|
+
Add Graph Weights
|
469
|
+
|
470
|
+
Apply a weighting strategy to generate edge weights on a graph. For directed graphs "upstream_weights" will
|
471
|
+
be generated as well which should be used when searching for a node's ancestors.
|
472
|
+
|
473
|
+
Args:
|
474
|
+
cpr_graph (ig.Graph): a graphical network of molecules/reactions (nodes) and edges linking them.
|
475
|
+
reaction_attrs (dict): an optional dict
|
476
|
+
weighting_strategy: a network weighting strategy with options:
|
477
|
+
- unweighted: all weights (and upstream_weights for directed graphs) are set to 1.
|
478
|
+
- topology: weight edges by the degree of the source nodes favoring nodes emerging from nodes
|
479
|
+
with few connections.
|
480
|
+
- mixed: transform edges with a quantitative score based on reaction_attrs; and set edges
|
481
|
+
without quantitative score as a source-specific weight.
|
482
|
+
- calibrated: transforme edges with a quantitative score based on reaction_attrs and combine them
|
483
|
+
with topology scores to generate a consensus.
|
484
|
+
|
485
|
+
"""
|
486
|
+
|
487
|
+
cpr_graph_updated = copy.deepcopy(cpr_graph)
|
488
|
+
|
489
|
+
_validate_entity_attrs(reaction_attrs)
|
490
|
+
|
491
|
+
if weighting_strategy not in VALID_WEIGHTING_STRATEGIES:
|
492
|
+
raise ValueError(
|
493
|
+
f"weighting_strategy was {weighting_strategy} and must be one of: "
|
494
|
+
f"{', '.join(VALID_WEIGHTING_STRATEGIES)}"
|
495
|
+
)
|
496
|
+
|
497
|
+
# count parents and children and create weights based on them
|
498
|
+
topology_weighted_graph = _create_topology_weights(cpr_graph_updated)
|
499
|
+
|
500
|
+
if weighting_strategy == CPR_WEIGHTING_STRATEGIES.TOPOLOGY:
|
501
|
+
topology_weighted_graph.es[CPR_GRAPH_EDGES.WEIGHTS] = (
|
502
|
+
topology_weighted_graph.es["topo_weights"]
|
503
|
+
)
|
504
|
+
if cpr_graph_updated.is_directed():
|
505
|
+
topology_weighted_graph.es[CPR_GRAPH_EDGES.UPSTREAM_WEIGHTS] = (
|
506
|
+
topology_weighted_graph.es["upstream_topo_weights"]
|
507
|
+
)
|
508
|
+
|
509
|
+
return topology_weighted_graph
|
510
|
+
|
511
|
+
if weighting_strategy == CPR_WEIGHTING_STRATEGIES.UNWEIGHTED:
|
512
|
+
# set weights as a constant
|
513
|
+
topology_weighted_graph.es[CPR_GRAPH_EDGES.WEIGHTS] = 1
|
514
|
+
if cpr_graph_updated.is_directed():
|
515
|
+
topology_weighted_graph.es[CPR_GRAPH_EDGES.UPSTREAM_WEIGHTS] = 1
|
516
|
+
return topology_weighted_graph
|
517
|
+
|
518
|
+
if weighting_strategy == CPR_WEIGHTING_STRATEGIES.MIXED:
|
519
|
+
return _add_graph_weights_mixed(topology_weighted_graph, reaction_attrs)
|
520
|
+
|
521
|
+
if weighting_strategy == CPR_WEIGHTING_STRATEGIES.CALIBRATED:
|
522
|
+
return _add_graph_weights_calibration(topology_weighted_graph, reaction_attrs)
|
523
|
+
|
524
|
+
raise ValueError(f"No logic implemented for {weighting_strategy}")
|
525
|
+
|
526
|
+
|
527
|
+
def _create_cpr_graph_bipartite(sbml_dfs: sbml_dfs_core.SBML_dfs) -> pd.DataFrame:
|
528
|
+
"""Turn an sbml_dfs model into a bipartite graph linking molecules to reactions."""
|
529
|
+
|
530
|
+
# setup edges
|
531
|
+
network_edges = (
|
532
|
+
sbml_dfs.reaction_species.reset_index()[
|
533
|
+
[SBML_DFS.R_ID, SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY, SBML_DFS.SBO_TERM]
|
534
|
+
]
|
535
|
+
# rename species and reactions to reflect from -> to edges
|
536
|
+
.rename(
|
537
|
+
columns={
|
538
|
+
SBML_DFS.SC_ID: CPR_GRAPH_NODE_TYPES.SPECIES,
|
539
|
+
SBML_DFS.R_ID: CPR_GRAPH_NODE_TYPES.REACTION,
|
540
|
+
}
|
541
|
+
)
|
542
|
+
)
|
543
|
+
# add back an r_id variable so that each edge is annotated by a reaction
|
544
|
+
network_edges[CPR_GRAPH_EDGES.R_ID] = network_edges[CPR_GRAPH_NODE_TYPES.REACTION]
|
545
|
+
|
546
|
+
# add edge weights
|
547
|
+
cspecies_features = sbml_dfs.get_cspecies_features()
|
548
|
+
network_edges = network_edges.merge(
|
549
|
+
cspecies_features, left_on=CPR_GRAPH_NODE_TYPES.SPECIES, right_index=True
|
550
|
+
)
|
551
|
+
|
552
|
+
# if directed then flip substrates and modifiers to the origin edge
|
553
|
+
edge_vars = network_edges.columns.tolist()
|
554
|
+
|
555
|
+
origins = network_edges[network_edges[SBML_DFS.STOICHIOMETRY] <= 0]
|
556
|
+
origin_edges = origins.loc[:, [edge_vars[1], edge_vars[0]] + edge_vars[2:]].rename(
|
557
|
+
columns={
|
558
|
+
CPR_GRAPH_NODE_TYPES.SPECIES: CPR_GRAPH_EDGES.FROM,
|
559
|
+
CPR_GRAPH_NODE_TYPES.REACTION: CPR_GRAPH_EDGES.TO,
|
560
|
+
}
|
561
|
+
)
|
562
|
+
|
563
|
+
dests = network_edges[network_edges[SBML_DFS.STOICHIOMETRY] > 0]
|
564
|
+
dest_edges = dests.rename(
|
565
|
+
columns={
|
566
|
+
CPR_GRAPH_NODE_TYPES.REACTION: CPR_GRAPH_EDGES.FROM,
|
567
|
+
CPR_GRAPH_NODE_TYPES.SPECIES: CPR_GRAPH_EDGES.TO,
|
568
|
+
}
|
569
|
+
)
|
570
|
+
|
571
|
+
network_edges = pd.concat([origin_edges, dest_edges])
|
572
|
+
|
573
|
+
return network_edges
|
574
|
+
|
575
|
+
|
576
|
+
def _create_cpr_graph_tiered(
|
577
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, graph_type: str
|
578
|
+
) -> pd.DataFrame:
|
579
|
+
"""Turn an sbml_dfs model into a tiered graph which links upstream entities to downstream ones."""
|
580
|
+
|
581
|
+
# check whether all expect SBO terms are present
|
582
|
+
invalid_sbo_terms = sbml_dfs.reaction_species[
|
583
|
+
~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
584
|
+
]
|
585
|
+
assert isinstance(invalid_sbo_terms, pd.DataFrame)
|
586
|
+
|
587
|
+
if invalid_sbo_terms.shape[0] != 0:
|
588
|
+
invalid_counts = invalid_sbo_terms.value_counts(SBML_DFS.SBO_TERM).to_frame("N")
|
589
|
+
assert isinstance(invalid_counts, pd.DataFrame)
|
590
|
+
|
591
|
+
logger.warning(utils.style_df(invalid_counts, headers="keys")) # type: ignore
|
592
|
+
raise ValueError("Some reaction species have unusable SBO terms")
|
593
|
+
|
594
|
+
# load and validate the schema of graph_type
|
595
|
+
graph_hierarchy_df = _create_graph_hierarchy_df(graph_type)
|
596
|
+
|
597
|
+
# organize reaction species for defining connections
|
598
|
+
sorted_reaction_species = sbml_dfs.reaction_species.set_index(
|
599
|
+
[SBML_DFS.R_ID, SBML_DFS.SBO_TERM]
|
600
|
+
).sort_index()
|
601
|
+
|
602
|
+
logger.info(
|
603
|
+
f"Formatting {sorted_reaction_species.shape[0]} reactions species as "
|
604
|
+
"tiered edges."
|
605
|
+
)
|
606
|
+
|
607
|
+
# infer tiered edges in each reaction
|
608
|
+
all_reaction_edges = [
|
609
|
+
_format_tiered_reaction_species(
|
610
|
+
r, sorted_reaction_species, sbml_dfs, graph_hierarchy_df
|
611
|
+
)
|
612
|
+
for r in sorted_reaction_species.index.get_level_values(SBML_DFS.R_ID).unique()
|
613
|
+
]
|
614
|
+
all_reaction_edges_df = pd.concat(all_reaction_edges).reset_index(drop=True)
|
615
|
+
|
616
|
+
# test for reactions missing substrates
|
617
|
+
r_id_list = sorted_reaction_species.index.get_level_values(0).unique()
|
618
|
+
r_id_reactant_only = [
|
619
|
+
x for x in r_id_list if len(sorted_reaction_species.loc[x]) == 1
|
620
|
+
]
|
621
|
+
|
622
|
+
if len(r_id_reactant_only) > 0:
|
623
|
+
logger.warning(f"{len(r_id_reactant_only)} reactions are missing substrates")
|
624
|
+
all_reaction_edges_df_pre = all_reaction_edges_df.copy()
|
625
|
+
all_reaction_edges_df = all_reaction_edges_df_pre[
|
626
|
+
~all_reaction_edges_df_pre[SBML_DFS.R_ID].isin(r_id_reactant_only)
|
627
|
+
]
|
628
|
+
|
629
|
+
logger.info(
|
630
|
+
"Adding additional attributes to edges, e.g., # of children and parents."
|
631
|
+
)
|
632
|
+
|
633
|
+
# add compartmentalized species summaries to weight edges
|
634
|
+
cspecies_features = sbml_dfs.get_cspecies_features()
|
635
|
+
|
636
|
+
# calculate undirected and directed degrees (i.e., # of parents and children)
|
637
|
+
# based on a network's edgelist. this used when the network representation is
|
638
|
+
# not the bipartite network which can be trivially obtained from the pathway
|
639
|
+
# specification
|
640
|
+
unique_edges = (
|
641
|
+
all_reaction_edges_df.groupby([CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO])
|
642
|
+
.first()
|
643
|
+
.reset_index()
|
644
|
+
)
|
645
|
+
|
646
|
+
# children
|
647
|
+
n_children = (
|
648
|
+
unique_edges[CPR_GRAPH_EDGES.FROM]
|
649
|
+
.value_counts()
|
650
|
+
.to_frame()
|
651
|
+
.reset_index()
|
652
|
+
.rename(
|
653
|
+
{
|
654
|
+
"index": SBML_DFS.SC_ID,
|
655
|
+
CPR_GRAPH_EDGES.FROM: CPR_GRAPH_EDGES.SC_CHILDREN,
|
656
|
+
},
|
657
|
+
axis=1,
|
658
|
+
)
|
659
|
+
)
|
660
|
+
# parents
|
661
|
+
n_parents = (
|
662
|
+
unique_edges[CPR_GRAPH_EDGES.TO]
|
663
|
+
.value_counts()
|
664
|
+
.to_frame()
|
665
|
+
.reset_index()
|
666
|
+
.rename(
|
667
|
+
{"index": SBML_DFS.SC_ID, CPR_GRAPH_EDGES.TO: CPR_GRAPH_EDGES.SC_PARENTS},
|
668
|
+
axis=1,
|
669
|
+
)
|
670
|
+
)
|
671
|
+
graph_degree_by_edgelist = n_children.merge(n_parents, how="outer").fillna(0)
|
672
|
+
|
673
|
+
graph_degree_by_edgelist[CPR_GRAPH_EDGES.SC_DEGREE] = (
|
674
|
+
graph_degree_by_edgelist[CPR_GRAPH_EDGES.SC_CHILDREN]
|
675
|
+
+ graph_degree_by_edgelist[CPR_GRAPH_EDGES.SC_PARENTS]
|
676
|
+
)
|
677
|
+
graph_degree_by_edgelist = (
|
678
|
+
graph_degree_by_edgelist[
|
679
|
+
~graph_degree_by_edgelist[SBML_DFS.SC_ID].str.contains("R[0-9]{8}")
|
680
|
+
]
|
681
|
+
.set_index(SBML_DFS.SC_ID)
|
682
|
+
.sort_index()
|
683
|
+
)
|
684
|
+
|
685
|
+
cspecies_features = (
|
686
|
+
cspecies_features.drop(
|
687
|
+
[
|
688
|
+
CPR_GRAPH_EDGES.SC_DEGREE,
|
689
|
+
CPR_GRAPH_EDGES.SC_CHILDREN,
|
690
|
+
CPR_GRAPH_EDGES.SC_PARENTS,
|
691
|
+
],
|
692
|
+
axis=1,
|
693
|
+
)
|
694
|
+
.join(graph_degree_by_edgelist)
|
695
|
+
.fillna(0)
|
696
|
+
)
|
697
|
+
|
698
|
+
is_from_reaction = all_reaction_edges_df[CPR_GRAPH_EDGES.FROM].isin(
|
699
|
+
sbml_dfs.reactions.index.tolist()
|
700
|
+
)
|
701
|
+
is_from_reaction = all_reaction_edges_df[CPR_GRAPH_EDGES.FROM].isin(
|
702
|
+
sbml_dfs.reactions.index
|
703
|
+
)
|
704
|
+
# add substrate weight whenever "from" edge is a molecule
|
705
|
+
# and product weight when the "from" edge is a reaction
|
706
|
+
decorated_all_reaction_edges_df = pd.concat(
|
707
|
+
[
|
708
|
+
all_reaction_edges_df[~is_from_reaction].merge(
|
709
|
+
cspecies_features, left_on=CPR_GRAPH_EDGES.FROM, right_index=True
|
710
|
+
),
|
711
|
+
all_reaction_edges_df[is_from_reaction].merge(
|
712
|
+
cspecies_features, left_on=CPR_GRAPH_EDGES.TO, right_index=True
|
713
|
+
),
|
714
|
+
]
|
715
|
+
).sort_index()
|
716
|
+
|
717
|
+
if all_reaction_edges_df.shape[0] != decorated_all_reaction_edges_df.shape[0]:
|
718
|
+
msg = (
|
719
|
+
"'decorated_all_reaction_edges_df' and 'all_reaction_edges_df' should\n"
|
720
|
+
"have the same number of rows but they did not"
|
721
|
+
)
|
722
|
+
|
723
|
+
raise ValueError(msg)
|
724
|
+
|
725
|
+
logger.info(f"Done preparing {graph_type} graph")
|
726
|
+
|
727
|
+
return decorated_all_reaction_edges_df
|
728
|
+
|
729
|
+
|
730
|
+
def _format_tiered_reaction_species(
|
731
|
+
r_id: str,
|
732
|
+
sorted_reaction_species: pd.DataFrame,
|
733
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
734
|
+
graph_hierarchy_df: pd.DataFrame,
|
735
|
+
) -> pd.DataFrame:
|
736
|
+
"""
|
737
|
+
Format Tiered Reaction Species
|
738
|
+
|
739
|
+
Refactor a reaction's species into tiered edges between substrates, products, enzymes and allosteric regulators.
|
740
|
+
"""
|
741
|
+
|
742
|
+
rxn_species = sorted_reaction_species.loc[r_id]
|
743
|
+
assert isinstance(rxn_species, pd.DataFrame)
|
744
|
+
assert list(rxn_species.index.names) == [SBML_DFS.SBO_TERM]
|
745
|
+
assert rxn_species.columns.tolist() == [SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY]
|
746
|
+
|
747
|
+
rxn_sbo_terms = set(rxn_species.index.unique())
|
748
|
+
# map to common names
|
749
|
+
rxn_sbo_names = {MINI_SBO_TO_NAME[x] for x in rxn_sbo_terms}
|
750
|
+
|
751
|
+
# is the reaction a general purpose interaction
|
752
|
+
if len(rxn_sbo_names) == 1:
|
753
|
+
if list(rxn_sbo_names)[0] == SBOTERM_NAMES.INTERACTOR:
|
754
|
+
# further validation happens in the function - e.g., exactly two interactors
|
755
|
+
return _format_interactors_for_tiered_graph(r_id, rxn_species, sbml_dfs)
|
756
|
+
|
757
|
+
if SBOTERM_NAMES.INTERACTOR in rxn_sbo_names:
|
758
|
+
logger.warning(
|
759
|
+
f"Invalid combinations of SBO_terms in {str(r_id)} : {sbml_dfs.reactions.loc[r_id][SBML_DFS.R_NAME]}. "
|
760
|
+
"If interactors are present then there can't be any other types of reaction species. "
|
761
|
+
f"The following roles were defined: {', '.join(rxn_sbo_names)}"
|
762
|
+
)
|
763
|
+
|
764
|
+
# reorganize molecules and the reaction itself into tiers
|
765
|
+
entities_ordered_by_tier = (
|
766
|
+
pd.concat(
|
767
|
+
[
|
768
|
+
(
|
769
|
+
rxn_species.reset_index()
|
770
|
+
.rename({SBML_DFS.SC_ID: "entity_id"}, axis=1)
|
771
|
+
.merge(graph_hierarchy_df)
|
772
|
+
),
|
773
|
+
graph_hierarchy_df[
|
774
|
+
graph_hierarchy_df[CPR_GRAPH_EDGES.SBO_NAME]
|
775
|
+
== CPR_GRAPH_NODE_TYPES.REACTION
|
776
|
+
].assign(entity_id=r_id, r_id=r_id),
|
777
|
+
]
|
778
|
+
)
|
779
|
+
.sort_values(["tier"])
|
780
|
+
.set_index("tier")
|
781
|
+
)
|
782
|
+
ordered_tiers = entities_ordered_by_tier.index.get_level_values("tier").unique()
|
783
|
+
|
784
|
+
assert len(ordered_tiers) > 1
|
785
|
+
|
786
|
+
# which tier is the reaction?
|
787
|
+
reaction_tier = graph_hierarchy_df["tier"][
|
788
|
+
graph_hierarchy_df[CPR_GRAPH_EDGES.SBO_NAME] == CPR_GRAPH_NODE_TYPES.REACTION
|
789
|
+
].tolist()[0]
|
790
|
+
|
791
|
+
rxn_edges = list()
|
792
|
+
past_reaction = False
|
793
|
+
for i in range(0, len(ordered_tiers) - 1):
|
794
|
+
formatted_tier_combo = _format_tier_combo(
|
795
|
+
entities_ordered_by_tier.loc[[ordered_tiers[i]]],
|
796
|
+
entities_ordered_by_tier.loc[[ordered_tiers[i + 1]]],
|
797
|
+
past_reaction,
|
798
|
+
)
|
799
|
+
|
800
|
+
if ordered_tiers[i + 1] == reaction_tier:
|
801
|
+
past_reaction = True
|
802
|
+
|
803
|
+
rxn_edges.append(formatted_tier_combo)
|
804
|
+
|
805
|
+
rxn_edges_df = (
|
806
|
+
pd.concat(rxn_edges)[
|
807
|
+
[
|
808
|
+
CPR_GRAPH_EDGES.FROM,
|
809
|
+
CPR_GRAPH_EDGES.TO,
|
810
|
+
CPR_GRAPH_EDGES.STOICHIOMETRY,
|
811
|
+
CPR_GRAPH_EDGES.SBO_TERM,
|
812
|
+
]
|
813
|
+
]
|
814
|
+
.reset_index(drop=True)
|
815
|
+
.assign(r_id=r_id)
|
816
|
+
)
|
817
|
+
|
818
|
+
return rxn_edges_df
|
819
|
+
|
820
|
+
|
821
|
+
def _format_tier_combo(
|
822
|
+
upstream_tier: pd.DataFrame, downstream_tier: pd.DataFrame, past_reaction: bool
|
823
|
+
) -> pd.DataFrame:
|
824
|
+
"""
|
825
|
+
Format Tier Combo
|
826
|
+
|
827
|
+
Create a set of edges crossing two tiers of a tiered graph. This will involve an
|
828
|
+
all x all combination of entries. Tiers form an ordering along the molecular entities
|
829
|
+
in a reaction plus a tier for the reaction itself. Attributes such as stoichiometry
|
830
|
+
and sbo_term will be passed from the tier which is furthest from the reaction tier
|
831
|
+
to ensure that each tier of molecular data applies its attributes to a single set of
|
832
|
+
edges while the "reaction" tier does not. Reaction entities have neither a
|
833
|
+
stoichiometery or sbo_term annotation.
|
834
|
+
|
835
|
+
Args:
|
836
|
+
upstream_tier (pd.DataFrame): A table containing upstream entities in a reaction,
|
837
|
+
e.g., regulators.
|
838
|
+
downstream_tier (pd.DataFrame): A table containing downstream entities in a reaction,
|
839
|
+
e.g., catalysts.
|
840
|
+
past_reaction (bool): if True then attributes will be taken from downstream_tier and
|
841
|
+
if False they will come from upstream_tier.
|
842
|
+
|
843
|
+
Returns:
|
844
|
+
formatted_tier_combo (pd.DataFrame): A table of edges containing (from, to, stoichiometry, sbo_term, r_id). The
|
845
|
+
number of edges is the product of the number of entities in the upstream tier
|
846
|
+
times the number in the downstream tier.
|
847
|
+
|
848
|
+
"""
|
849
|
+
|
850
|
+
upstream_fields = ["entity_id", SBML_DFS.STOICHIOMETRY, SBML_DFS.SBO_TERM]
|
851
|
+
downstream_fields = ["entity_id"]
|
852
|
+
|
853
|
+
if past_reaction:
|
854
|
+
# swap fields
|
855
|
+
upstream_fields, downstream_fields = downstream_fields, upstream_fields
|
856
|
+
|
857
|
+
formatted_tier_combo = (
|
858
|
+
upstream_tier[upstream_fields]
|
859
|
+
.rename({"entity_id": CPR_GRAPH_EDGES.FROM}, axis=1)
|
860
|
+
.assign(_joiner=1)
|
861
|
+
).merge(
|
862
|
+
(
|
863
|
+
downstream_tier[downstream_fields]
|
864
|
+
.rename({"entity_id": CPR_GRAPH_EDGES.TO}, axis=1)
|
865
|
+
.assign(_joiner=1)
|
866
|
+
),
|
867
|
+
left_on="_joiner",
|
868
|
+
right_on="_joiner",
|
869
|
+
)
|
870
|
+
|
871
|
+
return formatted_tier_combo
|
872
|
+
|
873
|
+
|
874
|
+
def _create_graph_hierarchy_df(graph_type: str) -> pd.DataFrame:
|
875
|
+
"""
|
876
|
+
Create Graph Hierarchy DataFrame
|
877
|
+
|
878
|
+
Format a graph hierarchy list of lists and a pd.DataFrame
|
879
|
+
|
880
|
+
Args:
|
881
|
+
graph_type (str):
|
882
|
+
The type of tiered graph to work with. Each type has its own specification in constants.py.
|
883
|
+
|
884
|
+
Returns:
|
885
|
+
A pandas DataFrame with sbo_name, tier, and sbo_term.
|
886
|
+
|
887
|
+
"""
|
888
|
+
|
889
|
+
if graph_type == CPR_GRAPH_TYPES.REGULATORY:
|
890
|
+
sbo_names_hierarchy = REGULATORY_GRAPH_HIERARCHY
|
891
|
+
elif graph_type == CPR_GRAPH_TYPES.SURROGATE:
|
892
|
+
sbo_names_hierarchy = SURROGATE_GRAPH_HIERARCHY
|
893
|
+
else:
|
894
|
+
raise NotImplementedError(f"{graph_type} is not a valid graph_type")
|
895
|
+
|
896
|
+
# format as a DF
|
897
|
+
graph_hierarchy_df = pd.concat(
|
898
|
+
[
|
899
|
+
pd.DataFrame({"sbo_name": sbo_names_hierarchy[i]}).assign(tier=i)
|
900
|
+
for i in range(0, len(sbo_names_hierarchy))
|
901
|
+
]
|
902
|
+
).reset_index(drop=True)
|
903
|
+
graph_hierarchy_df[SBML_DFS.SBO_TERM] = graph_hierarchy_df["sbo_name"].apply(
|
904
|
+
lambda x: MINI_SBO_FROM_NAME[x] if x != CPR_GRAPH_NODE_TYPES.REACTION else ""
|
905
|
+
)
|
906
|
+
|
907
|
+
# ensure that the output is expected
|
908
|
+
utils.match_pd_vars(
|
909
|
+
graph_hierarchy_df,
|
910
|
+
req_vars={CPR_GRAPH_EDGES.SBO_NAME, "tier", SBML_DFS.SBO_TERM},
|
911
|
+
allow_series=False,
|
912
|
+
).assert_present()
|
913
|
+
|
914
|
+
return graph_hierarchy_df
|
915
|
+
|
916
|
+
|
917
|
+
def _add_graph_weights_mixed(cpr_graph: ig.Graph, reaction_attrs: dict) -> ig.Graph:
|
918
|
+
"""Weight a graph using a mixed approach combining source-specific weights and existing edge weights."""
|
919
|
+
|
920
|
+
edges_df = cpr_graph.get_edge_dataframe()
|
921
|
+
|
922
|
+
calibrated_edges = apply_weight_transformations(edges_df, reaction_attrs)
|
923
|
+
calibrated_edges = _create_source_weights(calibrated_edges, "source_wt")
|
924
|
+
|
925
|
+
score_vars = list(reaction_attrs.keys())
|
926
|
+
score_vars.append("source_wt")
|
927
|
+
|
928
|
+
logger.info(f"Creating mixed scores based on {', '.join(score_vars)}")
|
929
|
+
|
930
|
+
calibrated_edges["weights"] = calibrated_edges[score_vars].min(axis=1)
|
931
|
+
|
932
|
+
cpr_graph.es[CPR_GRAPH_EDGES.WEIGHTS] = calibrated_edges[CPR_GRAPH_EDGES.WEIGHTS]
|
933
|
+
if cpr_graph.is_directed():
|
934
|
+
cpr_graph.es[CPR_GRAPH_EDGES.UPSTREAM_WEIGHTS] = calibrated_edges[
|
935
|
+
CPR_GRAPH_EDGES.WEIGHTS
|
936
|
+
]
|
937
|
+
|
938
|
+
# add other attributes and update transformed attributes
|
939
|
+
cpr_graph.es["source_wt"] = calibrated_edges["source_wt"]
|
940
|
+
for k in reaction_attrs.keys():
|
941
|
+
cpr_graph.es[k] = calibrated_edges[k]
|
942
|
+
|
943
|
+
return cpr_graph
|
944
|
+
|
945
|
+
|
946
|
+
def _add_graph_weights_calibration(
|
947
|
+
cpr_graph: ig.Graph, reaction_attrs: dict
|
948
|
+
) -> ig.Graph:
|
949
|
+
"""Weight a graph using a calibrated strategy which aims to roughly align qualiatively similar weights from different sources."""
|
950
|
+
|
951
|
+
edges_df = cpr_graph.get_edge_dataframe()
|
952
|
+
|
953
|
+
calibrated_edges = apply_weight_transformations(edges_df, reaction_attrs)
|
954
|
+
|
955
|
+
score_vars = list(reaction_attrs.keys())
|
956
|
+
score_vars.append("topo_weights")
|
957
|
+
|
958
|
+
logger.info(f"Creating calibrated scores based on {', '.join(score_vars)}")
|
959
|
+
cpr_graph.es["weights"] = calibrated_edges[score_vars].min(axis=1)
|
960
|
+
|
961
|
+
if cpr_graph.is_directed():
|
962
|
+
score_vars = list(reaction_attrs.keys())
|
963
|
+
score_vars.append("upstream_topo_weights")
|
964
|
+
cpr_graph.es["upstream_weights"] = calibrated_edges[score_vars].min(axis=1)
|
965
|
+
|
966
|
+
# add other attributes and update transformed attributes
|
967
|
+
for k in reaction_attrs.keys():
|
968
|
+
cpr_graph.es[k] = calibrated_edges[k]
|
969
|
+
|
970
|
+
return cpr_graph
|
971
|
+
|
972
|
+
|
973
|
+
def _add_edge_attr_to_vertex_graph(
|
974
|
+
cpr_graph: ig.Graph,
|
975
|
+
edge_attr_list: list,
|
976
|
+
shared_node_key: str = "r_id",
|
977
|
+
) -> ig.Graph:
|
978
|
+
"""
|
979
|
+
Merge edge attribute(s) from edge_attr_list to vetices of an igraph
|
980
|
+
|
981
|
+
Parameters
|
982
|
+
----------
|
983
|
+
cpr_graph : iGraph
|
984
|
+
A graph generated by create_cpr_graph()
|
985
|
+
edge_attr_list: list
|
986
|
+
A list containing attributes to pull out of edges, then to add to vertices
|
987
|
+
shared_node_key : str
|
988
|
+
key in edge that is shared with vertex, to map edge ids to corresponding vertex ids
|
989
|
+
|
990
|
+
Returns:
|
991
|
+
----------
|
992
|
+
An Igraph network
|
993
|
+
"""
|
994
|
+
|
995
|
+
if len(edge_attr_list) == 0:
|
996
|
+
logger.warning(
|
997
|
+
"No edge attributes were passed, " "thus return the input graph."
|
998
|
+
)
|
999
|
+
return cpr_graph
|
1000
|
+
|
1001
|
+
graph_vertex_df = cpr_graph.get_vertex_dataframe()
|
1002
|
+
graph_edge_df = cpr_graph.get_edge_dataframe()
|
1003
|
+
|
1004
|
+
if shared_node_key not in graph_edge_df.columns.to_list():
|
1005
|
+
logger.warning(
|
1006
|
+
f"{shared_node_key} is not in the current edge attributes. "
|
1007
|
+
"shared_node_key must be an existing edge attribute"
|
1008
|
+
)
|
1009
|
+
return cpr_graph
|
1010
|
+
|
1011
|
+
graph_edge_df_sub = graph_edge_df.loc[:, [shared_node_key] + edge_attr_list].copy()
|
1012
|
+
|
1013
|
+
# check whether duplicated edge ids by shared_node_key have the same attribute values.
|
1014
|
+
# If not, give warning, and keep the first value. (which can be improved later)
|
1015
|
+
check_edgeid_attr_unique = (
|
1016
|
+
graph_edge_df_sub.groupby(shared_node_key)[edge_attr_list].nunique() == 1
|
1017
|
+
)
|
1018
|
+
|
1019
|
+
# check any False in check_edgeid_attr_unique's columns, if so, get the column names
|
1020
|
+
bool_edgeid_attr_unique = (check_edgeid_attr_unique.isin([False])).any() # type: ignore
|
1021
|
+
|
1022
|
+
non_unique_indices = [
|
1023
|
+
i for i, value in enumerate(bool_edgeid_attr_unique.to_list()) if value
|
1024
|
+
]
|
1025
|
+
|
1026
|
+
# if edge ids with duplicated shared_node_key have more than 1 unique values
|
1027
|
+
# for attributes of interest
|
1028
|
+
non_unique_egde_attr = bool_edgeid_attr_unique.index[non_unique_indices].to_list()
|
1029
|
+
|
1030
|
+
if len(non_unique_egde_attr) == 0:
|
1031
|
+
logger.info("Per duplicated edge ids, attributes have only 1 unique value.")
|
1032
|
+
else:
|
1033
|
+
logger.warning(
|
1034
|
+
f"Per duplicated edge ids, attributes: {non_unique_egde_attr} "
|
1035
|
+
"contain more than 1 unique values"
|
1036
|
+
)
|
1037
|
+
|
1038
|
+
# remove duplicated edge attribute values
|
1039
|
+
graph_edge_df_sub_no_duplicate = graph_edge_df_sub.drop_duplicates(
|
1040
|
+
subset=shared_node_key, keep="first"
|
1041
|
+
)
|
1042
|
+
|
1043
|
+
# rename shared_node_key to vertex key 'name'
|
1044
|
+
# as in net_create.create_cpr_graph(), vertex_name_attr is set to 'name'
|
1045
|
+
graph_edge_df_sub_no_duplicate = graph_edge_df_sub_no_duplicate.rename(
|
1046
|
+
columns={shared_node_key: "name"},
|
1047
|
+
)
|
1048
|
+
|
1049
|
+
# merge edge attributes in graph_edge_df_sub_no_duplicate to vertex_df,
|
1050
|
+
# by shared key 'name'
|
1051
|
+
graph_vertex_df_w_edge_attr = pd.merge(
|
1052
|
+
graph_vertex_df,
|
1053
|
+
graph_edge_df_sub_no_duplicate,
|
1054
|
+
on="name",
|
1055
|
+
how="outer",
|
1056
|
+
)
|
1057
|
+
|
1058
|
+
logger.info(f"Adding {edge_attr_list} to vertex attributes")
|
1059
|
+
# Warning for NaN values in vertex attributes:
|
1060
|
+
if graph_vertex_df_w_edge_attr.isnull().values.any():
|
1061
|
+
logger.warning(
|
1062
|
+
"NaN values are present in the newly added vertex attributes. "
|
1063
|
+
"Please assign proper values to those vertex attributes."
|
1064
|
+
)
|
1065
|
+
|
1066
|
+
# assign the edge_attrs from edge_attr_list to cpr_graph's vertices:
|
1067
|
+
# keep the same edge attribute names:
|
1068
|
+
for col_name in edge_attr_list:
|
1069
|
+
cpr_graph.vs[col_name] = graph_vertex_df_w_edge_attr[col_name]
|
1070
|
+
|
1071
|
+
return cpr_graph
|
1072
|
+
|
1073
|
+
|
1074
|
+
def _summarize_weight_calibration_table(
|
1075
|
+
calibrated_edges: pd.DataFrame,
|
1076
|
+
score_calibration_df: pd.DataFrame,
|
1077
|
+
score_calibration_df_calibrated: pd.DataFrame,
|
1078
|
+
):
|
1079
|
+
"""Create a table comparing edge weights from multiple sources."""
|
1080
|
+
|
1081
|
+
# generate a table summarizing different scoring measures
|
1082
|
+
#
|
1083
|
+
# a set of calibration points defined in DEFINED_WEIGHT_TRANSFORMATION which map
|
1084
|
+
# onto what we might consider strong versus dubious edges are compared to the
|
1085
|
+
# observed scores to see whether these calibration points generally map onto
|
1086
|
+
# the expected quantiles of the score distribution.
|
1087
|
+
#
|
1088
|
+
# different scores are also compared to see whether there calibrations are generally
|
1089
|
+
# aligned. that is to say a strong weight based on one scoring measure would receive
|
1090
|
+
# a similar quantitative score to a strong score for another measure.
|
1091
|
+
|
1092
|
+
score_calibration_long_raw = (
|
1093
|
+
score_calibration_df.reset_index()
|
1094
|
+
.rename({"index": "edge_strength"}, axis=1)
|
1095
|
+
.melt(
|
1096
|
+
id_vars="edge_strength", var_name="weight_measure", value_name="raw_weight"
|
1097
|
+
)
|
1098
|
+
)
|
1099
|
+
|
1100
|
+
score_calibration_long_calibrated = (
|
1101
|
+
score_calibration_df_calibrated.reset_index()
|
1102
|
+
.rename({"index": "edge_strength"}, axis=1)
|
1103
|
+
.melt(
|
1104
|
+
id_vars="edge_strength",
|
1105
|
+
var_name="weight_measure",
|
1106
|
+
value_name="trans_weight",
|
1107
|
+
)
|
1108
|
+
)
|
1109
|
+
|
1110
|
+
score_calibration_table_long = score_calibration_long_raw.merge(
|
1111
|
+
score_calibration_long_calibrated
|
1112
|
+
)
|
1113
|
+
|
1114
|
+
# compare calibration points to the quantiles of the observed score distributions
|
1115
|
+
score_quantiles = list()
|
1116
|
+
for ind, row in score_calibration_table_long.iterrows():
|
1117
|
+
score_quantiles.append(
|
1118
|
+
1
|
1119
|
+
- np.mean(
|
1120
|
+
calibrated_edges[row["weight_measure"]].dropna() >= row["trans_weight"]
|
1121
|
+
)
|
1122
|
+
)
|
1123
|
+
score_calibration_table_long["quantile_of_score_dist"] = score_quantiles
|
1124
|
+
|
1125
|
+
return utils.style_df(score_calibration_table_long, headers="keys")
|
1126
|
+
|
1127
|
+
|
1128
|
+
def _summarize_weight_calibration_plots(
|
1129
|
+
calibrated_edges: pd.DataFrame, score_calibration_df_calibrated: pd.DataFrame
|
1130
|
+
) -> None:
|
1131
|
+
"""Create a couple of plots summarizing the relationships between different scoring measures."""
|
1132
|
+
|
1133
|
+
# set up a 2 x 1 plot
|
1134
|
+
f, (ax1, ax2) = plt.subplots(1, 2)
|
1135
|
+
|
1136
|
+
calibrated_edges[["topo_weights", "string_wt"]].plot(
|
1137
|
+
kind="hist", bins=50, alpha=0.5, ax=ax1
|
1138
|
+
)
|
1139
|
+
ax1.set_title("Distribution of scores\npost calibration")
|
1140
|
+
|
1141
|
+
score_calibration_df_calibrated.plot("weights", "string_wt", kind="scatter", ax=ax2)
|
1142
|
+
|
1143
|
+
for k, v in score_calibration_df_calibrated.iterrows():
|
1144
|
+
ax2.annotate(k, v)
|
1145
|
+
ax2.axline((0, 0), slope=1.0, color="C0", label="by slope")
|
1146
|
+
ax2.set_title("Comparing STRING and\nTopology calibration points")
|
1147
|
+
|
1148
|
+
return None
|
1149
|
+
|
1150
|
+
|
1151
|
+
def _create_source_weights(
|
1152
|
+
edges_df: pd.DataFrame,
|
1153
|
+
source_wt_var: str = "source_wt",
|
1154
|
+
source_vars_dict: dict = SOURCE_VARS_DICT,
|
1155
|
+
source_wt_default: int = 1,
|
1156
|
+
) -> pd.DataFrame:
|
1157
|
+
""" "
|
1158
|
+
Create Source Weights
|
1159
|
+
|
1160
|
+
Create weights based on an edges source. This is a simple but crude way of allowing different
|
1161
|
+
data sources to have different support if we think that some are more trustworthly than others.
|
1162
|
+
|
1163
|
+
Args:
|
1164
|
+
edges_df: pd.DataFrame
|
1165
|
+
The edges dataframe to add the source weights to.
|
1166
|
+
source_wt_var: str
|
1167
|
+
The name of the column to store the source weights.
|
1168
|
+
source_vars_dict: dict
|
1169
|
+
Dictionary with keys indicating edge attributes and values indicating the weight to assign
|
1170
|
+
to that attribute. This value is generally the largest weight that can be assigned to an
|
1171
|
+
edge so that the numeric weight is chosen over the default.
|
1172
|
+
source_wt_default: int
|
1173
|
+
The default weight to assign to an edge if no other weight attribute is found.
|
1174
|
+
|
1175
|
+
Returns:
|
1176
|
+
pd.DataFrame
|
1177
|
+
The edges dataframe with the source weights added.
|
1178
|
+
|
1179
|
+
"""
|
1180
|
+
|
1181
|
+
logger.warning(
|
1182
|
+
"_create_source_weights should be reimplemented once https://github.com/calico/pathadex-data/issues/95 "
|
1183
|
+
"is fixed. The current implementation is quite limited."
|
1184
|
+
)
|
1185
|
+
|
1186
|
+
# currently, we will look for values of source_indicator_var which are non NA and set them to
|
1187
|
+
# source_indicator_match_score and setting entries which are NA as source_indicator_nonmatch_score.
|
1188
|
+
#
|
1189
|
+
# this is a simple way of flagging string vs. non-string scores
|
1190
|
+
|
1191
|
+
included_weight_vars = set(source_vars_dict.keys()).intersection(
|
1192
|
+
set(edges_df.columns)
|
1193
|
+
)
|
1194
|
+
if len(included_weight_vars) == 0:
|
1195
|
+
logger.warning(
|
1196
|
+
f"No edge attributes were found which match those in source_vars_dict: {', '.join(source_vars_dict.keys())}"
|
1197
|
+
)
|
1198
|
+
edges_df[source_wt_var] = source_wt_default
|
1199
|
+
return edges_df
|
1200
|
+
|
1201
|
+
edges_df_source_wts = edges_df[list(included_weight_vars)].copy()
|
1202
|
+
for wt in list(included_weight_vars):
|
1203
|
+
edges_df_source_wts[wt] = [
|
1204
|
+
source_wt_default if x is True else source_vars_dict[wt]
|
1205
|
+
for x in edges_df[wt].isna()
|
1206
|
+
]
|
1207
|
+
|
1208
|
+
source_wt_edges_df = edges_df.join(
|
1209
|
+
edges_df_source_wts.max(axis=1).rename(source_wt_var)
|
1210
|
+
)
|
1211
|
+
|
1212
|
+
return source_wt_edges_df
|
1213
|
+
|
1214
|
+
|
1215
|
+
def _wt_transformation_identity(x):
|
1216
|
+
"""Identity"""
|
1217
|
+
return x
|
1218
|
+
|
1219
|
+
|
1220
|
+
def _wt_transformation_string(x):
|
1221
|
+
"""Map STRING scores to a similar scale as topology weights."""
|
1222
|
+
|
1223
|
+
return 250000 / np.power(x, 1.7)
|
1224
|
+
|
1225
|
+
|
1226
|
+
def _wt_transformation_string_inv(x):
|
1227
|
+
"""Map STRING scores so they work with source weights."""
|
1228
|
+
|
1229
|
+
# string scores are bounded on [0, 1000]
|
1230
|
+
# and score/1000 is roughly a probability that
|
1231
|
+
# there is a real interaction (physical, genetic, ...)
|
1232
|
+
# reported string scores are currently on [150, 1000]
|
1233
|
+
# so this transformation will map these onto {6.67, 1}
|
1234
|
+
|
1235
|
+
return 1 / (x / 1000)
|
1236
|
+
|
1237
|
+
|
1238
|
+
def _format_interactors_for_tiered_graph(
|
1239
|
+
r_id: str, rxn_species: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
|
1240
|
+
) -> pd.DataFrame:
|
1241
|
+
"""Format an undirected interactions for tiered graph so interactions are linked even though they would be on the same tier."""
|
1242
|
+
|
1243
|
+
interactor_data = rxn_species.loc[MINI_SBO_FROM_NAME["interactor"]]
|
1244
|
+
if interactor_data.shape[0] != 2:
|
1245
|
+
raise ValueError(
|
1246
|
+
f"{interactor_data.shape[0]} interactors present for {str(r_id)} : "
|
1247
|
+
f"{sbml_dfs.reactions.loc[r_id]['r_name']}. "
|
1248
|
+
"Reactions with interactors must have exactly two interactors"
|
1249
|
+
)
|
1250
|
+
|
1251
|
+
if not (interactor_data["stoichiometry"] == 0).any():
|
1252
|
+
raise ValueError(
|
1253
|
+
f"Interactors had non-zero stoichiometry for {str(r_id)} : {sbml_dfs.reactions.loc[r_id]['r_name']}. "
|
1254
|
+
"If stoichiometry is important for this reaction then it should use other SBO terms "
|
1255
|
+
"(e.g., substrate and product)."
|
1256
|
+
)
|
1257
|
+
|
1258
|
+
# set the first entry as "from" and second as "to" if stoi is zero.
|
1259
|
+
# the reverse reaction will generally be added later because these
|
1260
|
+
# reactions should be reversible
|
1261
|
+
|
1262
|
+
return pd.DataFrame(
|
1263
|
+
{
|
1264
|
+
"from": interactor_data["sc_id"].iloc[0],
|
1265
|
+
"to": interactor_data["sc_id"].iloc[1],
|
1266
|
+
"sbo_term": MINI_SBO_FROM_NAME["interactor"],
|
1267
|
+
"stoichiometry": 0,
|
1268
|
+
"r_id": r_id,
|
1269
|
+
},
|
1270
|
+
index=[0],
|
1271
|
+
)
|
1272
|
+
|
1273
|
+
|
1274
|
+
def _add_graph_species_attribute(
|
1275
|
+
cpr_graph: ig.Graph,
|
1276
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
1277
|
+
species_graph_attrs: dict,
|
1278
|
+
) -> ig.Graph:
|
1279
|
+
"""Add meta-data from species_data to existing igraph's vertices."""
|
1280
|
+
|
1281
|
+
if not isinstance(species_graph_attrs, dict):
|
1282
|
+
raise TypeError(
|
1283
|
+
f"species_graph_attrs must be a dict, but was {type(species_graph_attrs)}"
|
1284
|
+
)
|
1285
|
+
|
1286
|
+
# fail fast if species_graph_attrs is not properly formatted
|
1287
|
+
# also flatten attribute list to be added to vertex nodes
|
1288
|
+
sp_graph_key_list = []
|
1289
|
+
sp_node_attr_list = []
|
1290
|
+
for k in species_graph_attrs.keys():
|
1291
|
+
_validate_entity_attrs(species_graph_attrs[k])
|
1292
|
+
|
1293
|
+
sp_graph_key_list.append(k)
|
1294
|
+
sp_node_attr_list.append(list(species_graph_attrs[k].keys()))
|
1295
|
+
|
1296
|
+
# flatten sp_node_attr_list
|
1297
|
+
flat_sp_node_attr_list = [item for items in sp_node_attr_list for item in items]
|
1298
|
+
|
1299
|
+
logger.info("Adding meta-data from species_data")
|
1300
|
+
|
1301
|
+
curr_network_nodes_df = cpr_graph.get_vertex_dataframe()
|
1302
|
+
|
1303
|
+
# add species-level attributes to nodes dataframe
|
1304
|
+
augmented_network_nodes_df = _augment_network_nodes(
|
1305
|
+
curr_network_nodes_df,
|
1306
|
+
sbml_dfs,
|
1307
|
+
species_graph_attrs,
|
1308
|
+
)
|
1309
|
+
|
1310
|
+
for vs_attr in flat_sp_node_attr_list:
|
1311
|
+
# in case more than one vs_attr in the flat_sp_node_attr_list
|
1312
|
+
logger.info(f"Adding new attribute {vs_attr} to vertices")
|
1313
|
+
cpr_graph.vs[vs_attr] = augmented_network_nodes_df[vs_attr].values
|
1314
|
+
|
1315
|
+
return cpr_graph
|
1316
|
+
|
1317
|
+
|
1318
|
+
def _augment_network_nodes(
|
1319
|
+
network_nodes: pd.DataFrame,
|
1320
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
1321
|
+
species_graph_attrs: dict = dict(),
|
1322
|
+
) -> pd.DataFrame:
|
1323
|
+
"""Add species-level attributes, expand network_nodes with s_id and c_id and then map to species-level attributes by s_id."""
|
1324
|
+
|
1325
|
+
REQUIRED_NETWORK_NODE_ATTRS = {
|
1326
|
+
"name",
|
1327
|
+
"node_name",
|
1328
|
+
"node_type",
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
missing_required_network_nodes_attrs = REQUIRED_NETWORK_NODE_ATTRS.difference(
|
1332
|
+
set(network_nodes.columns.tolist())
|
1333
|
+
)
|
1334
|
+
if len(missing_required_network_nodes_attrs) > 0:
|
1335
|
+
raise ValueError(
|
1336
|
+
f"{len(missing_required_network_nodes_attrs)} required attributes were missing "
|
1337
|
+
"from network_nodes: "
|
1338
|
+
f"{', '.join(missing_required_network_nodes_attrs)}"
|
1339
|
+
)
|
1340
|
+
|
1341
|
+
# include matching s_ids and c_ids of sc_ids
|
1342
|
+
# (the index of network_nodes df) in network_nodes df
|
1343
|
+
network_nodes_sid = pd.merge(
|
1344
|
+
network_nodes,
|
1345
|
+
sbml_dfs.compartmentalized_species[["s_id", "c_id"]],
|
1346
|
+
left_on="name",
|
1347
|
+
right_index=True,
|
1348
|
+
how="left",
|
1349
|
+
)
|
1350
|
+
|
1351
|
+
# assign species_data related attributes to s_id
|
1352
|
+
species_graph_data = pluck_entity_data(sbml_dfs, species_graph_attrs, "species")
|
1353
|
+
|
1354
|
+
if species_graph_data is not None:
|
1355
|
+
# add species_graph_data to the network_nodes df, based on s_id
|
1356
|
+
network_nodes_wdata = network_nodes_sid.merge(
|
1357
|
+
species_graph_data, left_on="s_id", right_index=True, how="left"
|
1358
|
+
)
|
1359
|
+
|
1360
|
+
# Note: multiple sc_ids with the same s_id will be assign with the same species_graph_data
|
1361
|
+
|
1362
|
+
network_nodes_wdata.fillna(0, inplace=True)
|
1363
|
+
network_nodes_wdata.drop(columns=["s_id", "c_id"], inplace=True)
|
1364
|
+
|
1365
|
+
return network_nodes_wdata
|
1366
|
+
|
1367
|
+
|
1368
|
+
def _augment_network_edges(
|
1369
|
+
network_edges: pd.DataFrame,
|
1370
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
1371
|
+
reaction_graph_attrs: dict = dict(),
|
1372
|
+
) -> pd.DataFrame:
|
1373
|
+
"""Add reversibility and other metadata from reactions."""
|
1374
|
+
|
1375
|
+
REQUIRED_NETWORK_EDGE_ATTRS = {
|
1376
|
+
"from",
|
1377
|
+
"to",
|
1378
|
+
"stoichiometry",
|
1379
|
+
"sbo_term",
|
1380
|
+
"sc_degree",
|
1381
|
+
"sc_children",
|
1382
|
+
"sc_parents",
|
1383
|
+
"species_type",
|
1384
|
+
"r_id",
|
1385
|
+
}
|
1386
|
+
|
1387
|
+
missing_required_network_edges_attrs = REQUIRED_NETWORK_EDGE_ATTRS.difference(
|
1388
|
+
set(network_edges.columns.tolist())
|
1389
|
+
)
|
1390
|
+
if len(missing_required_network_edges_attrs) > 0:
|
1391
|
+
raise ValueError(
|
1392
|
+
f"{len(missing_required_network_edges_attrs)} required attributes were missing "
|
1393
|
+
"from network_edges: "
|
1394
|
+
f"{', '.join(missing_required_network_edges_attrs)}"
|
1395
|
+
)
|
1396
|
+
|
1397
|
+
network_edges = (
|
1398
|
+
network_edges[list(REQUIRED_NETWORK_EDGE_ATTRS)]
|
1399
|
+
# add reaction-level attributes
|
1400
|
+
.merge(
|
1401
|
+
sbml_dfs.reactions[SBML_DFS.R_ISREVERSIBLE],
|
1402
|
+
left_on=SBML_DFS.R_ID,
|
1403
|
+
right_index=True,
|
1404
|
+
)
|
1405
|
+
)
|
1406
|
+
|
1407
|
+
# add other attributes based on reactions data
|
1408
|
+
reaction_graph_data = pluck_entity_data(
|
1409
|
+
sbml_dfs, reaction_graph_attrs, SBML_DFS.REACTIONS
|
1410
|
+
)
|
1411
|
+
if reaction_graph_data is not None:
|
1412
|
+
network_edges = network_edges.merge(
|
1413
|
+
reaction_graph_data, left_on=SBML_DFS.R_ID, right_index=True, how="left"
|
1414
|
+
)
|
1415
|
+
|
1416
|
+
return network_edges
|
1417
|
+
|
1418
|
+
|
1419
|
+
def _reverse_network_edges(augmented_network_edges: pd.DataFrame) -> pd.DataFrame:
|
1420
|
+
"""Flip reversible reactions to derive the reverse reaction."""
|
1421
|
+
|
1422
|
+
# validate inputs
|
1423
|
+
missing_required_vars = CPR_GRAPH_REQUIRED_EDGE_VARS.difference(
|
1424
|
+
set(augmented_network_edges.columns.tolist())
|
1425
|
+
)
|
1426
|
+
|
1427
|
+
if len(missing_required_vars) > 0:
|
1428
|
+
raise ValueError(
|
1429
|
+
"augmented_network_edges is missing required variables: "
|
1430
|
+
f"{', '.join(missing_required_vars)}"
|
1431
|
+
)
|
1432
|
+
|
1433
|
+
# select all edges derived from reversible reactions
|
1434
|
+
reversible_reaction_edges = augmented_network_edges[
|
1435
|
+
augmented_network_edges[CPR_GRAPH_EDGES.R_ISREVERSIBLE]
|
1436
|
+
]
|
1437
|
+
|
1438
|
+
r_reaction_edges = (
|
1439
|
+
# ignore edges which start in a regulator or catalyst; even for a reversible reaction it
|
1440
|
+
# doesn't make sense for a regulator to be impacted by a target
|
1441
|
+
reversible_reaction_edges[
|
1442
|
+
~reversible_reaction_edges[CPR_GRAPH_EDGES.SBO_TERM].isin(
|
1443
|
+
[
|
1444
|
+
MINI_SBO_FROM_NAME[x]
|
1445
|
+
for x in SBO_MODIFIER_NAMES.union({SBOTERM_NAMES.CATALYST})
|
1446
|
+
]
|
1447
|
+
)
|
1448
|
+
]
|
1449
|
+
# flip parent and child attributes
|
1450
|
+
.rename(
|
1451
|
+
{
|
1452
|
+
CPR_GRAPH_EDGES.FROM: CPR_GRAPH_EDGES.TO,
|
1453
|
+
CPR_GRAPH_EDGES.TO: CPR_GRAPH_EDGES.FROM,
|
1454
|
+
CPR_GRAPH_EDGES.SC_CHILDREN: CPR_GRAPH_EDGES.SC_PARENTS,
|
1455
|
+
CPR_GRAPH_EDGES.SC_PARENTS: CPR_GRAPH_EDGES.SC_CHILDREN,
|
1456
|
+
},
|
1457
|
+
axis=1,
|
1458
|
+
)
|
1459
|
+
)
|
1460
|
+
|
1461
|
+
# switch substrates and products
|
1462
|
+
r_reaction_edges[CPR_GRAPH_EDGES.STOICHIOMETRY] = r_reaction_edges[
|
1463
|
+
CPR_GRAPH_EDGES.STOICHIOMETRY
|
1464
|
+
].apply(
|
1465
|
+
# the ifelse statement prevents 0 being converted to -0 ...
|
1466
|
+
lambda x: -1 * x if x != 0 else 0
|
1467
|
+
)
|
1468
|
+
|
1469
|
+
transformed_r_reaction_edges = pd.concat(
|
1470
|
+
[
|
1471
|
+
(
|
1472
|
+
r_reaction_edges[
|
1473
|
+
r_reaction_edges[CPR_GRAPH_EDGES.SBO_TERM]
|
1474
|
+
== MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
|
1475
|
+
].assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT])
|
1476
|
+
),
|
1477
|
+
(
|
1478
|
+
r_reaction_edges[
|
1479
|
+
r_reaction_edges[CPR_GRAPH_EDGES.SBO_TERM]
|
1480
|
+
== MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
|
1481
|
+
].assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT])
|
1482
|
+
),
|
1483
|
+
r_reaction_edges[
|
1484
|
+
~r_reaction_edges[CPR_GRAPH_EDGES.SBO_TERM].isin(
|
1485
|
+
[
|
1486
|
+
MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT],
|
1487
|
+
MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT],
|
1488
|
+
]
|
1489
|
+
)
|
1490
|
+
],
|
1491
|
+
]
|
1492
|
+
)
|
1493
|
+
|
1494
|
+
assert transformed_r_reaction_edges.shape[0] == r_reaction_edges.shape[0]
|
1495
|
+
|
1496
|
+
return transformed_r_reaction_edges.assign(
|
1497
|
+
direction=CPR_GRAPH_EDGE_DIRECTIONS.REVERSE
|
1498
|
+
)
|
1499
|
+
|
1500
|
+
|
1501
|
+
def _create_topology_weights(
|
1502
|
+
cpr_graph: ig.Graph,
|
1503
|
+
base_score: float = 2,
|
1504
|
+
protein_multiplier: int = 1,
|
1505
|
+
metabolite_multiplier: int = 3,
|
1506
|
+
unknown_multiplier: int = 10,
|
1507
|
+
scale_multiplier_by_meandegree: bool = True,
|
1508
|
+
) -> ig.Graph:
|
1509
|
+
"""
|
1510
|
+
Create Topology Weights
|
1511
|
+
|
1512
|
+
Add weights to a network based on its topology. Edges downstream of nodes
|
1513
|
+
with many connections receive a higher weight suggesting that any one
|
1514
|
+
of them is less likely to be regulatory. This is a simple and clearly
|
1515
|
+
flawed heuristic which can be combined with more principled weighting
|
1516
|
+
schemes.
|
1517
|
+
|
1518
|
+
Args:
|
1519
|
+
cpr_graph (ig.Graph): a graph containing connections between molecules, proteins, and reactions.
|
1520
|
+
base_score (float): offset which will be added to all weights.
|
1521
|
+
protein_multiplier (int): multiplier for non-metabolite species (lower weight paths will tend to be selected).
|
1522
|
+
metabolite_multiplier (int): multiplier for metabolites [defined a species with a ChEBI ID).
|
1523
|
+
unknown_multiplier (int): multiplier for species without any identifier. See sbml_dfs_core.species_type_types.
|
1524
|
+
scale_multiplier_by_meandegree (bool): if True then multipliers will be rescaled by the average number of
|
1525
|
+
connections a node has (i.e., its degree) so that weights will be relatively similar regardless of network
|
1526
|
+
size and sparsity.
|
1527
|
+
|
1528
|
+
Returns:
|
1529
|
+
cpr_graph (ig.Graph): graph with added topology weights
|
1530
|
+
|
1531
|
+
"""
|
1532
|
+
|
1533
|
+
# check for required attribute before proceeding
|
1534
|
+
|
1535
|
+
required_attrs = {
|
1536
|
+
CPR_GRAPH_EDGES.SC_DEGREE,
|
1537
|
+
CPR_GRAPH_EDGES.SC_CHILDREN,
|
1538
|
+
CPR_GRAPH_EDGES.SC_PARENTS,
|
1539
|
+
CPR_GRAPH_EDGES.SPECIES_TYPE,
|
1540
|
+
}
|
1541
|
+
|
1542
|
+
missing_required_attrs = required_attrs.difference(set(cpr_graph.es.attributes()))
|
1543
|
+
if len(missing_required_attrs) != 0:
|
1544
|
+
raise ValueError(
|
1545
|
+
f"model is missing {len(missing_required_attrs)} required attributes: {', '.join(missing_required_attrs)}"
|
1546
|
+
)
|
1547
|
+
|
1548
|
+
if base_score < 0:
|
1549
|
+
raise ValueError(f"base_score was {base_score} and must be non-negative")
|
1550
|
+
if protein_multiplier > unknown_multiplier:
|
1551
|
+
raise ValueError(
|
1552
|
+
f"protein_multiplier was {protein_multiplier} and unknown_multiplier "
|
1553
|
+
f"was {unknown_multiplier}. unknown_multiplier must be greater than "
|
1554
|
+
"protein_multiplier"
|
1555
|
+
)
|
1556
|
+
if metabolite_multiplier > unknown_multiplier:
|
1557
|
+
raise ValueError(
|
1558
|
+
f"protein_multiplier was {metabolite_multiplier} and unknown_multiplier "
|
1559
|
+
f"was {unknown_multiplier}. unknown_multiplier must be greater than "
|
1560
|
+
"protein_multiplier"
|
1561
|
+
)
|
1562
|
+
|
1563
|
+
# create a new weight variable
|
1564
|
+
|
1565
|
+
weight_table = pd.DataFrame(
|
1566
|
+
{
|
1567
|
+
CPR_GRAPH_EDGES.SC_DEGREE: cpr_graph.es[CPR_GRAPH_EDGES.SC_DEGREE],
|
1568
|
+
CPR_GRAPH_EDGES.SC_CHILDREN: cpr_graph.es[CPR_GRAPH_EDGES.SC_CHILDREN],
|
1569
|
+
CPR_GRAPH_EDGES.SC_PARENTS: cpr_graph.es[CPR_GRAPH_EDGES.SC_PARENTS],
|
1570
|
+
CPR_GRAPH_EDGES.SPECIES_TYPE: cpr_graph.es[CPR_GRAPH_EDGES.SPECIES_TYPE],
|
1571
|
+
}
|
1572
|
+
)
|
1573
|
+
|
1574
|
+
lookup_multiplier_dict = {
|
1575
|
+
"protein": protein_multiplier,
|
1576
|
+
"metabolite": metabolite_multiplier,
|
1577
|
+
"unknown": unknown_multiplier,
|
1578
|
+
}
|
1579
|
+
weight_table["multiplier"] = weight_table["species_type"].map(
|
1580
|
+
lookup_multiplier_dict
|
1581
|
+
)
|
1582
|
+
|
1583
|
+
# calculate mean degree
|
1584
|
+
# since topology weights will differ based on the structure of the network
|
1585
|
+
# and it would be nice to have a consistent notion of edge weights and path weights
|
1586
|
+
# for interpretability and filtering, we can rescale topology weights by the
|
1587
|
+
# average degree of nodes
|
1588
|
+
if scale_multiplier_by_meandegree:
|
1589
|
+
mean_degree = len(cpr_graph.es) / len(cpr_graph.vs)
|
1590
|
+
if not cpr_graph.is_directed():
|
1591
|
+
# for a directed network in- and out-degree are separately treated while
|
1592
|
+
# an undirected network's degree will be the sum of these two measures.
|
1593
|
+
mean_degree = mean_degree * 2
|
1594
|
+
|
1595
|
+
weight_table["multiplier"] = weight_table["multiplier"] / mean_degree
|
1596
|
+
|
1597
|
+
if cpr_graph.is_directed():
|
1598
|
+
weight_table["connection_weight"] = weight_table[CPR_GRAPH_EDGES.SC_CHILDREN]
|
1599
|
+
else:
|
1600
|
+
weight_table["connection_weight"] = weight_table[CPR_GRAPH_EDGES.SC_DEGREE]
|
1601
|
+
|
1602
|
+
# weight traveling through a species based on
|
1603
|
+
# - a constant
|
1604
|
+
# - how plausibly that species type mediates a change
|
1605
|
+
# - the number of connections that the node can bridge to
|
1606
|
+
weight_table["topo_weights"] = [
|
1607
|
+
base_score + (x * y)
|
1608
|
+
for x, y in zip(weight_table["multiplier"], weight_table["connection_weight"])
|
1609
|
+
]
|
1610
|
+
cpr_graph.es["topo_weights"] = weight_table["topo_weights"]
|
1611
|
+
|
1612
|
+
# if directed and we want to use travel upstream define a corresponding weighting scheme
|
1613
|
+
if cpr_graph.is_directed():
|
1614
|
+
weight_table["upstream_topo_weights"] = [
|
1615
|
+
base_score + (x * y)
|
1616
|
+
for x, y in zip(weight_table["multiplier"], weight_table["sc_parents"])
|
1617
|
+
]
|
1618
|
+
cpr_graph.es["upstream_topo_weights"] = weight_table["upstream_topo_weights"]
|
1619
|
+
|
1620
|
+
return cpr_graph
|
1621
|
+
|
1622
|
+
|
1623
|
+
def _validate_entity_attrs(
|
1624
|
+
entity_attrs: dict, validate_transformations: bool = True
|
1625
|
+
) -> None:
|
1626
|
+
"""Validate that graph attributes are a valid format."""
|
1627
|
+
|
1628
|
+
assert isinstance(entity_attrs, dict)
|
1629
|
+
for v in entity_attrs.values():
|
1630
|
+
# check structure against pydantic config
|
1631
|
+
entity_attrs = _EntityAttrValidator(**v).model_dump()
|
1632
|
+
|
1633
|
+
if validate_transformations:
|
1634
|
+
if v["trans"] not in DEFINED_WEIGHT_TRANSFORMATION.keys():
|
1635
|
+
raise ValueError(
|
1636
|
+
f"transformation {v['trans']} was not defined as an alias in "
|
1637
|
+
"DEFINED_WEIGHT_TRANSFORMATION. The defined transformations "
|
1638
|
+
f"are {', '.join(DEFINED_WEIGHT_TRANSFORMATION.keys())}"
|
1639
|
+
)
|
1640
|
+
|
1641
|
+
return None
|
1642
|
+
|
1643
|
+
|
1644
|
+
class _EntityAttrValidator(BaseModel):
|
1645
|
+
table: str
|
1646
|
+
variable: str
|
1647
|
+
trans: Optional[str] = DEFAULT_WT_TRANS
|