napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,1647 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ import logging
5
+ import random
6
+ from typing import Optional
7
+
8
+ import igraph as ig
9
+ import matplotlib.pyplot as plt
10
+ import numpy as np
11
+ import pandas as pd
12
+ from pydantic import BaseModel
13
+
14
+ from napistu import sbml_dfs_core
15
+ from napistu import utils
16
+
17
+ from napistu.constants import DEFAULT_WT_TRANS
18
+ from napistu.constants import DEFINED_WEIGHT_TRANSFORMATION
19
+ from napistu.constants import MINI_SBO_FROM_NAME
20
+ from napistu.constants import MINI_SBO_TO_NAME
21
+ from napistu.constants import SBML_DFS
22
+ from napistu.constants import SBO_MODIFIER_NAMES
23
+ from napistu.constants import SCORE_CALIBRATION_POINTS_DICT
24
+ from napistu.constants import ENTITIES_W_DATA
25
+ from napistu.constants import SOURCE_VARS_DICT
26
+
27
+ from napistu.network.constants import CPR_GRAPH_NODES
28
+ from napistu.network.constants import CPR_GRAPH_EDGES
29
+ from napistu.network.constants import CPR_GRAPH_EDGE_DIRECTIONS
30
+ from napistu.network.constants import CPR_GRAPH_REQUIRED_EDGE_VARS
31
+ from napistu.network.constants import CPR_GRAPH_NODE_TYPES
32
+ from napistu.network.constants import CPR_GRAPH_TYPES
33
+ from napistu.network.constants import CPR_WEIGHTING_STRATEGIES
34
+ from napistu.network.constants import SBOTERM_NAMES
35
+ from napistu.network.constants import REGULATORY_GRAPH_HIERARCHY
36
+ from napistu.network.constants import SURROGATE_GRAPH_HIERARCHY
37
+ from napistu.network.constants import VALID_CPR_GRAPH_TYPES
38
+ from napistu.network.constants import VALID_WEIGHTING_STRATEGIES
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ def create_cpr_graph(
44
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
45
+ reaction_graph_attrs: dict = dict(),
46
+ directed: bool = True,
47
+ edge_reversed: bool = False,
48
+ graph_type: str = CPR_GRAPH_TYPES.BIPARTITE,
49
+ verbose: bool = False,
50
+ ) -> ig.Graph:
51
+ """
52
+ Create CPR Graph
53
+
54
+ Create an igraph network from a mechanistic network using one of a set of graph_types.
55
+
56
+ Parameters
57
+ ----------
58
+ sbml_dfs : SBML_dfs
59
+ A model formed by aggregating pathways
60
+ reaction_graph_attrs: dict
61
+ Dictionary containing attributes to pull out of reaction_data and
62
+ a weighting scheme for the graph
63
+ directed : bool
64
+ Should a directed (True) or undirected graph be made (False)
65
+ edge_reversed : bool
66
+ Should the directions of edges be reversed or not (False)
67
+ graph_type : str
68
+ Type of graph to create, valid values are:
69
+ - bipartite: substrates and modifiers point to the reaction they drive, this reaction points to products
70
+ - reguatory: non-enzymatic modifiers point to enzymes, enzymes point to substrates and products
71
+ - surrogate: non-enzymatic modifiers -> substrates -> enzymes -> reaction -> products.
72
+ In this representation enzymes are effective standing in for their reaction (eventhough the enzyme is
73
+ not modified by a substrate per-se).
74
+ verbose : bool
75
+ Extra reporting
76
+
77
+ Returns:
78
+ ----------
79
+ An Igraph network
80
+ """
81
+
82
+ if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
83
+ raise TypeError(
84
+ f"sbml_dfs must be a sbml_dfs_core.SBML_dfs, but was {type(sbml_dfs)}"
85
+ )
86
+
87
+ if not isinstance(reaction_graph_attrs, dict):
88
+ raise TypeError(
89
+ f"reaction_graph_attrs must be a dict, but was {type(reaction_graph_attrs)}"
90
+ )
91
+
92
+ if not isinstance(directed, bool):
93
+ raise TypeError(f"directed must be a bool, but was {type(directed)}")
94
+
95
+ if not isinstance(edge_reversed, bool):
96
+ raise TypeError(f"edge_reverse must be a bool, but was {type(edge_reversed)}")
97
+
98
+ if not isinstance(graph_type, str):
99
+ raise TypeError(f"graph_type must be a str, but was {type(verbose)}")
100
+
101
+ if graph_type not in VALID_CPR_GRAPH_TYPES:
102
+ raise ValueError(
103
+ f"graph_type is not a valid value ({graph_type}), valid values are {','.join(VALID_CPR_GRAPH_TYPES)}"
104
+ )
105
+
106
+ if not isinstance(verbose, bool):
107
+ raise TypeError(f"verbose must be a bool, but was {type(verbose)}")
108
+
109
+ # fail fast in reaction_graph_attrs is not properly formatted
110
+ for k in reaction_graph_attrs.keys():
111
+ _validate_entity_attrs(reaction_graph_attrs[k])
112
+
113
+ working_sbml_dfs = copy.deepcopy(sbml_dfs)
114
+ reaction_species_counts = working_sbml_dfs.reaction_species.value_counts(
115
+ SBML_DFS.R_ID
116
+ )
117
+ valid_reactions = reaction_species_counts[reaction_species_counts > 1].index
118
+ # due to autoregulation reactions, and removal of cofactors some
119
+ # reactions may have 1 (or even zero) species. drop these.
120
+
121
+ n_dropped_reactions = working_sbml_dfs.reactions.shape[0] - len(valid_reactions)
122
+ if n_dropped_reactions != 0:
123
+ logger.info(
124
+ f"Dropping {n_dropped_reactions} reactions with <= 1 reaction species "
125
+ "these underspecified reactions may be due to either unrepresented "
126
+ "autoregulation and/or removal of cofactors."
127
+ )
128
+
129
+ working_sbml_dfs.reactions = working_sbml_dfs.reactions[
130
+ working_sbml_dfs.reactions.index.isin(valid_reactions)
131
+ ]
132
+ working_sbml_dfs.reaction_species = working_sbml_dfs.reaction_species[
133
+ working_sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(valid_reactions)
134
+ ]
135
+
136
+ logger.info(
137
+ "Organizing all network nodes (compartmentalized species and reactions)"
138
+ )
139
+
140
+ network_nodes = list()
141
+ network_nodes.append(
142
+ working_sbml_dfs.compartmentalized_species.reset_index()[
143
+ [SBML_DFS.SC_ID, SBML_DFS.SC_NAME]
144
+ ]
145
+ .rename(columns={SBML_DFS.SC_ID: "node_id", SBML_DFS.SC_NAME: "node_name"})
146
+ .assign(node_type=CPR_GRAPH_NODE_TYPES.SPECIES)
147
+ )
148
+ network_nodes.append(
149
+ working_sbml_dfs.reactions.reset_index()[[SBML_DFS.R_ID, SBML_DFS.R_NAME]]
150
+ .rename(columns={SBML_DFS.R_ID: "node_id", SBML_DFS.R_NAME: "node_name"})
151
+ .assign(node_type=CPR_GRAPH_NODE_TYPES.REACTION)
152
+ )
153
+
154
+ # rename nodes to name since it is treated specially
155
+ network_nodes_df = pd.concat(network_nodes).rename(
156
+ columns={"node_id": CPR_GRAPH_NODES.NAME}
157
+ )
158
+
159
+ logger.info(f"Formatting edges as a {graph_type} graph")
160
+
161
+ if graph_type == CPR_GRAPH_TYPES.BIPARTITE:
162
+ network_edges = _create_cpr_graph_bipartite(working_sbml_dfs)
163
+ elif graph_type in [CPR_GRAPH_TYPES.REGULATORY, CPR_GRAPH_TYPES.SURROGATE]:
164
+ # pass graph_type so that an appropriate tiered schema can be used.
165
+ network_edges = _create_cpr_graph_tiered(working_sbml_dfs, graph_type)
166
+ else:
167
+ raise NotImplementedError("Invalid graph_type")
168
+
169
+ logger.info("Adding reversibility and other meta-data from reactions_data")
170
+ augmented_network_edges = _augment_network_edges(
171
+ network_edges, working_sbml_dfs, reaction_graph_attrs
172
+ )
173
+
174
+ logger.info(
175
+ "Creating reverse reactions for reversible reactions on a directed graph"
176
+ )
177
+ if directed:
178
+ directed_network_edges = pd.concat(
179
+ [
180
+ # assign forward edges
181
+ augmented_network_edges.assign(
182
+ direction=CPR_GRAPH_EDGE_DIRECTIONS.FORWARD
183
+ ),
184
+ # create reverse edges for reversibile reactions
185
+ _reverse_network_edges(augmented_network_edges),
186
+ ]
187
+ )
188
+ else:
189
+ directed_network_edges = augmented_network_edges.assign(
190
+ direction=CPR_GRAPH_EDGE_DIRECTIONS.UNDIRECTED
191
+ )
192
+
193
+ # de-duplicate edges
194
+ unique_edges = (
195
+ directed_network_edges.groupby([CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO])
196
+ .first()
197
+ .reset_index()
198
+ )
199
+
200
+ if unique_edges.shape[0] != directed_network_edges.shape[0]:
201
+ logger.warning(
202
+ f"{directed_network_edges.shape[0] - unique_edges.shape[0]} edges were dropped "
203
+ "due to duplicated origin -> target relationiships, use verbose for "
204
+ "more information"
205
+ )
206
+
207
+ if verbose:
208
+ # report duplicated edges
209
+ grouped_edges = directed_network_edges.groupby(
210
+ [CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO]
211
+ )
212
+ duplicated_edges = [
213
+ grouped_edges.get_group(x)
214
+ for x in grouped_edges.groups
215
+ if grouped_edges.get_group(x).shape[0] > 1
216
+ ]
217
+ example_duplicates = pd.concat(
218
+ random.sample(duplicated_edges, min(5, len(duplicated_edges)))
219
+ )
220
+
221
+ logger.warning(utils.style_df(example_duplicates, headers="keys"))
222
+
223
+ # reverse edge directions if edge_reversed is True:
224
+
225
+ if edge_reversed:
226
+ rev_unique_edges_df = unique_edges.copy()
227
+ rev_unique_edges_df[CPR_GRAPH_EDGES.FROM] = unique_edges[CPR_GRAPH_EDGES.TO]
228
+ rev_unique_edges_df[CPR_GRAPH_EDGES.TO] = unique_edges[CPR_GRAPH_EDGES.FROM]
229
+ rev_unique_edges_df[CPR_GRAPH_EDGES.SC_PARENTS] = unique_edges[
230
+ CPR_GRAPH_EDGES.SC_CHILDREN
231
+ ]
232
+ rev_unique_edges_df[CPR_GRAPH_EDGES.SC_CHILDREN] = unique_edges[
233
+ CPR_GRAPH_EDGES.SC_PARENTS
234
+ ]
235
+ rev_unique_edges_df[CPR_GRAPH_EDGES.STOICHOMETRY] = unique_edges[
236
+ CPR_GRAPH_EDGES.STOICHOMETRY
237
+ ] * (-1)
238
+
239
+ rev_unique_edges_df[CPR_GRAPH_EDGES.DIRECTION] = unique_edges[
240
+ CPR_GRAPH_EDGES.DIRECTION
241
+ ].replace(
242
+ {
243
+ CPR_GRAPH_EDGE_DIRECTIONS.REVERSE: CPR_GRAPH_EDGE_DIRECTIONS.FORWARD,
244
+ CPR_GRAPH_EDGE_DIRECTIONS.FORWARD: CPR_GRAPH_EDGE_DIRECTIONS.REVERSE,
245
+ }
246
+ )
247
+ else:
248
+ # unchanged if edge_reversed is False:
249
+ rev_unique_edges_df = unique_edges
250
+
251
+ # convert nodes and edgelist into an igraph network
252
+
253
+ logger.info("Formatting cpr_graph output")
254
+ cpr_graph = ig.Graph.DictList(
255
+ vertices=network_nodes_df.to_dict("records"),
256
+ edges=rev_unique_edges_df.to_dict("records"),
257
+ directed=directed,
258
+ vertex_name_attr=CPR_GRAPH_NODES.NAME,
259
+ edge_foreign_keys=(CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO),
260
+ )
261
+
262
+ return cpr_graph
263
+
264
+
265
+ def process_cpr_graph(
266
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
267
+ reaction_graph_attrs: dict = dict(),
268
+ directed: bool = True,
269
+ edge_reversed: bool = False,
270
+ graph_type: str = CPR_GRAPH_TYPES.BIPARTITE,
271
+ weighting_strategy: str = CPR_WEIGHTING_STRATEGIES.UNWEIGHTED,
272
+ verbose: bool = False,
273
+ ) -> ig.Graph:
274
+ """
275
+ Process Consensus Graph
276
+
277
+ Setup an igraph network and then add weights and other maleable attributes.
278
+
279
+ Args:
280
+ sbml_dfs (SBML_dfs): A model formed by aggregating pathways
281
+ reaction_graph_attrs (dict): Dictionary containing attributes to pull out of reaction_data and
282
+ a weighting scheme for the graph
283
+ directed (bool): Should a directed (True) or undirected graph be made (False)
284
+ edge_reversed (bool): Should directions of edges be reversed (False)
285
+ graph_type (str): Type of graph to create, valid values are:
286
+ - bipartite: substrates and modifiers point to the reaction they drive, this reaction points to products
287
+ - reguatory: non-enzymatic modifiers point to enzymes, enzymes point to substrates and products
288
+ weighting_strategy (str) : a network weighting strategy with options:
289
+ - unweighted: all weights (and upstream_weights for directed graphs) are set to 1.
290
+ - topology: weight edges by the degree of the source nodes favoring nodes emerging from nodes
291
+ with few connections.
292
+ - mixed: transform edges with a quantitative score based on reaction_attrs; and set edges
293
+ without quantitative score as a source-specific weight.
294
+ - calibrated: transforme edges with a quantitative score based on reaction_attrs and combine them
295
+ with topology scores to generate a consensus.
296
+ verbose (bool): Extra reporting
297
+
298
+ Returns:
299
+ weighted_graph (ig.Graph): An Igraph network
300
+ """
301
+
302
+ logging.info("Constructing network")
303
+ cpr_graph = create_cpr_graph(
304
+ sbml_dfs,
305
+ reaction_graph_attrs,
306
+ directed=directed,
307
+ edge_reversed=edge_reversed,
308
+ graph_type=graph_type,
309
+ verbose=verbose,
310
+ )
311
+
312
+ if "reactions" in reaction_graph_attrs.keys():
313
+ reaction_attrs = reaction_graph_attrs["reactions"]
314
+ else:
315
+ reaction_attrs = dict()
316
+
317
+ logging.info(f"Adding edge weights with an {weighting_strategy} strategy")
318
+
319
+ weighted_cpr_graph = add_graph_weights(
320
+ cpr_graph=cpr_graph,
321
+ reaction_attrs=reaction_attrs,
322
+ weighting_strategy=weighting_strategy,
323
+ )
324
+
325
+ return weighted_cpr_graph
326
+
327
+
328
+ def pluck_entity_data(
329
+ sbml_dfs: sbml_dfs_core.SBML_dfs, graph_attrs: dict[str, dict], data_type: str
330
+ ) -> pd.DataFrame | None:
331
+ """
332
+ Pluck Entity Attributes
333
+
334
+ Pull species or reaction attributes out of an sbml_dfs based on a set of
335
+ tables and variables to look for.
336
+
337
+ Parameters:
338
+ sbml_dfs: sbml_dfs_core.SBML_dfs
339
+ A mechanistic model
340
+ graph_attrs: dict
341
+ A dictionary of species/reaction attributes to pull out
342
+ data_type: str
343
+ "species" or "reactions" to pull out species_data or reactions_data
344
+
345
+ Returns:
346
+ A table where all extracted attributes are merged based on a common index or None
347
+ if no attributes were extracted.
348
+
349
+ """
350
+
351
+ if data_type not in ENTITIES_W_DATA:
352
+ raise ValueError(
353
+ f'"data_type" was {data_type} and must be in {", ".join(ENTITIES_W_DATA)}'
354
+ )
355
+
356
+ if data_type not in graph_attrs.keys():
357
+ logger.info(
358
+ f'No {data_type} annotations provided in "graph_attrs"; returning None'
359
+ )
360
+ return None
361
+
362
+ entity_attrs = graph_attrs[data_type]
363
+ # validating dict
364
+ _validate_entity_attrs(entity_attrs)
365
+
366
+ data_type_attr = data_type + "_data"
367
+ entity_data_tbls = getattr(sbml_dfs, data_type_attr)
368
+
369
+ data_list = list()
370
+ for k, v in entity_attrs.items():
371
+ if v["table"] is not None:
372
+ # does the data table exist?
373
+ if v["table"] not in entity_data_tbls.keys():
374
+ raise ValueError(
375
+ f"{v['table']} was defined as a table in \"graph_attrs\" but "
376
+ f'it is not present in the "{data_type_attr}" of the sbml_dfs'
377
+ )
378
+
379
+ if v["variable"] not in entity_data_tbls[v["table"]].columns.tolist():
380
+ raise ValueError(
381
+ f"{v['variable']} was defined as a variable in \"graph_attrs\" but "
382
+ f"it is not present in the {v['table']} of the \"{data_type_attr}\" of "
383
+ "the sbml_dfs"
384
+ )
385
+
386
+ entity_series = entity_data_tbls[v["table"]][v["variable"]].rename(k)
387
+ data_list.append(entity_series)
388
+
389
+ if len(data_list) == 0:
390
+ return None
391
+
392
+ return pd.concat(data_list, axis=1)
393
+
394
+
395
+ def apply_weight_transformations(edges_df: pd.DataFrame, reaction_attrs: dict):
396
+ """
397
+ Apply Weight Transformations
398
+
399
+ Args:
400
+ edges_df (pd.DataFrame): a table of edges and their attributes extracted
401
+ from a cpr_grpah.
402
+ reaction_attrs (dict):
403
+ A dictionary of attributes identifying weighting attributes within
404
+ an sbml_df's reaction_data, how they will be named in edges_df (the keys),
405
+ and how they should be transformed (the "trans" aliases")
406
+
407
+ Returns:
408
+ transformed_edges_df (pd.DataFrame): edges_df with weight variables transformed.
409
+
410
+ """
411
+
412
+ _validate_entity_attrs(reaction_attrs)
413
+
414
+ transformed_edges_df = copy.deepcopy(edges_df)
415
+ for k, v in reaction_attrs.items():
416
+ if k not in transformed_edges_df.columns:
417
+ raise ValueError(f"A weighting variable {k} was missing from edges_df")
418
+
419
+ trans_fxn = DEFINED_WEIGHT_TRANSFORMATION[v["trans"]]
420
+
421
+ transformed_edges_df[k] = transformed_edges_df[k].apply(globals()[trans_fxn])
422
+
423
+ return transformed_edges_df
424
+
425
+
426
+ def summarize_weight_calibration(cpr_graph: ig.Graph, reaction_attrs: dict) -> None:
427
+ """
428
+ Summarize Weight Calibration
429
+
430
+ For a network with multiple sources for edge weights summarize the alignment of
431
+ different weighting schemes and how they map onto our notion of "good" versus
432
+ "dubious" weights.
433
+
434
+ Args:
435
+ cpr_graph (ig.Graph): A graph where edge weights have already been calibrated.
436
+ reaction_attrs (dict): a dictionary summarizing the types of weights that
437
+ exist and how they are transformed for calibration.
438
+
439
+ Returns:
440
+ None
441
+
442
+ """
443
+
444
+ score_calibration_df = pd.DataFrame(SCORE_CALIBRATION_POINTS_DICT)
445
+ score_calibration_df_calibrated = apply_weight_transformations(
446
+ score_calibration_df, reaction_attrs
447
+ )
448
+
449
+ calibrated_edges = cpr_graph.get_edge_dataframe()
450
+
451
+ _summarize_weight_calibration_table(
452
+ calibrated_edges, score_calibration_df, score_calibration_df_calibrated
453
+ )
454
+
455
+ _summarize_weight_calibration_plots(
456
+ calibrated_edges, score_calibration_df_calibrated
457
+ )
458
+
459
+ return None
460
+
461
+
462
+ def add_graph_weights(
463
+ cpr_graph: ig.Graph,
464
+ reaction_attrs: dict,
465
+ weighting_strategy: str = CPR_WEIGHTING_STRATEGIES.UNWEIGHTED,
466
+ ) -> ig.Graph:
467
+ """
468
+ Add Graph Weights
469
+
470
+ Apply a weighting strategy to generate edge weights on a graph. For directed graphs "upstream_weights" will
471
+ be generated as well which should be used when searching for a node's ancestors.
472
+
473
+ Args:
474
+ cpr_graph (ig.Graph): a graphical network of molecules/reactions (nodes) and edges linking them.
475
+ reaction_attrs (dict): an optional dict
476
+ weighting_strategy: a network weighting strategy with options:
477
+ - unweighted: all weights (and upstream_weights for directed graphs) are set to 1.
478
+ - topology: weight edges by the degree of the source nodes favoring nodes emerging from nodes
479
+ with few connections.
480
+ - mixed: transform edges with a quantitative score based on reaction_attrs; and set edges
481
+ without quantitative score as a source-specific weight.
482
+ - calibrated: transforme edges with a quantitative score based on reaction_attrs and combine them
483
+ with topology scores to generate a consensus.
484
+
485
+ """
486
+
487
+ cpr_graph_updated = copy.deepcopy(cpr_graph)
488
+
489
+ _validate_entity_attrs(reaction_attrs)
490
+
491
+ if weighting_strategy not in VALID_WEIGHTING_STRATEGIES:
492
+ raise ValueError(
493
+ f"weighting_strategy was {weighting_strategy} and must be one of: "
494
+ f"{', '.join(VALID_WEIGHTING_STRATEGIES)}"
495
+ )
496
+
497
+ # count parents and children and create weights based on them
498
+ topology_weighted_graph = _create_topology_weights(cpr_graph_updated)
499
+
500
+ if weighting_strategy == CPR_WEIGHTING_STRATEGIES.TOPOLOGY:
501
+ topology_weighted_graph.es[CPR_GRAPH_EDGES.WEIGHTS] = (
502
+ topology_weighted_graph.es["topo_weights"]
503
+ )
504
+ if cpr_graph_updated.is_directed():
505
+ topology_weighted_graph.es[CPR_GRAPH_EDGES.UPSTREAM_WEIGHTS] = (
506
+ topology_weighted_graph.es["upstream_topo_weights"]
507
+ )
508
+
509
+ return topology_weighted_graph
510
+
511
+ if weighting_strategy == CPR_WEIGHTING_STRATEGIES.UNWEIGHTED:
512
+ # set weights as a constant
513
+ topology_weighted_graph.es[CPR_GRAPH_EDGES.WEIGHTS] = 1
514
+ if cpr_graph_updated.is_directed():
515
+ topology_weighted_graph.es[CPR_GRAPH_EDGES.UPSTREAM_WEIGHTS] = 1
516
+ return topology_weighted_graph
517
+
518
+ if weighting_strategy == CPR_WEIGHTING_STRATEGIES.MIXED:
519
+ return _add_graph_weights_mixed(topology_weighted_graph, reaction_attrs)
520
+
521
+ if weighting_strategy == CPR_WEIGHTING_STRATEGIES.CALIBRATED:
522
+ return _add_graph_weights_calibration(topology_weighted_graph, reaction_attrs)
523
+
524
+ raise ValueError(f"No logic implemented for {weighting_strategy}")
525
+
526
+
527
+ def _create_cpr_graph_bipartite(sbml_dfs: sbml_dfs_core.SBML_dfs) -> pd.DataFrame:
528
+ """Turn an sbml_dfs model into a bipartite graph linking molecules to reactions."""
529
+
530
+ # setup edges
531
+ network_edges = (
532
+ sbml_dfs.reaction_species.reset_index()[
533
+ [SBML_DFS.R_ID, SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY, SBML_DFS.SBO_TERM]
534
+ ]
535
+ # rename species and reactions to reflect from -> to edges
536
+ .rename(
537
+ columns={
538
+ SBML_DFS.SC_ID: CPR_GRAPH_NODE_TYPES.SPECIES,
539
+ SBML_DFS.R_ID: CPR_GRAPH_NODE_TYPES.REACTION,
540
+ }
541
+ )
542
+ )
543
+ # add back an r_id variable so that each edge is annotated by a reaction
544
+ network_edges[CPR_GRAPH_EDGES.R_ID] = network_edges[CPR_GRAPH_NODE_TYPES.REACTION]
545
+
546
+ # add edge weights
547
+ cspecies_features = sbml_dfs.get_cspecies_features()
548
+ network_edges = network_edges.merge(
549
+ cspecies_features, left_on=CPR_GRAPH_NODE_TYPES.SPECIES, right_index=True
550
+ )
551
+
552
+ # if directed then flip substrates and modifiers to the origin edge
553
+ edge_vars = network_edges.columns.tolist()
554
+
555
+ origins = network_edges[network_edges[SBML_DFS.STOICHIOMETRY] <= 0]
556
+ origin_edges = origins.loc[:, [edge_vars[1], edge_vars[0]] + edge_vars[2:]].rename(
557
+ columns={
558
+ CPR_GRAPH_NODE_TYPES.SPECIES: CPR_GRAPH_EDGES.FROM,
559
+ CPR_GRAPH_NODE_TYPES.REACTION: CPR_GRAPH_EDGES.TO,
560
+ }
561
+ )
562
+
563
+ dests = network_edges[network_edges[SBML_DFS.STOICHIOMETRY] > 0]
564
+ dest_edges = dests.rename(
565
+ columns={
566
+ CPR_GRAPH_NODE_TYPES.REACTION: CPR_GRAPH_EDGES.FROM,
567
+ CPR_GRAPH_NODE_TYPES.SPECIES: CPR_GRAPH_EDGES.TO,
568
+ }
569
+ )
570
+
571
+ network_edges = pd.concat([origin_edges, dest_edges])
572
+
573
+ return network_edges
574
+
575
+
576
+ def _create_cpr_graph_tiered(
577
+ sbml_dfs: sbml_dfs_core.SBML_dfs, graph_type: str
578
+ ) -> pd.DataFrame:
579
+ """Turn an sbml_dfs model into a tiered graph which links upstream entities to downstream ones."""
580
+
581
+ # check whether all expect SBO terms are present
582
+ invalid_sbo_terms = sbml_dfs.reaction_species[
583
+ ~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
584
+ ]
585
+ assert isinstance(invalid_sbo_terms, pd.DataFrame)
586
+
587
+ if invalid_sbo_terms.shape[0] != 0:
588
+ invalid_counts = invalid_sbo_terms.value_counts(SBML_DFS.SBO_TERM).to_frame("N")
589
+ assert isinstance(invalid_counts, pd.DataFrame)
590
+
591
+ logger.warning(utils.style_df(invalid_counts, headers="keys")) # type: ignore
592
+ raise ValueError("Some reaction species have unusable SBO terms")
593
+
594
+ # load and validate the schema of graph_type
595
+ graph_hierarchy_df = _create_graph_hierarchy_df(graph_type)
596
+
597
+ # organize reaction species for defining connections
598
+ sorted_reaction_species = sbml_dfs.reaction_species.set_index(
599
+ [SBML_DFS.R_ID, SBML_DFS.SBO_TERM]
600
+ ).sort_index()
601
+
602
+ logger.info(
603
+ f"Formatting {sorted_reaction_species.shape[0]} reactions species as "
604
+ "tiered edges."
605
+ )
606
+
607
+ # infer tiered edges in each reaction
608
+ all_reaction_edges = [
609
+ _format_tiered_reaction_species(
610
+ r, sorted_reaction_species, sbml_dfs, graph_hierarchy_df
611
+ )
612
+ for r in sorted_reaction_species.index.get_level_values(SBML_DFS.R_ID).unique()
613
+ ]
614
+ all_reaction_edges_df = pd.concat(all_reaction_edges).reset_index(drop=True)
615
+
616
+ # test for reactions missing substrates
617
+ r_id_list = sorted_reaction_species.index.get_level_values(0).unique()
618
+ r_id_reactant_only = [
619
+ x for x in r_id_list if len(sorted_reaction_species.loc[x]) == 1
620
+ ]
621
+
622
+ if len(r_id_reactant_only) > 0:
623
+ logger.warning(f"{len(r_id_reactant_only)} reactions are missing substrates")
624
+ all_reaction_edges_df_pre = all_reaction_edges_df.copy()
625
+ all_reaction_edges_df = all_reaction_edges_df_pre[
626
+ ~all_reaction_edges_df_pre[SBML_DFS.R_ID].isin(r_id_reactant_only)
627
+ ]
628
+
629
+ logger.info(
630
+ "Adding additional attributes to edges, e.g., # of children and parents."
631
+ )
632
+
633
+ # add compartmentalized species summaries to weight edges
634
+ cspecies_features = sbml_dfs.get_cspecies_features()
635
+
636
+ # calculate undirected and directed degrees (i.e., # of parents and children)
637
+ # based on a network's edgelist. this used when the network representation is
638
+ # not the bipartite network which can be trivially obtained from the pathway
639
+ # specification
640
+ unique_edges = (
641
+ all_reaction_edges_df.groupby([CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO])
642
+ .first()
643
+ .reset_index()
644
+ )
645
+
646
+ # children
647
+ n_children = (
648
+ unique_edges[CPR_GRAPH_EDGES.FROM]
649
+ .value_counts()
650
+ .to_frame()
651
+ .reset_index()
652
+ .rename(
653
+ {
654
+ "index": SBML_DFS.SC_ID,
655
+ CPR_GRAPH_EDGES.FROM: CPR_GRAPH_EDGES.SC_CHILDREN,
656
+ },
657
+ axis=1,
658
+ )
659
+ )
660
+ # parents
661
+ n_parents = (
662
+ unique_edges[CPR_GRAPH_EDGES.TO]
663
+ .value_counts()
664
+ .to_frame()
665
+ .reset_index()
666
+ .rename(
667
+ {"index": SBML_DFS.SC_ID, CPR_GRAPH_EDGES.TO: CPR_GRAPH_EDGES.SC_PARENTS},
668
+ axis=1,
669
+ )
670
+ )
671
+ graph_degree_by_edgelist = n_children.merge(n_parents, how="outer").fillna(0)
672
+
673
+ graph_degree_by_edgelist[CPR_GRAPH_EDGES.SC_DEGREE] = (
674
+ graph_degree_by_edgelist[CPR_GRAPH_EDGES.SC_CHILDREN]
675
+ + graph_degree_by_edgelist[CPR_GRAPH_EDGES.SC_PARENTS]
676
+ )
677
+ graph_degree_by_edgelist = (
678
+ graph_degree_by_edgelist[
679
+ ~graph_degree_by_edgelist[SBML_DFS.SC_ID].str.contains("R[0-9]{8}")
680
+ ]
681
+ .set_index(SBML_DFS.SC_ID)
682
+ .sort_index()
683
+ )
684
+
685
+ cspecies_features = (
686
+ cspecies_features.drop(
687
+ [
688
+ CPR_GRAPH_EDGES.SC_DEGREE,
689
+ CPR_GRAPH_EDGES.SC_CHILDREN,
690
+ CPR_GRAPH_EDGES.SC_PARENTS,
691
+ ],
692
+ axis=1,
693
+ )
694
+ .join(graph_degree_by_edgelist)
695
+ .fillna(0)
696
+ )
697
+
698
+ is_from_reaction = all_reaction_edges_df[CPR_GRAPH_EDGES.FROM].isin(
699
+ sbml_dfs.reactions.index.tolist()
700
+ )
701
+ is_from_reaction = all_reaction_edges_df[CPR_GRAPH_EDGES.FROM].isin(
702
+ sbml_dfs.reactions.index
703
+ )
704
+ # add substrate weight whenever "from" edge is a molecule
705
+ # and product weight when the "from" edge is a reaction
706
+ decorated_all_reaction_edges_df = pd.concat(
707
+ [
708
+ all_reaction_edges_df[~is_from_reaction].merge(
709
+ cspecies_features, left_on=CPR_GRAPH_EDGES.FROM, right_index=True
710
+ ),
711
+ all_reaction_edges_df[is_from_reaction].merge(
712
+ cspecies_features, left_on=CPR_GRAPH_EDGES.TO, right_index=True
713
+ ),
714
+ ]
715
+ ).sort_index()
716
+
717
+ if all_reaction_edges_df.shape[0] != decorated_all_reaction_edges_df.shape[0]:
718
+ msg = (
719
+ "'decorated_all_reaction_edges_df' and 'all_reaction_edges_df' should\n"
720
+ "have the same number of rows but they did not"
721
+ )
722
+
723
+ raise ValueError(msg)
724
+
725
+ logger.info(f"Done preparing {graph_type} graph")
726
+
727
+ return decorated_all_reaction_edges_df
728
+
729
+
730
+ def _format_tiered_reaction_species(
731
+ r_id: str,
732
+ sorted_reaction_species: pd.DataFrame,
733
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
734
+ graph_hierarchy_df: pd.DataFrame,
735
+ ) -> pd.DataFrame:
736
+ """
737
+ Format Tiered Reaction Species
738
+
739
+ Refactor a reaction's species into tiered edges between substrates, products, enzymes and allosteric regulators.
740
+ """
741
+
742
+ rxn_species = sorted_reaction_species.loc[r_id]
743
+ assert isinstance(rxn_species, pd.DataFrame)
744
+ assert list(rxn_species.index.names) == [SBML_DFS.SBO_TERM]
745
+ assert rxn_species.columns.tolist() == [SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY]
746
+
747
+ rxn_sbo_terms = set(rxn_species.index.unique())
748
+ # map to common names
749
+ rxn_sbo_names = {MINI_SBO_TO_NAME[x] for x in rxn_sbo_terms}
750
+
751
+ # is the reaction a general purpose interaction
752
+ if len(rxn_sbo_names) == 1:
753
+ if list(rxn_sbo_names)[0] == SBOTERM_NAMES.INTERACTOR:
754
+ # further validation happens in the function - e.g., exactly two interactors
755
+ return _format_interactors_for_tiered_graph(r_id, rxn_species, sbml_dfs)
756
+
757
+ if SBOTERM_NAMES.INTERACTOR in rxn_sbo_names:
758
+ logger.warning(
759
+ f"Invalid combinations of SBO_terms in {str(r_id)} : {sbml_dfs.reactions.loc[r_id][SBML_DFS.R_NAME]}. "
760
+ "If interactors are present then there can't be any other types of reaction species. "
761
+ f"The following roles were defined: {', '.join(rxn_sbo_names)}"
762
+ )
763
+
764
+ # reorganize molecules and the reaction itself into tiers
765
+ entities_ordered_by_tier = (
766
+ pd.concat(
767
+ [
768
+ (
769
+ rxn_species.reset_index()
770
+ .rename({SBML_DFS.SC_ID: "entity_id"}, axis=1)
771
+ .merge(graph_hierarchy_df)
772
+ ),
773
+ graph_hierarchy_df[
774
+ graph_hierarchy_df[CPR_GRAPH_EDGES.SBO_NAME]
775
+ == CPR_GRAPH_NODE_TYPES.REACTION
776
+ ].assign(entity_id=r_id, r_id=r_id),
777
+ ]
778
+ )
779
+ .sort_values(["tier"])
780
+ .set_index("tier")
781
+ )
782
+ ordered_tiers = entities_ordered_by_tier.index.get_level_values("tier").unique()
783
+
784
+ assert len(ordered_tiers) > 1
785
+
786
+ # which tier is the reaction?
787
+ reaction_tier = graph_hierarchy_df["tier"][
788
+ graph_hierarchy_df[CPR_GRAPH_EDGES.SBO_NAME] == CPR_GRAPH_NODE_TYPES.REACTION
789
+ ].tolist()[0]
790
+
791
+ rxn_edges = list()
792
+ past_reaction = False
793
+ for i in range(0, len(ordered_tiers) - 1):
794
+ formatted_tier_combo = _format_tier_combo(
795
+ entities_ordered_by_tier.loc[[ordered_tiers[i]]],
796
+ entities_ordered_by_tier.loc[[ordered_tiers[i + 1]]],
797
+ past_reaction,
798
+ )
799
+
800
+ if ordered_tiers[i + 1] == reaction_tier:
801
+ past_reaction = True
802
+
803
+ rxn_edges.append(formatted_tier_combo)
804
+
805
+ rxn_edges_df = (
806
+ pd.concat(rxn_edges)[
807
+ [
808
+ CPR_GRAPH_EDGES.FROM,
809
+ CPR_GRAPH_EDGES.TO,
810
+ CPR_GRAPH_EDGES.STOICHIOMETRY,
811
+ CPR_GRAPH_EDGES.SBO_TERM,
812
+ ]
813
+ ]
814
+ .reset_index(drop=True)
815
+ .assign(r_id=r_id)
816
+ )
817
+
818
+ return rxn_edges_df
819
+
820
+
821
+ def _format_tier_combo(
822
+ upstream_tier: pd.DataFrame, downstream_tier: pd.DataFrame, past_reaction: bool
823
+ ) -> pd.DataFrame:
824
+ """
825
+ Format Tier Combo
826
+
827
+ Create a set of edges crossing two tiers of a tiered graph. This will involve an
828
+ all x all combination of entries. Tiers form an ordering along the molecular entities
829
+ in a reaction plus a tier for the reaction itself. Attributes such as stoichiometry
830
+ and sbo_term will be passed from the tier which is furthest from the reaction tier
831
+ to ensure that each tier of molecular data applies its attributes to a single set of
832
+ edges while the "reaction" tier does not. Reaction entities have neither a
833
+ stoichiometery or sbo_term annotation.
834
+
835
+ Args:
836
+ upstream_tier (pd.DataFrame): A table containing upstream entities in a reaction,
837
+ e.g., regulators.
838
+ downstream_tier (pd.DataFrame): A table containing downstream entities in a reaction,
839
+ e.g., catalysts.
840
+ past_reaction (bool): if True then attributes will be taken from downstream_tier and
841
+ if False they will come from upstream_tier.
842
+
843
+ Returns:
844
+ formatted_tier_combo (pd.DataFrame): A table of edges containing (from, to, stoichiometry, sbo_term, r_id). The
845
+ number of edges is the product of the number of entities in the upstream tier
846
+ times the number in the downstream tier.
847
+
848
+ """
849
+
850
+ upstream_fields = ["entity_id", SBML_DFS.STOICHIOMETRY, SBML_DFS.SBO_TERM]
851
+ downstream_fields = ["entity_id"]
852
+
853
+ if past_reaction:
854
+ # swap fields
855
+ upstream_fields, downstream_fields = downstream_fields, upstream_fields
856
+
857
+ formatted_tier_combo = (
858
+ upstream_tier[upstream_fields]
859
+ .rename({"entity_id": CPR_GRAPH_EDGES.FROM}, axis=1)
860
+ .assign(_joiner=1)
861
+ ).merge(
862
+ (
863
+ downstream_tier[downstream_fields]
864
+ .rename({"entity_id": CPR_GRAPH_EDGES.TO}, axis=1)
865
+ .assign(_joiner=1)
866
+ ),
867
+ left_on="_joiner",
868
+ right_on="_joiner",
869
+ )
870
+
871
+ return formatted_tier_combo
872
+
873
+
874
+ def _create_graph_hierarchy_df(graph_type: str) -> pd.DataFrame:
875
+ """
876
+ Create Graph Hierarchy DataFrame
877
+
878
+ Format a graph hierarchy list of lists and a pd.DataFrame
879
+
880
+ Args:
881
+ graph_type (str):
882
+ The type of tiered graph to work with. Each type has its own specification in constants.py.
883
+
884
+ Returns:
885
+ A pandas DataFrame with sbo_name, tier, and sbo_term.
886
+
887
+ """
888
+
889
+ if graph_type == CPR_GRAPH_TYPES.REGULATORY:
890
+ sbo_names_hierarchy = REGULATORY_GRAPH_HIERARCHY
891
+ elif graph_type == CPR_GRAPH_TYPES.SURROGATE:
892
+ sbo_names_hierarchy = SURROGATE_GRAPH_HIERARCHY
893
+ else:
894
+ raise NotImplementedError(f"{graph_type} is not a valid graph_type")
895
+
896
+ # format as a DF
897
+ graph_hierarchy_df = pd.concat(
898
+ [
899
+ pd.DataFrame({"sbo_name": sbo_names_hierarchy[i]}).assign(tier=i)
900
+ for i in range(0, len(sbo_names_hierarchy))
901
+ ]
902
+ ).reset_index(drop=True)
903
+ graph_hierarchy_df[SBML_DFS.SBO_TERM] = graph_hierarchy_df["sbo_name"].apply(
904
+ lambda x: MINI_SBO_FROM_NAME[x] if x != CPR_GRAPH_NODE_TYPES.REACTION else ""
905
+ )
906
+
907
+ # ensure that the output is expected
908
+ utils.match_pd_vars(
909
+ graph_hierarchy_df,
910
+ req_vars={CPR_GRAPH_EDGES.SBO_NAME, "tier", SBML_DFS.SBO_TERM},
911
+ allow_series=False,
912
+ ).assert_present()
913
+
914
+ return graph_hierarchy_df
915
+
916
+
917
+ def _add_graph_weights_mixed(cpr_graph: ig.Graph, reaction_attrs: dict) -> ig.Graph:
918
+ """Weight a graph using a mixed approach combining source-specific weights and existing edge weights."""
919
+
920
+ edges_df = cpr_graph.get_edge_dataframe()
921
+
922
+ calibrated_edges = apply_weight_transformations(edges_df, reaction_attrs)
923
+ calibrated_edges = _create_source_weights(calibrated_edges, "source_wt")
924
+
925
+ score_vars = list(reaction_attrs.keys())
926
+ score_vars.append("source_wt")
927
+
928
+ logger.info(f"Creating mixed scores based on {', '.join(score_vars)}")
929
+
930
+ calibrated_edges["weights"] = calibrated_edges[score_vars].min(axis=1)
931
+
932
+ cpr_graph.es[CPR_GRAPH_EDGES.WEIGHTS] = calibrated_edges[CPR_GRAPH_EDGES.WEIGHTS]
933
+ if cpr_graph.is_directed():
934
+ cpr_graph.es[CPR_GRAPH_EDGES.UPSTREAM_WEIGHTS] = calibrated_edges[
935
+ CPR_GRAPH_EDGES.WEIGHTS
936
+ ]
937
+
938
+ # add other attributes and update transformed attributes
939
+ cpr_graph.es["source_wt"] = calibrated_edges["source_wt"]
940
+ for k in reaction_attrs.keys():
941
+ cpr_graph.es[k] = calibrated_edges[k]
942
+
943
+ return cpr_graph
944
+
945
+
946
+ def _add_graph_weights_calibration(
947
+ cpr_graph: ig.Graph, reaction_attrs: dict
948
+ ) -> ig.Graph:
949
+ """Weight a graph using a calibrated strategy which aims to roughly align qualiatively similar weights from different sources."""
950
+
951
+ edges_df = cpr_graph.get_edge_dataframe()
952
+
953
+ calibrated_edges = apply_weight_transformations(edges_df, reaction_attrs)
954
+
955
+ score_vars = list(reaction_attrs.keys())
956
+ score_vars.append("topo_weights")
957
+
958
+ logger.info(f"Creating calibrated scores based on {', '.join(score_vars)}")
959
+ cpr_graph.es["weights"] = calibrated_edges[score_vars].min(axis=1)
960
+
961
+ if cpr_graph.is_directed():
962
+ score_vars = list(reaction_attrs.keys())
963
+ score_vars.append("upstream_topo_weights")
964
+ cpr_graph.es["upstream_weights"] = calibrated_edges[score_vars].min(axis=1)
965
+
966
+ # add other attributes and update transformed attributes
967
+ for k in reaction_attrs.keys():
968
+ cpr_graph.es[k] = calibrated_edges[k]
969
+
970
+ return cpr_graph
971
+
972
+
973
+ def _add_edge_attr_to_vertex_graph(
974
+ cpr_graph: ig.Graph,
975
+ edge_attr_list: list,
976
+ shared_node_key: str = "r_id",
977
+ ) -> ig.Graph:
978
+ """
979
+ Merge edge attribute(s) from edge_attr_list to vetices of an igraph
980
+
981
+ Parameters
982
+ ----------
983
+ cpr_graph : iGraph
984
+ A graph generated by create_cpr_graph()
985
+ edge_attr_list: list
986
+ A list containing attributes to pull out of edges, then to add to vertices
987
+ shared_node_key : str
988
+ key in edge that is shared with vertex, to map edge ids to corresponding vertex ids
989
+
990
+ Returns:
991
+ ----------
992
+ An Igraph network
993
+ """
994
+
995
+ if len(edge_attr_list) == 0:
996
+ logger.warning(
997
+ "No edge attributes were passed, " "thus return the input graph."
998
+ )
999
+ return cpr_graph
1000
+
1001
+ graph_vertex_df = cpr_graph.get_vertex_dataframe()
1002
+ graph_edge_df = cpr_graph.get_edge_dataframe()
1003
+
1004
+ if shared_node_key not in graph_edge_df.columns.to_list():
1005
+ logger.warning(
1006
+ f"{shared_node_key} is not in the current edge attributes. "
1007
+ "shared_node_key must be an existing edge attribute"
1008
+ )
1009
+ return cpr_graph
1010
+
1011
+ graph_edge_df_sub = graph_edge_df.loc[:, [shared_node_key] + edge_attr_list].copy()
1012
+
1013
+ # check whether duplicated edge ids by shared_node_key have the same attribute values.
1014
+ # If not, give warning, and keep the first value. (which can be improved later)
1015
+ check_edgeid_attr_unique = (
1016
+ graph_edge_df_sub.groupby(shared_node_key)[edge_attr_list].nunique() == 1
1017
+ )
1018
+
1019
+ # check any False in check_edgeid_attr_unique's columns, if so, get the column names
1020
+ bool_edgeid_attr_unique = (check_edgeid_attr_unique.isin([False])).any() # type: ignore
1021
+
1022
+ non_unique_indices = [
1023
+ i for i, value in enumerate(bool_edgeid_attr_unique.to_list()) if value
1024
+ ]
1025
+
1026
+ # if edge ids with duplicated shared_node_key have more than 1 unique values
1027
+ # for attributes of interest
1028
+ non_unique_egde_attr = bool_edgeid_attr_unique.index[non_unique_indices].to_list()
1029
+
1030
+ if len(non_unique_egde_attr) == 0:
1031
+ logger.info("Per duplicated edge ids, attributes have only 1 unique value.")
1032
+ else:
1033
+ logger.warning(
1034
+ f"Per duplicated edge ids, attributes: {non_unique_egde_attr} "
1035
+ "contain more than 1 unique values"
1036
+ )
1037
+
1038
+ # remove duplicated edge attribute values
1039
+ graph_edge_df_sub_no_duplicate = graph_edge_df_sub.drop_duplicates(
1040
+ subset=shared_node_key, keep="first"
1041
+ )
1042
+
1043
+ # rename shared_node_key to vertex key 'name'
1044
+ # as in net_create.create_cpr_graph(), vertex_name_attr is set to 'name'
1045
+ graph_edge_df_sub_no_duplicate = graph_edge_df_sub_no_duplicate.rename(
1046
+ columns={shared_node_key: "name"},
1047
+ )
1048
+
1049
+ # merge edge attributes in graph_edge_df_sub_no_duplicate to vertex_df,
1050
+ # by shared key 'name'
1051
+ graph_vertex_df_w_edge_attr = pd.merge(
1052
+ graph_vertex_df,
1053
+ graph_edge_df_sub_no_duplicate,
1054
+ on="name",
1055
+ how="outer",
1056
+ )
1057
+
1058
+ logger.info(f"Adding {edge_attr_list} to vertex attributes")
1059
+ # Warning for NaN values in vertex attributes:
1060
+ if graph_vertex_df_w_edge_attr.isnull().values.any():
1061
+ logger.warning(
1062
+ "NaN values are present in the newly added vertex attributes. "
1063
+ "Please assign proper values to those vertex attributes."
1064
+ )
1065
+
1066
+ # assign the edge_attrs from edge_attr_list to cpr_graph's vertices:
1067
+ # keep the same edge attribute names:
1068
+ for col_name in edge_attr_list:
1069
+ cpr_graph.vs[col_name] = graph_vertex_df_w_edge_attr[col_name]
1070
+
1071
+ return cpr_graph
1072
+
1073
+
1074
+ def _summarize_weight_calibration_table(
1075
+ calibrated_edges: pd.DataFrame,
1076
+ score_calibration_df: pd.DataFrame,
1077
+ score_calibration_df_calibrated: pd.DataFrame,
1078
+ ):
1079
+ """Create a table comparing edge weights from multiple sources."""
1080
+
1081
+ # generate a table summarizing different scoring measures
1082
+ #
1083
+ # a set of calibration points defined in DEFINED_WEIGHT_TRANSFORMATION which map
1084
+ # onto what we might consider strong versus dubious edges are compared to the
1085
+ # observed scores to see whether these calibration points generally map onto
1086
+ # the expected quantiles of the score distribution.
1087
+ #
1088
+ # different scores are also compared to see whether there calibrations are generally
1089
+ # aligned. that is to say a strong weight based on one scoring measure would receive
1090
+ # a similar quantitative score to a strong score for another measure.
1091
+
1092
+ score_calibration_long_raw = (
1093
+ score_calibration_df.reset_index()
1094
+ .rename({"index": "edge_strength"}, axis=1)
1095
+ .melt(
1096
+ id_vars="edge_strength", var_name="weight_measure", value_name="raw_weight"
1097
+ )
1098
+ )
1099
+
1100
+ score_calibration_long_calibrated = (
1101
+ score_calibration_df_calibrated.reset_index()
1102
+ .rename({"index": "edge_strength"}, axis=1)
1103
+ .melt(
1104
+ id_vars="edge_strength",
1105
+ var_name="weight_measure",
1106
+ value_name="trans_weight",
1107
+ )
1108
+ )
1109
+
1110
+ score_calibration_table_long = score_calibration_long_raw.merge(
1111
+ score_calibration_long_calibrated
1112
+ )
1113
+
1114
+ # compare calibration points to the quantiles of the observed score distributions
1115
+ score_quantiles = list()
1116
+ for ind, row in score_calibration_table_long.iterrows():
1117
+ score_quantiles.append(
1118
+ 1
1119
+ - np.mean(
1120
+ calibrated_edges[row["weight_measure"]].dropna() >= row["trans_weight"]
1121
+ )
1122
+ )
1123
+ score_calibration_table_long["quantile_of_score_dist"] = score_quantiles
1124
+
1125
+ return utils.style_df(score_calibration_table_long, headers="keys")
1126
+
1127
+
1128
+ def _summarize_weight_calibration_plots(
1129
+ calibrated_edges: pd.DataFrame, score_calibration_df_calibrated: pd.DataFrame
1130
+ ) -> None:
1131
+ """Create a couple of plots summarizing the relationships between different scoring measures."""
1132
+
1133
+ # set up a 2 x 1 plot
1134
+ f, (ax1, ax2) = plt.subplots(1, 2)
1135
+
1136
+ calibrated_edges[["topo_weights", "string_wt"]].plot(
1137
+ kind="hist", bins=50, alpha=0.5, ax=ax1
1138
+ )
1139
+ ax1.set_title("Distribution of scores\npost calibration")
1140
+
1141
+ score_calibration_df_calibrated.plot("weights", "string_wt", kind="scatter", ax=ax2)
1142
+
1143
+ for k, v in score_calibration_df_calibrated.iterrows():
1144
+ ax2.annotate(k, v)
1145
+ ax2.axline((0, 0), slope=1.0, color="C0", label="by slope")
1146
+ ax2.set_title("Comparing STRING and\nTopology calibration points")
1147
+
1148
+ return None
1149
+
1150
+
1151
+ def _create_source_weights(
1152
+ edges_df: pd.DataFrame,
1153
+ source_wt_var: str = "source_wt",
1154
+ source_vars_dict: dict = SOURCE_VARS_DICT,
1155
+ source_wt_default: int = 1,
1156
+ ) -> pd.DataFrame:
1157
+ """ "
1158
+ Create Source Weights
1159
+
1160
+ Create weights based on an edges source. This is a simple but crude way of allowing different
1161
+ data sources to have different support if we think that some are more trustworthly than others.
1162
+
1163
+ Args:
1164
+ edges_df: pd.DataFrame
1165
+ The edges dataframe to add the source weights to.
1166
+ source_wt_var: str
1167
+ The name of the column to store the source weights.
1168
+ source_vars_dict: dict
1169
+ Dictionary with keys indicating edge attributes and values indicating the weight to assign
1170
+ to that attribute. This value is generally the largest weight that can be assigned to an
1171
+ edge so that the numeric weight is chosen over the default.
1172
+ source_wt_default: int
1173
+ The default weight to assign to an edge if no other weight attribute is found.
1174
+
1175
+ Returns:
1176
+ pd.DataFrame
1177
+ The edges dataframe with the source weights added.
1178
+
1179
+ """
1180
+
1181
+ logger.warning(
1182
+ "_create_source_weights should be reimplemented once https://github.com/calico/pathadex-data/issues/95 "
1183
+ "is fixed. The current implementation is quite limited."
1184
+ )
1185
+
1186
+ # currently, we will look for values of source_indicator_var which are non NA and set them to
1187
+ # source_indicator_match_score and setting entries which are NA as source_indicator_nonmatch_score.
1188
+ #
1189
+ # this is a simple way of flagging string vs. non-string scores
1190
+
1191
+ included_weight_vars = set(source_vars_dict.keys()).intersection(
1192
+ set(edges_df.columns)
1193
+ )
1194
+ if len(included_weight_vars) == 0:
1195
+ logger.warning(
1196
+ f"No edge attributes were found which match those in source_vars_dict: {', '.join(source_vars_dict.keys())}"
1197
+ )
1198
+ edges_df[source_wt_var] = source_wt_default
1199
+ return edges_df
1200
+
1201
+ edges_df_source_wts = edges_df[list(included_weight_vars)].copy()
1202
+ for wt in list(included_weight_vars):
1203
+ edges_df_source_wts[wt] = [
1204
+ source_wt_default if x is True else source_vars_dict[wt]
1205
+ for x in edges_df[wt].isna()
1206
+ ]
1207
+
1208
+ source_wt_edges_df = edges_df.join(
1209
+ edges_df_source_wts.max(axis=1).rename(source_wt_var)
1210
+ )
1211
+
1212
+ return source_wt_edges_df
1213
+
1214
+
1215
+ def _wt_transformation_identity(x):
1216
+ """Identity"""
1217
+ return x
1218
+
1219
+
1220
+ def _wt_transformation_string(x):
1221
+ """Map STRING scores to a similar scale as topology weights."""
1222
+
1223
+ return 250000 / np.power(x, 1.7)
1224
+
1225
+
1226
+ def _wt_transformation_string_inv(x):
1227
+ """Map STRING scores so they work with source weights."""
1228
+
1229
+ # string scores are bounded on [0, 1000]
1230
+ # and score/1000 is roughly a probability that
1231
+ # there is a real interaction (physical, genetic, ...)
1232
+ # reported string scores are currently on [150, 1000]
1233
+ # so this transformation will map these onto {6.67, 1}
1234
+
1235
+ return 1 / (x / 1000)
1236
+
1237
+
1238
+ def _format_interactors_for_tiered_graph(
1239
+ r_id: str, rxn_species: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
1240
+ ) -> pd.DataFrame:
1241
+ """Format an undirected interactions for tiered graph so interactions are linked even though they would be on the same tier."""
1242
+
1243
+ interactor_data = rxn_species.loc[MINI_SBO_FROM_NAME["interactor"]]
1244
+ if interactor_data.shape[0] != 2:
1245
+ raise ValueError(
1246
+ f"{interactor_data.shape[0]} interactors present for {str(r_id)} : "
1247
+ f"{sbml_dfs.reactions.loc[r_id]['r_name']}. "
1248
+ "Reactions with interactors must have exactly two interactors"
1249
+ )
1250
+
1251
+ if not (interactor_data["stoichiometry"] == 0).any():
1252
+ raise ValueError(
1253
+ f"Interactors had non-zero stoichiometry for {str(r_id)} : {sbml_dfs.reactions.loc[r_id]['r_name']}. "
1254
+ "If stoichiometry is important for this reaction then it should use other SBO terms "
1255
+ "(e.g., substrate and product)."
1256
+ )
1257
+
1258
+ # set the first entry as "from" and second as "to" if stoi is zero.
1259
+ # the reverse reaction will generally be added later because these
1260
+ # reactions should be reversible
1261
+
1262
+ return pd.DataFrame(
1263
+ {
1264
+ "from": interactor_data["sc_id"].iloc[0],
1265
+ "to": interactor_data["sc_id"].iloc[1],
1266
+ "sbo_term": MINI_SBO_FROM_NAME["interactor"],
1267
+ "stoichiometry": 0,
1268
+ "r_id": r_id,
1269
+ },
1270
+ index=[0],
1271
+ )
1272
+
1273
+
1274
+ def _add_graph_species_attribute(
1275
+ cpr_graph: ig.Graph,
1276
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
1277
+ species_graph_attrs: dict,
1278
+ ) -> ig.Graph:
1279
+ """Add meta-data from species_data to existing igraph's vertices."""
1280
+
1281
+ if not isinstance(species_graph_attrs, dict):
1282
+ raise TypeError(
1283
+ f"species_graph_attrs must be a dict, but was {type(species_graph_attrs)}"
1284
+ )
1285
+
1286
+ # fail fast if species_graph_attrs is not properly formatted
1287
+ # also flatten attribute list to be added to vertex nodes
1288
+ sp_graph_key_list = []
1289
+ sp_node_attr_list = []
1290
+ for k in species_graph_attrs.keys():
1291
+ _validate_entity_attrs(species_graph_attrs[k])
1292
+
1293
+ sp_graph_key_list.append(k)
1294
+ sp_node_attr_list.append(list(species_graph_attrs[k].keys()))
1295
+
1296
+ # flatten sp_node_attr_list
1297
+ flat_sp_node_attr_list = [item for items in sp_node_attr_list for item in items]
1298
+
1299
+ logger.info("Adding meta-data from species_data")
1300
+
1301
+ curr_network_nodes_df = cpr_graph.get_vertex_dataframe()
1302
+
1303
+ # add species-level attributes to nodes dataframe
1304
+ augmented_network_nodes_df = _augment_network_nodes(
1305
+ curr_network_nodes_df,
1306
+ sbml_dfs,
1307
+ species_graph_attrs,
1308
+ )
1309
+
1310
+ for vs_attr in flat_sp_node_attr_list:
1311
+ # in case more than one vs_attr in the flat_sp_node_attr_list
1312
+ logger.info(f"Adding new attribute {vs_attr} to vertices")
1313
+ cpr_graph.vs[vs_attr] = augmented_network_nodes_df[vs_attr].values
1314
+
1315
+ return cpr_graph
1316
+
1317
+
1318
+ def _augment_network_nodes(
1319
+ network_nodes: pd.DataFrame,
1320
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
1321
+ species_graph_attrs: dict = dict(),
1322
+ ) -> pd.DataFrame:
1323
+ """Add species-level attributes, expand network_nodes with s_id and c_id and then map to species-level attributes by s_id."""
1324
+
1325
+ REQUIRED_NETWORK_NODE_ATTRS = {
1326
+ "name",
1327
+ "node_name",
1328
+ "node_type",
1329
+ }
1330
+
1331
+ missing_required_network_nodes_attrs = REQUIRED_NETWORK_NODE_ATTRS.difference(
1332
+ set(network_nodes.columns.tolist())
1333
+ )
1334
+ if len(missing_required_network_nodes_attrs) > 0:
1335
+ raise ValueError(
1336
+ f"{len(missing_required_network_nodes_attrs)} required attributes were missing "
1337
+ "from network_nodes: "
1338
+ f"{', '.join(missing_required_network_nodes_attrs)}"
1339
+ )
1340
+
1341
+ # include matching s_ids and c_ids of sc_ids
1342
+ # (the index of network_nodes df) in network_nodes df
1343
+ network_nodes_sid = pd.merge(
1344
+ network_nodes,
1345
+ sbml_dfs.compartmentalized_species[["s_id", "c_id"]],
1346
+ left_on="name",
1347
+ right_index=True,
1348
+ how="left",
1349
+ )
1350
+
1351
+ # assign species_data related attributes to s_id
1352
+ species_graph_data = pluck_entity_data(sbml_dfs, species_graph_attrs, "species")
1353
+
1354
+ if species_graph_data is not None:
1355
+ # add species_graph_data to the network_nodes df, based on s_id
1356
+ network_nodes_wdata = network_nodes_sid.merge(
1357
+ species_graph_data, left_on="s_id", right_index=True, how="left"
1358
+ )
1359
+
1360
+ # Note: multiple sc_ids with the same s_id will be assign with the same species_graph_data
1361
+
1362
+ network_nodes_wdata.fillna(0, inplace=True)
1363
+ network_nodes_wdata.drop(columns=["s_id", "c_id"], inplace=True)
1364
+
1365
+ return network_nodes_wdata
1366
+
1367
+
1368
+ def _augment_network_edges(
1369
+ network_edges: pd.DataFrame,
1370
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
1371
+ reaction_graph_attrs: dict = dict(),
1372
+ ) -> pd.DataFrame:
1373
+ """Add reversibility and other metadata from reactions."""
1374
+
1375
+ REQUIRED_NETWORK_EDGE_ATTRS = {
1376
+ "from",
1377
+ "to",
1378
+ "stoichiometry",
1379
+ "sbo_term",
1380
+ "sc_degree",
1381
+ "sc_children",
1382
+ "sc_parents",
1383
+ "species_type",
1384
+ "r_id",
1385
+ }
1386
+
1387
+ missing_required_network_edges_attrs = REQUIRED_NETWORK_EDGE_ATTRS.difference(
1388
+ set(network_edges.columns.tolist())
1389
+ )
1390
+ if len(missing_required_network_edges_attrs) > 0:
1391
+ raise ValueError(
1392
+ f"{len(missing_required_network_edges_attrs)} required attributes were missing "
1393
+ "from network_edges: "
1394
+ f"{', '.join(missing_required_network_edges_attrs)}"
1395
+ )
1396
+
1397
+ network_edges = (
1398
+ network_edges[list(REQUIRED_NETWORK_EDGE_ATTRS)]
1399
+ # add reaction-level attributes
1400
+ .merge(
1401
+ sbml_dfs.reactions[SBML_DFS.R_ISREVERSIBLE],
1402
+ left_on=SBML_DFS.R_ID,
1403
+ right_index=True,
1404
+ )
1405
+ )
1406
+
1407
+ # add other attributes based on reactions data
1408
+ reaction_graph_data = pluck_entity_data(
1409
+ sbml_dfs, reaction_graph_attrs, SBML_DFS.REACTIONS
1410
+ )
1411
+ if reaction_graph_data is not None:
1412
+ network_edges = network_edges.merge(
1413
+ reaction_graph_data, left_on=SBML_DFS.R_ID, right_index=True, how="left"
1414
+ )
1415
+
1416
+ return network_edges
1417
+
1418
+
1419
+ def _reverse_network_edges(augmented_network_edges: pd.DataFrame) -> pd.DataFrame:
1420
+ """Flip reversible reactions to derive the reverse reaction."""
1421
+
1422
+ # validate inputs
1423
+ missing_required_vars = CPR_GRAPH_REQUIRED_EDGE_VARS.difference(
1424
+ set(augmented_network_edges.columns.tolist())
1425
+ )
1426
+
1427
+ if len(missing_required_vars) > 0:
1428
+ raise ValueError(
1429
+ "augmented_network_edges is missing required variables: "
1430
+ f"{', '.join(missing_required_vars)}"
1431
+ )
1432
+
1433
+ # select all edges derived from reversible reactions
1434
+ reversible_reaction_edges = augmented_network_edges[
1435
+ augmented_network_edges[CPR_GRAPH_EDGES.R_ISREVERSIBLE]
1436
+ ]
1437
+
1438
+ r_reaction_edges = (
1439
+ # ignore edges which start in a regulator or catalyst; even for a reversible reaction it
1440
+ # doesn't make sense for a regulator to be impacted by a target
1441
+ reversible_reaction_edges[
1442
+ ~reversible_reaction_edges[CPR_GRAPH_EDGES.SBO_TERM].isin(
1443
+ [
1444
+ MINI_SBO_FROM_NAME[x]
1445
+ for x in SBO_MODIFIER_NAMES.union({SBOTERM_NAMES.CATALYST})
1446
+ ]
1447
+ )
1448
+ ]
1449
+ # flip parent and child attributes
1450
+ .rename(
1451
+ {
1452
+ CPR_GRAPH_EDGES.FROM: CPR_GRAPH_EDGES.TO,
1453
+ CPR_GRAPH_EDGES.TO: CPR_GRAPH_EDGES.FROM,
1454
+ CPR_GRAPH_EDGES.SC_CHILDREN: CPR_GRAPH_EDGES.SC_PARENTS,
1455
+ CPR_GRAPH_EDGES.SC_PARENTS: CPR_GRAPH_EDGES.SC_CHILDREN,
1456
+ },
1457
+ axis=1,
1458
+ )
1459
+ )
1460
+
1461
+ # switch substrates and products
1462
+ r_reaction_edges[CPR_GRAPH_EDGES.STOICHIOMETRY] = r_reaction_edges[
1463
+ CPR_GRAPH_EDGES.STOICHIOMETRY
1464
+ ].apply(
1465
+ # the ifelse statement prevents 0 being converted to -0 ...
1466
+ lambda x: -1 * x if x != 0 else 0
1467
+ )
1468
+
1469
+ transformed_r_reaction_edges = pd.concat(
1470
+ [
1471
+ (
1472
+ r_reaction_edges[
1473
+ r_reaction_edges[CPR_GRAPH_EDGES.SBO_TERM]
1474
+ == MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
1475
+ ].assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT])
1476
+ ),
1477
+ (
1478
+ r_reaction_edges[
1479
+ r_reaction_edges[CPR_GRAPH_EDGES.SBO_TERM]
1480
+ == MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
1481
+ ].assign(sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT])
1482
+ ),
1483
+ r_reaction_edges[
1484
+ ~r_reaction_edges[CPR_GRAPH_EDGES.SBO_TERM].isin(
1485
+ [
1486
+ MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT],
1487
+ MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT],
1488
+ ]
1489
+ )
1490
+ ],
1491
+ ]
1492
+ )
1493
+
1494
+ assert transformed_r_reaction_edges.shape[0] == r_reaction_edges.shape[0]
1495
+
1496
+ return transformed_r_reaction_edges.assign(
1497
+ direction=CPR_GRAPH_EDGE_DIRECTIONS.REVERSE
1498
+ )
1499
+
1500
+
1501
+ def _create_topology_weights(
1502
+ cpr_graph: ig.Graph,
1503
+ base_score: float = 2,
1504
+ protein_multiplier: int = 1,
1505
+ metabolite_multiplier: int = 3,
1506
+ unknown_multiplier: int = 10,
1507
+ scale_multiplier_by_meandegree: bool = True,
1508
+ ) -> ig.Graph:
1509
+ """
1510
+ Create Topology Weights
1511
+
1512
+ Add weights to a network based on its topology. Edges downstream of nodes
1513
+ with many connections receive a higher weight suggesting that any one
1514
+ of them is less likely to be regulatory. This is a simple and clearly
1515
+ flawed heuristic which can be combined with more principled weighting
1516
+ schemes.
1517
+
1518
+ Args:
1519
+ cpr_graph (ig.Graph): a graph containing connections between molecules, proteins, and reactions.
1520
+ base_score (float): offset which will be added to all weights.
1521
+ protein_multiplier (int): multiplier for non-metabolite species (lower weight paths will tend to be selected).
1522
+ metabolite_multiplier (int): multiplier for metabolites [defined a species with a ChEBI ID).
1523
+ unknown_multiplier (int): multiplier for species without any identifier. See sbml_dfs_core.species_type_types.
1524
+ scale_multiplier_by_meandegree (bool): if True then multipliers will be rescaled by the average number of
1525
+ connections a node has (i.e., its degree) so that weights will be relatively similar regardless of network
1526
+ size and sparsity.
1527
+
1528
+ Returns:
1529
+ cpr_graph (ig.Graph): graph with added topology weights
1530
+
1531
+ """
1532
+
1533
+ # check for required attribute before proceeding
1534
+
1535
+ required_attrs = {
1536
+ CPR_GRAPH_EDGES.SC_DEGREE,
1537
+ CPR_GRAPH_EDGES.SC_CHILDREN,
1538
+ CPR_GRAPH_EDGES.SC_PARENTS,
1539
+ CPR_GRAPH_EDGES.SPECIES_TYPE,
1540
+ }
1541
+
1542
+ missing_required_attrs = required_attrs.difference(set(cpr_graph.es.attributes()))
1543
+ if len(missing_required_attrs) != 0:
1544
+ raise ValueError(
1545
+ f"model is missing {len(missing_required_attrs)} required attributes: {', '.join(missing_required_attrs)}"
1546
+ )
1547
+
1548
+ if base_score < 0:
1549
+ raise ValueError(f"base_score was {base_score} and must be non-negative")
1550
+ if protein_multiplier > unknown_multiplier:
1551
+ raise ValueError(
1552
+ f"protein_multiplier was {protein_multiplier} and unknown_multiplier "
1553
+ f"was {unknown_multiplier}. unknown_multiplier must be greater than "
1554
+ "protein_multiplier"
1555
+ )
1556
+ if metabolite_multiplier > unknown_multiplier:
1557
+ raise ValueError(
1558
+ f"protein_multiplier was {metabolite_multiplier} and unknown_multiplier "
1559
+ f"was {unknown_multiplier}. unknown_multiplier must be greater than "
1560
+ "protein_multiplier"
1561
+ )
1562
+
1563
+ # create a new weight variable
1564
+
1565
+ weight_table = pd.DataFrame(
1566
+ {
1567
+ CPR_GRAPH_EDGES.SC_DEGREE: cpr_graph.es[CPR_GRAPH_EDGES.SC_DEGREE],
1568
+ CPR_GRAPH_EDGES.SC_CHILDREN: cpr_graph.es[CPR_GRAPH_EDGES.SC_CHILDREN],
1569
+ CPR_GRAPH_EDGES.SC_PARENTS: cpr_graph.es[CPR_GRAPH_EDGES.SC_PARENTS],
1570
+ CPR_GRAPH_EDGES.SPECIES_TYPE: cpr_graph.es[CPR_GRAPH_EDGES.SPECIES_TYPE],
1571
+ }
1572
+ )
1573
+
1574
+ lookup_multiplier_dict = {
1575
+ "protein": protein_multiplier,
1576
+ "metabolite": metabolite_multiplier,
1577
+ "unknown": unknown_multiplier,
1578
+ }
1579
+ weight_table["multiplier"] = weight_table["species_type"].map(
1580
+ lookup_multiplier_dict
1581
+ )
1582
+
1583
+ # calculate mean degree
1584
+ # since topology weights will differ based on the structure of the network
1585
+ # and it would be nice to have a consistent notion of edge weights and path weights
1586
+ # for interpretability and filtering, we can rescale topology weights by the
1587
+ # average degree of nodes
1588
+ if scale_multiplier_by_meandegree:
1589
+ mean_degree = len(cpr_graph.es) / len(cpr_graph.vs)
1590
+ if not cpr_graph.is_directed():
1591
+ # for a directed network in- and out-degree are separately treated while
1592
+ # an undirected network's degree will be the sum of these two measures.
1593
+ mean_degree = mean_degree * 2
1594
+
1595
+ weight_table["multiplier"] = weight_table["multiplier"] / mean_degree
1596
+
1597
+ if cpr_graph.is_directed():
1598
+ weight_table["connection_weight"] = weight_table[CPR_GRAPH_EDGES.SC_CHILDREN]
1599
+ else:
1600
+ weight_table["connection_weight"] = weight_table[CPR_GRAPH_EDGES.SC_DEGREE]
1601
+
1602
+ # weight traveling through a species based on
1603
+ # - a constant
1604
+ # - how plausibly that species type mediates a change
1605
+ # - the number of connections that the node can bridge to
1606
+ weight_table["topo_weights"] = [
1607
+ base_score + (x * y)
1608
+ for x, y in zip(weight_table["multiplier"], weight_table["connection_weight"])
1609
+ ]
1610
+ cpr_graph.es["topo_weights"] = weight_table["topo_weights"]
1611
+
1612
+ # if directed and we want to use travel upstream define a corresponding weighting scheme
1613
+ if cpr_graph.is_directed():
1614
+ weight_table["upstream_topo_weights"] = [
1615
+ base_score + (x * y)
1616
+ for x, y in zip(weight_table["multiplier"], weight_table["sc_parents"])
1617
+ ]
1618
+ cpr_graph.es["upstream_topo_weights"] = weight_table["upstream_topo_weights"]
1619
+
1620
+ return cpr_graph
1621
+
1622
+
1623
+ def _validate_entity_attrs(
1624
+ entity_attrs: dict, validate_transformations: bool = True
1625
+ ) -> None:
1626
+ """Validate that graph attributes are a valid format."""
1627
+
1628
+ assert isinstance(entity_attrs, dict)
1629
+ for v in entity_attrs.values():
1630
+ # check structure against pydantic config
1631
+ entity_attrs = _EntityAttrValidator(**v).model_dump()
1632
+
1633
+ if validate_transformations:
1634
+ if v["trans"] not in DEFINED_WEIGHT_TRANSFORMATION.keys():
1635
+ raise ValueError(
1636
+ f"transformation {v['trans']} was not defined as an alias in "
1637
+ "DEFINED_WEIGHT_TRANSFORMATION. The defined transformations "
1638
+ f"are {', '.join(DEFINED_WEIGHT_TRANSFORMATION.keys())}"
1639
+ )
1640
+
1641
+ return None
1642
+
1643
+
1644
+ class _EntityAttrValidator(BaseModel):
1645
+ table: str
1646
+ variable: str
1647
+ trans: Optional[str] = DEFAULT_WT_TRANS