napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,652 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import random
|
6
|
+
import textwrap
|
7
|
+
import yaml
|
8
|
+
from typing import Any
|
9
|
+
from typing import Sequence
|
10
|
+
|
11
|
+
import igraph as ig
|
12
|
+
import numpy as np
|
13
|
+
import pandas as pd
|
14
|
+
from napistu import sbml_dfs_core
|
15
|
+
from napistu import source
|
16
|
+
from napistu.network import net_create
|
17
|
+
|
18
|
+
from napistu.constants import SBML_DFS
|
19
|
+
from napistu.constants import SOURCE_SPEC
|
20
|
+
|
21
|
+
from napistu.network.constants import CPR_GRAPH_NODES
|
22
|
+
from napistu.network.constants import CPR_GRAPH_TYPES
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
def compartmentalize_species(
|
28
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, species: str | list[str]
|
29
|
+
) -> pd.DataFrame:
|
30
|
+
"""
|
31
|
+
Compartmentalize Species
|
32
|
+
|
33
|
+
Returns the compartmentalized species IDs (sc_ids) corresponding to a list of species (s_ids)
|
34
|
+
|
35
|
+
Parameters
|
36
|
+
----------
|
37
|
+
sbml_dfs : SBML_dfs
|
38
|
+
A model formed by aggregating pathways
|
39
|
+
species : list
|
40
|
+
Species IDs
|
41
|
+
|
42
|
+
Returns
|
43
|
+
-------
|
44
|
+
pd.DataFrame containings the s_id and sc_id pairs
|
45
|
+
"""
|
46
|
+
|
47
|
+
if isinstance(species, str):
|
48
|
+
species = [species]
|
49
|
+
if not isinstance(species, list):
|
50
|
+
raise TypeError("species is not a str or list")
|
51
|
+
|
52
|
+
return sbml_dfs.compartmentalized_species[
|
53
|
+
sbml_dfs.compartmentalized_species[SBML_DFS.S_ID].isin(species)
|
54
|
+
].reset_index()[[SBML_DFS.S_ID, SBML_DFS.SC_ID]]
|
55
|
+
|
56
|
+
|
57
|
+
def compartmentalize_species_pairs(
|
58
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
59
|
+
origin_species: str | list[str],
|
60
|
+
dest_species: str | list[str],
|
61
|
+
) -> pd.DataFrame:
|
62
|
+
"""
|
63
|
+
Compartmentalize Shortest Paths
|
64
|
+
|
65
|
+
For a set of origin and destination species pairs, consider each species in every
|
66
|
+
compartment it operates in, seperately.
|
67
|
+
|
68
|
+
Parameters
|
69
|
+
----------
|
70
|
+
sbml_dfs : SBML_dfs
|
71
|
+
A model formed by aggregating pathways
|
72
|
+
origin_species : list
|
73
|
+
Species IDs as starting points
|
74
|
+
dest_species : list
|
75
|
+
Species IDs as ending points
|
76
|
+
|
77
|
+
Returns
|
78
|
+
-------
|
79
|
+
pd.DataFrame containing pairs of origin and destination compartmentalized species
|
80
|
+
"""
|
81
|
+
|
82
|
+
compartmentalized_origins = compartmentalize_species(
|
83
|
+
sbml_dfs, origin_species
|
84
|
+
).rename(columns={SBML_DFS.SC_ID: "sc_id_origin", SBML_DFS.S_ID: "s_id_origin"})
|
85
|
+
if isinstance(origin_species, str):
|
86
|
+
origin_species = [origin_species]
|
87
|
+
|
88
|
+
compartmentalized_dests = compartmentalize_species(sbml_dfs, dest_species).rename(
|
89
|
+
columns={SBML_DFS.SC_ID: "sc_id_dest", SBML_DFS.S_ID: "s_id_dest"}
|
90
|
+
)
|
91
|
+
if isinstance(dest_species, str):
|
92
|
+
dest_species = [dest_species]
|
93
|
+
|
94
|
+
# create an all x all of origins and destinations
|
95
|
+
target_species_paths = pd.DataFrame(
|
96
|
+
[(x, y) for x in origin_species for y in dest_species]
|
97
|
+
)
|
98
|
+
target_species_paths.columns = ["s_id_origin", "s_id_dest"]
|
99
|
+
|
100
|
+
target_species_paths = target_species_paths.merge(compartmentalized_origins).merge(
|
101
|
+
compartmentalized_dests
|
102
|
+
)
|
103
|
+
|
104
|
+
if target_species_paths.shape[0] == 0:
|
105
|
+
raise ValueError(
|
106
|
+
"No compartmentalized paths exist, this is unexpected behavior"
|
107
|
+
)
|
108
|
+
|
109
|
+
return target_species_paths
|
110
|
+
|
111
|
+
|
112
|
+
def get_minimal_sources_edges(
|
113
|
+
vertices: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
|
114
|
+
) -> pd.DataFrame | None:
|
115
|
+
"""Assign edges to a set of sources."""
|
116
|
+
|
117
|
+
nodes = vertices["node"].tolist()
|
118
|
+
present_reactions = sbml_dfs.reactions[sbml_dfs.reactions.index.isin(nodes)]
|
119
|
+
|
120
|
+
if len(present_reactions) == 0:
|
121
|
+
return None
|
122
|
+
|
123
|
+
table_schema = sbml_dfs.schema[SBML_DFS.REACTIONS]
|
124
|
+
source_df = source.unnest_sources(present_reactions, table_schema["source"])
|
125
|
+
|
126
|
+
if source_df is None:
|
127
|
+
return None
|
128
|
+
else:
|
129
|
+
edge_sources = source.greedy_set_coverge_of_sources(source_df, table_schema)
|
130
|
+
return edge_sources.reset_index()[
|
131
|
+
[SBML_DFS.R_ID, SOURCE_SPEC.PATHWAY_ID, SOURCE_SPEC.NAME]
|
132
|
+
]
|
133
|
+
|
134
|
+
|
135
|
+
def get_graph_summary(graph: ig.Graph) -> dict[str, Any]:
|
136
|
+
"""Calculates common summary statistics for a network
|
137
|
+
|
138
|
+
Args:
|
139
|
+
graph (ig.Graph): An igraph
|
140
|
+
|
141
|
+
returns:
|
142
|
+
dict: A dictionary of summary statistics with values
|
143
|
+
n_edges [int]: number of edges
|
144
|
+
n_vertices [int]: number of vertices
|
145
|
+
n_components [int]: number of weakly connected components
|
146
|
+
(i.e. without considering edge directionality)
|
147
|
+
stats_component_sizes [dict[str, float]]: summary statistics for the component sizes
|
148
|
+
top10_large_components [list[dict[str, Any]]]: the top 10 largest components with 10 example vertices
|
149
|
+
top10_smallest_components [list[dict[str, Any]]]: the top 10 smallest components with 10 example vertices
|
150
|
+
average_path_length [float]: the average shortest path length between all vertices
|
151
|
+
top10_betweenness [list[dict[str, Any]]]: the top 10 vertices by betweenness centrality.
|
152
|
+
Roughly: measures how many shortest paths go through a vertices
|
153
|
+
top10_harmonic_centrality [list[dict[str, Any]]]: the top 10 vertices by harmonic centrality:
|
154
|
+
Roughly: mean inverse distance to all other vertices
|
155
|
+
"""
|
156
|
+
stats = {}
|
157
|
+
stats["n_edges"] = graph.ecount()
|
158
|
+
stats["n_vertices"] = graph.vcount()
|
159
|
+
components = graph.components(mode="weak")
|
160
|
+
stats["n_components"] = len(components)
|
161
|
+
component_sizes = [len(c) for c in components]
|
162
|
+
stats["stats_component_sizes"] = pd.Series(component_sizes).describe().to_dict()
|
163
|
+
# get the top 10 largest components and 10 example nodes
|
164
|
+
|
165
|
+
stats["top10_large_components"] = _get_top_n_component_stats(
|
166
|
+
graph, components, component_sizes, n=10, ascending=False
|
167
|
+
)
|
168
|
+
|
169
|
+
stats["top10_smallest_components"] = _get_top_n_component_stats(
|
170
|
+
graph, components, component_sizes, n=10, ascending=True
|
171
|
+
)
|
172
|
+
|
173
|
+
stats["average_path_length"] = graph.average_path_length()
|
174
|
+
|
175
|
+
between = list(graph.betweenness(directed=False))
|
176
|
+
stats["top10_betweenness"] = _get_top_n_nodes(
|
177
|
+
graph, between, "betweenness", n=10, ascending=False
|
178
|
+
)
|
179
|
+
|
180
|
+
harmonic_centrality = list(graph.harmonic_centrality())
|
181
|
+
stats["top10_harmonic_centrality"] = _get_top_n_nodes(
|
182
|
+
graph, harmonic_centrality, "harmonic_centrality", n=10, ascending=False
|
183
|
+
)
|
184
|
+
|
185
|
+
return stats
|
186
|
+
|
187
|
+
|
188
|
+
def export_networks(
|
189
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
190
|
+
model_prefix: str,
|
191
|
+
outdir: str,
|
192
|
+
directeds: list[bool] = [True, False],
|
193
|
+
graph_types: list[str] = [CPR_GRAPH_TYPES.BIPARTITE, CPR_GRAPH_TYPES.REGULATORY],
|
194
|
+
) -> None:
|
195
|
+
"""
|
196
|
+
Exports Networks
|
197
|
+
|
198
|
+
Create one or more network from a pathway model and pickle the results
|
199
|
+
|
200
|
+
Parameters
|
201
|
+
----------
|
202
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
203
|
+
A pathway model
|
204
|
+
model_prefix: str
|
205
|
+
Label to prepend to all exported files
|
206
|
+
outdir: str
|
207
|
+
Path to an existing directory where results should be saved
|
208
|
+
directeds : [bool]
|
209
|
+
List of directed types to export: a directed (True) or undirected graph be made (False)
|
210
|
+
graph_types : [str]
|
211
|
+
Types of graphs to construct, valid values are:
|
212
|
+
- bipartite: substrates and modifiers point to the reaction they drive, this reaction points to products
|
213
|
+
- regulatory: non-enzymatic modifiers point to enzymes, enzymes point to substrates and products
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
----------
|
217
|
+
None
|
218
|
+
"""
|
219
|
+
|
220
|
+
if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
|
221
|
+
raise TypeError(
|
222
|
+
f"sbml_dfs must be a sbml_dfs_core.SBML_dfs, but was {type(sbml_dfs)}"
|
223
|
+
)
|
224
|
+
if not isinstance(model_prefix, str):
|
225
|
+
raise TypeError(f"model_prefix was a {type(model_prefix)} and must be a str")
|
226
|
+
if not os.path.isdir(outdir):
|
227
|
+
raise FileNotFoundError(f"{outdir} does not exist")
|
228
|
+
if not isinstance(directeds, list):
|
229
|
+
raise TypeError(f"directeds must be a list, but was {type(directeds)}")
|
230
|
+
if not isinstance(graph_types, list):
|
231
|
+
raise TypeError(f"graph_types must be a list but was a {type(graph_types)}")
|
232
|
+
|
233
|
+
# iterate through provided graph_types and export each type
|
234
|
+
for graph_type in graph_types:
|
235
|
+
for directed in directeds:
|
236
|
+
export_pkl_path = _create_network_save_string(
|
237
|
+
model_prefix=model_prefix,
|
238
|
+
outdir=outdir,
|
239
|
+
directed=directed,
|
240
|
+
graph_type=graph_type,
|
241
|
+
)
|
242
|
+
print(f"Exporting {graph_type} network to {export_pkl_path}")
|
243
|
+
|
244
|
+
network_graph = net_create.process_cpr_graph(
|
245
|
+
sbml_dfs=sbml_dfs,
|
246
|
+
directed=directed,
|
247
|
+
graph_type=graph_type,
|
248
|
+
verbose=True,
|
249
|
+
)
|
250
|
+
|
251
|
+
network_graph.write_pickle(export_pkl_path)
|
252
|
+
|
253
|
+
return None
|
254
|
+
|
255
|
+
|
256
|
+
def read_network_pkl(
|
257
|
+
model_prefix: str,
|
258
|
+
network_dir: str,
|
259
|
+
graph_type: str,
|
260
|
+
directed: bool = True,
|
261
|
+
) -> ig.Graph:
|
262
|
+
"""
|
263
|
+
Read Network Pickle
|
264
|
+
|
265
|
+
Read a saved network representation.
|
266
|
+
|
267
|
+
Params
|
268
|
+
------
|
269
|
+
model_prefix: str
|
270
|
+
Type of model to import
|
271
|
+
network_dir: str
|
272
|
+
Path to a directory containing all saved networks.
|
273
|
+
directed : bool
|
274
|
+
Should a directed (True) or undirected graph be loaded (False)
|
275
|
+
graph_type : [str]
|
276
|
+
Type of graphs to read, valid values are:
|
277
|
+
- bipartite: substrates and modifiers point to the reaction they drive, this reaction points to products
|
278
|
+
- reguatory: non-enzymatic modifiers point to enzymes, enzymes point to substrates and products
|
279
|
+
|
280
|
+
Returns
|
281
|
+
-------
|
282
|
+
network_graph: igraph.Graph
|
283
|
+
An igraph network of the pathway
|
284
|
+
|
285
|
+
"""
|
286
|
+
|
287
|
+
if not isinstance(model_prefix, str):
|
288
|
+
raise TypeError(f"model_prefix was a {type(model_prefix)} and must be a str")
|
289
|
+
if not os.path.isdir(network_dir):
|
290
|
+
raise FileNotFoundError(f"{network_dir} does not exist")
|
291
|
+
if not isinstance(directed, bool):
|
292
|
+
raise TypeError(f"directed must be a bool, but was {type(directed)}")
|
293
|
+
if not isinstance(graph_type, str):
|
294
|
+
raise TypeError(f"graph_type must be a str but was a {type(graph_type)}")
|
295
|
+
|
296
|
+
import_pkl_path = _create_network_save_string(
|
297
|
+
model_prefix, network_dir, directed, graph_type
|
298
|
+
)
|
299
|
+
if not os.path.isfile(import_pkl_path):
|
300
|
+
raise FileNotFoundError(f"{import_pkl_path} does not exist")
|
301
|
+
print(f"Importing {graph_type} network from {import_pkl_path}")
|
302
|
+
|
303
|
+
network_graph = ig.Graph.Read_Pickle(fname=import_pkl_path)
|
304
|
+
|
305
|
+
return network_graph
|
306
|
+
|
307
|
+
|
308
|
+
def filter_to_largest_subgraph(cpr_graph: ig.Graph) -> ig.Graph:
|
309
|
+
"""Filter a graph to its largest weakly connected component."""
|
310
|
+
|
311
|
+
component_members = cpr_graph.components(mode="weak")
|
312
|
+
component_sizes = [len(x) for x in component_members]
|
313
|
+
|
314
|
+
top_component_members = [
|
315
|
+
m
|
316
|
+
for s, m in zip(component_sizes, component_members)
|
317
|
+
if s == max(component_sizes)
|
318
|
+
][0]
|
319
|
+
|
320
|
+
largest_subgraph = cpr_graph.induced_subgraph(top_component_members)
|
321
|
+
|
322
|
+
return largest_subgraph
|
323
|
+
|
324
|
+
|
325
|
+
def validate_assets(
|
326
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
327
|
+
cpr_graph: ig.Graph,
|
328
|
+
precomputed_distances: pd.DataFrame,
|
329
|
+
identifiers_df: pd.DataFrame,
|
330
|
+
) -> None:
|
331
|
+
"""
|
332
|
+
Validate Assets
|
333
|
+
|
334
|
+
Perform a few quick checks of inputs to catch inconsistencies.
|
335
|
+
|
336
|
+
Args:
|
337
|
+
sbml_dfs (sbml_dfs_core.SBML_dfs):
|
338
|
+
A pathway representation.
|
339
|
+
cpr_graph (igraph.Graph):
|
340
|
+
A network-based representation of "sbml_dfs".
|
341
|
+
precomputed_distances (pd.DataFrame):
|
342
|
+
Precomputed distances between vertices in "cpr_graph".
|
343
|
+
identifiers_df (pd.DataFrame):
|
344
|
+
A table of systematic identifiers for compartmentalized species in "sbml_dfs".
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
None
|
348
|
+
|
349
|
+
|
350
|
+
"""
|
351
|
+
|
352
|
+
# compare cpr_graph to sbml_dfs
|
353
|
+
# test for consistent sc_id to sc_name mappings
|
354
|
+
_validate_assets_sbml_graph(sbml_dfs, cpr_graph)
|
355
|
+
|
356
|
+
# compare precomputed_distances to cpr_graph
|
357
|
+
# test whether dircetly connected sc_ids are in the same reaction
|
358
|
+
_validate_assets_graph_dist(cpr_graph, precomputed_distances)
|
359
|
+
|
360
|
+
# compare identifiers_df to sbml_dfs
|
361
|
+
# do the (sc_id, s_name) tuples in in identifiers match (sc_id, s_name) tuples in sbml_dfs
|
362
|
+
_validate_assets_sbml_ids(sbml_dfs, identifiers_df)
|
363
|
+
|
364
|
+
return None
|
365
|
+
|
366
|
+
|
367
|
+
def cpr_graph_to_pandas_dfs(cpr_graph: ig.Graph):
|
368
|
+
"""
|
369
|
+
CPR Graph to Pandas DataFrames
|
370
|
+
|
371
|
+
Take an igraph representation of a network and turn it into vertices and edges tables.
|
372
|
+
|
373
|
+
Args:
|
374
|
+
cpr_graph(ig.Graph): an igraph network
|
375
|
+
|
376
|
+
Returns:
|
377
|
+
vertices (pd.DataFrame):
|
378
|
+
A table with one row per vertex.
|
379
|
+
edges (pd.DataFrame):
|
380
|
+
A table with one row per edge.
|
381
|
+
"""
|
382
|
+
|
383
|
+
vertices = pd.DataFrame(
|
384
|
+
[{**{"index": v.index}, **v.attributes()} for v in cpr_graph.vs]
|
385
|
+
)
|
386
|
+
edges = pd.DataFrame(
|
387
|
+
[
|
388
|
+
{**{"source": e.source, "target": e.target}, **e.attributes()}
|
389
|
+
for e in cpr_graph.es
|
390
|
+
]
|
391
|
+
)
|
392
|
+
|
393
|
+
return vertices, edges
|
394
|
+
|
395
|
+
|
396
|
+
def safe_fill(x, fill_width=15):
|
397
|
+
if x == "":
|
398
|
+
return ""
|
399
|
+
else:
|
400
|
+
return textwrap.fill(x, 15)
|
401
|
+
|
402
|
+
|
403
|
+
def read_graph_attrs_spec(graph_attrs_spec_uri: str) -> dict:
|
404
|
+
"""Read a YAML file containing the specification for adding reaction- and/or species-attributes to a cpr_graph."""
|
405
|
+
|
406
|
+
with open(graph_attrs_spec_uri) as f:
|
407
|
+
graph_attrs_spec = yaml.safe_load(f)
|
408
|
+
|
409
|
+
VALID_SPEC_SECTIONS = ["species", "reactions"]
|
410
|
+
defined_spec_sections = set(graph_attrs_spec.keys()).intersection(
|
411
|
+
VALID_SPEC_SECTIONS
|
412
|
+
)
|
413
|
+
|
414
|
+
if len(defined_spec_sections) == 0:
|
415
|
+
raise ValueError(
|
416
|
+
f"The provided graph attributes spec did not contain either of the expected sections: {', '.join(VALID_SPEC_SECTIONS)}"
|
417
|
+
)
|
418
|
+
|
419
|
+
if "reactions" in defined_spec_sections:
|
420
|
+
net_create._validate_entity_attrs(graph_attrs_spec["reactions"])
|
421
|
+
|
422
|
+
if "species" in defined_spec_sections:
|
423
|
+
net_create._validate_entity_attrs(graph_attrs_spec["reactions"])
|
424
|
+
|
425
|
+
return graph_attrs_spec
|
426
|
+
|
427
|
+
|
428
|
+
def _create_network_save_string(
|
429
|
+
model_prefix: str, outdir: str, directed: bool, graph_type: str
|
430
|
+
) -> str:
|
431
|
+
if directed:
|
432
|
+
directed_str = "directed"
|
433
|
+
else:
|
434
|
+
directed_str = "undirected"
|
435
|
+
|
436
|
+
export_pkl_path = os.path.join(
|
437
|
+
outdir, model_prefix + "_network_" + graph_type + "_" + directed_str + ".pkl"
|
438
|
+
)
|
439
|
+
|
440
|
+
return export_pkl_path
|
441
|
+
|
442
|
+
|
443
|
+
def _create_induced_subgraph(
|
444
|
+
cpr_graph: ig.Graph, vertices=None, n_vertices: int = 5000
|
445
|
+
) -> ig.Graph:
|
446
|
+
"""
|
447
|
+
Utility function for creating subgraphs including a set of vertices and their connections
|
448
|
+
|
449
|
+
"""
|
450
|
+
|
451
|
+
if vertices is not None:
|
452
|
+
selected_vertices = vertices
|
453
|
+
else:
|
454
|
+
vertex_names = cpr_graph.vs[CPR_GRAPH_NODES.NAME]
|
455
|
+
selected_vertices = random.sample(vertex_names, n_vertices)
|
456
|
+
|
457
|
+
subgraph = cpr_graph.induced_subgraph(selected_vertices)
|
458
|
+
|
459
|
+
return subgraph
|
460
|
+
|
461
|
+
|
462
|
+
def _validate_assets_sbml_graph(
|
463
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, cpr_graph: ig.Graph
|
464
|
+
) -> None:
|
465
|
+
""" "Check an sbml_dfs model and cpr_graph for inconsistencies."""
|
466
|
+
|
467
|
+
vertices = pd.DataFrame(
|
468
|
+
[{**{"index": v.index}, **v.attributes()} for v in cpr_graph.vs]
|
469
|
+
)
|
470
|
+
|
471
|
+
matched_cspecies = sbml_dfs.compartmentalized_species.reset_index()[
|
472
|
+
["sc_id", "sc_name"]
|
473
|
+
].merge(
|
474
|
+
vertices.query("node_type == 'species'"),
|
475
|
+
left_on=["sc_id"],
|
476
|
+
right_on=["name"],
|
477
|
+
)
|
478
|
+
|
479
|
+
mismatched_names = [
|
480
|
+
f"{x} != {y}"
|
481
|
+
for x, y in zip(matched_cspecies["sc_name"], matched_cspecies["node_name"])
|
482
|
+
if x != y
|
483
|
+
]
|
484
|
+
|
485
|
+
if len(mismatched_names) > 0:
|
486
|
+
example_names = mismatched_names[: min(10, len(mismatched_names))]
|
487
|
+
|
488
|
+
raise ValueError(
|
489
|
+
f"{len(mismatched_names)} species names do not match between sbml_dfs and cpr_graph: {example_names}"
|
490
|
+
)
|
491
|
+
|
492
|
+
return None
|
493
|
+
|
494
|
+
|
495
|
+
def _validate_assets_graph_dist(
|
496
|
+
cpr_graph: ig.Graph, precomputed_distances: pd.DataFrame
|
497
|
+
) -> None:
|
498
|
+
""" "Check an cpr_graph and precomputed distances table for inconsistencies."""
|
499
|
+
|
500
|
+
edges = pd.DataFrame(
|
501
|
+
[{**{"index": e.index}, **e.attributes()} for e in cpr_graph.es]
|
502
|
+
)
|
503
|
+
|
504
|
+
direct_interactions = precomputed_distances.query("path_length == 1")
|
505
|
+
|
506
|
+
edges_with_distances = direct_interactions.merge(
|
507
|
+
edges[["from", "to", "weights", "upstream_weights"]],
|
508
|
+
left_on=["sc_id_origin", "sc_id_dest"],
|
509
|
+
right_on=["from", "to"],
|
510
|
+
)
|
511
|
+
|
512
|
+
inconsistent_weights = edges_with_distances.query("path_weights != weights")
|
513
|
+
if inconsistent_weights.shape[0] > 0:
|
514
|
+
logger.warning(
|
515
|
+
f"{inconsistent_weights.shape[0]} edges' weights are inconsistent between",
|
516
|
+
"edges in the cpr_graph and length 1 paths in precomputed_distances."
|
517
|
+
f"This is {inconsistent_weights.shape[0] / edges_with_distances.shape[0]:.2%} of all edges.",
|
518
|
+
)
|
519
|
+
|
520
|
+
return None
|
521
|
+
|
522
|
+
|
523
|
+
def _validate_assets_sbml_ids(
|
524
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, identifiers_df: pd.DataFrame
|
525
|
+
) -> None:
|
526
|
+
"""Check an sbml_dfs file and identifiers table for inconsistencies."""
|
527
|
+
|
528
|
+
joined_species_w_ids = sbml_dfs.species.merge(
|
529
|
+
identifiers_df[["s_id", "s_name"]].drop_duplicates(),
|
530
|
+
left_index=True,
|
531
|
+
right_on="s_id",
|
532
|
+
)
|
533
|
+
|
534
|
+
inconsistent_names_df = joined_species_w_ids.query("s_name_x != s_name_y").dropna()
|
535
|
+
inconsistent_names_list = [
|
536
|
+
f"{x} != {y}"
|
537
|
+
for x, y in zip(
|
538
|
+
inconsistent_names_df["s_name_x"], inconsistent_names_df["s_name_y"]
|
539
|
+
)
|
540
|
+
]
|
541
|
+
|
542
|
+
if len(inconsistent_names_list):
|
543
|
+
example_inconsistent_names = inconsistent_names_list[
|
544
|
+
0 : min(10, len(inconsistent_names_list))
|
545
|
+
]
|
546
|
+
|
547
|
+
raise ValueError(
|
548
|
+
f"{len(inconsistent_names_list)} species names do not match between "
|
549
|
+
f"sbml_dfs and identifiers_df including: {', '.join(example_inconsistent_names)}"
|
550
|
+
)
|
551
|
+
|
552
|
+
return None
|
553
|
+
|
554
|
+
|
555
|
+
def _get_top_n_idx(arr: Sequence, n: int, ascending: bool = False) -> Sequence[int]:
|
556
|
+
"""Returns the indices of the top n values in an array
|
557
|
+
|
558
|
+
Args:
|
559
|
+
arr (Sequence): An array of values
|
560
|
+
n (int): The number of top values to return
|
561
|
+
ascending (bool, optional): Whether to return the top or bottom n values. Defaults to False.
|
562
|
+
|
563
|
+
Returns:
|
564
|
+
Sequence[int]: The indices of the top n values
|
565
|
+
"""
|
566
|
+
order = np.argsort(arr)
|
567
|
+
if ascending:
|
568
|
+
return order[:n] # type: ignore
|
569
|
+
else:
|
570
|
+
return order[-n:][::-1] # type: ignore
|
571
|
+
|
572
|
+
|
573
|
+
def _get_top_n_objects(
|
574
|
+
object_vals: Sequence, objects: Sequence, n: int = 10, ascending: bool = False
|
575
|
+
) -> list:
|
576
|
+
"""Get the top N objects based on a ranking measure."""
|
577
|
+
|
578
|
+
idxs = _get_top_n_idx(object_vals, n, ascending=ascending)
|
579
|
+
top_objects = [objects[idx] for idx in idxs]
|
580
|
+
return top_objects
|
581
|
+
|
582
|
+
|
583
|
+
def _get_top_n_component_stats(
|
584
|
+
graph: ig.Graph,
|
585
|
+
components,
|
586
|
+
component_sizes: Sequence[int],
|
587
|
+
n: int = 10,
|
588
|
+
ascending: bool = False,
|
589
|
+
) -> list[dict[str, Any]]:
|
590
|
+
"""Summarize the top N components' network properties."""
|
591
|
+
|
592
|
+
top_components = _get_top_n_objects(component_sizes, components, n, ascending)
|
593
|
+
top_component_stats = [
|
594
|
+
{"n": len(c), "examples": [graph.vs[n].attributes() for n in c[:10]]}
|
595
|
+
for c in top_components
|
596
|
+
]
|
597
|
+
return top_component_stats
|
598
|
+
|
599
|
+
|
600
|
+
def _get_top_n_nodes(
|
601
|
+
graph: ig.Graph, vals: Sequence, val_name: str, n: int = 10, ascending: bool = False
|
602
|
+
) -> list[dict[str, Any]]:
|
603
|
+
"""Get the top N nodes by a node attribute."""
|
604
|
+
|
605
|
+
top_idxs = _get_top_n_idx(vals, n, ascending=ascending)
|
606
|
+
top_node_attrs = [graph.vs[idx].attributes() for idx in top_idxs]
|
607
|
+
top_vals = [vals[idx] for idx in top_idxs]
|
608
|
+
return [{val_name: val, **node} for val, node in zip(top_vals, top_node_attrs)]
|
609
|
+
|
610
|
+
|
611
|
+
def _validate_edge_attributes(graph: ig.Graph, edge_attributes: list[str]) -> None:
|
612
|
+
"""Check for the existence of one or more edge attributes."""
|
613
|
+
|
614
|
+
if isinstance(edge_attributes, list):
|
615
|
+
attrs = edge_attributes
|
616
|
+
elif isinstance(edge_attributes, str):
|
617
|
+
attrs = [edge_attributes]
|
618
|
+
else:
|
619
|
+
raise TypeError('"edge_attributes" must be a list or str')
|
620
|
+
|
621
|
+
available_attributes = graph.es[0].attributes().keys()
|
622
|
+
missing_attributes = set(attrs).difference(available_attributes)
|
623
|
+
n_missing_attrs = len(missing_attributes)
|
624
|
+
|
625
|
+
if n_missing_attrs > 0:
|
626
|
+
raise ValueError(
|
627
|
+
f"{n_missing_attrs} edge attributes were missing ({', '.join(missing_attributes)}). The available edge attributes are {', '.join(available_attributes)}"
|
628
|
+
)
|
629
|
+
|
630
|
+
return None
|
631
|
+
|
632
|
+
|
633
|
+
def _validate_vertex_attributes(graph: ig.Graph, vertex_attributes: list[str]) -> None:
|
634
|
+
"""Check for the existence of one or more vertex attributes."""
|
635
|
+
|
636
|
+
if isinstance(vertex_attributes, list):
|
637
|
+
attrs = vertex_attributes
|
638
|
+
elif isinstance(vertex_attributes, str):
|
639
|
+
attrs = [vertex_attributes]
|
640
|
+
else:
|
641
|
+
raise TypeError('"vertex_attributes" must be a list or str')
|
642
|
+
|
643
|
+
available_attributes = graph.vs[0].attributes().keys()
|
644
|
+
missing_attributes = set(attrs).difference(available_attributes)
|
645
|
+
n_missing_attrs = len(missing_attributes)
|
646
|
+
|
647
|
+
if n_missing_attrs > 0:
|
648
|
+
raise ValueError(
|
649
|
+
f"{n_missing_attrs} vertex attributes were missing ({', '.join(missing_attributes)}). The available vertex attributes are {', '.join(available_attributes)}"
|
650
|
+
)
|
651
|
+
|
652
|
+
return None
|