napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,652 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import random
6
+ import textwrap
7
+ import yaml
8
+ from typing import Any
9
+ from typing import Sequence
10
+
11
+ import igraph as ig
12
+ import numpy as np
13
+ import pandas as pd
14
+ from napistu import sbml_dfs_core
15
+ from napistu import source
16
+ from napistu.network import net_create
17
+
18
+ from napistu.constants import SBML_DFS
19
+ from napistu.constants import SOURCE_SPEC
20
+
21
+ from napistu.network.constants import CPR_GRAPH_NODES
22
+ from napistu.network.constants import CPR_GRAPH_TYPES
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def compartmentalize_species(
28
+ sbml_dfs: sbml_dfs_core.SBML_dfs, species: str | list[str]
29
+ ) -> pd.DataFrame:
30
+ """
31
+ Compartmentalize Species
32
+
33
+ Returns the compartmentalized species IDs (sc_ids) corresponding to a list of species (s_ids)
34
+
35
+ Parameters
36
+ ----------
37
+ sbml_dfs : SBML_dfs
38
+ A model formed by aggregating pathways
39
+ species : list
40
+ Species IDs
41
+
42
+ Returns
43
+ -------
44
+ pd.DataFrame containings the s_id and sc_id pairs
45
+ """
46
+
47
+ if isinstance(species, str):
48
+ species = [species]
49
+ if not isinstance(species, list):
50
+ raise TypeError("species is not a str or list")
51
+
52
+ return sbml_dfs.compartmentalized_species[
53
+ sbml_dfs.compartmentalized_species[SBML_DFS.S_ID].isin(species)
54
+ ].reset_index()[[SBML_DFS.S_ID, SBML_DFS.SC_ID]]
55
+
56
+
57
+ def compartmentalize_species_pairs(
58
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
59
+ origin_species: str | list[str],
60
+ dest_species: str | list[str],
61
+ ) -> pd.DataFrame:
62
+ """
63
+ Compartmentalize Shortest Paths
64
+
65
+ For a set of origin and destination species pairs, consider each species in every
66
+ compartment it operates in, seperately.
67
+
68
+ Parameters
69
+ ----------
70
+ sbml_dfs : SBML_dfs
71
+ A model formed by aggregating pathways
72
+ origin_species : list
73
+ Species IDs as starting points
74
+ dest_species : list
75
+ Species IDs as ending points
76
+
77
+ Returns
78
+ -------
79
+ pd.DataFrame containing pairs of origin and destination compartmentalized species
80
+ """
81
+
82
+ compartmentalized_origins = compartmentalize_species(
83
+ sbml_dfs, origin_species
84
+ ).rename(columns={SBML_DFS.SC_ID: "sc_id_origin", SBML_DFS.S_ID: "s_id_origin"})
85
+ if isinstance(origin_species, str):
86
+ origin_species = [origin_species]
87
+
88
+ compartmentalized_dests = compartmentalize_species(sbml_dfs, dest_species).rename(
89
+ columns={SBML_DFS.SC_ID: "sc_id_dest", SBML_DFS.S_ID: "s_id_dest"}
90
+ )
91
+ if isinstance(dest_species, str):
92
+ dest_species = [dest_species]
93
+
94
+ # create an all x all of origins and destinations
95
+ target_species_paths = pd.DataFrame(
96
+ [(x, y) for x in origin_species for y in dest_species]
97
+ )
98
+ target_species_paths.columns = ["s_id_origin", "s_id_dest"]
99
+
100
+ target_species_paths = target_species_paths.merge(compartmentalized_origins).merge(
101
+ compartmentalized_dests
102
+ )
103
+
104
+ if target_species_paths.shape[0] == 0:
105
+ raise ValueError(
106
+ "No compartmentalized paths exist, this is unexpected behavior"
107
+ )
108
+
109
+ return target_species_paths
110
+
111
+
112
+ def get_minimal_sources_edges(
113
+ vertices: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
114
+ ) -> pd.DataFrame | None:
115
+ """Assign edges to a set of sources."""
116
+
117
+ nodes = vertices["node"].tolist()
118
+ present_reactions = sbml_dfs.reactions[sbml_dfs.reactions.index.isin(nodes)]
119
+
120
+ if len(present_reactions) == 0:
121
+ return None
122
+
123
+ table_schema = sbml_dfs.schema[SBML_DFS.REACTIONS]
124
+ source_df = source.unnest_sources(present_reactions, table_schema["source"])
125
+
126
+ if source_df is None:
127
+ return None
128
+ else:
129
+ edge_sources = source.greedy_set_coverge_of_sources(source_df, table_schema)
130
+ return edge_sources.reset_index()[
131
+ [SBML_DFS.R_ID, SOURCE_SPEC.PATHWAY_ID, SOURCE_SPEC.NAME]
132
+ ]
133
+
134
+
135
+ def get_graph_summary(graph: ig.Graph) -> dict[str, Any]:
136
+ """Calculates common summary statistics for a network
137
+
138
+ Args:
139
+ graph (ig.Graph): An igraph
140
+
141
+ returns:
142
+ dict: A dictionary of summary statistics with values
143
+ n_edges [int]: number of edges
144
+ n_vertices [int]: number of vertices
145
+ n_components [int]: number of weakly connected components
146
+ (i.e. without considering edge directionality)
147
+ stats_component_sizes [dict[str, float]]: summary statistics for the component sizes
148
+ top10_large_components [list[dict[str, Any]]]: the top 10 largest components with 10 example vertices
149
+ top10_smallest_components [list[dict[str, Any]]]: the top 10 smallest components with 10 example vertices
150
+ average_path_length [float]: the average shortest path length between all vertices
151
+ top10_betweenness [list[dict[str, Any]]]: the top 10 vertices by betweenness centrality.
152
+ Roughly: measures how many shortest paths go through a vertices
153
+ top10_harmonic_centrality [list[dict[str, Any]]]: the top 10 vertices by harmonic centrality:
154
+ Roughly: mean inverse distance to all other vertices
155
+ """
156
+ stats = {}
157
+ stats["n_edges"] = graph.ecount()
158
+ stats["n_vertices"] = graph.vcount()
159
+ components = graph.components(mode="weak")
160
+ stats["n_components"] = len(components)
161
+ component_sizes = [len(c) for c in components]
162
+ stats["stats_component_sizes"] = pd.Series(component_sizes).describe().to_dict()
163
+ # get the top 10 largest components and 10 example nodes
164
+
165
+ stats["top10_large_components"] = _get_top_n_component_stats(
166
+ graph, components, component_sizes, n=10, ascending=False
167
+ )
168
+
169
+ stats["top10_smallest_components"] = _get_top_n_component_stats(
170
+ graph, components, component_sizes, n=10, ascending=True
171
+ )
172
+
173
+ stats["average_path_length"] = graph.average_path_length()
174
+
175
+ between = list(graph.betweenness(directed=False))
176
+ stats["top10_betweenness"] = _get_top_n_nodes(
177
+ graph, between, "betweenness", n=10, ascending=False
178
+ )
179
+
180
+ harmonic_centrality = list(graph.harmonic_centrality())
181
+ stats["top10_harmonic_centrality"] = _get_top_n_nodes(
182
+ graph, harmonic_centrality, "harmonic_centrality", n=10, ascending=False
183
+ )
184
+
185
+ return stats
186
+
187
+
188
+ def export_networks(
189
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
190
+ model_prefix: str,
191
+ outdir: str,
192
+ directeds: list[bool] = [True, False],
193
+ graph_types: list[str] = [CPR_GRAPH_TYPES.BIPARTITE, CPR_GRAPH_TYPES.REGULATORY],
194
+ ) -> None:
195
+ """
196
+ Exports Networks
197
+
198
+ Create one or more network from a pathway model and pickle the results
199
+
200
+ Parameters
201
+ ----------
202
+ sbml_dfs : sbml_dfs_core.SBML_dfs
203
+ A pathway model
204
+ model_prefix: str
205
+ Label to prepend to all exported files
206
+ outdir: str
207
+ Path to an existing directory where results should be saved
208
+ directeds : [bool]
209
+ List of directed types to export: a directed (True) or undirected graph be made (False)
210
+ graph_types : [str]
211
+ Types of graphs to construct, valid values are:
212
+ - bipartite: substrates and modifiers point to the reaction they drive, this reaction points to products
213
+ - regulatory: non-enzymatic modifiers point to enzymes, enzymes point to substrates and products
214
+
215
+ Returns:
216
+ ----------
217
+ None
218
+ """
219
+
220
+ if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
221
+ raise TypeError(
222
+ f"sbml_dfs must be a sbml_dfs_core.SBML_dfs, but was {type(sbml_dfs)}"
223
+ )
224
+ if not isinstance(model_prefix, str):
225
+ raise TypeError(f"model_prefix was a {type(model_prefix)} and must be a str")
226
+ if not os.path.isdir(outdir):
227
+ raise FileNotFoundError(f"{outdir} does not exist")
228
+ if not isinstance(directeds, list):
229
+ raise TypeError(f"directeds must be a list, but was {type(directeds)}")
230
+ if not isinstance(graph_types, list):
231
+ raise TypeError(f"graph_types must be a list but was a {type(graph_types)}")
232
+
233
+ # iterate through provided graph_types and export each type
234
+ for graph_type in graph_types:
235
+ for directed in directeds:
236
+ export_pkl_path = _create_network_save_string(
237
+ model_prefix=model_prefix,
238
+ outdir=outdir,
239
+ directed=directed,
240
+ graph_type=graph_type,
241
+ )
242
+ print(f"Exporting {graph_type} network to {export_pkl_path}")
243
+
244
+ network_graph = net_create.process_cpr_graph(
245
+ sbml_dfs=sbml_dfs,
246
+ directed=directed,
247
+ graph_type=graph_type,
248
+ verbose=True,
249
+ )
250
+
251
+ network_graph.write_pickle(export_pkl_path)
252
+
253
+ return None
254
+
255
+
256
+ def read_network_pkl(
257
+ model_prefix: str,
258
+ network_dir: str,
259
+ graph_type: str,
260
+ directed: bool = True,
261
+ ) -> ig.Graph:
262
+ """
263
+ Read Network Pickle
264
+
265
+ Read a saved network representation.
266
+
267
+ Params
268
+ ------
269
+ model_prefix: str
270
+ Type of model to import
271
+ network_dir: str
272
+ Path to a directory containing all saved networks.
273
+ directed : bool
274
+ Should a directed (True) or undirected graph be loaded (False)
275
+ graph_type : [str]
276
+ Type of graphs to read, valid values are:
277
+ - bipartite: substrates and modifiers point to the reaction they drive, this reaction points to products
278
+ - reguatory: non-enzymatic modifiers point to enzymes, enzymes point to substrates and products
279
+
280
+ Returns
281
+ -------
282
+ network_graph: igraph.Graph
283
+ An igraph network of the pathway
284
+
285
+ """
286
+
287
+ if not isinstance(model_prefix, str):
288
+ raise TypeError(f"model_prefix was a {type(model_prefix)} and must be a str")
289
+ if not os.path.isdir(network_dir):
290
+ raise FileNotFoundError(f"{network_dir} does not exist")
291
+ if not isinstance(directed, bool):
292
+ raise TypeError(f"directed must be a bool, but was {type(directed)}")
293
+ if not isinstance(graph_type, str):
294
+ raise TypeError(f"graph_type must be a str but was a {type(graph_type)}")
295
+
296
+ import_pkl_path = _create_network_save_string(
297
+ model_prefix, network_dir, directed, graph_type
298
+ )
299
+ if not os.path.isfile(import_pkl_path):
300
+ raise FileNotFoundError(f"{import_pkl_path} does not exist")
301
+ print(f"Importing {graph_type} network from {import_pkl_path}")
302
+
303
+ network_graph = ig.Graph.Read_Pickle(fname=import_pkl_path)
304
+
305
+ return network_graph
306
+
307
+
308
+ def filter_to_largest_subgraph(cpr_graph: ig.Graph) -> ig.Graph:
309
+ """Filter a graph to its largest weakly connected component."""
310
+
311
+ component_members = cpr_graph.components(mode="weak")
312
+ component_sizes = [len(x) for x in component_members]
313
+
314
+ top_component_members = [
315
+ m
316
+ for s, m in zip(component_sizes, component_members)
317
+ if s == max(component_sizes)
318
+ ][0]
319
+
320
+ largest_subgraph = cpr_graph.induced_subgraph(top_component_members)
321
+
322
+ return largest_subgraph
323
+
324
+
325
+ def validate_assets(
326
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
327
+ cpr_graph: ig.Graph,
328
+ precomputed_distances: pd.DataFrame,
329
+ identifiers_df: pd.DataFrame,
330
+ ) -> None:
331
+ """
332
+ Validate Assets
333
+
334
+ Perform a few quick checks of inputs to catch inconsistencies.
335
+
336
+ Args:
337
+ sbml_dfs (sbml_dfs_core.SBML_dfs):
338
+ A pathway representation.
339
+ cpr_graph (igraph.Graph):
340
+ A network-based representation of "sbml_dfs".
341
+ precomputed_distances (pd.DataFrame):
342
+ Precomputed distances between vertices in "cpr_graph".
343
+ identifiers_df (pd.DataFrame):
344
+ A table of systematic identifiers for compartmentalized species in "sbml_dfs".
345
+
346
+ Returns:
347
+ None
348
+
349
+
350
+ """
351
+
352
+ # compare cpr_graph to sbml_dfs
353
+ # test for consistent sc_id to sc_name mappings
354
+ _validate_assets_sbml_graph(sbml_dfs, cpr_graph)
355
+
356
+ # compare precomputed_distances to cpr_graph
357
+ # test whether dircetly connected sc_ids are in the same reaction
358
+ _validate_assets_graph_dist(cpr_graph, precomputed_distances)
359
+
360
+ # compare identifiers_df to sbml_dfs
361
+ # do the (sc_id, s_name) tuples in in identifiers match (sc_id, s_name) tuples in sbml_dfs
362
+ _validate_assets_sbml_ids(sbml_dfs, identifiers_df)
363
+
364
+ return None
365
+
366
+
367
+ def cpr_graph_to_pandas_dfs(cpr_graph: ig.Graph):
368
+ """
369
+ CPR Graph to Pandas DataFrames
370
+
371
+ Take an igraph representation of a network and turn it into vertices and edges tables.
372
+
373
+ Args:
374
+ cpr_graph(ig.Graph): an igraph network
375
+
376
+ Returns:
377
+ vertices (pd.DataFrame):
378
+ A table with one row per vertex.
379
+ edges (pd.DataFrame):
380
+ A table with one row per edge.
381
+ """
382
+
383
+ vertices = pd.DataFrame(
384
+ [{**{"index": v.index}, **v.attributes()} for v in cpr_graph.vs]
385
+ )
386
+ edges = pd.DataFrame(
387
+ [
388
+ {**{"source": e.source, "target": e.target}, **e.attributes()}
389
+ for e in cpr_graph.es
390
+ ]
391
+ )
392
+
393
+ return vertices, edges
394
+
395
+
396
+ def safe_fill(x, fill_width=15):
397
+ if x == "":
398
+ return ""
399
+ else:
400
+ return textwrap.fill(x, 15)
401
+
402
+
403
+ def read_graph_attrs_spec(graph_attrs_spec_uri: str) -> dict:
404
+ """Read a YAML file containing the specification for adding reaction- and/or species-attributes to a cpr_graph."""
405
+
406
+ with open(graph_attrs_spec_uri) as f:
407
+ graph_attrs_spec = yaml.safe_load(f)
408
+
409
+ VALID_SPEC_SECTIONS = ["species", "reactions"]
410
+ defined_spec_sections = set(graph_attrs_spec.keys()).intersection(
411
+ VALID_SPEC_SECTIONS
412
+ )
413
+
414
+ if len(defined_spec_sections) == 0:
415
+ raise ValueError(
416
+ f"The provided graph attributes spec did not contain either of the expected sections: {', '.join(VALID_SPEC_SECTIONS)}"
417
+ )
418
+
419
+ if "reactions" in defined_spec_sections:
420
+ net_create._validate_entity_attrs(graph_attrs_spec["reactions"])
421
+
422
+ if "species" in defined_spec_sections:
423
+ net_create._validate_entity_attrs(graph_attrs_spec["reactions"])
424
+
425
+ return graph_attrs_spec
426
+
427
+
428
+ def _create_network_save_string(
429
+ model_prefix: str, outdir: str, directed: bool, graph_type: str
430
+ ) -> str:
431
+ if directed:
432
+ directed_str = "directed"
433
+ else:
434
+ directed_str = "undirected"
435
+
436
+ export_pkl_path = os.path.join(
437
+ outdir, model_prefix + "_network_" + graph_type + "_" + directed_str + ".pkl"
438
+ )
439
+
440
+ return export_pkl_path
441
+
442
+
443
+ def _create_induced_subgraph(
444
+ cpr_graph: ig.Graph, vertices=None, n_vertices: int = 5000
445
+ ) -> ig.Graph:
446
+ """
447
+ Utility function for creating subgraphs including a set of vertices and their connections
448
+
449
+ """
450
+
451
+ if vertices is not None:
452
+ selected_vertices = vertices
453
+ else:
454
+ vertex_names = cpr_graph.vs[CPR_GRAPH_NODES.NAME]
455
+ selected_vertices = random.sample(vertex_names, n_vertices)
456
+
457
+ subgraph = cpr_graph.induced_subgraph(selected_vertices)
458
+
459
+ return subgraph
460
+
461
+
462
+ def _validate_assets_sbml_graph(
463
+ sbml_dfs: sbml_dfs_core.SBML_dfs, cpr_graph: ig.Graph
464
+ ) -> None:
465
+ """ "Check an sbml_dfs model and cpr_graph for inconsistencies."""
466
+
467
+ vertices = pd.DataFrame(
468
+ [{**{"index": v.index}, **v.attributes()} for v in cpr_graph.vs]
469
+ )
470
+
471
+ matched_cspecies = sbml_dfs.compartmentalized_species.reset_index()[
472
+ ["sc_id", "sc_name"]
473
+ ].merge(
474
+ vertices.query("node_type == 'species'"),
475
+ left_on=["sc_id"],
476
+ right_on=["name"],
477
+ )
478
+
479
+ mismatched_names = [
480
+ f"{x} != {y}"
481
+ for x, y in zip(matched_cspecies["sc_name"], matched_cspecies["node_name"])
482
+ if x != y
483
+ ]
484
+
485
+ if len(mismatched_names) > 0:
486
+ example_names = mismatched_names[: min(10, len(mismatched_names))]
487
+
488
+ raise ValueError(
489
+ f"{len(mismatched_names)} species names do not match between sbml_dfs and cpr_graph: {example_names}"
490
+ )
491
+
492
+ return None
493
+
494
+
495
+ def _validate_assets_graph_dist(
496
+ cpr_graph: ig.Graph, precomputed_distances: pd.DataFrame
497
+ ) -> None:
498
+ """ "Check an cpr_graph and precomputed distances table for inconsistencies."""
499
+
500
+ edges = pd.DataFrame(
501
+ [{**{"index": e.index}, **e.attributes()} for e in cpr_graph.es]
502
+ )
503
+
504
+ direct_interactions = precomputed_distances.query("path_length == 1")
505
+
506
+ edges_with_distances = direct_interactions.merge(
507
+ edges[["from", "to", "weights", "upstream_weights"]],
508
+ left_on=["sc_id_origin", "sc_id_dest"],
509
+ right_on=["from", "to"],
510
+ )
511
+
512
+ inconsistent_weights = edges_with_distances.query("path_weights != weights")
513
+ if inconsistent_weights.shape[0] > 0:
514
+ logger.warning(
515
+ f"{inconsistent_weights.shape[0]} edges' weights are inconsistent between",
516
+ "edges in the cpr_graph and length 1 paths in precomputed_distances."
517
+ f"This is {inconsistent_weights.shape[0] / edges_with_distances.shape[0]:.2%} of all edges.",
518
+ )
519
+
520
+ return None
521
+
522
+
523
+ def _validate_assets_sbml_ids(
524
+ sbml_dfs: sbml_dfs_core.SBML_dfs, identifiers_df: pd.DataFrame
525
+ ) -> None:
526
+ """Check an sbml_dfs file and identifiers table for inconsistencies."""
527
+
528
+ joined_species_w_ids = sbml_dfs.species.merge(
529
+ identifiers_df[["s_id", "s_name"]].drop_duplicates(),
530
+ left_index=True,
531
+ right_on="s_id",
532
+ )
533
+
534
+ inconsistent_names_df = joined_species_w_ids.query("s_name_x != s_name_y").dropna()
535
+ inconsistent_names_list = [
536
+ f"{x} != {y}"
537
+ for x, y in zip(
538
+ inconsistent_names_df["s_name_x"], inconsistent_names_df["s_name_y"]
539
+ )
540
+ ]
541
+
542
+ if len(inconsistent_names_list):
543
+ example_inconsistent_names = inconsistent_names_list[
544
+ 0 : min(10, len(inconsistent_names_list))
545
+ ]
546
+
547
+ raise ValueError(
548
+ f"{len(inconsistent_names_list)} species names do not match between "
549
+ f"sbml_dfs and identifiers_df including: {', '.join(example_inconsistent_names)}"
550
+ )
551
+
552
+ return None
553
+
554
+
555
+ def _get_top_n_idx(arr: Sequence, n: int, ascending: bool = False) -> Sequence[int]:
556
+ """Returns the indices of the top n values in an array
557
+
558
+ Args:
559
+ arr (Sequence): An array of values
560
+ n (int): The number of top values to return
561
+ ascending (bool, optional): Whether to return the top or bottom n values. Defaults to False.
562
+
563
+ Returns:
564
+ Sequence[int]: The indices of the top n values
565
+ """
566
+ order = np.argsort(arr)
567
+ if ascending:
568
+ return order[:n] # type: ignore
569
+ else:
570
+ return order[-n:][::-1] # type: ignore
571
+
572
+
573
+ def _get_top_n_objects(
574
+ object_vals: Sequence, objects: Sequence, n: int = 10, ascending: bool = False
575
+ ) -> list:
576
+ """Get the top N objects based on a ranking measure."""
577
+
578
+ idxs = _get_top_n_idx(object_vals, n, ascending=ascending)
579
+ top_objects = [objects[idx] for idx in idxs]
580
+ return top_objects
581
+
582
+
583
+ def _get_top_n_component_stats(
584
+ graph: ig.Graph,
585
+ components,
586
+ component_sizes: Sequence[int],
587
+ n: int = 10,
588
+ ascending: bool = False,
589
+ ) -> list[dict[str, Any]]:
590
+ """Summarize the top N components' network properties."""
591
+
592
+ top_components = _get_top_n_objects(component_sizes, components, n, ascending)
593
+ top_component_stats = [
594
+ {"n": len(c), "examples": [graph.vs[n].attributes() for n in c[:10]]}
595
+ for c in top_components
596
+ ]
597
+ return top_component_stats
598
+
599
+
600
+ def _get_top_n_nodes(
601
+ graph: ig.Graph, vals: Sequence, val_name: str, n: int = 10, ascending: bool = False
602
+ ) -> list[dict[str, Any]]:
603
+ """Get the top N nodes by a node attribute."""
604
+
605
+ top_idxs = _get_top_n_idx(vals, n, ascending=ascending)
606
+ top_node_attrs = [graph.vs[idx].attributes() for idx in top_idxs]
607
+ top_vals = [vals[idx] for idx in top_idxs]
608
+ return [{val_name: val, **node} for val, node in zip(top_vals, top_node_attrs)]
609
+
610
+
611
+ def _validate_edge_attributes(graph: ig.Graph, edge_attributes: list[str]) -> None:
612
+ """Check for the existence of one or more edge attributes."""
613
+
614
+ if isinstance(edge_attributes, list):
615
+ attrs = edge_attributes
616
+ elif isinstance(edge_attributes, str):
617
+ attrs = [edge_attributes]
618
+ else:
619
+ raise TypeError('"edge_attributes" must be a list or str')
620
+
621
+ available_attributes = graph.es[0].attributes().keys()
622
+ missing_attributes = set(attrs).difference(available_attributes)
623
+ n_missing_attrs = len(missing_attributes)
624
+
625
+ if n_missing_attrs > 0:
626
+ raise ValueError(
627
+ f"{n_missing_attrs} edge attributes were missing ({', '.join(missing_attributes)}). The available edge attributes are {', '.join(available_attributes)}"
628
+ )
629
+
630
+ return None
631
+
632
+
633
+ def _validate_vertex_attributes(graph: ig.Graph, vertex_attributes: list[str]) -> None:
634
+ """Check for the existence of one or more vertex attributes."""
635
+
636
+ if isinstance(vertex_attributes, list):
637
+ attrs = vertex_attributes
638
+ elif isinstance(vertex_attributes, str):
639
+ attrs = [vertex_attributes]
640
+ else:
641
+ raise TypeError('"vertex_attributes" must be a list or str')
642
+
643
+ available_attributes = graph.vs[0].attributes().keys()
644
+ missing_attributes = set(attrs).difference(available_attributes)
645
+ n_missing_attrs = len(missing_attributes)
646
+
647
+ if n_missing_attrs > 0:
648
+ raise ValueError(
649
+ f"{n_missing_attrs} vertex attributes were missing ({', '.join(missing_attributes)}). The available vertex attributes are {', '.join(available_attributes)}"
650
+ )
651
+
652
+ return None