napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,500 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import math
5
+ import warnings
6
+ from typing import Any
7
+
8
+ import igraph as ig
9
+ import pandas as pd
10
+ from napistu import sbml_dfs_core
11
+ from napistu import utils
12
+ from napistu.constants import CPR_PATH_REQ_VARS
13
+ from napistu.constants import MINI_SBO_NAME_TO_POLARITY
14
+ from napistu.constants import MINI_SBO_TO_NAME
15
+ from napistu.network import net_utils
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def find_shortest_reaction_paths(
21
+ cpr_graph: ig.Graph,
22
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
23
+ origin: str,
24
+ dest: str | list,
25
+ weight_var: str,
26
+ ) -> tuple[pd.DataFrame, pd.DataFrame] | None:
27
+ """
28
+ Shortest Reaction Paths
29
+
30
+ Find all shortest paths between an origin and destination entity
31
+
32
+ Parameters
33
+ ----------
34
+ cpr_graph : igraph.Graph
35
+ A bipartite network connecting molecular species and reactions
36
+ sbml_dfs : sbml_dfs_core.SBML_dfs
37
+ A model formed by aggregating pathways
38
+ origin : str
39
+ A node to start at
40
+ dest : str | list
41
+ Node(s) to reach
42
+ weight_var : str
43
+ An edge attribute to use when forming a weighted shortest path
44
+
45
+ Returns:
46
+ ----------
47
+ Node paths and edges pd.DataFrames
48
+ """
49
+
50
+ # update destination list to exclude the origin vertex
51
+ if isinstance(dest, str):
52
+ if origin == dest:
53
+ logger.info("origin = dest; returning None")
54
+ return None
55
+ elif isinstance(dest, list):
56
+ # drop list entries where
57
+ dest = [d for d in dest if d != origin]
58
+ if len(dest) == 0:
59
+ logger.info("origin = dest; returning None")
60
+ return None
61
+
62
+ with warnings.catch_warnings():
63
+ # igraph throws warnings for each pair of unconnected species
64
+ warnings.simplefilter("ignore")
65
+
66
+ shortest_paths = cpr_graph.get_all_shortest_paths(
67
+ origin, to=dest, weights=weight_var
68
+ )
69
+
70
+ if len(shortest_paths) == 0:
71
+ return None
72
+
73
+ # summarize the graph which is being evaluated
74
+ with warnings.catch_warnings():
75
+ # igraph throws warnings for each pair of unconnected species
76
+ warnings.simplefilter("ignore")
77
+
78
+ shortest_paths = cpr_graph.get_all_shortest_paths(
79
+ origin, to=dest, weights=weight_var
80
+ )
81
+
82
+ # summarize the graph which is being evaluated
83
+ cpr_graph_names = [v.attributes()["name"] for v in cpr_graph.vs]
84
+
85
+ cpr_graph_edges = pd.DataFrame(
86
+ {
87
+ "from": cpr_graph.es.get_attribute_values("from"),
88
+ "to": cpr_graph.es.get_attribute_values("to"),
89
+ "weights": cpr_graph.es.get_attribute_values(weight_var),
90
+ "sbo_term": cpr_graph.es.get_attribute_values("sbo_term"),
91
+ "direction": cpr_graph.es.get_attribute_values("direction"),
92
+ }
93
+ )
94
+
95
+ directed = cpr_graph.is_directed()
96
+
97
+ # format shortest paths
98
+ # summaries of nodes
99
+ path_list = list()
100
+ # summaries of edges
101
+ edge_list = list()
102
+
103
+ entry = 0
104
+ for path in shortest_paths:
105
+ path_df = (
106
+ pd.DataFrame({"node": [cpr_graph_names[x] for x in path]})
107
+ .reset_index()
108
+ .rename(columns={"index": "step"})
109
+ .assign(path=entry)
110
+ )
111
+ path_df["node_number"] = path
112
+
113
+ # reconstruct edges
114
+ path_edges = pd.DataFrame(
115
+ {"from": path_df["node"][:-1].tolist(), "to": path_df["node"][1:].tolist()}
116
+ ).assign(path=entry)
117
+
118
+ # add weights to edges
119
+
120
+ if directed:
121
+ path_edges = path_edges.merge(
122
+ cpr_graph_edges,
123
+ left_on=["from", "to"],
124
+ right_on=["from", "to"],
125
+ )
126
+
127
+ path_edges["link_polarity"] = (
128
+ path_edges["sbo_term"]
129
+ .map(MINI_SBO_TO_NAME)
130
+ .map(MINI_SBO_NAME_TO_POLARITY)
131
+ )
132
+ # is the edge predicted to be activating, inhibiting or ambiguous?
133
+ path_edges["net_polarity"] = _calculate_net_polarity(
134
+ path_edges["link_polarity"]
135
+ )
136
+
137
+ else:
138
+ # if undirected then edges utilized may not be defined in the edgelist
139
+
140
+ path_edges["step"] = range(0, path_edges.shape[0])
141
+
142
+ # allow for matching in either polarity, 1 and only 1 will exist
143
+ path_edges = (
144
+ pd.concat(
145
+ [
146
+ path_edges,
147
+ path_edges.rename(columns={"to": "from", "from": "to"}),
148
+ ]
149
+ )
150
+ .merge(
151
+ cpr_graph_edges,
152
+ left_on=["from", "to"],
153
+ right_on=["from", "to"],
154
+ # keep at most 1 entry per step
155
+ )
156
+ .sort_values(["step", "weights"])
157
+ .groupby("step")
158
+ .first()
159
+ .reset_index()
160
+ )
161
+
162
+ if path_edges.shape[0] != path_df.shape[0] - 1:
163
+ raise ValueError(
164
+ "Something has gone wrong when merging attribute onto undirected edges"
165
+ )
166
+
167
+ # in an undirected graph it wouldn't make sense to use molecule's
168
+ # SBO terms to determine the likely sign of a regulatory effect
169
+ path_edges = path_edges.assign(link_polarity="ambiguous").assign(
170
+ net_polarity="ambiguous"
171
+ )
172
+
173
+ # resort to recover the pre-merge order
174
+ path_edges = path_edges.sort_values(["path", "step"]).drop("step", axis=1)
175
+
176
+ # add weights to nodes
177
+ path_df["weights"] = [0] + path_edges["weights"].tolist()
178
+
179
+ path_list.append(path_df)
180
+ edge_list.append(path_edges)
181
+ entry += 1
182
+
183
+ paths_df_raw = pd.concat(path_list).reset_index(drop=True)
184
+ edges_df = pd.concat(edge_list).reset_index(drop=True)
185
+
186
+ # annotate reactions
187
+ labelled_reactions = _label_path_reactions(sbml_dfs, paths_df_raw)
188
+
189
+ # annotate species
190
+ labelled_species = (
191
+ # find species among nodes
192
+ paths_df_raw.merge(
193
+ sbml_dfs.compartmentalized_species,
194
+ left_on="node",
195
+ right_index=True,
196
+ how="inner",
197
+ )
198
+ .loc[:, paths_df_raw.columns.tolist()]
199
+ .merge(sbml_dfs.compartmentalized_species, left_on="node", right_index=True)[
200
+ ["step", "node", "path", "sc_name", "node_number", "weights", "s_id"]
201
+ ]
202
+ .rename(columns={"sc_name": "label"})
203
+ .assign(node_type="species")
204
+ )
205
+
206
+ # add uri urls
207
+ labelled_species = labelled_species.merge(
208
+ sbml_dfs.get_uri_urls("species", labelled_species["s_id"].tolist()),
209
+ left_on="s_id",
210
+ right_index=True,
211
+ how="left",
212
+ ).drop("s_id", axis=1)
213
+
214
+ paths_df = (
215
+ pd.concat([labelled_reactions, labelled_species])
216
+ .sort_values(["path", "step"])
217
+ .fillna("")
218
+ )
219
+
220
+ return paths_df, edges_df
221
+
222
+
223
+ def find_all_shortest_reaction_paths(
224
+ cpr_graph: ig.Graph,
225
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
226
+ target_species_paths: pd.DataFrame,
227
+ weight_var: str = "weights",
228
+ precomputed_distances: pd.DataFrame | None = None,
229
+ ):
230
+ """
231
+ Shortest Reaction Paths
232
+
233
+ Find all shortest paths between a source and destination entity
234
+
235
+ Parameters
236
+ ----------
237
+ cpr_graph : igraph.Graph
238
+ A bipartite network connecting molecular species and reactions
239
+ sbml_dfs : SBML_dfs
240
+ A model formed by aggregating pathways
241
+ target_species_paths : pd.DataFrame
242
+ Pairs of source and destination compartmentalized species; produced by compartmentalize_species_pairs()
243
+ weight_var : str
244
+ An edge attribute to use when forming a weighted shortest path
245
+ precomputed_distances : pd.DataFrame | None
246
+ A table containing precalculated path summaries between pairs of compartmentalized species
247
+
248
+ Returns:
249
+ ----------
250
+ all_shortest_reaction_paths_df : pd.DataFrame
251
+ Nodes in all shortest paths
252
+ all_shortest_reaction_path_edges_df : pd.DataFrame
253
+ Edges in all shortest paths
254
+ edge_sources : pd.DataFrame
255
+ Sources of edge identifying the models where they originated
256
+ paths_graph : igraph.Graph
257
+ Network formed by all shortest paths
258
+ """
259
+
260
+ # find the shortest path between the origin and dest node for all pairs on target_species_paths
261
+
262
+ if not isinstance(weight_var, str):
263
+ raise TypeError(f"weight_var must be a str, but was {type(weight_var)}")
264
+
265
+ # filter to valid paths if precomputed distances are provided
266
+ target_species_paths = _filter_paths_by_precomputed_distances(
267
+ target_species_paths, precomputed_distances
268
+ )
269
+
270
+ all_shortest_reaction_paths = list()
271
+ all_shortest_reaction_path_edges = list()
272
+ for i in range(target_species_paths.shape[0]):
273
+ one_search = target_species_paths.iloc[i]
274
+
275
+ paths = find_shortest_reaction_paths(
276
+ cpr_graph,
277
+ sbml_dfs,
278
+ origin=one_search["sc_id_origin"],
279
+ dest=one_search["sc_id_dest"],
280
+ weight_var=weight_var,
281
+ )
282
+
283
+ if paths is None:
284
+ continue
285
+
286
+ shortest_paths_df, shortest_path_edges_df = paths
287
+
288
+ all_shortest_reaction_paths.append(
289
+ shortest_paths_df.assign(
290
+ origin=one_search["sc_id_origin"], dest=one_search["sc_id_dest"]
291
+ )
292
+ )
293
+ all_shortest_reaction_path_edges.append(
294
+ shortest_path_edges_df.assign(
295
+ origin=one_search["sc_id_origin"], dest=one_search["sc_id_dest"]
296
+ )
297
+ )
298
+
299
+ if (
300
+ len(all_shortest_reaction_paths) == 0
301
+ or len(all_shortest_reaction_path_edges) == 0
302
+ ):
303
+ raise ValueError("No paths found")
304
+
305
+ all_shortest_reaction_paths_df = pd.concat(
306
+ all_shortest_reaction_paths
307
+ ).reset_index()
308
+ all_shortest_reaction_path_edges_df = pd.concat(
309
+ all_shortest_reaction_path_edges
310
+ ).reset_index()
311
+
312
+ # at a minimal set of pathway sources to organize reactions
313
+ edge_sources = net_utils.get_minimal_sources_edges(
314
+ all_shortest_reaction_paths_df, sbml_dfs
315
+ )
316
+
317
+ # create a new small network of shortest paths
318
+ unique_path_nodes = (
319
+ all_shortest_reaction_paths_df.groupby(["node"])
320
+ .first()
321
+ .reset_index()
322
+ .drop(columns=["index", "step", "path", "origin", "dest"])
323
+ )
324
+
325
+ directed = cpr_graph.is_directed()
326
+ paths_graph = ig.Graph.DictList(
327
+ vertices=unique_path_nodes.to_dict("records"),
328
+ edges=all_shortest_reaction_path_edges_df.to_dict("records"),
329
+ directed=directed,
330
+ vertex_name_attr="node",
331
+ edge_foreign_keys=("from", "to"),
332
+ )
333
+
334
+ return (
335
+ all_shortest_reaction_paths_df,
336
+ all_shortest_reaction_path_edges_df,
337
+ edge_sources,
338
+ paths_graph,
339
+ )
340
+
341
+
342
+ def plot_shortest_paths(paths_graph: ig.Graph) -> ig.plot:
343
+ """Plot a shortest paths graph."""
344
+
345
+ if "label" not in paths_graph.vs.attributes():
346
+ logger.warning(
347
+ "label was not defined as a vertex attribute so paths will not be colored"
348
+ )
349
+ paths_graph.vs.set_attribute_values("label", "")
350
+
351
+ paths_graph_layout = paths_graph.layout("kk")
352
+
353
+ color_dict = {"reaction": "dodgerblue", "species": "firebrick"}
354
+
355
+ visual_style = {} # type: dict[str,Any]
356
+ visual_style["background"] = "black"
357
+ visual_style["vertex_size"] = 10
358
+ visual_style["vertex_label"] = [
359
+ net_utils.safe_fill(x) for x in paths_graph.vs["label"]
360
+ ]
361
+ visual_style["vertex_label_color"] = "white"
362
+ visual_style["vertex_label_size"] = 8
363
+ visual_style["vertex_label_angle"] = 90
364
+ visual_style["vertex_color"] = [color_dict[x] for x in paths_graph.vs["node_type"]]
365
+ visual_style["edge_width"] = [math.sqrt(x) for x in paths_graph.es["weights"]]
366
+ visual_style["edge_color"] = "dimgray"
367
+ visual_style["layout"] = paths_graph_layout
368
+ visual_style["bbox"] = (2000, 2000)
369
+ visual_style["margin"] = 50
370
+
371
+ return ig.plot(paths_graph, **visual_style)
372
+
373
+
374
+ def _filter_paths_by_precomputed_distances(
375
+ all_species_pairs: pd.DataFrame, precomputed_distances: pd.DataFrame | None = None
376
+ ) -> pd.DataFrame:
377
+ """Filter source -> destination pairs based on precomputed distances if they were provided."""
378
+
379
+ utils.match_pd_vars(all_species_pairs, CPR_PATH_REQ_VARS).assert_present()
380
+
381
+ if precomputed_distances is None:
382
+ logger.info(
383
+ "precomputed_distances were not provided; all paths will be calculated on-the-fly"
384
+ )
385
+ return all_species_pairs
386
+ else:
387
+ if not isinstance(precomputed_distances, pd.DataFrame):
388
+ raise TypeError('"precomputed_distances" must be a pd.DataFrame')
389
+
390
+ utils.match_pd_vars(precomputed_distances, CPR_PATH_REQ_VARS).assert_present()
391
+
392
+ # filter to pairs which are connected in the pre-computed distances table
393
+ valid_all_species_pairs = all_species_pairs.merge(
394
+ precomputed_distances[["sc_id_origin", "sc_id_dest"]],
395
+ )
396
+
397
+ return valid_all_species_pairs
398
+
399
+
400
+ def _calculate_net_polarity(link_polarity_series: pd.Series) -> str:
401
+ """Determine whether a path implies activation, inhbition, or an ambiguous regulatory relationship."""
402
+
403
+ assert isinstance(link_polarity_series, pd.Series)
404
+ assert link_polarity_series.name == "link_polarity"
405
+
406
+ # loop through loop polarity and
407
+ # determine the cumulative polarity account for inhibition steps which flip polarity
408
+ # and ambiguous steps which will add an ambiguous label to the net result
409
+
410
+ observed_polarities = set(link_polarity_series.tolist()) # type: set[str]
411
+ valid_polarities = {"activation", "inhibition", "ambiguous"} # type: set[str]
412
+ invalid_polarities = observed_polarities.difference(
413
+ valid_polarities
414
+ ) # type: set[str]
415
+ if len(invalid_polarities) > 0:
416
+ raise ValueError(
417
+ f"Some edge polarities were invalid: {', '.join(invalid_polarities)}. "
418
+ f"Valid polarities are {', '.join(valid_polarities)}."
419
+ )
420
+
421
+ # catch fully ambiguous case
422
+ if link_polarity_series.eq("ambiguous").all():
423
+ running_polarity = [
424
+ "ambiguous" for i in range(link_polarity_series.shape[0])
425
+ ] # type : list[str]
426
+ return running_polarity
427
+
428
+ running_polarity = list() # type : list[str]
429
+ current_polarity = 1
430
+ ambig_prefix = ""
431
+
432
+ for polarity in link_polarity_series:
433
+ if polarity == "ambiguous":
434
+ # once a polarity becomes ambiguous it is stuck
435
+ ambig_prefix = "ambiguous "
436
+ if polarity == "inhibition":
437
+ current_polarity = current_polarity * -1
438
+
439
+ if current_polarity == 1:
440
+ running_polarity.append(ambig_prefix + "activation")
441
+ else:
442
+ running_polarity.append(ambig_prefix + "inhibition")
443
+
444
+ return running_polarity
445
+
446
+
447
+ def _terminal_net_polarity(link_polarity_series: pd.Series) -> str:
448
+ """Figure out the net polarity for the vertex at the end of a path."""
449
+
450
+ # calculate net polarity but only look at the final value
451
+ net_polarity = _calculate_net_polarity(link_polarity_series)
452
+ return net_polarity[-1]
453
+
454
+
455
+ def _patch(x: Any):
456
+ logger.info("silly stub to define Any")
457
+
458
+
459
+ def _label_path_reactions(sbml_dfs: sbml_dfs_core.SBML_dfs, paths_df: pd.DataFrame):
460
+ """Create labels for reactions in a shortest path."""
461
+
462
+ # annotate reactions
463
+ # find reactions among nodes
464
+ reaction_paths = paths_df.merge(
465
+ sbml_dfs.reactions, left_on="node", right_index=True, how="inner"
466
+ ).loc[:, paths_df.columns.tolist()]
467
+
468
+ if reaction_paths.shape[0] == 0:
469
+ # the path doesn't contain any reactions
470
+ # this can happen with the "regulatory" model
471
+ # network_type specification
472
+ labelled_reactions = None
473
+ else:
474
+ # add reaction label based off stoichiometry + the r_name
475
+ reaction_info = (
476
+ pd.concat(
477
+ [
478
+ sbml_dfs_core.reaction_summaries(sbml_dfs, r_ids=x)
479
+ for x in set(reaction_paths["node"])
480
+ ]
481
+ )
482
+ .to_frame()
483
+ .join(sbml_dfs.reactions["r_name"])
484
+ )
485
+
486
+ labelled_reactions = (
487
+ reaction_paths.merge(reaction_info, left_on="node", right_index=True)
488
+ .rename(columns={"r_name": "label"})
489
+ .assign(node_type="reaction")
490
+ )
491
+
492
+ # add uri urls
493
+ labelled_reactions = labelled_reactions.merge(
494
+ sbml_dfs.get_uri_urls("reactions", labelled_reactions["node"].tolist()),
495
+ left_on="node",
496
+ right_index=True,
497
+ how="left",
498
+ )
499
+
500
+ return labelled_reactions