napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,1594 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import math
5
+ import os
6
+ import pickle
7
+ import shutil
8
+ import textwrap
9
+ import warnings
10
+ from collections import ChainMap
11
+ from typing import Any
12
+
13
+ import igraph as ig
14
+ import numpy as np
15
+ import pandas as pd
16
+ from napistu import sbml_dfs_core
17
+ from napistu import utils
18
+ from napistu.network import net_utils
19
+ from napistu.network import paths
20
+
21
+ from napistu.constants import SBML_DFS
22
+ from napistu.constants import MINI_SBO_NAME_TO_POLARITY
23
+ from napistu.constants import MINI_SBO_TO_NAME
24
+
25
+ from napistu.network.constants import CPR_GRAPH_TYPES
26
+ from napistu.network.constants import NEIGHBORHOOD_NETWORK_TYPES
27
+ from napistu.network.constants import VALID_NEIGHBORHOOD_NETWORK_TYPES
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def find_and_prune_neighborhoods(
33
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
34
+ cpr_graph: ig.Graph,
35
+ compartmentalized_species: str | list[str],
36
+ precomputed_distances: pd.DataFrame | None = None,
37
+ network_type: str = NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
38
+ order: int = 3,
39
+ verbose: bool = True,
40
+ top_n: int = 10,
41
+ ) -> dict[str, Any]:
42
+ """
43
+ Find and Prune Neighborhoods
44
+
45
+ Wrapper which combines find_neighborhoods() and prune_neighborhoods()
46
+
47
+ Parameters
48
+ ----------
49
+ sbml_dfs: sbml_dfs_core.SBML_dfs
50
+ A mechanistic molecular model
51
+ cpr_graph : igraph.Graph
52
+ A bipartite network connecting molecular species and reactions
53
+ compartmentalized_species : [str] or str
54
+ Compartmentalized species IDs for neighborhood centers
55
+ precomputed_distances : pd.DataFrame or None
56
+ If provided, an edgelist of origin->destination path weights and lengths
57
+ network_type: str
58
+ If the network is directed should neighbors be located "downstream",
59
+ or "upstream" of each compartmentalized species. The "hourglass" option
60
+ locates both upstream and downstream species.
61
+ order: int
62
+ Max steps away from center node
63
+ verbose: bool
64
+ Extra reporting
65
+ top_n: int
66
+ How many neighboring molecular species should be retained?
67
+ If the neighborhood includes both upstream and downstream connections
68
+ (i.e., hourglass), this filter will be applied to both sets separately.
69
+
70
+ Returns:
71
+ ----------
72
+ A dict containing the neighborhood of each compartmentalized species.
73
+ Each entry in the dict is a dict of the subgraph, vertices, and edges.
74
+ """
75
+
76
+ if not isinstance(network_type, str):
77
+ raise TypeError(f"network_type was a {type(network_type)} and must be an str")
78
+
79
+ if not isinstance(order, int):
80
+ raise TypeError(f"order was a {type(order)} and must be an int")
81
+
82
+ if not isinstance(top_n, int):
83
+ raise TypeError(f"top_n was a {type(top_n)} and must be an int")
84
+
85
+ if isinstance(compartmentalized_species, str):
86
+ compartmentalized_species = [compartmentalized_species]
87
+ assert isinstance(compartmentalized_species, list)
88
+
89
+ if isinstance(precomputed_distances, pd.DataFrame):
90
+ logger.info("Pre-computed neighbors based on precomputed_distances")
91
+
92
+ precomputed_neighbors = _precompute_neighbors(
93
+ compartmentalized_species,
94
+ precomputed_distances=precomputed_distances,
95
+ sbml_dfs=sbml_dfs,
96
+ network_type=network_type,
97
+ order=order,
98
+ top_n=math.ceil(top_n * 1.1), # ties when using head()?
99
+ )
100
+ else:
101
+ precomputed_neighbors = None
102
+
103
+ neighborhoods = find_neighborhoods(
104
+ sbml_dfs=sbml_dfs,
105
+ cpr_graph=cpr_graph,
106
+ compartmentalized_species=compartmentalized_species,
107
+ network_type=network_type,
108
+ order=order,
109
+ verbose=verbose,
110
+ precomputed_neighbors=precomputed_neighbors,
111
+ )
112
+
113
+ pruned_neighborhoods = prune_neighborhoods(neighborhoods, top_n=top_n)
114
+
115
+ return pruned_neighborhoods
116
+
117
+
118
+ def load_neighborhoods(
119
+ s_ids: list[str],
120
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
121
+ cpr_graph: ig.Graph,
122
+ output_dir: str,
123
+ network_type: str,
124
+ order: int,
125
+ top_n: int,
126
+ overwrite: bool = False,
127
+ verbose: bool = False,
128
+ ) -> tuple[pd.DataFrame, dict[str, Any]]:
129
+ """
130
+ Load Neighborhoods
131
+
132
+ Load existing neighborhoods if they exist
133
+ (and overwrite = False) and otherwise construct
134
+ neighborhoods using the provided settings
135
+
136
+ Parameters
137
+ ----------
138
+ s_ids: list(str)
139
+ create a neighborhood around each species
140
+ sbml_dfs: sbml_dfs_core.SBML_dfs
141
+ network model
142
+ cpr_graph: igraph.Graph
143
+ network associated with sbml_dfs
144
+ output_dir: str
145
+ path to existing output directory
146
+ network_type: str
147
+ downstream, upstream or hourglass (i.e., downstream and upstream)
148
+ order: 10
149
+ maximum number of steps from the focal node
150
+ top_n: 30
151
+ target number of upstream and downstream species to retain
152
+ overwrite: bool
153
+ ignore cached files and regenerate neighborhoods
154
+ verbose: bool
155
+ extra reporting
156
+
157
+ Returns
158
+ -------
159
+ all_neighborhoods_df: pd.DataFrame
160
+ A table containing all species in each query s_ids neighborhood
161
+ neighborhoods_dict: dict
162
+ Outputs from find_and_prune_neighborhoods for each s_id
163
+
164
+ """
165
+
166
+ if not os.path.isdir(output_dir):
167
+ raise FileNotFoundError(f"{output_dir} does not exist")
168
+
169
+ neighborhood_prefix = create_neighborhood_prefix(network_type, order, top_n)
170
+ vertices_path = os.path.join(output_dir, f"{neighborhood_prefix}_vertices.tsv")
171
+ networks_path = os.path.join(output_dir, f"{neighborhood_prefix}_networks.pkl")
172
+ neighborhood_paths = [vertices_path, networks_path]
173
+
174
+ if all([os.path.isfile(x) for x in neighborhood_paths]) and overwrite is False:
175
+ print(f"loading existing neighborhoods for {neighborhood_prefix}")
176
+
177
+ all_neighborhoods_df = pd.read_csv(vertices_path, sep="\t")
178
+ with open(networks_path, "rb") as in_file:
179
+ neighborhoods_dict = pickle.load(in_file)
180
+
181
+ else:
182
+ print(f"creating neighborhoods based on {neighborhood_prefix}")
183
+
184
+ all_neighborhoods_df, neighborhoods_dict = create_neighborhoods(
185
+ s_ids=s_ids,
186
+ sbml_dfs=sbml_dfs,
187
+ cpr_graph=cpr_graph,
188
+ network_type=network_type,
189
+ order=order,
190
+ top_n=top_n,
191
+ verbose=verbose,
192
+ )
193
+
194
+ # save df
195
+ all_neighborhoods_df.to_csv(vertices_path, sep="\t", index=False)
196
+
197
+ # pickle neighborhoods
198
+ with open(networks_path, "wb") as fh:
199
+ pickle.dump(neighborhoods_dict, fh)
200
+
201
+ return all_neighborhoods_df, neighborhoods_dict
202
+
203
+
204
+ def create_neighborhoods(
205
+ s_ids: list[str],
206
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
207
+ cpr_graph: ig.Graph,
208
+ network_type: str,
209
+ order: int,
210
+ top_n: int,
211
+ verbose: bool = False,
212
+ ) -> tuple[pd.DataFrame, dict]:
213
+ """
214
+ Create Neighborhoods
215
+
216
+ Create neighborhoods for a set of species and return
217
+
218
+ Parameters
219
+ ----------
220
+ s_ids: list(str)
221
+ create a neighborhood around each species
222
+ sbml_dfs: sbml_dfs_core.SBML_dfs
223
+ network model
224
+ cpr_graph: igraph.Graph
225
+ network associated with sbml_dfs
226
+ network_type: str
227
+ downstream, upstream or hourglass (i.e., downstream and upstream)
228
+ order: 10
229
+ maximum number of steps from the focal node
230
+ top_n: 30
231
+ target number of upstream and downstream species to retain
232
+ verbose: bool
233
+ extra reporting
234
+
235
+ Returns
236
+ -------
237
+ all_neighborhoods_df: pd.DataFrame
238
+ A table containing all species in each query s_ids neighborhood
239
+ neighborhoods_dict: dict
240
+ Outputs from find_and_prune_neighborhoods for each s_id
241
+ """
242
+
243
+ if not isinstance(s_ids, list):
244
+ raise TypeError(f"s_ids was a {type(s_ids)} and must be an list")
245
+
246
+ for s_id in s_ids:
247
+ if not isinstance(s_id, str):
248
+ raise TypeError(f"s_id was a {type(s_id)} and must be an str")
249
+
250
+ if not isinstance(network_type, str):
251
+ raise TypeError(f"network_type was a {type(network_type)} and must be an str")
252
+
253
+ if not isinstance(order, int):
254
+ raise TypeError(f"order was a {type(order)} and must be an int")
255
+
256
+ if not isinstance(top_n, int):
257
+ raise TypeError(f"top_n was a {type(top_n)} and must be an int")
258
+
259
+ neighborhoods_list = list()
260
+ neighborhoods_dict = dict()
261
+ for s_id in s_ids:
262
+ query_sc_species = net_utils.compartmentalize_species(sbml_dfs, s_id)
263
+
264
+ compartmentalized_species = query_sc_species[SBML_DFS.SC_ID].tolist()
265
+
266
+ neighborhoods = find_and_prune_neighborhoods(
267
+ sbml_dfs,
268
+ cpr_graph,
269
+ compartmentalized_species=compartmentalized_species,
270
+ network_type=network_type,
271
+ order=order,
272
+ top_n=top_n,
273
+ verbose=verbose,
274
+ )
275
+
276
+ # combine multiple neighborhoods
277
+
278
+ neighborhood_entities = pd.concat(
279
+ [
280
+ neighborhoods[sc_id]["vertices"].assign(focal_sc_id=sc_id)
281
+ for sc_id in neighborhoods.keys()
282
+ ]
283
+ ).assign(focal_s_id=s_id)
284
+
285
+ neighborhood_species = neighborhood_entities.merge(
286
+ sbml_dfs.compartmentalized_species[SBML_DFS.S_ID],
287
+ left_on="name",
288
+ right_index=True,
289
+ )
290
+
291
+ neighborhoods_list.append(neighborhood_species)
292
+ neighborhoods_dict[s_id] = neighborhoods
293
+
294
+ all_neighborhoods_df = pd.concat(neighborhoods_list).reset_index(drop=True)
295
+
296
+ return all_neighborhoods_df, neighborhoods_dict
297
+
298
+
299
+ def create_neighborhood_prefix(network_type: str, order: int, top_n: int) -> str:
300
+ if not isinstance(network_type, str):
301
+ raise TypeError(f"network_type was a {type(network_type)} and must be a str")
302
+
303
+ if network_type not in VALID_NEIGHBORHOOD_NETWORK_TYPES:
304
+ raise ValueError(
305
+ f"network_type was {network_type} and must be one of {', '.join(VALID_NEIGHBORHOOD_NETWORK_TYPES)}"
306
+ )
307
+ if not isinstance(order, int):
308
+ raise ValueError("order must be an int")
309
+ if not isinstance(top_n, int):
310
+ raise ValueError("top_n must be an int")
311
+
312
+ return f"{network_type[0]}{order}s{top_n}n"
313
+
314
+
315
+ def load_neighborhoods_by_partition(
316
+ selected_partition: int,
317
+ neighborhood_outdir: str,
318
+ graph_type: str = CPR_GRAPH_TYPES.REGULATORY,
319
+ ) -> None:
320
+ """
321
+ Load Neighborhoods By Partition
322
+
323
+ Call load_neighborhoods for a subset of species ids defined by a partition.
324
+ This function is setup to be called in a slurm job.
325
+
326
+ Params
327
+ ------
328
+ selected_partition: int
329
+ A partition of sids to search
330
+ neighborhood_outdir: str
331
+ Output directory
332
+
333
+
334
+ Returns
335
+ -------
336
+ None, used for side-effects
337
+
338
+ """
339
+
340
+ consensus_root = "/group/cpr/consensus"
341
+ consensus_name = "reactome"
342
+ consensus_outdir = os.path.join(consensus_root, consensus_name)
343
+
344
+ if not os.path.isdir(neighborhood_outdir):
345
+ raise FileNotFoundError(f"{neighborhood_outdir} does not exist")
346
+
347
+ partition_output = os.path.join(
348
+ neighborhood_outdir, f"partition_{selected_partition}"
349
+ )
350
+ # initialize an empty output
351
+ if os.path.isdir(partition_output):
352
+ print(f"removing existing directory: {partition_output}")
353
+ shutil.rmtree(partition_output)
354
+ os.makedirs(partition_output)
355
+
356
+ # format partition s_ids
357
+
358
+ sids_to_partition = pd.read_csv(os.path.join(neighborhood_outdir, "partitions.csv"))
359
+ parition_sids_df = sids_to_partition[
360
+ sids_to_partition["partition"] == selected_partition
361
+ ]
362
+
363
+ if parition_sids_df.shape[0] == 0:
364
+ raise ValueError(f"No s_ids associated with partition {selected_partition}")
365
+
366
+ parition_sids = parition_sids_df["s_id"].tolist()
367
+
368
+ # read pathway and network data
369
+
370
+ # read model containing Calico curations. this is primarily to support search programs
371
+ # to not use these switch to refined.pkl
372
+ refined_model_pkl_path = os.path.join(consensus_outdir, "curated.pkl")
373
+ with open(refined_model_pkl_path, "rb") as in_file:
374
+ refined_model = pickle.load(in_file)
375
+ refined_model.validate()
376
+
377
+ # load the graph
378
+ cpr_graph = net_utils.read_network_pkl(
379
+ model_prefix="curated",
380
+ network_dir=consensus_outdir,
381
+ directed=True,
382
+ graph_type=graph_type,
383
+ )
384
+
385
+ all_neighborhoods_df, neighborhoods_dict = load_neighborhoods(
386
+ s_ids=parition_sids,
387
+ sbml_dfs=refined_model,
388
+ cpr_graph=cpr_graph,
389
+ output_dir=partition_output,
390
+ network_type="hourglass",
391
+ order=12,
392
+ top_n=100,
393
+ overwrite=True,
394
+ verbose=True,
395
+ )
396
+
397
+ return None
398
+
399
+
400
+ def read_paritioned_neighborhoods(
401
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
402
+ cpr_graph: ig.Graph,
403
+ partitions_path: str,
404
+ n_partitions: int = 200,
405
+ ) -> tuple[pd.DataFrame, dict[str, Any]]:
406
+ """
407
+ Read Partitioned Neighborhoods
408
+
409
+ Import a set of neighborhoods produced by the find_neighborhoods_batch.sh slurm job
410
+
411
+ Params
412
+ ------
413
+ sbml_dfs: sbml_dfs_core.SBML_dfs
414
+ network model
415
+ cpr_graph: igraph.Graph
416
+ network associated with sbml_dfs
417
+ partitions_path: str
418
+ Path to a directory containing folders for each partition's results
419
+ n_partitions: int
420
+ Number of partitions that exist
421
+
422
+ Returns
423
+ -------
424
+ all_neighborhoods_df: pd.DataFrame
425
+ A table containing all species in each query s_ids neighborhood
426
+ neighborhoods_dict: dict
427
+ Outputs from find_and_prune_neighborhoods for each s_id
428
+
429
+ """
430
+
431
+ # check for partition directories
432
+ expected_partition_dirs = ["partition_" + str(p) for p in range(0, n_partitions)]
433
+ missing_partition_dirs = set(expected_partition_dirs).difference(
434
+ set(os.listdir(partitions_path))
435
+ )
436
+ if len(missing_partition_dirs) != 0:
437
+ raise FileNotFoundError(
438
+ f"{len(missing_partition_dirs)} neighborhood partition directories were not found:"
439
+ f" {', '.join(missing_partition_dirs)}"
440
+ )
441
+
442
+ # check for required files
443
+ expected_files = ["h12s100n_vertices.tsv", "h12s100n_networks.pkl"]
444
+ expected_paths_df = pd.DataFrame(
445
+ [
446
+ {"partition": p, "file": f}
447
+ for p in expected_partition_dirs
448
+ for f in expected_files
449
+ ]
450
+ )
451
+ expected_paths_df["path"] = [
452
+ os.path.join(partitions_path, p, f)
453
+ for p, f in zip(expected_paths_df["partition"], expected_paths_df["file"])
454
+ ]
455
+ expected_paths_df["exists"] = [os.path.isfile(p) for p in expected_paths_df["path"]]
456
+ missing_expected_paths_df = expected_paths_df[~expected_paths_df["exists"]]
457
+
458
+ if missing_expected_paths_df.shape[0] > 0:
459
+ styled_df = utils.style_df(
460
+ missing_expected_paths_df.drop(["exists"], axis=1), headers="keys"
461
+ )
462
+ logger.warning(styled_df)
463
+
464
+ raise FileNotFoundError(
465
+ f"missing {missing_expected_paths_df.shape[0]} required files"
466
+ )
467
+
468
+ neighborhood_paths_list = list()
469
+ path_dict_list = list()
470
+
471
+ for p in expected_partition_dirs:
472
+ partition_paths, partition_dict = load_neighborhoods(
473
+ s_ids=["stub"],
474
+ sbml_dfs=sbml_dfs,
475
+ cpr_graph=cpr_graph,
476
+ output_dir=os.path.join(partitions_path, p),
477
+ # these settings define the neighborhood string so they must
478
+ # match the settings at the time of network generation
479
+ network_type="hourglass",
480
+ order=12,
481
+ top_n=100,
482
+ overwrite=False,
483
+ verbose=False,
484
+ )
485
+
486
+ neighborhood_paths_list.append(partition_paths)
487
+ path_dict_list.append(partition_dict)
488
+
489
+ # combine all partitions' dfs and dicts
490
+ all_neighborhoods_df = pd.concat(neighborhood_paths_list).reset_index(drop=True)
491
+ neighborhoods_dict = dict(ChainMap(*path_dict_list))
492
+
493
+ # TO DO - remove s_id duplication (these are present in the vertices table in the partition outputs)
494
+ if not all(all_neighborhoods_df["s_id_x"] == all_neighborhoods_df["s_id_y"]):
495
+ raise ValueError("The patch won't hold")
496
+ all_neighborhoods_df = all_neighborhoods_df.drop(["s_id_y"], axis=1).rename(
497
+ {"s_id_x": "s_id"}, axis=1
498
+ )
499
+
500
+ return all_neighborhoods_df, neighborhoods_dict
501
+
502
+
503
+ def find_neighborhoods(
504
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
505
+ cpr_graph: ig.Graph,
506
+ compartmentalized_species: list[str],
507
+ network_type: str = "downstream",
508
+ order: int = 3,
509
+ verbose: bool = True,
510
+ precomputed_neighbors: pd.DataFrame | None = None,
511
+ ) -> dict:
512
+ """
513
+ Find Neighborhood
514
+
515
+ Create a network composed of all species and reactions within N steps of
516
+ each of a set of compartmentalized species.
517
+
518
+ Parameters
519
+ ----------
520
+ sbml_dfs: sbml_dfs_core.SBML_dfs
521
+ A mechanistic molecular model
522
+ cpr_graph : igraph.Graph
523
+ A network connecting molecular species and reactions
524
+ compartmentalized_species : [str]
525
+ Compartmentalized species IDs for neighborhood centers
526
+ network_type: str
527
+ If the network is directed should neighbors be located "downstream",
528
+ or "upstream" of each compartmentalized species. The "hourglass" option
529
+ locates both upstream and downstream species.
530
+ order: int
531
+ Max steps away from center node
532
+ verbose: bool
533
+ Extra reporting
534
+ precomputed_neighbors: pd.DataFrame or None
535
+ If provided, a pre-filtered table of nodes nearby the compartmentalized species
536
+ which will be used to skip on-the-fly neighborhood generation.
537
+
538
+ Returns:
539
+ ----------
540
+ A dict containing the neighborhood of each compartmentalized species.
541
+ Each entry in the dict is a dict of the subgraph, vertices, and edges.
542
+ """
543
+
544
+ if not isinstance(network_type, str):
545
+ raise TypeError(f"network_type was a {type(network_type)} and must be a str")
546
+
547
+ valid_network_types = ["downstream", "upstream", "hourglass"]
548
+ if network_type not in valid_network_types:
549
+ raise ValueError(
550
+ f"network_type must be one of {', '.join(valid_network_types)}"
551
+ )
552
+
553
+ if not isinstance(order, int):
554
+ raise TypeError(f"order was a {type(order)} and must be an int")
555
+
556
+ # create a table which includes cspecies and reaction nearby each of the
557
+ # focal compartmentalized_speecies
558
+ neighborhood_df = _build_raw_neighborhood_df(
559
+ cpr_graph=cpr_graph,
560
+ compartmentalized_species=compartmentalized_species,
561
+ network_type=network_type,
562
+ order=order,
563
+ precomputed_neighbors=precomputed_neighbors,
564
+ )
565
+
566
+ # format the vertices and edges in each compartmentalized species' network
567
+ neighborhood_dict = {
568
+ sc_id: create_neighborhood_dict_entry(
569
+ sc_id, neighborhood_df, sbml_dfs, cpr_graph, verbose=verbose
570
+ )
571
+ for sc_id in compartmentalized_species
572
+ }
573
+
574
+ return neighborhood_dict
575
+
576
+
577
+ def create_neighborhood_dict_entry(
578
+ sc_id: str,
579
+ neighborhood_df: pd.DataFrame,
580
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
581
+ cpr_graph: ig.Graph,
582
+ verbose: bool = False,
583
+ ) -> dict[str, Any]:
584
+ """
585
+ Create Neighborhood Dict Entry
586
+
587
+ Generate a summary of a compartmentalized species' neighborhood
588
+
589
+ Parameters
590
+ ----------
591
+ sc_id: str
592
+ A compartmentalized species id
593
+ neighborhood_df: pd.DataFrame
594
+ A table of upstream and/or downstream neighbors of all compartmentalized species
595
+ sbml_dfs: sbml_dfs_core.SBML_dfs
596
+ A mechanistic molecular model
597
+ cpr_graph: igraph.Graph
598
+ A network connecting molecular species and reactions
599
+ verbose: bool
600
+ Extra reporting?
601
+
602
+ Returns
603
+ -------
604
+ dict containing:
605
+ graph: igraph.Graph
606
+ subgraph of sc_id's neighborhood,
607
+ vertices: pd.DataFrame
608
+ nodes in the neighborhood
609
+ edges: pd.DataFrame
610
+ edges in the neighborhood
611
+ edge_sources: pd.DataFrame
612
+ models that edges were derived from
613
+ neighborhood_path_entities: dict
614
+ upstream and downstream dicts representing entities in paths.
615
+ If the keys are to be included in a neighborhood, the
616
+ values should be as well in order to maintain connection to the
617
+ focal node.
618
+ """
619
+
620
+ one_neighborhood_df = neighborhood_df[neighborhood_df["sc_id"] == sc_id]
621
+
622
+ if verbose:
623
+ _create_neighborhood_dict_entry_logging(sc_id, one_neighborhood_df, sbml_dfs)
624
+
625
+ if not one_neighborhood_df["name"].eq(sc_id).any():
626
+ raise ValueError(
627
+ f"The focal node sc_id = {sc_id} was not in 'one_neighborhood_df'.\
628
+ By convention it should be part of its neighborhood"
629
+ )
630
+
631
+ # create the subgraph formed by filtering to neighborhoods
632
+ neighborhood_graph = cpr_graph.subgraph(
633
+ cpr_graph.vs[one_neighborhood_df["neighbor"]], implementation="auto"
634
+ )
635
+
636
+ vertices = pd.DataFrame([v.attributes() for v in neighborhood_graph.vs])
637
+ edges = pd.DataFrame([e.attributes() for e in neighborhood_graph.es])
638
+
639
+ # add edge polarity: whether edges are activating, inhibiting or unknown
640
+ if edges.shape[0] > 0:
641
+ edges["link_polarity"] = (
642
+ edges["sbo_term"].map(MINI_SBO_TO_NAME).map(MINI_SBO_NAME_TO_POLARITY)
643
+ )
644
+
645
+ try:
646
+ edge_sources = net_utils.get_minimal_sources_edges(
647
+ vertices.rename(columns={"name": "node"}), sbml_dfs
648
+ )
649
+ except Exception:
650
+ edge_sources = None
651
+
652
+ # to add weights to the network solve the shortest path problem
653
+ # from the focal node to each neighbor
654
+ # solve this problem separately whether a given neighbor is an
655
+ # ancestor or descendant
656
+
657
+ # focal node -> descendants
658
+
659
+ one_descendants_df = one_neighborhood_df[
660
+ one_neighborhood_df["relationship"] == "descendants"
661
+ ]
662
+ descendants_list = list(set(one_descendants_df["name"].tolist()).union({sc_id}))
663
+
664
+ # hide warnings which are mostly just Dijkstra complaining about not finding neighbors
665
+ with warnings.catch_warnings():
666
+ # igraph throws warnings for each pair of unconnected species
667
+ warnings.simplefilter("ignore")
668
+
669
+ neighborhood_paths = neighborhood_graph.get_shortest_paths(
670
+ # focal node
671
+ v=sc_id,
672
+ to=descendants_list,
673
+ weights="weights",
674
+ mode="out",
675
+ output="epath",
676
+ )
677
+
678
+ downstream_path_attrs, downstream_entity_dict = _calculate_path_attrs(
679
+ neighborhood_paths, edges, vertices=descendants_list, weight_var="weights"
680
+ )
681
+ downstream_path_attrs = downstream_path_attrs.assign(node_orientation="downstream")
682
+
683
+ # ancestors -> focal_node
684
+
685
+ one_ancestors_df = one_neighborhood_df[
686
+ one_neighborhood_df["relationship"] == "ancestors"
687
+ ]
688
+ ancestors_list = list(set(one_ancestors_df["name"].tolist()).union({sc_id}))
689
+
690
+ with warnings.catch_warnings():
691
+ # igraph throws warnings for each pair of unconnected species
692
+ warnings.simplefilter("ignore")
693
+
694
+ neighborhood_paths = neighborhood_graph.get_shortest_paths(
695
+ v=sc_id,
696
+ to=ancestors_list,
697
+ weights="upstream_weights",
698
+ mode="in",
699
+ output="epath",
700
+ )
701
+
702
+ upstream_path_attrs, upstream_entity_dict = _calculate_path_attrs(
703
+ neighborhood_paths,
704
+ edges,
705
+ vertices=ancestors_list,
706
+ weight_var="upstream_weights",
707
+ )
708
+ upstream_path_attrs = upstream_path_attrs.assign(node_orientation="upstream")
709
+
710
+ # combine upstream and downstream shortest paths
711
+ # in cases a node is upstream and downstream of the focal node
712
+ # by taking the lowest path weight
713
+ vertex_neighborhood_attrs = (
714
+ pd.concat([downstream_path_attrs, upstream_path_attrs])
715
+ .sort_values("path_weight")
716
+ .groupby("neighbor")
717
+ .first()
718
+ )
719
+ # label the focal node
720
+ vertex_neighborhood_attrs.loc[sc_id, "node_orientation"] = "focal"
721
+
722
+ # if the precomputed distances, graph and/or sbml_dfs are inconsistent
723
+ # then the shortest paths search may just return empty lists
724
+ # throw a clearer error message in this case.
725
+ EXPECTED_VERTEX_ATTRS = {"final_from", "final_to", "net_polarity"}
726
+ missing_vertex_attrs = EXPECTED_VERTEX_ATTRS.difference(
727
+ set(vertex_neighborhood_attrs.columns.tolist())
728
+ )
729
+
730
+ if len(missing_vertex_attrs) > 0:
731
+ raise ValueError(
732
+ f"vertex_neighborhood_attrs did not contain the expected columns: {EXPECTED_VERTEX_ATTRS}."
733
+ "This is likely because of inconsistencies between the precomputed distances, graph and/or sbml_dfs."
734
+ "Please try net_utils.validate_assets() to check for consistency."
735
+ )
736
+
737
+ # add net_polarity to edges in addition to nodes
738
+ edges = edges.merge(
739
+ vertex_neighborhood_attrs.reset_index()[
740
+ ["final_from", "final_to", "net_polarity"]
741
+ ].dropna(),
742
+ left_on=["from", "to"],
743
+ right_on=["final_from", "final_to"],
744
+ how="left",
745
+ )
746
+
747
+ vertices = vertices.merge(
748
+ vertex_neighborhood_attrs, left_on="name", right_index=True
749
+ )
750
+
751
+ # drop nodes with a path length / weight of zero
752
+ # which are NOT the focal node
753
+ # these were cases where no path to/from the focal node to the query node was found
754
+ disconnected_neighbors = vertices.query(
755
+ "(not node_orientation == 'focal') and path_weight == 0"
756
+ )
757
+ vertices = vertices[~vertices.index.isin(disconnected_neighbors.index.tolist())]
758
+
759
+ # add reference urls
760
+ vertices = add_vertices_uri_urls(vertices, sbml_dfs)
761
+
762
+ neighborhood_path_entities = {
763
+ "downstream": downstream_entity_dict,
764
+ "upstream": upstream_entity_dict,
765
+ }
766
+
767
+ # update graph with additional vertex and edge attributes
768
+ updated_cpr_graph = ig.Graph.DictList(
769
+ vertices=vertices.to_dict("records"),
770
+ edges=edges.to_dict("records"),
771
+ directed=cpr_graph.is_directed(),
772
+ vertex_name_attr="name",
773
+ edge_foreign_keys=("from", "to"),
774
+ )
775
+
776
+ outdict = {
777
+ "graph": updated_cpr_graph,
778
+ "vertices": vertices,
779
+ "edges": edges,
780
+ "edge_sources": edge_sources,
781
+ "neighborhood_path_entities": neighborhood_path_entities,
782
+ }
783
+
784
+ return outdict
785
+
786
+
787
+ def _create_neighborhood_dict_entry_logging(
788
+ sc_id: str, one_neighborhood_df: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
789
+ ):
790
+ df_summary = one_neighborhood_df.copy()
791
+ df_summary["node_type"] = [
792
+ "species" if x else "reactions"
793
+ for x in df_summary["name"].isin(sbml_dfs.compartmentalized_species.index)
794
+ ]
795
+ relationship_counts = df_summary.value_counts(
796
+ ["relationship", "node_type"]
797
+ ).sort_index()
798
+
799
+ relation_strings = list()
800
+ for relation in relationship_counts.index.get_level_values(0).unique():
801
+ relation_str = " and ".join(
802
+ [
803
+ f"{relationship_counts[relation][i]} {i}"
804
+ for i in relationship_counts[relation].index
805
+ ]
806
+ )
807
+ relation_strings.append(f"{relation}: {relation_str}")
808
+
809
+ msg = f"{sc_id} neighborhood: {'; '.join(relation_strings)}"
810
+ logger.info(msg)
811
+
812
+
813
+ def add_vertices_uri_urls(
814
+ vertices: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
815
+ ) -> pd.DataFrame:
816
+ """
817
+ Add Vertices URI URLs
818
+
819
+ Add a url variable to the neighborhood vertices pd.DataFrame
820
+
821
+ Parameters
822
+ ----------
823
+ vertices: pd.DataFrame
824
+ table of neighborhood vertices
825
+ sbml_dfs: sbml_dfs_core.SBML_dfs
826
+ consensus network model
827
+
828
+ Returns
829
+ -------
830
+ vertices: pd.DataFrame
831
+ input table with a url field
832
+
833
+ """
834
+
835
+ assert isinstance(vertices, pd.DataFrame)
836
+ assert vertices.shape[0] > 0
837
+
838
+ # add uri urls for each node
839
+
840
+ # add s_ids
841
+ neighborhood_species = vertices[vertices["node_type"] == "species"].merge(
842
+ sbml_dfs.compartmentalized_species["s_id"],
843
+ left_on="name",
844
+ right_index=True,
845
+ how="left",
846
+ )
847
+
848
+ # add a standard reference identifier
849
+ neighborhood_species_aug = neighborhood_species.merge(
850
+ sbml_dfs.get_uri_urls("species", neighborhood_species["s_id"]),
851
+ left_on="s_id",
852
+ right_index=True,
853
+ how="left",
854
+ # add pharos ids where available
855
+ ).merge(
856
+ sbml_dfs.get_uri_urls(
857
+ "species", neighborhood_species["s_id"], required_ontology="pharos"
858
+ ).rename("pharos"),
859
+ left_on="s_id",
860
+ right_index=True,
861
+ how="left",
862
+ )
863
+
864
+ if sum(vertices["node_type"] == "reaction") > 0:
865
+ neighborhood_reactions = vertices[vertices["node_type"] == "reaction"].merge(
866
+ sbml_dfs.get_uri_urls(
867
+ "reactions", vertices[vertices["node_type"] == "reaction"]["name"]
868
+ ),
869
+ left_on="name",
870
+ right_index=True,
871
+ how="left",
872
+ )
873
+ else:
874
+ neighborhood_reactions = None
875
+
876
+ if neighborhood_reactions is None:
877
+ updated_vertices = neighborhood_species_aug.fillna("")
878
+ else:
879
+ updated_vertices = pd.concat(
880
+ [neighborhood_species_aug, neighborhood_reactions]
881
+ ).fillna("")
882
+
883
+ assert isinstance(updated_vertices, pd.DataFrame)
884
+ if vertices.shape[0] != updated_vertices.shape[0]:
885
+ raise ValueError("output vertices rows did not match input")
886
+
887
+ return updated_vertices
888
+
889
+
890
+ def prune_neighborhoods(neighborhoods: dict, top_n: int = 100) -> dict:
891
+ """
892
+ Prune Neighborhoods
893
+
894
+ Take a possibly very large neighborhood around a set of focal nodes
895
+ and prune to the most highly weighted nodes. Nodes weights are
896
+ constructed as the sum of path weights from the focal node to each
897
+ neighbor so each pruned neighborhood will still be a single subnetwork.
898
+
899
+ Parameters
900
+ ----------
901
+ neighborhoods: dict
902
+ A dictionary of sc_id neighborhoods as produced by find_neighborhoods()
903
+ top_n: int
904
+ How many neighbors should be retained? If the neighborhood includes
905
+ both upstream and downstream connections (i.e., hourglass), this filter
906
+ will be applied to both sets separately
907
+
908
+ Returns
909
+ -------
910
+ neighborhoods: dict
911
+ Same structure as neighborhoods input
912
+ """
913
+
914
+ if not isinstance(top_n, int):
915
+ raise TypeError(f"top_n was a {type(top_n)} and must be an int")
916
+
917
+ pruned_neighborhoods_dict = dict()
918
+
919
+ for an_sc_id in neighborhoods.keys():
920
+ one_neighborhood = neighborhoods[an_sc_id]
921
+
922
+ # filter to the desired number of vertices w/ lowest path_weight (from focal node)
923
+ # filter neighborhood to high-weight vertices
924
+ pruned_vertices = _prune_vertex_set(one_neighborhood, top_n=top_n)
925
+
926
+ # reduce neighborhood to this set of high-weight vertices
927
+ all_neighbors = pd.DataFrame({"name": one_neighborhood["graph"].vs["name"]})
928
+ pruned_vertices_indices = all_neighbors[
929
+ all_neighbors["name"].isin(pruned_vertices["name"])
930
+ ].index.tolist()
931
+
932
+ pruned_neighborhood = one_neighborhood["graph"].subgraph(
933
+ one_neighborhood["graph"].vs[pruned_vertices_indices],
934
+ implementation="auto",
935
+ )
936
+
937
+ pruned_edges = pd.DataFrame([e.attributes() for e in pruned_neighborhood.es])
938
+
939
+ pruned_reactions = pruned_vertices[pruned_vertices["node_type"] == "reaction"][
940
+ "name"
941
+ ]
942
+
943
+ if pruned_reactions.shape[0] != 0:
944
+ if one_neighborhood["edge_sources"] is None:
945
+ # allow for missing source information since this is currently optional
946
+ pruned_edge_sources = one_neighborhood["edge_sources"]
947
+ else:
948
+ pruned_edge_sources = one_neighborhood["edge_sources"][
949
+ one_neighborhood["edge_sources"]["r_id"].isin(pruned_reactions)
950
+ ]
951
+ else:
952
+ pruned_edge_sources = one_neighborhood["edge_sources"]
953
+
954
+ pruned_neighborhoods_dict[an_sc_id] = {
955
+ "graph": pruned_neighborhood,
956
+ "vertices": pruned_vertices,
957
+ "edges": pruned_edges,
958
+ "edge_sources": pruned_edge_sources,
959
+ }
960
+
961
+ return pruned_neighborhoods_dict
962
+
963
+
964
+ def plot_neighborhood(
965
+ neighborhood_graph: ig.Graph,
966
+ name_nodes: bool = False,
967
+ plot_size: int = 1000,
968
+ network_layout: str = "drl",
969
+ ) -> ig.plot:
970
+ """
971
+ Plot Neighborhood
972
+
973
+ Parameters:
974
+ ----------
975
+ neighborhood_graph: igraph.Graph
976
+ An igraph network
977
+ name_nodes: bool
978
+ Should nodes be named
979
+ plot_size: int
980
+ Plot width/height in pixels
981
+ network_layout: str
982
+ Igraph network layout method
983
+
984
+ Returns:
985
+ ----------
986
+ An igraph plot
987
+ """
988
+
989
+ neighborhood_graph_layout = neighborhood_graph.layout(network_layout)
990
+
991
+ if "net_polarity" not in neighborhood_graph.es.attributes():
992
+ logger.warning(
993
+ "net_polarity was not defined as an edge attribute so edges will not be colored"
994
+ )
995
+ neighborhood_graph.es.set_attribute_values("net_polarity", np.nan)
996
+
997
+ color_dict = {
998
+ "focal disease": "lime",
999
+ "disease": "aquamarine",
1000
+ "focal": "lightcoral",
1001
+ "species": "firebrick",
1002
+ "reaction": "dodgerblue",
1003
+ }
1004
+
1005
+ edge_polarity_colors = {
1006
+ "ambiguous": "dimgray",
1007
+ "activation": "gold",
1008
+ "inhibition": "royalblue",
1009
+ "ambiguous activation": "palegoldenrod",
1010
+ "ambiguous inhibition": "powerblue",
1011
+ np.nan: "dimgray",
1012
+ }
1013
+
1014
+ visual_style = {} # type: dict[str,Any]
1015
+ visual_style["background"] = "black"
1016
+ visual_style["vertex_size"] = 10
1017
+ if name_nodes:
1018
+ visual_style["vertex_label"] = [
1019
+ textwrap.fill(x, 15) for x in neighborhood_graph.vs["node_name"]
1020
+ ]
1021
+ visual_style["vertex_label_color"] = "white"
1022
+ visual_style["vertex_label_size"] = 8
1023
+ visual_style["vertex_label_angle"] = 90
1024
+ visual_style["vertex_label_dist"] = 3
1025
+ visual_style["vertex_color"] = [
1026
+ color_dict[x] for x in neighborhood_graph.vs["node_type"]
1027
+ ]
1028
+ visual_style["edge_color"] = [
1029
+ edge_polarity_colors[x] for x in neighborhood_graph.es["net_polarity"]
1030
+ ]
1031
+ visual_style["layout"] = neighborhood_graph_layout
1032
+ visual_style["bbox"] = (plot_size, plot_size)
1033
+ visual_style["margin"] = 50
1034
+ visual_style["title"] = "foobar"
1035
+
1036
+ return ig.plot(neighborhood_graph, **visual_style)
1037
+
1038
+
1039
+ def _precompute_neighbors(
1040
+ compartmentalized_species: list[str],
1041
+ precomputed_distances: pd.DataFrame,
1042
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
1043
+ network_type: str = NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
1044
+ order: int = 3,
1045
+ top_n: int = 10,
1046
+ ) -> pd.DataFrame:
1047
+ """
1048
+ Precompute Neighbors
1049
+
1050
+ Identify compartmentalized_species' most tightly connected neighbors using parameters
1051
+ shared by the on-the-fly methods (order for identifying neighbors within N steps;
1052
+ top_n for identifying the most the lowest weight network paths between the focal node
1053
+ and each possible neighbors). This precomputation will greatly speed up the neighborhood
1054
+ generation for highly connected species or densely connected networks. In those situations
1055
+ naively creating a neighborhood in N steps could contain thousands of neighbors.
1056
+
1057
+ """
1058
+
1059
+ # check that compartmentalized_species are included in precomputed_distances
1060
+ all_cspecies = {
1061
+ *precomputed_distances["sc_id_origin"].tolist(),
1062
+ *precomputed_distances["sc_id_dest"].tolist(),
1063
+ }
1064
+ missing_cspecies = set(compartmentalized_species).difference(all_cspecies)
1065
+ if len(missing_cspecies) > 0:
1066
+ logged_specs = ", ".join(list(missing_cspecies)[0:10])
1067
+ logger.warning(
1068
+ f"{len(missing_cspecies)} cspecies were missing from precomputed_distances including {logged_specs}"
1069
+ )
1070
+
1071
+ # filter precomputed_distances to those which originate or end with one of the compartmentalized_species
1072
+ # if we are looking for downstream species then we want relationships where a cspecies is the origin
1073
+ if network_type in [
1074
+ NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
1075
+ NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1076
+ ]:
1077
+ valid_origin = precomputed_distances["sc_id_origin"].isin(
1078
+ compartmentalized_species
1079
+ )
1080
+ if network_type in [
1081
+ NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
1082
+ NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1083
+ ]:
1084
+ valid_dest = precomputed_distances["sc_id_dest"].isin(compartmentalized_species)
1085
+
1086
+ if network_type == NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS:
1087
+ cspecies_subset_precomputed_distances = precomputed_distances[
1088
+ [True if (x or y) else False for (x, y) in zip(valid_origin, valid_dest)]
1089
+ ]
1090
+ elif network_type == NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
1091
+ cspecies_subset_precomputed_distances = precomputed_distances.loc[valid_origin]
1092
+ elif network_type == NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM:
1093
+ cspecies_subset_precomputed_distances = precomputed_distances.loc[valid_dest]
1094
+ else:
1095
+ raise ValueError(
1096
+ f"network_type was {network_type} and must by one of 'hourglass', 'downstream', 'upstream'"
1097
+ )
1098
+
1099
+ logger.debug(
1100
+ f"Pre-filtered neighbors {cspecies_subset_precomputed_distances.shape[0]}"
1101
+ )
1102
+
1103
+ # filter by distance
1104
+ close_cspecies_subset_precomputed_distances = cspecies_subset_precomputed_distances[
1105
+ cspecies_subset_precomputed_distances["path_length"] <= order
1106
+ ]
1107
+
1108
+ # filter to retain top_n
1109
+ if network_type in [
1110
+ NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
1111
+ NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1112
+ ]:
1113
+ top_descendants = (
1114
+ close_cspecies_subset_precomputed_distances[
1115
+ close_cspecies_subset_precomputed_distances["sc_id_origin"].isin(
1116
+ compartmentalized_species
1117
+ )
1118
+ ]
1119
+ # sort by path_weight so we can retain the lowest weight neighbors
1120
+ .sort_values("path_weights")
1121
+ .groupby("sc_id_origin")
1122
+ .head(top_n)
1123
+ )
1124
+
1125
+ logger.debug(f"N top_descendants {top_descendants.shape[0]}")
1126
+
1127
+ if network_type in [
1128
+ NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
1129
+ NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1130
+ ]:
1131
+ top_ancestors = (
1132
+ close_cspecies_subset_precomputed_distances[
1133
+ close_cspecies_subset_precomputed_distances["sc_id_dest"].isin(
1134
+ compartmentalized_species
1135
+ )
1136
+ ]
1137
+ # sort by path_upstream_weights so we can retain the lowest weight neighbors
1138
+ # we allow for upstream weights to differ from downstream weights
1139
+ # when creating a network in process_cpr_graph.
1140
+ #
1141
+ # the default network weighting penalizing an edge from a node
1142
+ # based on the number of children it has. this captures the idea
1143
+ # that if there are many children we might expect that each
1144
+ # of them is less likely to transduct an effect.
1145
+ # the logic is flipped if we are looking for ancestors where
1146
+ # we penalize based on the number of parents of a node when
1147
+ # we use it (i.e., the default upstream_weights).
1148
+ .sort_values("path_upstream_weights")
1149
+ .groupby("sc_id_dest")
1150
+ .head(top_n)
1151
+ )
1152
+
1153
+ logger.debug(f"N top_ancestors {top_ancestors.shape[0]}")
1154
+
1155
+ # add reactions
1156
+
1157
+ if network_type in [
1158
+ NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
1159
+ NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1160
+ ]:
1161
+ downstream_reactions = _find_reactions_by_relationship(
1162
+ precomputed_neighbors=top_descendants,
1163
+ compartmentalized_species=compartmentalized_species,
1164
+ sbml_dfs=sbml_dfs,
1165
+ relationship="descendants",
1166
+ )
1167
+
1168
+ if downstream_reactions is not None:
1169
+ logger.debug(f"N downstream reactions {downstream_reactions.shape[0]}")
1170
+
1171
+ if network_type in [
1172
+ NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
1173
+ NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1174
+ ]:
1175
+ upstream_reactions = _find_reactions_by_relationship(
1176
+ precomputed_neighbors=top_ancestors,
1177
+ compartmentalized_species=compartmentalized_species,
1178
+ sbml_dfs=sbml_dfs,
1179
+ relationship="ancestors",
1180
+ )
1181
+
1182
+ if upstream_reactions is not None:
1183
+ logger.debug(f"N upstream reactions {upstream_reactions.shape[0]}")
1184
+
1185
+ # add the self links since sc_id_dest will be used to define
1186
+ # an sc_id_origin-specific subgraph
1187
+ identity_df = pd.DataFrame(
1188
+ {
1189
+ "sc_id_origin": compartmentalized_species,
1190
+ "sc_id_dest": compartmentalized_species,
1191
+ }
1192
+ )
1193
+
1194
+ # combine all ancestor-descendent edges into the precomputed_neighbors edgelist
1195
+ if network_type == NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS:
1196
+ precomputed_neighbors = pd.concat(
1197
+ [
1198
+ top_ancestors,
1199
+ top_descendants,
1200
+ upstream_reactions, # type: ignore
1201
+ downstream_reactions, # type: ignore
1202
+ identity_df,
1203
+ ]
1204
+ )[["sc_id_origin", "sc_id_dest"]].drop_duplicates()
1205
+ elif network_type == NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
1206
+ precomputed_neighbors = pd.concat([top_descendants, downstream_reactions, identity_df])[ # type: ignore
1207
+ ["sc_id_origin", "sc_id_dest"]
1208
+ ].drop_duplicates()
1209
+ elif network_type == NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM:
1210
+ precomputed_neighbors = pd.concat([top_ancestors, upstream_reactions, identity_df])[ # type: ignore
1211
+ ["sc_id_origin", "sc_id_dest"]
1212
+ ].drop_duplicates()
1213
+ else:
1214
+ raise ValueError("This error shouldn't happen")
1215
+
1216
+ return precomputed_neighbors
1217
+
1218
+
1219
+ def _build_raw_neighborhood_df(
1220
+ cpr_graph: ig.Graph,
1221
+ compartmentalized_species: list[str],
1222
+ network_type: str,
1223
+ order: int,
1224
+ precomputed_neighbors: pd.DataFrame | None = None,
1225
+ ) -> pd.DataFrame:
1226
+ # report if network_type is not the default and will be ignored due to the network
1227
+ # being undirected
1228
+ is_directed = cpr_graph.is_directed()
1229
+ if not is_directed and network_type != NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
1230
+ logger.warning(
1231
+ "Network is undirected; network_type will be treated as 'downstream'"
1232
+ )
1233
+ network_type = NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM
1234
+
1235
+ # create the "out-network" of descendant nodes
1236
+ if network_type in [
1237
+ NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
1238
+ NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1239
+ ]:
1240
+ descendants_df = _find_neighbors(
1241
+ cpr_graph=cpr_graph,
1242
+ compartmentalized_species=compartmentalized_species,
1243
+ relationship="descendants",
1244
+ order=order,
1245
+ precomputed_neighbors=precomputed_neighbors,
1246
+ )
1247
+
1248
+ # create the "in-network" of ancestor nodes
1249
+ if network_type in [
1250
+ NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
1251
+ NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1252
+ ]:
1253
+ ancestors_df = _find_neighbors(
1254
+ cpr_graph=cpr_graph,
1255
+ compartmentalized_species=compartmentalized_species,
1256
+ relationship="ancestors",
1257
+ order=order,
1258
+ precomputed_neighbors=precomputed_neighbors,
1259
+ )
1260
+
1261
+ if network_type == NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS:
1262
+ # merge descendants and ancestors
1263
+ neighborhood_df = pd.concat([ancestors_df, descendants_df])
1264
+ elif network_type == NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
1265
+ neighborhood_df = descendants_df
1266
+ elif network_type == NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM:
1267
+ neighborhood_df = ancestors_df
1268
+ else:
1269
+ raise NotImplementedError("invalid network_type")
1270
+
1271
+ # add name since this is an easy way to lookup igraph vertices
1272
+ neighborhood_df["name"] = [
1273
+ x["name"] for x in cpr_graph.vs[neighborhood_df["neighbor"]]
1274
+ ]
1275
+
1276
+ return neighborhood_df
1277
+
1278
+
1279
+ def _find_neighbors(
1280
+ cpr_graph: ig.Graph,
1281
+ compartmentalized_species: list[str],
1282
+ relationship: str,
1283
+ order: int = 3,
1284
+ precomputed_neighbors: pd.DataFrame | None = None,
1285
+ ) -> pd.DataFrame:
1286
+ """
1287
+ Find Neighbors
1288
+
1289
+ Identify the neighbors nearby each of the requested compartmentalized_species
1290
+
1291
+ If 'precomputed_neighbors' are provided, neighbors will be summarized by reformatting
1292
+ this table. Otherwise, neighbors will be found on-the-fly using the igraph.neighborhood() method.
1293
+
1294
+ """
1295
+
1296
+ if isinstance(precomputed_neighbors, pd.DataFrame):
1297
+ # add graph indices to neighbors
1298
+ nodes_to_names = (
1299
+ pd.DataFrame({"name": cpr_graph.vs["name"]})
1300
+ .reset_index()
1301
+ .rename({"index": "neighbor"}, axis=1)
1302
+ )
1303
+
1304
+ if relationship == "descendants":
1305
+ bait_id = "sc_id_origin"
1306
+ target_id = "sc_id_dest"
1307
+ elif relationship == "ancestors":
1308
+ bait_id = "sc_id_dest"
1309
+ target_id = "sc_id_origin"
1310
+ else:
1311
+ raise ValueError(
1312
+ f"relationship must be 'descendants' or 'ancestors' but was {relationship}"
1313
+ )
1314
+
1315
+ neighbors_df = (
1316
+ precomputed_neighbors[
1317
+ precomputed_neighbors[bait_id].isin(compartmentalized_species)
1318
+ ]
1319
+ .merge(nodes_to_names.rename({"name": target_id}, axis=1))
1320
+ .rename({bait_id: "sc_id"}, axis=1)
1321
+ .drop([target_id], axis=1)
1322
+ .assign(relationship=relationship)
1323
+ )
1324
+ else:
1325
+ if relationship == "descendants":
1326
+ mode_type = "out"
1327
+ elif relationship == "ancestors":
1328
+ mode_type = "in"
1329
+ else:
1330
+ raise ValueError(
1331
+ f"relationship must be 'descendants' or 'ancestors' but was {relationship}"
1332
+ )
1333
+
1334
+ neighbors = cpr_graph.neighborhood(
1335
+ # mode = out queries outgoing edges and is ignored if the network is undirected
1336
+ vertices=compartmentalized_species,
1337
+ order=order,
1338
+ mode=mode_type,
1339
+ )
1340
+
1341
+ neighbors_df = pd.concat(
1342
+ [
1343
+ pd.DataFrame({"sc_id": c, "neighbor": x}, index=range(0, len(x)))
1344
+ for c, x in zip(compartmentalized_species, neighbors)
1345
+ ]
1346
+ ).assign(relationship=relationship)
1347
+
1348
+ return neighbors_df
1349
+
1350
+
1351
+ def _find_reactions_by_relationship(
1352
+ precomputed_neighbors,
1353
+ compartmentalized_species: list,
1354
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
1355
+ relationship: str,
1356
+ ) -> pd.DataFrame | None:
1357
+ """
1358
+ Find Reactions by Relationship
1359
+
1360
+ Based on an ancestor-descendant edgelist of compartmentalized species find all reactions which involve 2+ members
1361
+
1362
+ Since we primarily care about paths between species and reactions are more of a means-to-an-end of
1363
+ connecting pairs of species precomputed_distances are generated between just pairs of species
1364
+ this also makes the problem feasible since the number of species is upper bounded at <100K but
1365
+ the number of reactions is unbounded. Having a bound ensures that we can calculate
1366
+ the precomputed_distances efficiently using matrix operations whose memory footprint scales with O(N^2).
1367
+ """
1368
+
1369
+ # if there are no neighboring cspecies then there will be no reactions
1370
+ if precomputed_neighbors.shape[0] == 0:
1371
+ return None
1372
+
1373
+ if relationship == "descendants":
1374
+ bait_id = "sc_id_origin"
1375
+ target_id = "sc_id_dest"
1376
+ elif relationship == "ancestors":
1377
+ bait_id = "sc_id_dest"
1378
+ target_id = "sc_id_origin"
1379
+ else:
1380
+ raise ValueError(
1381
+ f"relationship must be 'descendants' or 'ancestors' but was {relationship}"
1382
+ )
1383
+
1384
+ # index by the bait id to create a series with all relatives of the specified relationship
1385
+ indexed_relatives = (
1386
+ precomputed_neighbors[
1387
+ precomputed_neighbors[bait_id].isin(compartmentalized_species)
1388
+ ]
1389
+ .set_index(bait_id)
1390
+ .sort_index()
1391
+ )
1392
+
1393
+ reaction_relatives = list()
1394
+
1395
+ # loop through compartmentalized species in precomputed_neighbors
1396
+ for uq in indexed_relatives.index.unique():
1397
+ relatives = indexed_relatives.loc[uq, target_id]
1398
+ if isinstance(relatives, str):
1399
+ relatives = [relatives]
1400
+ elif isinstance(relatives, pd.Series):
1401
+ relatives = relatives.tolist()
1402
+ else:
1403
+ raise ValueError("relatives is an unexpected type")
1404
+
1405
+ # add the focal node to the set of relatives
1406
+ relatives_cspecies = {*relatives, *[uq]}
1407
+ # count the number of relative cspecies including each reaction
1408
+ rxn_species_counts = sbml_dfs.reaction_species[
1409
+ sbml_dfs.reaction_species["sc_id"].isin(relatives_cspecies)
1410
+ ].value_counts("r_id")
1411
+
1412
+ # retain reactions involving 2+ cspecies.
1413
+ # some of these reactions will be irrelevant and will be excluded when
1414
+ # calculating the shortest paths from/to the focal node from each neighbor
1415
+ # in prune_neighborhoods()
1416
+ neighboring_reactions = rxn_species_counts[
1417
+ rxn_species_counts >= 2
1418
+ ].index.tolist()
1419
+
1420
+ # create new entries for reaction relatives
1421
+ kws = {bait_id: uq}
1422
+ new_entries = pd.DataFrame({target_id: neighboring_reactions}).assign(**kws)
1423
+
1424
+ reaction_relatives.append(new_entries)
1425
+
1426
+ reactions_df = pd.concat(reaction_relatives)
1427
+
1428
+ return reactions_df
1429
+
1430
+
1431
+ def _prune_vertex_set(one_neighborhood: dict, top_n: int) -> pd.DataFrame:
1432
+ """
1433
+ Prune Vertex Set
1434
+
1435
+ Filter a neighborhood to the lowest weight neighbors connected to the focal node.
1436
+ During this process upstream and downstream nodes are treated separately.
1437
+
1438
+ Parameters
1439
+ ----------
1440
+ one_neighborhood: dict
1441
+ The neighborhood around a single compartmentalized species - one of the values
1442
+ in dict created by find_neighborhoods().
1443
+ top_n: int
1444
+ How many neighboring molecular species should be retained?
1445
+ If the neighborhood includes both upstream and downstream connections
1446
+ (i.e., hourglass), this filter will be applied to both sets separately.
1447
+
1448
+ Returns
1449
+ -------
1450
+ vertices: pd.DataFrame
1451
+ the vertices in one_neighborhood with high weight neighbors removed.
1452
+
1453
+ """
1454
+
1455
+ neighborhood_vertices = one_neighborhood["vertices"]
1456
+
1457
+ indexed_neighborhood_species = neighborhood_vertices[
1458
+ neighborhood_vertices["node_type"] == "species"
1459
+ ].set_index("node_orientation")
1460
+
1461
+ pruned_oriented_neighbors = list()
1462
+ for a_node_orientation in indexed_neighborhood_species.index.unique().tolist():
1463
+ vertex_subset = indexed_neighborhood_species.loc[a_node_orientation]
1464
+ if type(vertex_subset) is pd.Series:
1465
+ # handle cases where only one entry exists to DF->series coercion occurs
1466
+ vertex_subset = vertex_subset.to_frame().T
1467
+
1468
+ sorted_vertex_set = vertex_subset.sort_values("path_weight")
1469
+ weight_cutoff = sorted_vertex_set["path_weight"].iloc[
1470
+ min(top_n - 1, sorted_vertex_set.shape[0] - 1)
1471
+ ]
1472
+
1473
+ top_neighbors = sorted_vertex_set[
1474
+ sorted_vertex_set["path_weight"] <= weight_cutoff
1475
+ ]["name"].tolist()
1476
+
1477
+ # include reactions and other species necessary to reach the top neighbors
1478
+ # by pulling in the past solutions to weighted shortest paths problems
1479
+ if a_node_orientation in one_neighborhood["neighborhood_path_entities"].keys():
1480
+ # path to/from focal node to each species
1481
+ neighborhood_path_entities = one_neighborhood["neighborhood_path_entities"][
1482
+ a_node_orientation
1483
+ ]
1484
+
1485
+ top_neighbors = set().union(
1486
+ *[neighborhood_path_entities[p] for p in top_neighbors]
1487
+ )
1488
+
1489
+ pruned_oriented_neighbors.append(top_neighbors)
1490
+
1491
+ # combine all neighbors
1492
+ pruned_neighbors = set().union(*pruned_oriented_neighbors)
1493
+ pruned_vertices = neighborhood_vertices[
1494
+ neighborhood_vertices["name"].isin(pruned_neighbors)
1495
+ ].reset_index(drop=True)
1496
+
1497
+ return pruned_vertices
1498
+
1499
+
1500
+ def _calculate_path_attrs(
1501
+ neighborhood_paths: list[list],
1502
+ edges: pd.DataFrame,
1503
+ vertices: list,
1504
+ weight_var: str = "weights",
1505
+ ) -> tuple[pd.DataFrame, dict[Any, set]]:
1506
+ """
1507
+ Calculate Path Attributes
1508
+
1509
+ Return the vertices and path weights (sum of edge weights) for a list of paths.
1510
+
1511
+ Parameters
1512
+ ----------
1513
+ neighborhood_paths: list
1514
+ List of lists of edge indices
1515
+ edges: pd.DataFrame
1516
+ Edges with rows correponding to entries in neighborhood_paths inner lists
1517
+ vertices: list
1518
+ List of vertices correponding to the ordering of neighborhood_paths
1519
+ weights_var: str
1520
+ variable in edges to use for scoring path weights
1521
+
1522
+ Returns
1523
+ -------
1524
+ path_attributes_df: pd.DataFrame
1525
+ A table containing attributes summarizing the path to each neighbor
1526
+ neighborhood_path_entities: dict
1527
+ Dict mapping from each neighbor to the entities connecting it to the focal node
1528
+
1529
+ """
1530
+
1531
+ if not isinstance(neighborhood_paths, list):
1532
+ raise TypeError("neighborhood_paths should be a list of lists of edge indices")
1533
+ if not isinstance(vertices, list):
1534
+ raise TypeError("vertices should be a list of list of vertices")
1535
+ assert len(vertices) > 0 # control for length zero vertices upstream
1536
+ if len(neighborhood_paths) != len(vertices):
1537
+ raise ValueError("vertices and neighborhood_paths were not the same length")
1538
+
1539
+ if any([len(x) > 0 for x in neighborhood_paths]):
1540
+ all_path_edges = (
1541
+ # create a table of edges traversed to reach each neighbor
1542
+ pd.concat(
1543
+ [
1544
+ edges.iloc[neighborhood_paths[i]].assign(neighbor=vertices[i])
1545
+ for i in range(0, len(neighborhood_paths))
1546
+ ]
1547
+ ).groupby("neighbor")
1548
+ )
1549
+
1550
+ # if all_path_edges.ngroups > 0:
1551
+ path_attributes_df = pd.concat(
1552
+ [
1553
+ all_path_edges[weight_var].agg("sum").rename("path_weight"),
1554
+ all_path_edges.agg("size").rename("path_length"),
1555
+ all_path_edges["link_polarity"]
1556
+ .agg(paths._terminal_net_polarity)
1557
+ .rename("net_polarity"),
1558
+ # add the final edge since this can be used to add path attributes to edges
1559
+ # i.e., apply net_polarity to an edge
1560
+ all_path_edges["from"].agg("last").rename("final_from"),
1561
+ all_path_edges["to"].agg("last").rename("final_to"),
1562
+ ],
1563
+ axis=1,
1564
+ ).reset_index()
1565
+
1566
+ # create a dict mapping from a neighbor to all mediating nodes
1567
+ neighborhood_path_entities = {
1568
+ group_name: set().union(*[dat["from"], dat["to"]])
1569
+ for group_name, dat in all_path_edges
1570
+ }
1571
+
1572
+ else:
1573
+ # catch case where there are no paths
1574
+ path_attributes_df = pd.DataFrame()
1575
+ neighborhood_path_entities = dict()
1576
+
1577
+ # add entries with no edges
1578
+ edgeless_nodes = [
1579
+ vertices[i]
1580
+ for i in range(0, len(neighborhood_paths))
1581
+ if len(neighborhood_paths[i]) == 0
1582
+ ]
1583
+ edgeles_nodes_df = pd.DataFrame({"neighbor": edgeless_nodes}).assign(
1584
+ path_length=0, path_weight=0, net_polarity=None
1585
+ )
1586
+
1587
+ # add edgeless entries as entries in the two outputs
1588
+ path_attributes_df = pd.concat([path_attributes_df, edgeles_nodes_df])
1589
+ neighborhood_path_entities.update({x: {x} for x in edgeless_nodes})
1590
+
1591
+ assert path_attributes_df.shape[0] == len(neighborhood_paths)
1592
+ assert len(neighborhood_path_entities) == len(neighborhood_paths)
1593
+
1594
+ return path_attributes_df, neighborhood_path_entities