napistu 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,13 +18,26 @@ from napistu import utils
18
18
  from napistu.network import ng_utils
19
19
  from napistu.network import paths
20
20
 
21
- from napistu.constants import SBML_DFS
22
- from napistu.constants import MINI_SBO_NAME_TO_POLARITY
23
- from napistu.constants import MINI_SBO_TO_NAME
24
-
25
- from napistu.network.constants import GRAPH_WIRING_APPROACHES
26
- from napistu.network.constants import NEIGHBORHOOD_NETWORK_TYPES
27
- from napistu.network.constants import VALID_NEIGHBORHOOD_NETWORK_TYPES
21
+ from napistu.constants import (
22
+ MINI_SBO_NAME_TO_POLARITY,
23
+ MINI_SBO_TO_NAME,
24
+ NAPISTU_EDGELIST,
25
+ ONTOLOGIES,
26
+ SBML_DFS,
27
+ )
28
+
29
+ from napistu.network.constants import (
30
+ DISTANCES,
31
+ GRAPH_RELATIONSHIPS,
32
+ GRAPH_WIRING_APPROACHES,
33
+ NAPISTU_GRAPH_EDGES,
34
+ NAPISTU_GRAPH_NODE_TYPES,
35
+ NAPISTU_GRAPH_VERTICES,
36
+ NEIGHBORHOOD_DICT_KEYS,
37
+ NEIGHBORHOOD_NETWORK_TYPES,
38
+ NET_POLARITY,
39
+ VALID_NEIGHBORHOOD_NETWORK_TYPES,
40
+ )
28
41
 
29
42
  logger = logging.getLogger(__name__)
30
43
 
@@ -34,8 +47,9 @@ def find_and_prune_neighborhoods(
34
47
  napistu_graph: ig.Graph,
35
48
  compartmentalized_species: str | list[str],
36
49
  precomputed_distances: pd.DataFrame | None = None,
50
+ min_pw_size: int = 3,
37
51
  source_total_counts: pd.Series | None = None,
38
- network_type: str = NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
52
+ network_type: str = NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
39
53
  order: int = 3,
40
54
  verbose: bool = True,
41
55
  top_n: int = 10,
@@ -55,6 +69,8 @@ def find_and_prune_neighborhoods(
55
69
  Compartmentalized species IDs for neighborhood centers
56
70
  precomputed_distances : pd.DataFrame or None
57
71
  If provided, an edgelist of origin->destination path weights and lengths
72
+ min_pw_size: int
73
+ the minimum size of a pathway to be considered
58
74
  source_total_counts: pd.Series | None
59
75
  Optional, A series of the total counts of each source. As produced by
60
76
  source.get_source_total_counts()
@@ -91,6 +107,16 @@ def find_and_prune_neighborhoods(
91
107
  if not isinstance(compartmentalized_species, list):
92
108
  raise TypeError("compartmentalized_species must be a list")
93
109
 
110
+ invalid_cspecies = [
111
+ x
112
+ for x in compartmentalized_species
113
+ if x not in sbml_dfs.compartmentalized_species.index
114
+ ]
115
+ if len(invalid_cspecies) > 0:
116
+ raise ValueError(
117
+ f"compartmentalized_species contains invalid species: {invalid_cspecies}"
118
+ )
119
+
94
120
  if isinstance(precomputed_distances, pd.DataFrame):
95
121
  logger.info("Pre-computed neighbors based on precomputed_distances")
96
122
 
@@ -105,18 +131,19 @@ def find_and_prune_neighborhoods(
105
131
  else:
106
132
  precomputed_neighbors = None
107
133
 
108
- neighborhoods = find_neighborhoods(
134
+ neighborhood_dicts = find_neighborhoods(
109
135
  sbml_dfs=sbml_dfs,
110
136
  napistu_graph=napistu_graph,
111
137
  compartmentalized_species=compartmentalized_species,
112
138
  network_type=network_type,
113
139
  order=order,
114
- verbose=verbose,
115
140
  precomputed_neighbors=precomputed_neighbors,
141
+ min_pw_size=min_pw_size,
116
142
  source_total_counts=source_total_counts,
143
+ verbose=verbose,
117
144
  )
118
145
 
119
- pruned_neighborhoods = prune_neighborhoods(neighborhoods, top_n=top_n)
146
+ pruned_neighborhoods = prune_neighborhoods(neighborhood_dicts, top_n=top_n)
120
147
 
121
148
  return pruned_neighborhoods
122
149
 
@@ -164,7 +191,7 @@ def load_neighborhoods(
164
191
  -------
165
192
  all_neighborhoods_df: pd.DataFrame
166
193
  A table containing all species in each query s_ids neighborhood
167
- neighborhoods_dict: dict
194
+ neighborhood_dicts: dict
168
195
  Outputs from find_and_prune_neighborhoods for each s_id
169
196
 
170
197
  """
@@ -178,16 +205,16 @@ def load_neighborhoods(
178
205
  neighborhood_paths = [vertices_path, networks_path]
179
206
 
180
207
  if all([os.path.isfile(x) for x in neighborhood_paths]) and overwrite is False:
181
- print(f"loading existing neighborhoods for {neighborhood_prefix}")
208
+ logger.info(f"loading existing neighborhoods for {neighborhood_prefix}")
182
209
 
183
210
  all_neighborhoods_df = pd.read_csv(vertices_path, sep="\t")
184
211
  with open(networks_path, "rb") as in_file:
185
- neighborhoods_dict = pickle.load(in_file)
212
+ neighborhood_dicts = pickle.load(in_file)
186
213
 
187
214
  else:
188
- print(f"creating neighborhoods based on {neighborhood_prefix}")
215
+ logger.info(f"creating neighborhoods based on {neighborhood_prefix}")
189
216
 
190
- all_neighborhoods_df, neighborhoods_dict = create_neighborhoods(
217
+ all_neighborhoods_df, neighborhood_dicts = create_neighborhoods(
191
218
  s_ids=s_ids,
192
219
  sbml_dfs=sbml_dfs,
193
220
  napistu_graph=napistu_graph,
@@ -202,9 +229,9 @@ def load_neighborhoods(
202
229
 
203
230
  # pickle neighborhoods
204
231
  with open(networks_path, "wb") as fh:
205
- pickle.dump(neighborhoods_dict, fh)
232
+ pickle.dump(neighborhood_dicts, fh)
206
233
 
207
- return all_neighborhoods_df, neighborhoods_dict
234
+ return all_neighborhoods_df, neighborhood_dicts
208
235
 
209
236
 
210
237
  def create_neighborhoods(
@@ -242,7 +269,7 @@ def create_neighborhoods(
242
269
  -------
243
270
  all_neighborhoods_df: pd.DataFrame
244
271
  A table containing all species in each query s_ids neighborhood
245
- neighborhoods_dict: dict
272
+ neighborhood_dicts: dict
246
273
  Outputs from find_and_prune_neighborhoods for each s_id
247
274
  """
248
275
 
@@ -263,13 +290,13 @@ def create_neighborhoods(
263
290
  raise TypeError(f"top_n was a {type(top_n)} and must be an int")
264
291
 
265
292
  neighborhoods_list = list()
266
- neighborhoods_dict = dict()
293
+ neighborhood_dicts = dict()
267
294
  for s_id in s_ids:
268
295
  query_sc_species = ng_utils.compartmentalize_species(sbml_dfs, s_id)
269
296
 
270
297
  compartmentalized_species = query_sc_species[SBML_DFS.SC_ID].tolist()
271
298
 
272
- neighborhoods = find_and_prune_neighborhoods(
299
+ neighborhood_dicts = find_and_prune_neighborhoods(
273
300
  sbml_dfs,
274
301
  napistu_graph,
275
302
  compartmentalized_species=compartmentalized_species,
@@ -283,23 +310,25 @@ def create_neighborhoods(
283
310
 
284
311
  neighborhood_entities = pd.concat(
285
312
  [
286
- neighborhoods[sc_id]["vertices"].assign(focal_sc_id=sc_id)
287
- for sc_id in neighborhoods.keys()
313
+ neighborhood_dicts[sc_id][NEIGHBORHOOD_DICT_KEYS.VERTICES].assign(
314
+ focal_sc_id=sc_id
315
+ )
316
+ for sc_id in neighborhood_dicts.keys()
288
317
  ]
289
318
  ).assign(focal_s_id=s_id)
290
319
 
291
320
  neighborhood_species = neighborhood_entities.merge(
292
321
  sbml_dfs.compartmentalized_species[SBML_DFS.S_ID],
293
- left_on="name",
322
+ left_on=NAPISTU_GRAPH_VERTICES.NAME,
294
323
  right_index=True,
295
324
  )
296
325
 
297
326
  neighborhoods_list.append(neighborhood_species)
298
- neighborhoods_dict[s_id] = neighborhoods
327
+ neighborhood_dicts[s_id] = neighborhood_dicts
299
328
 
300
329
  all_neighborhoods_df = pd.concat(neighborhoods_list).reset_index(drop=True)
301
330
 
302
- return all_neighborhoods_df, neighborhoods_dict
331
+ return all_neighborhoods_df, neighborhood_dicts
303
332
 
304
333
 
305
334
  def create_neighborhood_prefix(network_type: str, order: int, top_n: int) -> str:
@@ -321,6 +350,7 @@ def create_neighborhood_prefix(network_type: str, order: int, top_n: int) -> str
321
350
  def load_neighborhoods_by_partition(
322
351
  selected_partition: int,
323
352
  neighborhood_outdir: str,
353
+ cache_dir: str,
324
354
  wiring_approach: str = GRAPH_WIRING_APPROACHES.REGULATORY,
325
355
  ) -> None:
326
356
  """
@@ -343,19 +373,18 @@ def load_neighborhoods_by_partition(
343
373
 
344
374
  """
345
375
 
346
- consensus_root = "/group/cpr/consensus"
347
- consensus_name = "reactome"
348
- consensus_outdir = os.path.join(consensus_root, consensus_name)
349
-
350
376
  if not os.path.isdir(neighborhood_outdir):
351
377
  raise FileNotFoundError(f"{neighborhood_outdir} does not exist")
352
378
 
379
+ if not os.path.isdir(cache_dir):
380
+ raise FileNotFoundError(f"{cache_dir} does not exist")
381
+
353
382
  partition_output = os.path.join(
354
383
  neighborhood_outdir, f"partition_{selected_partition}"
355
384
  )
356
385
  # initialize an empty output
357
386
  if os.path.isdir(partition_output):
358
- print(f"removing existing directory: {partition_output}")
387
+ logger.warning(f"removing existing directory: {partition_output}")
359
388
  shutil.rmtree(partition_output)
360
389
  os.makedirs(partition_output)
361
390
 
@@ -369,13 +398,13 @@ def load_neighborhoods_by_partition(
369
398
  if parition_sids_df.shape[0] == 0:
370
399
  raise ValueError(f"No s_ids associated with partition {selected_partition}")
371
400
 
372
- parition_sids = parition_sids_df["s_id"].tolist()
401
+ parition_sids = parition_sids_df[SBML_DFS.S_ID].tolist()
373
402
 
374
403
  # read pathway and network data
375
404
 
376
405
  # read model containing Calico curations. this is primarily to support search programs
377
406
  # to not use these switch to refined.pkl
378
- refined_model_pkl_path = os.path.join(consensus_outdir, "curated.pkl")
407
+ refined_model_pkl_path = os.path.join(cache_dir, "curated.pkl")
379
408
  with open(refined_model_pkl_path, "rb") as in_file:
380
409
  refined_model = pickle.load(in_file)
381
410
  refined_model.validate()
@@ -383,12 +412,12 @@ def load_neighborhoods_by_partition(
383
412
  # load the graph
384
413
  napistu_graph = ng_utils.read_network_pkl(
385
414
  model_prefix="curated",
386
- network_dir=consensus_outdir,
415
+ network_dir=cache_dir,
387
416
  directed=True,
388
417
  wiring_approach=wiring_approach,
389
418
  )
390
419
 
391
- all_neighborhoods_df, neighborhoods_dict = load_neighborhoods(
420
+ _, _ = load_neighborhoods(
392
421
  s_ids=parition_sids,
393
422
  sbml_dfs=refined_model,
394
423
  napistu_graph=napistu_graph,
@@ -429,7 +458,7 @@ def read_paritioned_neighborhoods(
429
458
  -------
430
459
  all_neighborhoods_df: pd.DataFrame
431
460
  A table containing all species in each query s_ids neighborhood
432
- neighborhoods_dict: dict
461
+ neighborhood_dicts: dict
433
462
  Outputs from find_and_prune_neighborhoods for each s_id
434
463
 
435
464
  """
@@ -494,7 +523,7 @@ def read_paritioned_neighborhoods(
494
523
 
495
524
  # combine all partitions' dfs and dicts
496
525
  all_neighborhoods_df = pd.concat(neighborhood_paths_list).reset_index(drop=True)
497
- neighborhoods_dict = dict(ChainMap(*path_dict_list))
526
+ neighborhood_dicts = dict(ChainMap(*path_dict_list))
498
527
 
499
528
  # TO DO - remove s_id duplication (these are present in the vertices table in the partition outputs)
500
529
  if not all(all_neighborhoods_df["s_id_x"] == all_neighborhoods_df["s_id_y"]):
@@ -503,14 +532,14 @@ def read_paritioned_neighborhoods(
503
532
  {"s_id_x": "s_id"}, axis=1
504
533
  )
505
534
 
506
- return all_neighborhoods_df, neighborhoods_dict
535
+ return all_neighborhoods_df, neighborhood_dicts
507
536
 
508
537
 
509
538
  def find_neighborhoods(
510
539
  sbml_dfs: sbml_dfs_core.SBML_dfs,
511
540
  napistu_graph: ig.Graph,
512
541
  compartmentalized_species: list[str],
513
- network_type: str = "downstream",
542
+ network_type: str = NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
514
543
  order: int = 3,
515
544
  min_pw_size: int = 3,
516
545
  precomputed_neighbors: pd.DataFrame | None = None,
@@ -544,7 +573,7 @@ def find_neighborhoods(
544
573
  the minimum size of a pathway to be considered
545
574
  source_total_counts: pd.Series | None
546
575
  Optional, A series of the total counts of each source. As produced by
547
- source.get_source_total_counts()\
576
+ source.get_source_total_counts()
548
577
  verbose: bool
549
578
  Extra reporting
550
579
 
@@ -557,15 +586,24 @@ def find_neighborhoods(
557
586
  if not isinstance(network_type, str):
558
587
  raise TypeError(f"network_type was a {type(network_type)} and must be a str")
559
588
 
560
- valid_network_types = ["downstream", "upstream", "hourglass"]
561
- if network_type not in valid_network_types:
589
+ if network_type not in VALID_NEIGHBORHOOD_NETWORK_TYPES:
562
590
  raise ValueError(
563
- f"network_type must be one of {', '.join(valid_network_types)}"
591
+ f"network_type must be one of {', '.join(VALID_NEIGHBORHOOD_NETWORK_TYPES)}"
564
592
  )
565
593
 
566
594
  if not isinstance(order, int):
567
595
  raise TypeError(f"order was a {type(order)} and must be an int")
568
596
 
597
+ invalid_cspecies = [
598
+ x
599
+ for x in compartmentalized_species
600
+ if x not in sbml_dfs.compartmentalized_species.index
601
+ ]
602
+ if len(invalid_cspecies) > 0:
603
+ raise ValueError(
604
+ f"compartmentalized_species contains invalid species: {invalid_cspecies}"
605
+ )
606
+
569
607
  # create a table which includes cspecies and reaction nearby each of the
570
608
  # focal compartmentalized_speecies
571
609
  neighborhood_df = _build_raw_neighborhood_df(
@@ -634,8 +672,8 @@ def create_neighborhood_dict_entry(
634
672
  nodes in the neighborhood
635
673
  edges: pd.DataFrame
636
674
  edges in the neighborhood
637
- edge_sources: pd.DataFrame
638
- models that edges were derived from
675
+ reaction_sources: pd.DataFrame
676
+ models that reactions were derived from
639
677
  neighborhood_path_entities: dict
640
678
  upstream and downstream dicts representing entities in paths.
641
679
  If the keys are to be included in a neighborhood, the
@@ -643,12 +681,12 @@ def create_neighborhood_dict_entry(
643
681
  focal node.
644
682
  """
645
683
 
646
- one_neighborhood_df = neighborhood_df[neighborhood_df["sc_id"] == sc_id]
684
+ one_neighborhood_df = neighborhood_df[neighborhood_df[SBML_DFS.SC_ID] == sc_id]
647
685
 
648
686
  if verbose:
649
687
  _create_neighborhood_dict_entry_logging(sc_id, one_neighborhood_df, sbml_dfs)
650
688
 
651
- if not one_neighborhood_df["name"].eq(sc_id).any():
689
+ if not one_neighborhood_df[NAPISTU_GRAPH_VERTICES.NAME].eq(sc_id).any():
652
690
  raise ValueError(
653
691
  f"The focal node sc_id = {sc_id} was not in 'one_neighborhood_df'.\
654
692
  By convention it should be part of its neighborhood"
@@ -664,95 +702,61 @@ def create_neighborhood_dict_entry(
664
702
 
665
703
  # add edge polarity: whether edges are activating, inhibiting or unknown
666
704
  if edges.shape[0] > 0:
667
- edges["link_polarity"] = (
668
- edges["sbo_term"].map(MINI_SBO_TO_NAME).map(MINI_SBO_NAME_TO_POLARITY)
705
+ edges[NET_POLARITY.LINK_POLARITY] = (
706
+ edges[SBML_DFS.SBO_TERM]
707
+ .map(MINI_SBO_TO_NAME)
708
+ .map(MINI_SBO_NAME_TO_POLARITY)
669
709
  )
670
710
 
671
711
  try:
672
- edge_sources = ng_utils.get_minimal_sources_edges(
673
- vertices.rename(columns={"name": "node"}),
712
+ reaction_sources = ng_utils.get_minimal_sources_edges(
713
+ vertices.rename(columns={NAPISTU_GRAPH_VERTICES.NAME: "node"}),
674
714
  sbml_dfs,
675
715
  min_pw_size=min_pw_size,
676
716
  # optional, counts of sources across the whole model
677
717
  source_total_counts=source_total_counts,
678
718
  )
679
719
  except Exception:
680
- edge_sources = None
720
+ logger.warning(f"Could not get reaction sources for {sc_id}; returning None")
721
+ reaction_sources = None
681
722
 
682
723
  # to add weights to the network solve the shortest path problem
683
724
  # from the focal node to each neighbor
684
725
  # solve this problem separately whether a given neighbor is an
685
726
  # ancestor or descendant
686
727
 
687
- # focal node -> descendants
688
-
689
- one_descendants_df = one_neighborhood_df[
690
- one_neighborhood_df["relationship"] == "descendants"
691
- ]
692
- descendants_list = list(set(one_descendants_df["name"].tolist()).union({sc_id}))
693
-
694
- # hide warnings which are mostly just Dijkstra complaining about not finding neighbors
695
- with warnings.catch_warnings():
696
- # igraph throws warnings for each pair of unconnected species
697
- warnings.simplefilter("ignore")
698
-
699
- neighborhood_paths = neighborhood_graph.get_shortest_paths(
700
- # focal node
701
- v=sc_id,
702
- to=descendants_list,
703
- weights="weights",
704
- mode="out",
705
- output="epath",
706
- )
707
-
708
- downstream_path_attrs, downstream_entity_dict = _calculate_path_attrs(
709
- neighborhood_paths, edges, vertices=descendants_list, weight_var="weights"
710
- )
711
- downstream_path_attrs = downstream_path_attrs.assign(node_orientation="downstream")
712
-
713
- # ancestors -> focal_node
714
-
715
- one_ancestors_df = one_neighborhood_df[
716
- one_neighborhood_df["relationship"] == "ancestors"
717
- ]
718
- ancestors_list = list(set(one_ancestors_df["name"].tolist()).union({sc_id}))
719
-
720
- with warnings.catch_warnings():
721
- # igraph throws warnings for each pair of unconnected species
722
- warnings.simplefilter("ignore")
723
-
724
- neighborhood_paths = neighborhood_graph.get_shortest_paths(
725
- v=sc_id,
726
- to=ancestors_list,
727
- weights="upstream_weights",
728
- mode="in",
729
- output="epath",
730
- )
731
-
732
- upstream_path_attrs, upstream_entity_dict = _calculate_path_attrs(
733
- neighborhood_paths,
728
+ (
729
+ downstream_path_attrs,
730
+ downstream_entity_dict,
731
+ upstream_path_attrs,
732
+ upstream_entity_dict,
733
+ ) = _find_neighbors_paths(
734
+ neighborhood_graph,
735
+ one_neighborhood_df,
736
+ sc_id,
734
737
  edges,
735
- vertices=ancestors_list,
736
- weight_var="upstream_weights",
737
738
  )
738
- upstream_path_attrs = upstream_path_attrs.assign(node_orientation="upstream")
739
739
 
740
740
  # combine upstream and downstream shortest paths
741
741
  # in cases a node is upstream and downstream of the focal node
742
742
  # by taking the lowest path weight
743
743
  vertex_neighborhood_attrs = (
744
744
  pd.concat([downstream_path_attrs, upstream_path_attrs])
745
- .sort_values("path_weight")
745
+ .sort_values(DISTANCES.PATH_WEIGHTS)
746
746
  .groupby("neighbor")
747
747
  .first()
748
748
  )
749
749
  # label the focal node
750
- vertex_neighborhood_attrs.loc[sc_id, "node_orientation"] = "focal"
750
+ vertex_neighborhood_attrs.loc[sc_id, "node_orientation"] = GRAPH_RELATIONSHIPS.FOCAL
751
751
 
752
752
  # if the precomputed distances, graph and/or sbml_dfs are inconsistent
753
753
  # then the shortest paths search may just return empty lists
754
754
  # throw a clearer error message in this case.
755
- EXPECTED_VERTEX_ATTRS = {"final_from", "final_to", "net_polarity"}
755
+ EXPECTED_VERTEX_ATTRS = {
756
+ DISTANCES.FINAL_FROM,
757
+ DISTANCES.FINAL_TO,
758
+ NET_POLARITY.NET_POLARITY,
759
+ }
756
760
  missing_vertex_attrs = EXPECTED_VERTEX_ATTRS.difference(
757
761
  set(vertex_neighborhood_attrs.columns.tolist())
758
762
  )
@@ -767,22 +771,22 @@ def create_neighborhood_dict_entry(
767
771
  # add net_polarity to edges in addition to nodes
768
772
  edges = edges.merge(
769
773
  vertex_neighborhood_attrs.reset_index()[
770
- ["final_from", "final_to", "net_polarity"]
774
+ [DISTANCES.FINAL_FROM, DISTANCES.FINAL_TO, NET_POLARITY.NET_POLARITY]
771
775
  ].dropna(),
772
- left_on=["from", "to"],
773
- right_on=["final_from", "final_to"],
776
+ left_on=[NAPISTU_GRAPH_EDGES.FROM, NAPISTU_GRAPH_EDGES.TO],
777
+ right_on=[DISTANCES.FINAL_FROM, DISTANCES.FINAL_TO],
774
778
  how="left",
775
779
  )
776
780
 
777
781
  vertices = vertices.merge(
778
- vertex_neighborhood_attrs, left_on="name", right_index=True
782
+ vertex_neighborhood_attrs, left_on=NAPISTU_GRAPH_VERTICES.NAME, right_index=True
779
783
  )
780
784
 
781
785
  # drop nodes with a path length / weight of zero
782
786
  # which are NOT the focal node
783
787
  # these were cases where no path to/from the focal node to the query node was found
784
788
  disconnected_neighbors = vertices.query(
785
- "(not node_orientation == 'focal') and path_weight == 0"
789
+ f"(not node_orientation == '{GRAPH_RELATIONSHIPS.FOCAL}') and {DISTANCES.PATH_WEIGHTS} == 0"
786
790
  )
787
791
  vertices = vertices[~vertices.index.isin(disconnected_neighbors.index.tolist())]
788
792
 
@@ -790,8 +794,8 @@ def create_neighborhood_dict_entry(
790
794
  vertices = add_vertices_uri_urls(vertices, sbml_dfs)
791
795
 
792
796
  neighborhood_path_entities = {
793
- "downstream": downstream_entity_dict,
794
- "upstream": upstream_entity_dict,
797
+ NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM: downstream_entity_dict,
798
+ NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM: upstream_entity_dict,
795
799
  }
796
800
 
797
801
  # update graph with additional vertex and edge attributes
@@ -799,16 +803,16 @@ def create_neighborhood_dict_entry(
799
803
  vertices=vertices.to_dict("records"),
800
804
  edges=edges.to_dict("records"),
801
805
  directed=napistu_graph.is_directed(),
802
- vertex_name_attr="name",
803
- edge_foreign_keys=("from", "to"),
806
+ vertex_name_attr=NAPISTU_GRAPH_VERTICES.NAME,
807
+ edge_foreign_keys=(NAPISTU_GRAPH_EDGES.FROM, NAPISTU_GRAPH_EDGES.TO),
804
808
  )
805
809
 
806
810
  outdict = {
807
- "graph": updated_napistu_graph,
808
- "vertices": vertices,
809
- "edges": edges,
810
- "edge_sources": edge_sources,
811
- "neighborhood_path_entities": neighborhood_path_entities,
811
+ NEIGHBORHOOD_DICT_KEYS.GRAPH: updated_napistu_graph,
812
+ NEIGHBORHOOD_DICT_KEYS.VERTICES: vertices,
813
+ NEIGHBORHOOD_DICT_KEYS.EDGES: edges,
814
+ NEIGHBORHOOD_DICT_KEYS.REACTION_SOURCES: reaction_sources,
815
+ NEIGHBORHOOD_DICT_KEYS.NEIGHBORHOOD_PATH_ENTITIES: neighborhood_path_entities,
812
816
  }
813
817
 
814
818
  return outdict
@@ -818,9 +822,11 @@ def _create_neighborhood_dict_entry_logging(
818
822
  sc_id: str, one_neighborhood_df: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
819
823
  ):
820
824
  df_summary = one_neighborhood_df.copy()
821
- df_summary["node_type"] = [
822
- "species" if x else "reactions"
823
- for x in df_summary["name"].isin(sbml_dfs.compartmentalized_species.index)
825
+ df_summary[NAPISTU_GRAPH_VERTICES.NODE_TYPE] = [
826
+ NAPISTU_GRAPH_NODE_TYPES.SPECIES if x else NAPISTU_GRAPH_NODE_TYPES.REACTION
827
+ for x in df_summary[NAPISTU_GRAPH_VERTICES.NAME].isin(
828
+ sbml_dfs.compartmentalized_species.index
829
+ )
824
830
  ]
825
831
  relationship_counts = df_summary.value_counts(
826
832
  ["relationship", "node_type"]
@@ -844,22 +850,45 @@ def add_vertices_uri_urls(
844
850
  vertices: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
845
851
  ) -> pd.DataFrame:
846
852
  """
847
- Add Vertices URI URLs
853
+ Add URI URLs to neighborhood vertices DataFrame.
848
854
 
849
- Add a url variable to the neighborhood vertices pd.DataFrame
855
+ This function enriches a vertices DataFrame with URI URLs for both species and
856
+ reactions. For species, it adds standard reference identifiers and Pharos IDs
857
+ where available. For reactions, it adds reaction-specific URI URLs.
850
858
 
851
859
  Parameters
852
860
  ----------
853
861
  vertices: pd.DataFrame
854
- table of neighborhood vertices
862
+ DataFrame containing neighborhood vertices with the following required columns:
863
+ - NAPISTU_GRAPH_VERTICES.NAME: The name/identifier of each vertex
864
+ - NAPISTU_GRAPH_VERTICES.NODE_TYPE: The type of node, either
865
+ NAPISTU_GRAPH_NODE_TYPES.SPECIES or NAPISTU_GRAPH_NODE_TYPES.REACTION
855
866
  sbml_dfs: sbml_dfs_core.SBML_dfs
856
- consensus network model
867
+ Pathway model including species, compartmentalized species, reactions and ontologies
857
868
 
858
869
  Returns
859
870
  -------
860
- vertices: pd.DataFrame
861
- input table with a url field
871
+ pd.DataFrame
872
+ Input vertices DataFrame enriched with URI URL columns:
873
+ - For species: standard reference identifier URLs and Pharos IDs
874
+ - For reactions: reaction-specific URI URLs
875
+ - Empty strings for missing URLs
862
876
 
877
+ Raises
878
+ ------
879
+ ValueError
880
+ If vertices DataFrame is empty (no rows)
881
+ TypeError
882
+ If the output is not a pandas DataFrame
883
+ ValueError
884
+ If the output row count doesn't match the input row count
885
+
886
+ Notes
887
+ -----
888
+ - Species vertices are merged with compartmentalized_species to get s_id mappings
889
+ - Reaction vertices are processed directly using their names
890
+ - Missing URLs are filled with empty strings
891
+ - The function preserves the original row order and count
863
892
  """
864
893
 
865
894
  if vertices.shape[0] <= 0:
@@ -868,35 +897,54 @@ def add_vertices_uri_urls(
868
897
  # add uri urls for each node
869
898
 
870
899
  # add s_ids
871
- neighborhood_species = vertices[vertices["node_type"] == "species"].merge(
872
- sbml_dfs.compartmentalized_species["s_id"],
873
- left_on="name",
900
+ neighborhood_species = vertices[
901
+ vertices[NAPISTU_GRAPH_VERTICES.NODE_TYPE] == NAPISTU_GRAPH_NODE_TYPES.SPECIES
902
+ ].merge(
903
+ sbml_dfs.compartmentalized_species[SBML_DFS.S_ID],
904
+ left_on=NAPISTU_GRAPH_VERTICES.NAME,
874
905
  right_index=True,
875
906
  how="left",
876
907
  )
877
908
 
878
909
  # add a standard reference identifier
879
910
  neighborhood_species_aug = neighborhood_species.merge(
880
- sbml_dfs.get_uri_urls("species", neighborhood_species["s_id"]),
881
- left_on="s_id",
911
+ sbml_dfs.get_uri_urls(
912
+ NAPISTU_GRAPH_NODE_TYPES.SPECIES, neighborhood_species[SBML_DFS.S_ID]
913
+ ),
914
+ left_on=SBML_DFS.S_ID,
882
915
  right_index=True,
883
916
  how="left",
884
917
  # add pharos ids where available
885
918
  ).merge(
886
919
  sbml_dfs.get_uri_urls(
887
- "species", neighborhood_species["s_id"], required_ontology="pharos"
888
- ).rename("pharos"),
889
- left_on="s_id",
920
+ NAPISTU_GRAPH_NODE_TYPES.SPECIES,
921
+ neighborhood_species[SBML_DFS.S_ID],
922
+ required_ontology=ONTOLOGIES.PHAROS,
923
+ ).rename(ONTOLOGIES.PHAROS),
924
+ left_on=SBML_DFS.S_ID,
890
925
  right_index=True,
891
926
  how="left",
892
927
  )
893
928
 
894
- if sum(vertices["node_type"] == "reaction") > 0:
895
- neighborhood_reactions = vertices[vertices["node_type"] == "reaction"].merge(
929
+ if (
930
+ sum(
931
+ vertices[NAPISTU_GRAPH_VERTICES.NODE_TYPE]
932
+ == NAPISTU_GRAPH_NODE_TYPES.REACTION
933
+ )
934
+ > 0
935
+ ):
936
+ neighborhood_reactions = vertices[
937
+ vertices[NAPISTU_GRAPH_VERTICES.NODE_TYPE]
938
+ == NAPISTU_GRAPH_NODE_TYPES.REACTION
939
+ ].merge(
896
940
  sbml_dfs.get_uri_urls(
897
- "reactions", vertices[vertices["node_type"] == "reaction"]["name"]
941
+ SBML_DFS.REACTIONS,
942
+ vertices[
943
+ vertices[NAPISTU_GRAPH_VERTICES.NODE_TYPE]
944
+ == NAPISTU_GRAPH_NODE_TYPES.REACTION
945
+ ][NAPISTU_GRAPH_VERTICES.NAME],
898
946
  ),
899
- left_on="name",
947
+ left_on=NAPISTU_GRAPH_VERTICES.NAME,
900
948
  right_index=True,
901
949
  how="left",
902
950
  )
@@ -945,7 +993,7 @@ def prune_neighborhoods(neighborhoods: dict, top_n: int = 100) -> dict:
945
993
  if not isinstance(top_n, int):
946
994
  raise TypeError(f"top_n was a {type(top_n)} and must be an int")
947
995
 
948
- pruned_neighborhoods_dict = dict()
996
+ pruned_neighborhood_dicts = dict()
949
997
 
950
998
  for an_sc_id in neighborhoods.keys():
951
999
  one_neighborhood = neighborhoods[an_sc_id]
@@ -955,41 +1003,58 @@ def prune_neighborhoods(neighborhoods: dict, top_n: int = 100) -> dict:
955
1003
  pruned_vertices = _prune_vertex_set(one_neighborhood, top_n=top_n)
956
1004
 
957
1005
  # reduce neighborhood to this set of high-weight vertices
958
- all_neighbors = pd.DataFrame({"name": one_neighborhood["graph"].vs["name"]})
1006
+ all_neighbors = pd.DataFrame(
1007
+ {
1008
+ NAPISTU_GRAPH_VERTICES.NAME: one_neighborhood[
1009
+ NEIGHBORHOOD_DICT_KEYS.GRAPH
1010
+ ].vs[NAPISTU_GRAPH_VERTICES.NAME]
1011
+ }
1012
+ )
959
1013
  pruned_vertices_indices = all_neighbors[
960
- all_neighbors["name"].isin(pruned_vertices["name"])
1014
+ all_neighbors[NAPISTU_GRAPH_VERTICES.NAME].isin(
1015
+ pruned_vertices[NAPISTU_GRAPH_VERTICES.NAME]
1016
+ )
961
1017
  ].index.tolist()
962
1018
 
963
- pruned_neighborhood = one_neighborhood["graph"].subgraph(
964
- one_neighborhood["graph"].vs[pruned_vertices_indices],
1019
+ pruned_neighborhood = one_neighborhood[NEIGHBORHOOD_DICT_KEYS.GRAPH].subgraph(
1020
+ one_neighborhood[NEIGHBORHOOD_DICT_KEYS.GRAPH].vs[pruned_vertices_indices],
965
1021
  implementation="auto",
966
1022
  )
967
1023
 
968
1024
  pruned_edges = pd.DataFrame([e.attributes() for e in pruned_neighborhood.es])
969
1025
 
970
- pruned_reactions = pruned_vertices[pruned_vertices["node_type"] == "reaction"][
971
- "name"
972
- ]
1026
+ pruned_reactions = pruned_vertices[
1027
+ pruned_vertices[NAPISTU_GRAPH_VERTICES.NODE_TYPE]
1028
+ == NAPISTU_GRAPH_NODE_TYPES.REACTION
1029
+ ][NAPISTU_GRAPH_VERTICES.NAME]
973
1030
 
974
1031
  if pruned_reactions.shape[0] != 0:
975
- if one_neighborhood["edge_sources"] is None:
1032
+ if one_neighborhood[NEIGHBORHOOD_DICT_KEYS.REACTION_SOURCES] is None:
976
1033
  # allow for missing source information since this is currently optional
977
- pruned_edge_sources = one_neighborhood["edge_sources"]
1034
+ pruned_reaction_sources = one_neighborhood[
1035
+ NEIGHBORHOOD_DICT_KEYS.REACTION_SOURCES
1036
+ ]
978
1037
  else:
979
- pruned_edge_sources = one_neighborhood["edge_sources"][
980
- one_neighborhood["edge_sources"]["r_id"].isin(pruned_reactions)
1038
+ pruned_reaction_sources = one_neighborhood[
1039
+ NEIGHBORHOOD_DICT_KEYS.REACTION_SOURCES
1040
+ ][
1041
+ one_neighborhood[NEIGHBORHOOD_DICT_KEYS.REACTION_SOURCES][
1042
+ SBML_DFS.R_ID
1043
+ ].isin(pruned_reactions)
981
1044
  ]
982
1045
  else:
983
- pruned_edge_sources = one_neighborhood["edge_sources"]
1046
+ pruned_reaction_sources = one_neighborhood[
1047
+ NEIGHBORHOOD_DICT_KEYS.REACTION_SOURCES
1048
+ ]
984
1049
 
985
- pruned_neighborhoods_dict[an_sc_id] = {
986
- "graph": pruned_neighborhood,
987
- "vertices": pruned_vertices,
988
- "edges": pruned_edges,
989
- "edge_sources": pruned_edge_sources,
1050
+ pruned_neighborhood_dicts[an_sc_id] = {
1051
+ NEIGHBORHOOD_DICT_KEYS.GRAPH: pruned_neighborhood,
1052
+ NEIGHBORHOOD_DICT_KEYS.VERTICES: pruned_vertices,
1053
+ NEIGHBORHOOD_DICT_KEYS.EDGES: pruned_edges,
1054
+ NEIGHBORHOOD_DICT_KEYS.REACTION_SOURCES: pruned_reaction_sources,
990
1055
  }
991
1056
 
992
- return pruned_neighborhoods_dict
1057
+ return pruned_neighborhood_dicts
993
1058
 
994
1059
 
995
1060
  def plot_neighborhood(
@@ -1029,16 +1094,16 @@ def plot_neighborhood(
1029
1094
  "focal disease": "lime",
1030
1095
  "disease": "aquamarine",
1031
1096
  "focal": "lightcoral",
1032
- "species": "firebrick",
1033
- "reaction": "dodgerblue",
1097
+ NAPISTU_GRAPH_NODE_TYPES.SPECIES: "firebrick",
1098
+ NAPISTU_GRAPH_NODE_TYPES.REACTION: "dodgerblue",
1034
1099
  }
1035
1100
 
1036
1101
  edge_polarity_colors = {
1037
- "ambiguous": "dimgray",
1038
- "activation": "gold",
1039
- "inhibition": "royalblue",
1040
- "ambiguous activation": "palegoldenrod",
1041
- "ambiguous inhibition": "powerblue",
1102
+ NET_POLARITY.AMBIGUOUS: "dimgray",
1103
+ NET_POLARITY.ACTIVATION: "gold",
1104
+ NET_POLARITY.INHIBITION: "royalblue",
1105
+ NET_POLARITY.AMBIGUOUS_ACTIVATION: "palegoldenrod",
1106
+ NET_POLARITY.AMBIGUOUS_INHIBITION: "powerblue",
1042
1107
  np.nan: "dimgray",
1043
1108
  }
1044
1109
 
@@ -1047,17 +1112,19 @@ def plot_neighborhood(
1047
1112
  visual_style["vertex_size"] = 10
1048
1113
  if name_nodes:
1049
1114
  visual_style["vertex_label"] = [
1050
- textwrap.fill(x, 15) for x in neighborhood_graph.vs["node_name"]
1115
+ textwrap.fill(x, 15)
1116
+ for x in neighborhood_graph.vs[NAPISTU_GRAPH_VERTICES.NODE_NAME]
1051
1117
  ]
1052
1118
  visual_style["vertex_label_color"] = "white"
1053
1119
  visual_style["vertex_label_size"] = 8
1054
1120
  visual_style["vertex_label_angle"] = 90
1055
1121
  visual_style["vertex_label_dist"] = 3
1056
1122
  visual_style["vertex_color"] = [
1057
- color_dict[x] for x in neighborhood_graph.vs["node_type"]
1123
+ color_dict[x] for x in neighborhood_graph.vs[NAPISTU_GRAPH_VERTICES.NODE_TYPE]
1058
1124
  ]
1059
1125
  visual_style["edge_color"] = [
1060
- edge_polarity_colors[x] for x in neighborhood_graph.es["net_polarity"]
1126
+ edge_polarity_colors[x]
1127
+ for x in neighborhood_graph.es[NET_POLARITY.NET_POLARITY]
1061
1128
  ]
1062
1129
  visual_style["layout"] = neighborhood_graph_layout
1063
1130
  visual_style["bbox"] = (plot_size, plot_size)
@@ -1089,8 +1156,8 @@ def _precompute_neighbors(
1089
1156
 
1090
1157
  # check that compartmentalized_species are included in precomputed_distances
1091
1158
  all_cspecies = {
1092
- *precomputed_distances["sc_id_origin"].tolist(),
1093
- *precomputed_distances["sc_id_dest"].tolist(),
1159
+ *precomputed_distances[NAPISTU_EDGELIST.SC_ID_ORIGIN].tolist(),
1160
+ *precomputed_distances[NAPISTU_EDGELIST.SC_ID_DEST].tolist(),
1094
1161
  }
1095
1162
  missing_cspecies = set(compartmentalized_species).difference(all_cspecies)
1096
1163
  if len(missing_cspecies) > 0:
@@ -1105,14 +1172,16 @@ def _precompute_neighbors(
1105
1172
  NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM,
1106
1173
  NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1107
1174
  ]:
1108
- valid_origin = precomputed_distances["sc_id_origin"].isin(
1175
+ valid_origin = precomputed_distances[NAPISTU_EDGELIST.SC_ID_ORIGIN].isin(
1109
1176
  compartmentalized_species
1110
1177
  )
1111
1178
  if network_type in [
1112
1179
  NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM,
1113
1180
  NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS,
1114
1181
  ]:
1115
- valid_dest = precomputed_distances["sc_id_dest"].isin(compartmentalized_species)
1182
+ valid_dest = precomputed_distances[NAPISTU_EDGELIST.SC_ID_DEST].isin(
1183
+ compartmentalized_species
1184
+ )
1116
1185
 
1117
1186
  if network_type == NEIGHBORHOOD_NETWORK_TYPES.HOURGLASS:
1118
1187
  cspecies_subset_precomputed_distances = precomputed_distances[
@@ -1133,7 +1202,7 @@ def _precompute_neighbors(
1133
1202
 
1134
1203
  # filter by distance
1135
1204
  close_cspecies_subset_precomputed_distances = cspecies_subset_precomputed_distances[
1136
- cspecies_subset_precomputed_distances["path_length"] <= order
1205
+ cspecies_subset_precomputed_distances[DISTANCES.PATH_LENGTH] <= order
1137
1206
  ]
1138
1207
 
1139
1208
  # filter to retain top_n
@@ -1143,13 +1212,13 @@ def _precompute_neighbors(
1143
1212
  ]:
1144
1213
  top_descendants = (
1145
1214
  close_cspecies_subset_precomputed_distances[
1146
- close_cspecies_subset_precomputed_distances["sc_id_origin"].isin(
1147
- compartmentalized_species
1148
- )
1215
+ close_cspecies_subset_precomputed_distances[
1216
+ DISTANCES.SC_ID_ORIGIN
1217
+ ].isin(compartmentalized_species)
1149
1218
  ]
1150
1219
  # sort by path_weight so we can retain the lowest weight neighbors
1151
- .sort_values("path_weights")
1152
- .groupby("sc_id_origin")
1220
+ .sort_values(DISTANCES.PATH_WEIGHTS)
1221
+ .groupby(NAPISTU_EDGELIST.SC_ID_ORIGIN)
1153
1222
  .head(top_n)
1154
1223
  )
1155
1224
 
@@ -1161,9 +1230,9 @@ def _precompute_neighbors(
1161
1230
  ]:
1162
1231
  top_ancestors = (
1163
1232
  close_cspecies_subset_precomputed_distances[
1164
- close_cspecies_subset_precomputed_distances["sc_id_dest"].isin(
1165
- compartmentalized_species
1166
- )
1233
+ close_cspecies_subset_precomputed_distances[
1234
+ NAPISTU_EDGELIST.SC_ID_DEST
1235
+ ].isin(compartmentalized_species)
1167
1236
  ]
1168
1237
  # sort by path_upstream_weights so we can retain the lowest weight neighbors
1169
1238
  # we allow for upstream weights to differ from downstream weights
@@ -1176,8 +1245,8 @@ def _precompute_neighbors(
1176
1245
  # the logic is flipped if we are looking for ancestors where
1177
1246
  # we penalize based on the number of parents of a node when
1178
1247
  # we use it (i.e., the default upstream_weights).
1179
- .sort_values("path_upstream_weights")
1180
- .groupby("sc_id_dest")
1248
+ .sort_values(DISTANCES.PATH_UPSTREAM_WEIGHTS)
1249
+ .groupby(NAPISTU_EDGELIST.SC_ID_DEST)
1181
1250
  .head(top_n)
1182
1251
  )
1183
1252
 
@@ -1193,7 +1262,7 @@ def _precompute_neighbors(
1193
1262
  precomputed_neighbors=top_descendants,
1194
1263
  compartmentalized_species=compartmentalized_species,
1195
1264
  sbml_dfs=sbml_dfs,
1196
- relationship="descendants",
1265
+ relationship=GRAPH_RELATIONSHIPS.DESCENDANTS,
1197
1266
  )
1198
1267
 
1199
1268
  if downstream_reactions is not None:
@@ -1207,7 +1276,7 @@ def _precompute_neighbors(
1207
1276
  precomputed_neighbors=top_ancestors,
1208
1277
  compartmentalized_species=compartmentalized_species,
1209
1278
  sbml_dfs=sbml_dfs,
1210
- relationship="ancestors",
1279
+ relationship=GRAPH_RELATIONSHIPS.ANCESTORS,
1211
1280
  )
1212
1281
 
1213
1282
  if upstream_reactions is not None:
@@ -1217,8 +1286,8 @@ def _precompute_neighbors(
1217
1286
  # an sc_id_origin-specific subgraph
1218
1287
  identity_df = pd.DataFrame(
1219
1288
  {
1220
- "sc_id_origin": compartmentalized_species,
1221
- "sc_id_dest": compartmentalized_species,
1289
+ NAPISTU_EDGELIST.SC_ID_ORIGIN: compartmentalized_species,
1290
+ NAPISTU_EDGELIST.SC_ID_DEST: compartmentalized_species,
1222
1291
  }
1223
1292
  )
1224
1293
 
@@ -1232,14 +1301,16 @@ def _precompute_neighbors(
1232
1301
  downstream_reactions, # type: ignore
1233
1302
  identity_df,
1234
1303
  ]
1235
- )[["sc_id_origin", "sc_id_dest"]].drop_duplicates()
1304
+ )[
1305
+ [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
1306
+ ].drop_duplicates()
1236
1307
  elif network_type == NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM:
1237
1308
  precomputed_neighbors = pd.concat([top_descendants, downstream_reactions, identity_df])[ # type: ignore
1238
- ["sc_id_origin", "sc_id_dest"]
1309
+ [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
1239
1310
  ].drop_duplicates()
1240
1311
  elif network_type == NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM:
1241
1312
  precomputed_neighbors = pd.concat([top_ancestors, upstream_reactions, identity_df])[ # type: ignore
1242
- ["sc_id_origin", "sc_id_dest"]
1313
+ [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
1243
1314
  ].drop_duplicates()
1244
1315
  else:
1245
1316
  raise ValueError("This error shouldn't happen")
@@ -1271,7 +1342,7 @@ def _build_raw_neighborhood_df(
1271
1342
  descendants_df = _find_neighbors(
1272
1343
  napistu_graph=napistu_graph,
1273
1344
  compartmentalized_species=compartmentalized_species,
1274
- relationship="descendants",
1345
+ relationship=GRAPH_RELATIONSHIPS.DESCENDANTS,
1275
1346
  order=order,
1276
1347
  precomputed_neighbors=precomputed_neighbors,
1277
1348
  )
@@ -1284,7 +1355,7 @@ def _build_raw_neighborhood_df(
1284
1355
  ancestors_df = _find_neighbors(
1285
1356
  napistu_graph=napistu_graph,
1286
1357
  compartmentalized_species=compartmentalized_species,
1287
- relationship="ancestors",
1358
+ relationship=GRAPH_RELATIONSHIPS.ANCESTORS,
1288
1359
  order=order,
1289
1360
  precomputed_neighbors=precomputed_neighbors,
1290
1361
  )
@@ -1300,8 +1371,9 @@ def _build_raw_neighborhood_df(
1300
1371
  raise NotImplementedError("invalid network_type")
1301
1372
 
1302
1373
  # add name since this is an easy way to lookup igraph vertices
1303
- neighborhood_df["name"] = [
1304
- x["name"] for x in napistu_graph.vs[neighborhood_df["neighbor"]]
1374
+ neighborhood_df[NAPISTU_GRAPH_VERTICES.NAME] = [
1375
+ x[NAPISTU_GRAPH_VERTICES.NAME]
1376
+ for x in napistu_graph.vs[neighborhood_df["neighbor"]]
1305
1377
  ]
1306
1378
 
1307
1379
  return neighborhood_df
@@ -1327,17 +1399,23 @@ def _find_neighbors(
1327
1399
  if isinstance(precomputed_neighbors, pd.DataFrame):
1328
1400
  # add graph indices to neighbors
1329
1401
  nodes_to_names = (
1330
- pd.DataFrame({"name": napistu_graph.vs["name"]})
1402
+ pd.DataFrame(
1403
+ {
1404
+ NAPISTU_GRAPH_VERTICES.NAME: napistu_graph.vs[
1405
+ NAPISTU_GRAPH_VERTICES.NAME
1406
+ ]
1407
+ }
1408
+ )
1331
1409
  .reset_index()
1332
1410
  .rename({"index": "neighbor"}, axis=1)
1333
1411
  )
1334
1412
 
1335
- if relationship == "descendants":
1336
- bait_id = "sc_id_origin"
1337
- target_id = "sc_id_dest"
1338
- elif relationship == "ancestors":
1339
- bait_id = "sc_id_dest"
1340
- target_id = "sc_id_origin"
1413
+ if relationship == GRAPH_RELATIONSHIPS.DESCENDANTS:
1414
+ bait_id = NAPISTU_EDGELIST.SC_ID_ORIGIN
1415
+ target_id = NAPISTU_EDGELIST.SC_ID_DEST
1416
+ elif relationship == GRAPH_RELATIONSHIPS.ANCESTORS:
1417
+ bait_id = NAPISTU_EDGELIST.SC_ID_DEST
1418
+ target_id = NAPISTU_EDGELIST.SC_ID_ORIGIN
1341
1419
  else:
1342
1420
  raise ValueError(
1343
1421
  f"relationship must be 'descendants' or 'ancestors' but was {relationship}"
@@ -1347,15 +1425,17 @@ def _find_neighbors(
1347
1425
  precomputed_neighbors[
1348
1426
  precomputed_neighbors[bait_id].isin(compartmentalized_species)
1349
1427
  ]
1350
- .merge(nodes_to_names.rename({"name": target_id}, axis=1))
1351
- .rename({bait_id: "sc_id"}, axis=1)
1428
+ .merge(
1429
+ nodes_to_names.rename({NAPISTU_GRAPH_VERTICES.NAME: target_id}, axis=1)
1430
+ )
1431
+ .rename({bait_id: SBML_DFS.SC_ID}, axis=1)
1352
1432
  .drop([target_id], axis=1)
1353
1433
  .assign(relationship=relationship)
1354
1434
  )
1355
1435
  else:
1356
- if relationship == "descendants":
1436
+ if relationship == GRAPH_RELATIONSHIPS.DESCENDANTS:
1357
1437
  mode_type = "out"
1358
- elif relationship == "ancestors":
1438
+ elif relationship == GRAPH_RELATIONSHIPS.ANCESTORS:
1359
1439
  mode_type = "in"
1360
1440
  else:
1361
1441
  raise ValueError(
@@ -1371,7 +1451,7 @@ def _find_neighbors(
1371
1451
 
1372
1452
  neighbors_df = pd.concat(
1373
1453
  [
1374
- pd.DataFrame({"sc_id": c, "neighbor": x}, index=range(0, len(x)))
1454
+ pd.DataFrame({SBML_DFS.SC_ID: c, "neighbor": x}, index=range(0, len(x)))
1375
1455
  for c, x in zip(compartmentalized_species, neighbors)
1376
1456
  ]
1377
1457
  ).assign(relationship=relationship)
@@ -1401,12 +1481,12 @@ def _find_reactions_by_relationship(
1401
1481
  if precomputed_neighbors.shape[0] == 0:
1402
1482
  return None
1403
1483
 
1404
- if relationship == "descendants":
1405
- bait_id = "sc_id_origin"
1406
- target_id = "sc_id_dest"
1407
- elif relationship == "ancestors":
1408
- bait_id = "sc_id_dest"
1409
- target_id = "sc_id_origin"
1484
+ if relationship == GRAPH_RELATIONSHIPS.DESCENDANTS:
1485
+ bait_id = NAPISTU_EDGELIST.SC_ID_ORIGIN
1486
+ target_id = NAPISTU_EDGELIST.SC_ID_DEST
1487
+ elif relationship == GRAPH_RELATIONSHIPS.ANCESTORS:
1488
+ bait_id = NAPISTU_EDGELIST.SC_ID_DEST
1489
+ target_id = NAPISTU_EDGELIST.SC_ID_ORIGIN
1410
1490
  else:
1411
1491
  raise ValueError(
1412
1492
  f"relationship must be 'descendants' or 'ancestors' but was {relationship}"
@@ -1437,8 +1517,8 @@ def _find_reactions_by_relationship(
1437
1517
  relatives_cspecies = {*relatives, *[uq]}
1438
1518
  # count the number of relative cspecies including each reaction
1439
1519
  rxn_species_counts = sbml_dfs.reaction_species[
1440
- sbml_dfs.reaction_species["sc_id"].isin(relatives_cspecies)
1441
- ].value_counts("r_id")
1520
+ sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(relatives_cspecies)
1521
+ ].value_counts(SBML_DFS.R_ID)
1442
1522
 
1443
1523
  # retain reactions involving 2+ cspecies.
1444
1524
  # some of these reactions will be irrelevant and will be excluded when
@@ -1483,10 +1563,11 @@ def _prune_vertex_set(one_neighborhood: dict, top_n: int) -> pd.DataFrame:
1483
1563
 
1484
1564
  """
1485
1565
 
1486
- neighborhood_vertices = one_neighborhood["vertices"]
1566
+ neighborhood_vertices = one_neighborhood[NEIGHBORHOOD_DICT_KEYS.VERTICES]
1487
1567
 
1488
1568
  indexed_neighborhood_species = neighborhood_vertices[
1489
- neighborhood_vertices["node_type"] == "species"
1569
+ neighborhood_vertices[NAPISTU_GRAPH_VERTICES.NODE_TYPE]
1570
+ == NAPISTU_GRAPH_NODE_TYPES.SPECIES
1490
1571
  ].set_index("node_orientation")
1491
1572
 
1492
1573
  pruned_oriented_neighbors = list()
@@ -1496,14 +1577,14 @@ def _prune_vertex_set(one_neighborhood: dict, top_n: int) -> pd.DataFrame:
1496
1577
  # handle cases where only one entry exists to DF->series coercion occurs
1497
1578
  vertex_subset = vertex_subset.to_frame().T
1498
1579
 
1499
- sorted_vertex_set = vertex_subset.sort_values("path_weight")
1500
- weight_cutoff = sorted_vertex_set["path_weight"].iloc[
1580
+ sorted_vertex_set = vertex_subset.sort_values(DISTANCES.PATH_WEIGHTS)
1581
+ weight_cutoff = sorted_vertex_set[DISTANCES.PATH_WEIGHTS].iloc[
1501
1582
  min(top_n - 1, sorted_vertex_set.shape[0] - 1)
1502
1583
  ]
1503
1584
 
1504
1585
  top_neighbors = sorted_vertex_set[
1505
- sorted_vertex_set["path_weight"] <= weight_cutoff
1506
- ]["name"].tolist()
1586
+ sorted_vertex_set[DISTANCES.PATH_WEIGHTS] <= weight_cutoff
1587
+ ][NAPISTU_GRAPH_VERTICES.NAME].tolist()
1507
1588
 
1508
1589
  # include reactions and other species necessary to reach the top neighbors
1509
1590
  # by pulling in the past solutions to weighted shortest paths problems
@@ -1522,7 +1603,7 @@ def _prune_vertex_set(one_neighborhood: dict, top_n: int) -> pd.DataFrame:
1522
1603
  # combine all neighbors
1523
1604
  pruned_neighbors = set().union(*pruned_oriented_neighbors)
1524
1605
  pruned_vertices = neighborhood_vertices[
1525
- neighborhood_vertices["name"].isin(pruned_neighbors)
1606
+ neighborhood_vertices[NAPISTU_GRAPH_VERTICES.NAME].isin(pruned_neighbors)
1526
1607
  ].reset_index(drop=True)
1527
1608
 
1528
1609
  return pruned_vertices
@@ -1532,7 +1613,7 @@ def _calculate_path_attrs(
1532
1613
  neighborhood_paths: list[list],
1533
1614
  edges: pd.DataFrame,
1534
1615
  vertices: list,
1535
- weight_var: str = "weights",
1616
+ weight_var: str = NAPISTU_GRAPH_EDGES.WEIGHTS,
1536
1617
  ) -> tuple[pd.DataFrame, dict[Any, set]]:
1537
1618
  """
1538
1619
  Calculate Path Attributes
@@ -1582,15 +1663,15 @@ def _calculate_path_attrs(
1582
1663
  # if all_path_edges.ngroups > 0:
1583
1664
  path_attributes_df = pd.concat(
1584
1665
  [
1585
- all_path_edges[weight_var].agg("sum").rename("path_weight"),
1586
- all_path_edges.agg("size").rename("path_length"),
1587
- all_path_edges["link_polarity"]
1666
+ all_path_edges[weight_var].agg("sum").rename(DISTANCES.PATH_WEIGHTS),
1667
+ all_path_edges.agg("size").rename(DISTANCES.PATH_LENGTH),
1668
+ all_path_edges[NET_POLARITY.LINK_POLARITY]
1588
1669
  .agg(paths._terminal_net_polarity)
1589
- .rename("net_polarity"),
1670
+ .rename(NET_POLARITY.NET_POLARITY),
1590
1671
  # add the final edge since this can be used to add path attributes to edges
1591
1672
  # i.e., apply net_polarity to an edge
1592
- all_path_edges["from"].agg("last").rename("final_from"),
1593
- all_path_edges["to"].agg("last").rename("final_to"),
1673
+ all_path_edges["from"].agg("last").rename(DISTANCES.FINAL_FROM),
1674
+ all_path_edges["to"].agg("last").rename(DISTANCES.FINAL_TO),
1594
1675
  ],
1595
1676
  axis=1,
1596
1677
  ).reset_index()
@@ -1613,7 +1694,11 @@ def _calculate_path_attrs(
1613
1694
  if len(neighborhood_paths[i]) == 0
1614
1695
  ]
1615
1696
  edgeles_nodes_df = pd.DataFrame({"neighbor": edgeless_nodes}).assign(
1616
- path_length=0, path_weight=0, net_polarity=None
1697
+ **{
1698
+ DISTANCES.PATH_LENGTH: 0,
1699
+ DISTANCES.PATH_WEIGHTS: 0,
1700
+ NET_POLARITY.NET_POLARITY: None,
1701
+ }
1617
1702
  )
1618
1703
 
1619
1704
  # add edgeless entries as entries in the two outputs
@@ -1630,3 +1715,118 @@ def _calculate_path_attrs(
1630
1715
  )
1631
1716
 
1632
1717
  return path_attributes_df, neighborhood_path_entities
1718
+
1719
+
1720
+ def _find_neighbors_paths(
1721
+ neighborhood_graph: ig.Graph,
1722
+ one_neighborhood_df: pd.DataFrame,
1723
+ sc_id: str,
1724
+ edges: pd.DataFrame,
1725
+ ) -> tuple[pd.DataFrame, dict[Any, set], pd.DataFrame, dict[Any, set]]:
1726
+ """
1727
+ Find shortest paths between the focal node and its neighbors in both directions.
1728
+
1729
+ This function calculates shortest paths from the focal node to its descendants
1730
+ (downstream) and ancestors (upstream) using igraph's shortest path algorithms.
1731
+ It uses _calculate_path_attrs to compute path attributes including path weights,
1732
+ lengths, and polarity information.
1733
+
1734
+ Parameters
1735
+ ----------
1736
+ neighborhood_graph: ig.Graph
1737
+ The igraph Graph object representing the neighborhood network
1738
+ one_neighborhood_df: pd.DataFrame
1739
+ DataFrame containing neighborhood information with 'relationship' column
1740
+ indicating 'descendants' or 'ancestors' for each node
1741
+ sc_id: str
1742
+ The compartmentalized species ID of the focal node
1743
+ edges: pd.DataFrame
1744
+ DataFrame containing edge information with columns for 'from', 'to',
1745
+ weights, and link polarity
1746
+
1747
+ Returns
1748
+ -------
1749
+ downstream_path_attrs: pd.DataFrame
1750
+ DataFrame containing path attributes for downstream paths from focal node
1751
+ to descendants. Includes columns: neighbor, path_weight, path_length,
1752
+ net_polarity, final_from, final_to, node_orientation
1753
+ downstream_entity_dict: dict[Any, set]
1754
+ Dictionary mapping each descendant neighbor to the set of entities
1755
+ (nodes) connecting it to the focal node
1756
+ upstream_path_attrs: pd.DataFrame
1757
+ DataFrame containing path attributes for upstream paths from focal node
1758
+ to ancestors. Includes columns: neighbor, path_weight, path_length,
1759
+ net_polarity, final_from, final_to, node_orientation
1760
+ upstream_entity_dict: dict[Any, set]
1761
+ Dictionary mapping each ancestor neighbor to the set of entities
1762
+ (nodes) connecting it to the focal node
1763
+ """
1764
+
1765
+ one_descendants_df = one_neighborhood_df[
1766
+ one_neighborhood_df["relationship"] == GRAPH_RELATIONSHIPS.DESCENDANTS
1767
+ ]
1768
+ descendants_list = list(
1769
+ set(one_descendants_df[NAPISTU_GRAPH_VERTICES.NAME].tolist()).union({sc_id})
1770
+ )
1771
+
1772
+ # hide warnings which are mostly just Dijkstra complaining about not finding neighbors
1773
+ with warnings.catch_warnings():
1774
+ # igraph throws warnings for each pair of unconnected species
1775
+ warnings.simplefilter("ignore")
1776
+
1777
+ neighborhood_paths = neighborhood_graph.get_shortest_paths(
1778
+ # focal node
1779
+ v=sc_id,
1780
+ to=descendants_list,
1781
+ weights=NAPISTU_GRAPH_EDGES.WEIGHTS,
1782
+ mode="out",
1783
+ output="epath",
1784
+ )
1785
+
1786
+ downstream_path_attrs, downstream_entity_dict = _calculate_path_attrs(
1787
+ neighborhood_paths,
1788
+ edges,
1789
+ vertices=descendants_list,
1790
+ weight_var=NAPISTU_GRAPH_EDGES.WEIGHTS,
1791
+ )
1792
+ downstream_path_attrs = downstream_path_attrs.assign(
1793
+ node_orientation=NEIGHBORHOOD_NETWORK_TYPES.DOWNSTREAM
1794
+ )
1795
+
1796
+ # ancestors -> focal_node
1797
+
1798
+ one_ancestors_df = one_neighborhood_df[
1799
+ one_neighborhood_df["relationship"] == GRAPH_RELATIONSHIPS.ANCESTORS
1800
+ ]
1801
+ ancestors_list = list(
1802
+ set(one_ancestors_df[NAPISTU_GRAPH_VERTICES.NAME].tolist()).union({sc_id})
1803
+ )
1804
+
1805
+ with warnings.catch_warnings():
1806
+ # igraph throws warnings for each pair of unconnected species
1807
+ warnings.simplefilter("ignore")
1808
+
1809
+ neighborhood_paths = neighborhood_graph.get_shortest_paths(
1810
+ v=sc_id,
1811
+ to=ancestors_list,
1812
+ weights=NAPISTU_GRAPH_EDGES.UPSTREAM_WEIGHTS,
1813
+ mode="in",
1814
+ output="epath",
1815
+ )
1816
+
1817
+ upstream_path_attrs, upstream_entity_dict = _calculate_path_attrs(
1818
+ neighborhood_paths,
1819
+ edges,
1820
+ vertices=ancestors_list,
1821
+ weight_var=NAPISTU_GRAPH_EDGES.UPSTREAM_WEIGHTS,
1822
+ )
1823
+ upstream_path_attrs = upstream_path_attrs.assign(
1824
+ node_orientation=NEIGHBORHOOD_NETWORK_TYPES.UPSTREAM
1825
+ )
1826
+
1827
+ return (
1828
+ downstream_path_attrs,
1829
+ downstream_entity_dict,
1830
+ upstream_path_attrs,
1831
+ upstream_entity_dict,
1832
+ )