napistu 0.3.6__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. napistu/__main__.py +28 -13
  2. napistu/consensus.py +19 -25
  3. napistu/constants.py +102 -83
  4. napistu/indices.py +3 -1
  5. napistu/ingestion/napistu_edgelist.py +4 -4
  6. napistu/ingestion/sbml.py +298 -295
  7. napistu/ingestion/string.py +14 -18
  8. napistu/ingestion/trrust.py +22 -27
  9. napistu/matching/interactions.py +41 -39
  10. napistu/matching/species.py +1 -1
  11. napistu/modify/gaps.py +2 -1
  12. napistu/network/constants.py +61 -45
  13. napistu/network/data_handling.py +1 -1
  14. napistu/network/neighborhoods.py +3 -3
  15. napistu/network/net_create.py +440 -616
  16. napistu/network/net_create_utils.py +734 -0
  17. napistu/network/net_propagation.py +1 -1
  18. napistu/network/{napistu_graph_core.py → ng_core.py} +57 -15
  19. napistu/network/ng_utils.py +28 -21
  20. napistu/network/paths.py +4 -4
  21. napistu/network/precompute.py +35 -74
  22. napistu/ontologies/genodexito.py +5 -1
  23. napistu/ontologies/renaming.py +4 -0
  24. napistu/sbml_dfs_core.py +127 -64
  25. napistu/sbml_dfs_utils.py +50 -0
  26. napistu/utils.py +132 -46
  27. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/METADATA +2 -2
  28. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/RECORD +47 -44
  29. tests/conftest.py +171 -13
  30. tests/test_consensus.py +74 -5
  31. tests/test_gaps.py +26 -15
  32. tests/test_network_data_handling.py +5 -2
  33. tests/test_network_net_create.py +93 -202
  34. tests/test_network_net_create_utils.py +538 -0
  35. tests/test_network_ng_core.py +19 -0
  36. tests/test_network_ng_utils.py +1 -1
  37. tests/test_network_precompute.py +5 -4
  38. tests/test_ontologies_renaming.py +28 -24
  39. tests/test_rpy2_callr.py +0 -1
  40. tests/test_rpy2_init.py +0 -1
  41. tests/test_sbml_dfs_core.py +165 -15
  42. tests/test_sbml_dfs_utils.py +45 -0
  43. tests/test_utils.py +45 -2
  44. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/WHEEL +0 -0
  45. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/entry_points.txt +0 -0
  46. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/licenses/LICENSE +0 -0
  47. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,734 @@
1
+ import logging
2
+
3
+ import pandas as pd
4
+
5
+ from napistu import utils
6
+ from napistu.constants import (
7
+ MINI_SBO_FROM_NAME,
8
+ MINI_SBO_TO_NAME,
9
+ SBML_DFS,
10
+ SBOTERM_NAMES,
11
+ SBML_DFS_SCHEMA,
12
+ SCHEMA_DEFS,
13
+ )
14
+ from napistu.network.constants import (
15
+ NAPISTU_GRAPH_EDGES,
16
+ NAPISTU_GRAPH_NODE_TYPES,
17
+ DROP_REACTIONS_WHEN,
18
+ VALID_DROP_REACTIONS_WHEN,
19
+ GRAPH_WIRING_HIERARCHIES,
20
+ VALID_GRAPH_WIRING_APPROACHES,
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def wire_reaction_species(
27
+ reaction_species: pd.DataFrame, wiring_approach: str, drop_reactions_when: str
28
+ ) -> pd.DataFrame:
29
+ """
30
+ Convert reaction species data into network edges using specified wiring approach.
31
+
32
+ This function processes reaction species data to create network edges that represent
33
+ the relationships between molecular entities in a biological network. It handles
34
+ both interactor pairs (processed en-masse) and other reaction species (processed
35
+ using tiered algorithms based on the wiring approach).
36
+
37
+ Parameters
38
+ ----------
39
+ reaction_species : pd.DataFrame
40
+ DataFrame containing reaction species data with columns:
41
+ - r_id : str
42
+ Reaction identifier
43
+ - sc_id : str
44
+ Compartmentalized species identifier
45
+ - stoichiometry : float
46
+ Stoichiometric coefficient (negative for reactants, positive for products, 0 for modifiers)
47
+ - sbo_term : str
48
+ Systems Biology Ontology term defining the role of the species in the reaction
49
+ (e.g., 'SBO:0000010' for reactant, 'SBO:0000011' for product, 'SBO:0000336' for interactor)
50
+ wiring_approach : str
51
+ The wiring approach to use for creating the network. Must be one of:
52
+ - 'bipartite' : Creates bipartite network with molecules connected to reactions
53
+ - 'regulatory' : Creates regulatory hierarchy (modifiers -> catalysts -> reactants -> reactions -> products)
54
+ - 'surrogate' : Alternative layout with enzymes downstream of substrates
55
+ drop_reactions_when : str
56
+ Condition under which to drop reactions as network vertices. Must be one of:
57
+ - 'always' : Always drop reaction vertices
58
+ - 'edgelist' : Drop if there are exactly 2 participants
59
+ - 'same_tier' : Drop if there are 2 participants which are both "interactor"
60
+
61
+ Returns
62
+ -------
63
+ pd.DataFrame
64
+ DataFrame containing network edges with columns:
65
+ - from : str
66
+ Source node identifier (species or reaction ID)
67
+ - to : str
68
+ Target node identifier (species or reaction ID)
69
+ - stoichiometry : float
70
+ Stoichiometric coefficient for the edge
71
+ - sbo_term : str
72
+ SBO term defining the relationship type
73
+ - r_id : str
74
+ Reaction identifier associated with the edge
75
+
76
+ Notes
77
+ -----
78
+ The function processes reaction species in two phases:
79
+
80
+ 1. **Interactor Processing**: Pairs of interactors (SBO:0000336) are processed
81
+ en-masse and converted to wide format edges.
82
+
83
+ 2. **Tiered Processing**: Non-interactor species are processed using tiered
84
+ algorithms based on the wiring approach hierarchy. This creates edges
85
+ between entities at different tiers in the hierarchy.
86
+
87
+ Reactions with ≤1 species are automatically dropped as they represent
88
+ underspecified reactions (e.g., autoregulation or reactions with removed cofactors).
89
+
90
+ Examples
91
+ --------
92
+ >>> from napistu.network import net_create_utils
93
+ >>> from napistu.constants import SBML_DFS, MINI_SBO_FROM_NAME, SBOTERM_NAMES
94
+ >>> import pandas as pd
95
+ >>>
96
+ >>> # Create sample reaction species data
97
+ >>> reaction_species = pd.DataFrame({
98
+ ... SBML_DFS.R_ID: ['R1', 'R1', 'R2', 'R2'],
99
+ ... SBML_DFS.SC_ID: ['A', 'B', 'C', 'D'],
100
+ ... SBML_DFS.STOICHIOMETRY: [-1, 1, 0, 0],
101
+ ... SBML_DFS.SBO_TERM: [
102
+ ... MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT],
103
+ ... MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT],
104
+ ... MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR],
105
+ ... MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR]
106
+ ... ]
107
+ ... })
108
+ >>>
109
+ >>> # Wire the reaction species using regulatory approach
110
+ >>> edges = wire_reaction_species(
111
+ ... reaction_species,
112
+ ... wiring_approach='regulatory',
113
+ ... drop_reactions_when='same_tier'
114
+ ... )
115
+
116
+ Raises
117
+ ------
118
+ ValueError
119
+ If `wiring_approach` is not a valid value.
120
+ If `drop_reactions_when` is not a valid value.
121
+ If reaction species have unusable SBO terms.
122
+
123
+ See Also
124
+ --------
125
+ format_tiered_reaction_species : Process individual reactions with tiered algorithms
126
+ create_graph_hierarchy_df : Create hierarchy DataFrame for wiring approach
127
+ """
128
+
129
+ # check whether all expect SBO terms are present
130
+ invalid_sbo_terms = reaction_species[
131
+ ~reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
132
+ ]
133
+
134
+ if invalid_sbo_terms.shape[0] != 0:
135
+ invalid_counts = invalid_sbo_terms.value_counts(SBML_DFS.SBO_TERM).to_frame("N")
136
+ if not isinstance(invalid_counts, pd.DataFrame):
137
+ raise TypeError("invalid_counts must be a pandas DataFrame")
138
+ logger.warning(utils.style_df(invalid_counts, headers="keys")) # type: ignore
139
+ raise ValueError("Some reaction species have unusable SBO terms")
140
+
141
+ # load and validate the schema of wiring_approach
142
+ graph_hierarchy_df = create_graph_hierarchy_df(wiring_approach)
143
+
144
+ # handle interactors since they can easily be processed en-masse
145
+ interactor_pairs = _find_sbo_duos(
146
+ reaction_species, MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR]
147
+ )
148
+
149
+ if len(interactor_pairs) > 0:
150
+ logger.info(f"Processing {len(interactor_pairs)} interaction pairs")
151
+ interactor_duos = reaction_species.loc[
152
+ reaction_species[SBML_DFS.R_ID].isin(interactor_pairs)
153
+ ]
154
+
155
+ interactor_edges = _interactor_duos_to_wide(interactor_duos)
156
+ else:
157
+ interactor_edges = pd.DataFrame()
158
+
159
+ non_interactors_rspecies = reaction_species.loc[
160
+ ~reaction_species[SBML_DFS.R_ID].isin(interactor_pairs)
161
+ ]
162
+
163
+ if non_interactors_rspecies.shape[0] > 0:
164
+
165
+ logger.info(
166
+ f"Processing {non_interactors_rspecies.shape[0]} reaction species using the {wiring_approach} hierarchy"
167
+ )
168
+
169
+ # filter to just the entries which will be processed with the tiered algorithm
170
+ rspecies_fields = SBML_DFS_SCHEMA.SCHEMA[SBML_DFS.REACTION_SPECIES][
171
+ SCHEMA_DEFS.VARS
172
+ ]
173
+ reaction_groups = non_interactors_rspecies[rspecies_fields].groupby(
174
+ SBML_DFS.R_ID
175
+ )
176
+
177
+ all_tiered_edges = [
178
+ format_tiered_reaction_species(
179
+ rxn_group.drop(columns=[SBML_DFS.R_ID])
180
+ .set_index(SBML_DFS.SBO_TERM)
181
+ .sort_index(), # Set index here
182
+ r_id,
183
+ graph_hierarchy_df,
184
+ drop_reactions_when,
185
+ )
186
+ for r_id, rxn_group in reaction_groups
187
+ ]
188
+
189
+ all_tiered_edges_df = pd.concat(all_tiered_edges).reset_index(drop=True)
190
+ else:
191
+ all_tiered_edges_df = pd.DataFrame()
192
+
193
+ return pd.concat([interactor_edges, all_tiered_edges_df])
194
+
195
+
196
+ def format_tiered_reaction_species(
197
+ rxn_species: pd.DataFrame,
198
+ r_id: str,
199
+ graph_hierarchy_df: pd.DataFrame,
200
+ drop_reactions_when: str = DROP_REACTIONS_WHEN.SAME_TIER,
201
+ ) -> pd.DataFrame:
202
+ """
203
+ Create a Napistu graph from a reaction and its species.
204
+
205
+ Parameters
206
+ ----------
207
+ rxn_species : pd.DataFrame
208
+ The reaction's participants indexed by SBO terms
209
+ r_id : str
210
+ The ID of the reaction. Should be indexed by `sbo_term` and have columns
211
+ graph_hierarchy_df : pd.DataFrame
212
+ The graph hierarchy.
213
+ drop_reactions_when : str, optional
214
+ The condition under which to drop reactions as a network vertex. Default is 'same_tier'.
215
+
216
+ Returns
217
+ -------
218
+ pd.DataFrame
219
+ The edges of the Napistu graph for a single reaction.
220
+ """
221
+
222
+ _validate_sbo_indexed_rsc_stoi(rxn_species)
223
+
224
+ if rxn_species.shape[0] <= 1:
225
+ logger.warning(
226
+ f"Reaction {r_id} has {rxn_species.shape[0]} species. "
227
+ "This reaction will be dropped."
228
+ )
229
+ return pd.DataFrame()
230
+
231
+ # map reaction species to the tiers of the graph hierarchy. higher levels point to lower levels
232
+ # same-level entries point at each other only if there is only a single tier
233
+ entities_ordered_by_tier = _reaction_species_to_tiers(
234
+ rxn_species, graph_hierarchy_df, r_id
235
+ )
236
+ n_tiers = len(entities_ordered_by_tier.index.get_level_values("tier").unique())
237
+
238
+ # format edges for reactions where all participants are on the same tier of a wiring hierarcy
239
+ if n_tiers == 2:
240
+ edges = _format_same_tier_edges(rxn_species, r_id)
241
+ else:
242
+ edges = _format_cross_tier_edges(
243
+ entities_ordered_by_tier, r_id, drop_reactions_when
244
+ )
245
+
246
+ return edges
247
+
248
+
249
+ def create_graph_hierarchy_df(wiring_approach: str) -> pd.DataFrame:
250
+ """
251
+ Create a DataFrame representing the graph hierarchy for a given wiring approach.
252
+
253
+ Parameters
254
+ ----------
255
+ wiring_approach : str
256
+ The type of tiered graph to work with. Each type has its own specification in constants.py.
257
+
258
+ Returns
259
+ -------
260
+ pd.DataFrame
261
+ DataFrame with sbo_name, tier, and sbo_term.
262
+
263
+ Raises
264
+ ------
265
+ ValueError
266
+ If wiring_approach is not valid.
267
+ """
268
+
269
+ if wiring_approach not in VALID_GRAPH_WIRING_APPROACHES:
270
+ raise ValueError(
271
+ f"{wiring_approach} is not a valid wiring approach. Valid approaches are {', '.join(VALID_GRAPH_WIRING_APPROACHES)}"
272
+ )
273
+
274
+ sbo_names_hierarchy = GRAPH_WIRING_HIERARCHIES[wiring_approach]
275
+
276
+ # format as a DF
277
+ graph_hierarchy_df = pd.concat(
278
+ [
279
+ pd.DataFrame({"sbo_name": sbo_names_hierarchy[i]}).assign(tier=i)
280
+ for i in range(0, len(sbo_names_hierarchy))
281
+ ]
282
+ ).reset_index(drop=True)
283
+ graph_hierarchy_df[SBML_DFS.SBO_TERM] = graph_hierarchy_df["sbo_name"].apply(
284
+ lambda x: (
285
+ MINI_SBO_FROM_NAME[x] if x != NAPISTU_GRAPH_NODE_TYPES.REACTION else ""
286
+ )
287
+ )
288
+
289
+ # ensure that the output is expected
290
+ utils.match_pd_vars(
291
+ graph_hierarchy_df,
292
+ req_vars={NAPISTU_GRAPH_EDGES.SBO_NAME, "tier", SBML_DFS.SBO_TERM},
293
+ allow_series=False,
294
+ ).assert_present()
295
+
296
+ return graph_hierarchy_df
297
+
298
+
299
+ def _should_drop_reaction(
300
+ entities_ordered_by_tier: pd.DataFrame,
301
+ drop_reactions_when: str = DROP_REACTIONS_WHEN.SAME_TIER,
302
+ ):
303
+ """
304
+ Determine if a reaction should be dropped based on regulatory relationships and stringency.
305
+
306
+ Parameters
307
+ ----------
308
+ entities_ordered_by_tier : pd.DataFrame
309
+ The entities ordered by tier.
310
+ drop_reactions_when : str, optional
311
+ The desired stringency for dropping reactions. Default is 'same_tier'.
312
+
313
+ Returns
314
+ -------
315
+ bool
316
+ True if the reaction should be dropped, False otherwise.
317
+
318
+ Notes
319
+ _____
320
+ reactions are always dropped if they are on the same tier. This greatly decreases the number of vertices
321
+ in a graph constructed from relatively dense interaction networks like STRING.
322
+
323
+ Raises
324
+ ------
325
+ ValueError
326
+ If drop_reactions_when is not a valid value.
327
+
328
+ """
329
+
330
+ if drop_reactions_when == DROP_REACTIONS_WHEN.ALWAYS:
331
+ return True
332
+
333
+ elif drop_reactions_when == DROP_REACTIONS_WHEN.EDGELIST:
334
+ if entities_ordered_by_tier.shape[0] == 3: # 2 members + 1 for reaction
335
+ return True
336
+ else:
337
+ return False
338
+
339
+ elif drop_reactions_when == DROP_REACTIONS_WHEN.SAME_TIER:
340
+ n_reactant_tiers = len(
341
+ entities_ordered_by_tier.query("sbo_name != 'reaction'")
342
+ .index.unique()
343
+ .tolist()
344
+ )
345
+ if n_reactant_tiers == 1:
346
+ return True
347
+ else:
348
+ return False
349
+
350
+ else:
351
+ raise ValueError(
352
+ f"Invalid drop_reactions: {drop_reactions_when}; valid values are {VALID_DROP_REACTIONS_WHEN}"
353
+ )
354
+
355
+
356
+ def _format_same_tier_edges(rxn_species: pd.DataFrame, r_id: str) -> pd.DataFrame:
357
+ """
358
+ Format edges for reactions where all participants are on the same tier of a wiring hierarchy.
359
+
360
+ Parameters
361
+ ----------
362
+ rxn_species : pd.DataFrame
363
+ DataFrame of reaction species for the reaction.
364
+ r_id : str
365
+ Reaction ID.
366
+
367
+ Returns
368
+ -------
369
+ pd.DataFrame
370
+ DataFrame of formatted edges for same-tier reactions.
371
+
372
+ Raises
373
+ ------
374
+ ValueError
375
+ If reaction has multiple distinct metadata.
376
+ """
377
+
378
+ # if they have the same SBO_term and stoichiometry, then the
379
+ # reaction can be trivially treated as reversible
380
+
381
+ valid_species = rxn_species.reset_index().assign(
382
+ entry=range(0, rxn_species.shape[0])
383
+ )
384
+ distinct_metadata = valid_species[
385
+ [SBML_DFS.SBO_TERM, SBML_DFS.STOICHIOMETRY]
386
+ ].drop_duplicates()
387
+ if distinct_metadata.shape[0] > 1:
388
+ _log_pathological_same_tier(distinct_metadata, r_id)
389
+ return pd.DataFrame()
390
+
391
+ crossed_species = (
392
+ valid_species.merge(valid_species, how="cross", suffixes=("_left", "_right"))
393
+ .query("entry_left < entry_right")
394
+ .rename(
395
+ {
396
+ "sc_id_left": NAPISTU_GRAPH_EDGES.FROM,
397
+ "sc_id_right": NAPISTU_GRAPH_EDGES.TO,
398
+ "stoichiometry_right": NAPISTU_GRAPH_EDGES.STOICHIOMETRY,
399
+ "sbo_term_left": NAPISTU_GRAPH_EDGES.SBO_TERM,
400
+ },
401
+ axis=1,
402
+ )
403
+ .assign(r_id=r_id)
404
+ )
405
+
406
+ OUT_ATTRS = [
407
+ NAPISTU_GRAPH_EDGES.FROM,
408
+ NAPISTU_GRAPH_EDGES.TO,
409
+ NAPISTU_GRAPH_EDGES.STOICHIOMETRY,
410
+ NAPISTU_GRAPH_EDGES.SBO_TERM,
411
+ SBML_DFS.R_ID,
412
+ ]
413
+
414
+ return crossed_species[OUT_ATTRS]
415
+
416
+
417
+ def _log_pathological_same_tier(distinct_metadata: pd.DataFrame, r_id: str) -> None:
418
+ """
419
+ Log a warning if a reaction has multiple distinct metadata.
420
+ """
421
+ msg = list(
422
+ [
423
+ f"Ignoring reaction {r_id}; its members have distinct annotations but they exist on the same level of a wiring hierarchy so their relationships cannot be determined."
424
+ ]
425
+ )
426
+ sbo_terms = distinct_metadata["sbo_term"].map(MINI_SBO_TO_NAME).unique().tolist()
427
+ if len(sbo_terms) > 1:
428
+ msg.append(f"SBO terms: {sbo_terms}")
429
+ stoichiometries = distinct_metadata["stoichiometry"].unique().tolist()
430
+ if len(stoichiometries) > 1:
431
+ msg.append(f"Stoichiometries: {stoichiometries}")
432
+ logger.warning(msg[0] + "; ".join(msg[1:]))
433
+
434
+
435
+ def _format_cross_tier_edges(
436
+ entities_ordered_by_tier: pd.DataFrame,
437
+ r_id: str,
438
+ drop_reactions_when: str = DROP_REACTIONS_WHEN.SAME_TIER,
439
+ ):
440
+ """
441
+ Format edges for reactions where participants are on different tiers of a wiring hierarchy.
442
+
443
+ Parameters
444
+ ----------
445
+ entities_ordered_by_tier : pd.DataFrame
446
+ DataFrame of entities ordered by tier.
447
+ r_id : str
448
+ Reaction ID.
449
+ drop_reactions_when : str, optional
450
+ The condition under which to drop reactions as a network vertex. Default is 'same_tier'.
451
+
452
+ Returns
453
+ -------
454
+ pd.DataFrame
455
+ DataFrame of formatted edges for cross-tier reactions.
456
+ """
457
+
458
+ ordered_tiers = entities_ordered_by_tier.index.get_level_values("tier").unique()
459
+ reaction_tier = entities_ordered_by_tier.query(
460
+ "sbo_name == 'reaction'"
461
+ ).index.tolist()[0]
462
+ drop_reaction = _should_drop_reaction(entities_ordered_by_tier, drop_reactions_when)
463
+
464
+ rxn_edges = list()
465
+ past_reaction = False
466
+ for i in range(0, len(ordered_tiers) - 1):
467
+
468
+ if ordered_tiers[i] == reaction_tier:
469
+ if drop_reaction:
470
+ continue
471
+
472
+ next_tier = ordered_tiers[i + 1]
473
+ if ordered_tiers[i + 1] == reaction_tier:
474
+ # hop over the reaction tier
475
+ if drop_reaction:
476
+ next_tier = ordered_tiers[i + 2]
477
+
478
+ formatted_tier_combo = _format_tier_combo(
479
+ entities_ordered_by_tier.loc[[ordered_tiers[i]]],
480
+ entities_ordered_by_tier.loc[[next_tier]],
481
+ past_reaction,
482
+ )
483
+
484
+ if ordered_tiers[i + 1] == reaction_tier:
485
+ past_reaction = True
486
+
487
+ rxn_edges.append(formatted_tier_combo)
488
+
489
+ rxn_edges_df = (
490
+ pd.concat(rxn_edges)[
491
+ [
492
+ NAPISTU_GRAPH_EDGES.FROM,
493
+ NAPISTU_GRAPH_EDGES.TO,
494
+ NAPISTU_GRAPH_EDGES.STOICHIOMETRY,
495
+ NAPISTU_GRAPH_EDGES.SBO_TERM,
496
+ ]
497
+ ]
498
+ .reset_index(drop=True)
499
+ .assign(r_id=r_id)
500
+ )
501
+
502
+ return rxn_edges_df
503
+
504
+
505
+ def _validate_sbo_indexed_rsc_stoi(rxn_species: pd.DataFrame) -> None:
506
+ """
507
+ Validate that rxn_species is a DataFrame with correct index and columns.
508
+
509
+ Parameters
510
+ ----------
511
+ rxn_species : pd.DataFrame
512
+ DataFrame of reaction species, indexed by SBO_TERM.
513
+
514
+ Returns
515
+ -------
516
+ None
517
+
518
+ Raises
519
+ ------
520
+ TypeError
521
+ If rxn_species is not a pandas DataFrame.
522
+ ValueError
523
+ If index or columns are not as expected.
524
+ """
525
+
526
+ if not isinstance(rxn_species, pd.DataFrame):
527
+ raise TypeError("rxn_species must be a pandas DataFrame")
528
+ if list(rxn_species.index.names) != [SBML_DFS.SBO_TERM]:
529
+ raise ValueError("rxn_species index names must be [SBML_DFS.SBO_TERM]")
530
+ if rxn_species.columns.tolist() != [SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY]:
531
+ raise ValueError(
532
+ "rxn_species columns must be [SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY]"
533
+ )
534
+
535
+ return None
536
+
537
+
538
+ def _reaction_species_to_tiers(
539
+ rxn_species: pd.DataFrame, graph_hierarchy_df: pd.DataFrame, r_id: str
540
+ ) -> pd.DataFrame:
541
+ """
542
+ Map reaction species to tiers based on the graph hierarchy.
543
+
544
+ Parameters
545
+ ----------
546
+ rxn_species : pd.DataFrame
547
+ DataFrame of reaction species.
548
+ graph_hierarchy_df : pd.DataFrame
549
+ DataFrame defining the graph hierarchy.
550
+ r_id : str
551
+ Reaction ID.
552
+
553
+ Returns
554
+ -------
555
+ pd.DataFrame
556
+ DataFrame of entities ordered by tier.
557
+ """
558
+
559
+ entities_ordered_by_tier = (
560
+ pd.concat(
561
+ [
562
+ (
563
+ rxn_species.reset_index()
564
+ .rename({SBML_DFS.SC_ID: "entity_id"}, axis=1)
565
+ .merge(graph_hierarchy_df)
566
+ ),
567
+ graph_hierarchy_df[
568
+ graph_hierarchy_df[NAPISTU_GRAPH_EDGES.SBO_NAME]
569
+ == NAPISTU_GRAPH_NODE_TYPES.REACTION
570
+ ].assign(entity_id=r_id, r_id=r_id),
571
+ ]
572
+ )
573
+ .sort_values(["tier"])
574
+ .set_index("tier")
575
+ )
576
+ return entities_ordered_by_tier
577
+
578
+
579
+ def _format_tier_combo(
580
+ upstream_tier: pd.DataFrame, downstream_tier: pd.DataFrame, past_reaction: bool
581
+ ) -> pd.DataFrame:
582
+ """
583
+ Create all edges between two tiers of a tiered reaction graph.
584
+
585
+ This function generates a set of edges by performing an all-vs-all combination between entities
586
+ in the upstream and downstream tiers. Tiers represent an ordering along the molecular entities
587
+ in a reaction, plus a tier for the reaction itself. Attributes such as stoichiometry and sbo_term
588
+ are assigned from the tier furthest from the reaction tier, ensuring that each molecular tier
589
+ applies its attributes to a single set of edges, while the "reaction" tier does not contribute
590
+ these attributes. Reaction entities have neither a stoichiometry nor sbo_term annotation.
591
+
592
+ Parameters
593
+ ----------
594
+ upstream_tier : pd.DataFrame
595
+ DataFrame containing upstream entities in a reaction (e.g., regulators or substrates).
596
+ downstream_tier : pd.DataFrame
597
+ DataFrame containing downstream entities in a reaction (e.g., products or targets).
598
+ past_reaction : bool
599
+ If True, attributes (stoichiometry, sbo_term) are taken from downstream_tier;
600
+ if False, from upstream_tier. This controls the direction of attribute assignment
601
+ depending on whether the reaction tier has already been passed in the tier ordering.
602
+
603
+ Returns
604
+ -------
605
+ pd.DataFrame
606
+ DataFrame of edges, each with columns: 'from', 'to', 'stoichiometry', 'sbo_term', and 'r_id'.
607
+ The number of edges is the product of the number of entities in the upstream tier
608
+ and the number in the downstream tier.
609
+
610
+ Notes
611
+ -----
612
+ - This function is used to build the edge list for tiered graphs, where each tier represents
613
+ a functional group (e.g., substrates, products, modifiers, reaction).
614
+ - The direction and attributes of edges depend on the position relative to the reaction tier.
615
+ - Reaction entities themselves do not contribute stoichiometry or sbo_term attributes.
616
+ """
617
+
618
+ upstream_fields = ["entity_id", SBML_DFS.STOICHIOMETRY, SBML_DFS.SBO_TERM]
619
+ downstream_fields = ["entity_id"]
620
+
621
+ if past_reaction:
622
+ # swap fields
623
+ upstream_fields, downstream_fields = downstream_fields, upstream_fields
624
+
625
+ formatted_tier_combo = (
626
+ upstream_tier[upstream_fields]
627
+ .rename({"entity_id": NAPISTU_GRAPH_EDGES.FROM}, axis=1)
628
+ .assign(_joiner=1)
629
+ ).merge(
630
+ (
631
+ downstream_tier[downstream_fields]
632
+ .rename({"entity_id": NAPISTU_GRAPH_EDGES.TO}, axis=1)
633
+ .assign(_joiner=1)
634
+ ),
635
+ left_on="_joiner",
636
+ right_on="_joiner",
637
+ )
638
+
639
+ return formatted_tier_combo
640
+
641
+
642
+ def _find_sbo_duos(
643
+ reaction_species: pd.DataFrame,
644
+ target_sbo_term: str = MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR],
645
+ ) -> list[str]:
646
+ """
647
+ Find r_ids that have exactly 2 rows with the specified sbo_term and no other sbo_terms.
648
+
649
+ Parameters
650
+ ----------
651
+ reaction_species : pd.DataFrame
652
+ DataFrame with columns: sbo_term, sc_id, stoichiometry, r_id
653
+ target_sbo_term : str
654
+ The sbo_term to match (e.g., "SBO:0000336" aka "interactor")
655
+
656
+ Returns
657
+ -------
658
+ list
659
+ List of r_ids that meet the criteria
660
+ """
661
+ # Group by r_id and check conditions
662
+ grouped = reaction_species.groupby(SBML_DFS.R_ID)
663
+
664
+ matching_r_ids = []
665
+ for r_id, group in grouped:
666
+ # Check if all sbo_terms match the target AND there are exactly 2 rows
667
+ if (group[SBML_DFS.SBO_TERM] == target_sbo_term).all() and len(group) == 2:
668
+ matching_r_ids.append(r_id)
669
+
670
+ return matching_r_ids
671
+
672
+
673
+ def _interactor_duos_to_wide(interactor_duos: pd.DataFrame):
674
+ """
675
+ Convert paired long format to wide format with 'from' and 'to' columns.
676
+
677
+ Parameters
678
+ ----------
679
+ interactor_duos : pd.DataFrame
680
+ DataFrame with exactly 2 rows per r_id, containing sc_id and stoichiometry
681
+
682
+ Returns
683
+ -------
684
+ pd.DataFrame
685
+ Wide format with from_sc_id, from_stoichiometry, to_sc_id, to_stoichiometry columns
686
+ """
687
+ # Sort by sc_id within each group to ensure consistent ordering
688
+
689
+ _validate_interactor_duos(interactor_duos)
690
+ df_sorted = interactor_duos.sort_values([SBML_DFS.R_ID, SBML_DFS.SC_ID])
691
+
692
+ # Group by r_id and use cumcount to create row numbers (0, 1)
693
+ df_sorted["pair_order"] = df_sorted.groupby(SBML_DFS.R_ID).cumcount()
694
+
695
+ # Pivot to wide format
696
+ wide_df = df_sorted.pivot(
697
+ index=SBML_DFS.R_ID, columns="pair_order", values=SBML_DFS.SC_ID
698
+ )
699
+
700
+ # Flatten column names and rename
701
+ wide_df.columns = ["from", "to"]
702
+
703
+ # Reset index to make r_id a column
704
+ return wide_df.reset_index().assign(
705
+ sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR], stoichiometry=0
706
+ )
707
+
708
+
709
+ def _validate_interactor_duos(interactor_duos: pd.DataFrame):
710
+ """Logs cases when a pair of interactors have non-zero stoichiometry"""
711
+
712
+ utils.match_pd_vars(
713
+ interactor_duos,
714
+ req_vars={
715
+ SBML_DFS.R_ID,
716
+ SBML_DFS.SC_ID,
717
+ SBML_DFS.SBO_TERM,
718
+ SBML_DFS.STOICHIOMETRY,
719
+ },
720
+ ).assert_present()
721
+
722
+ non_zero_stoi = interactor_duos[interactor_duos[SBML_DFS.STOICHIOMETRY] != 0]
723
+
724
+ if not non_zero_stoi.empty:
725
+ affected_r_ids = non_zero_stoi[SBML_DFS.R_ID].unique()
726
+ n_reactions = len(affected_r_ids)
727
+ sample_r_ids = affected_r_ids[:5].tolist()
728
+
729
+ logger.warning(
730
+ f"Found {n_reactions} reactions constructed from pairs of interactors with non-zero"
731
+ "stoichiometry. These should likely be assigned to another SBO term so their relationship"
732
+ "can be properly represented.\n"
733
+ f"Affected r_ids (showing up to 5): {sample_r_ids}"
734
+ )