napistu 0.3.7__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +8 -4
- napistu/constants.py +30 -35
- napistu/gcs/constants.py +11 -11
- napistu/ingestion/napistu_edgelist.py +4 -4
- napistu/matching/interactions.py +41 -39
- napistu/modify/gaps.py +2 -1
- napistu/network/constants.py +61 -45
- napistu/network/data_handling.py +1 -1
- napistu/network/neighborhoods.py +3 -3
- napistu/network/net_create.py +440 -616
- napistu/network/net_create_utils.py +734 -0
- napistu/network/net_propagation.py +1 -1
- napistu/network/{napistu_graph_core.py → ng_core.py} +57 -15
- napistu/network/ng_utils.py +28 -21
- napistu/network/paths.py +4 -4
- napistu/network/precompute.py +35 -74
- napistu/ontologies/id_tables.py +282 -0
- napistu/sbml_dfs_core.py +53 -63
- napistu/sbml_dfs_utils.py +126 -16
- napistu/utils.py +80 -5
- {napistu-0.3.7.dist-info → napistu-0.4.1.dist-info}/METADATA +7 -2
- {napistu-0.3.7.dist-info → napistu-0.4.1.dist-info}/RECORD +39 -34
- tests/conftest.py +102 -1
- tests/test_network_data_handling.py +5 -2
- tests/test_network_net_create.py +92 -201
- tests/test_network_net_create_utils.py +538 -0
- tests/test_network_ng_core.py +19 -0
- tests/test_network_ng_utils.py +1 -1
- tests/test_network_precompute.py +4 -3
- tests/test_ontologies_id_tables.py +198 -0
- tests/test_rpy2_callr.py +0 -1
- tests/test_rpy2_init.py +0 -1
- tests/test_sbml_dfs_core.py +30 -19
- tests/test_sbml_dfs_utils.py +115 -0
- tests/test_utils.py +26 -2
- {napistu-0.3.7.dist-info → napistu-0.4.1.dist-info}/WHEEL +0 -0
- {napistu-0.3.7.dist-info → napistu-0.4.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.3.7.dist-info → napistu-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.3.7.dist-info → napistu-0.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,734 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
from napistu import utils
|
6
|
+
from napistu.constants import (
|
7
|
+
MINI_SBO_FROM_NAME,
|
8
|
+
MINI_SBO_TO_NAME,
|
9
|
+
SBML_DFS,
|
10
|
+
SBOTERM_NAMES,
|
11
|
+
SBML_DFS_SCHEMA,
|
12
|
+
SCHEMA_DEFS,
|
13
|
+
)
|
14
|
+
from napistu.network.constants import (
|
15
|
+
NAPISTU_GRAPH_EDGES,
|
16
|
+
NAPISTU_GRAPH_NODE_TYPES,
|
17
|
+
DROP_REACTIONS_WHEN,
|
18
|
+
VALID_DROP_REACTIONS_WHEN,
|
19
|
+
GRAPH_WIRING_HIERARCHIES,
|
20
|
+
VALID_GRAPH_WIRING_APPROACHES,
|
21
|
+
)
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
def wire_reaction_species(
|
27
|
+
reaction_species: pd.DataFrame, wiring_approach: str, drop_reactions_when: str
|
28
|
+
) -> pd.DataFrame:
|
29
|
+
"""
|
30
|
+
Convert reaction species data into network edges using specified wiring approach.
|
31
|
+
|
32
|
+
This function processes reaction species data to create network edges that represent
|
33
|
+
the relationships between molecular entities in a biological network. It handles
|
34
|
+
both interactor pairs (processed en-masse) and other reaction species (processed
|
35
|
+
using tiered algorithms based on the wiring approach).
|
36
|
+
|
37
|
+
Parameters
|
38
|
+
----------
|
39
|
+
reaction_species : pd.DataFrame
|
40
|
+
DataFrame containing reaction species data with columns:
|
41
|
+
- r_id : str
|
42
|
+
Reaction identifier
|
43
|
+
- sc_id : str
|
44
|
+
Compartmentalized species identifier
|
45
|
+
- stoichiometry : float
|
46
|
+
Stoichiometric coefficient (negative for reactants, positive for products, 0 for modifiers)
|
47
|
+
- sbo_term : str
|
48
|
+
Systems Biology Ontology term defining the role of the species in the reaction
|
49
|
+
(e.g., 'SBO:0000010' for reactant, 'SBO:0000011' for product, 'SBO:0000336' for interactor)
|
50
|
+
wiring_approach : str
|
51
|
+
The wiring approach to use for creating the network. Must be one of:
|
52
|
+
- 'bipartite' : Creates bipartite network with molecules connected to reactions
|
53
|
+
- 'regulatory' : Creates regulatory hierarchy (modifiers -> catalysts -> reactants -> reactions -> products)
|
54
|
+
- 'surrogate' : Alternative layout with enzymes downstream of substrates
|
55
|
+
drop_reactions_when : str
|
56
|
+
Condition under which to drop reactions as network vertices. Must be one of:
|
57
|
+
- 'always' : Always drop reaction vertices
|
58
|
+
- 'edgelist' : Drop if there are exactly 2 participants
|
59
|
+
- 'same_tier' : Drop if there are 2 participants which are both "interactor"
|
60
|
+
|
61
|
+
Returns
|
62
|
+
-------
|
63
|
+
pd.DataFrame
|
64
|
+
DataFrame containing network edges with columns:
|
65
|
+
- from : str
|
66
|
+
Source node identifier (species or reaction ID)
|
67
|
+
- to : str
|
68
|
+
Target node identifier (species or reaction ID)
|
69
|
+
- stoichiometry : float
|
70
|
+
Stoichiometric coefficient for the edge
|
71
|
+
- sbo_term : str
|
72
|
+
SBO term defining the relationship type
|
73
|
+
- r_id : str
|
74
|
+
Reaction identifier associated with the edge
|
75
|
+
|
76
|
+
Notes
|
77
|
+
-----
|
78
|
+
The function processes reaction species in two phases:
|
79
|
+
|
80
|
+
1. **Interactor Processing**: Pairs of interactors (SBO:0000336) are processed
|
81
|
+
en-masse and converted to wide format edges.
|
82
|
+
|
83
|
+
2. **Tiered Processing**: Non-interactor species are processed using tiered
|
84
|
+
algorithms based on the wiring approach hierarchy. This creates edges
|
85
|
+
between entities at different tiers in the hierarchy.
|
86
|
+
|
87
|
+
Reactions with ≤1 species are automatically dropped as they represent
|
88
|
+
underspecified reactions (e.g., autoregulation or reactions with removed cofactors).
|
89
|
+
|
90
|
+
Examples
|
91
|
+
--------
|
92
|
+
>>> from napistu.network import net_create_utils
|
93
|
+
>>> from napistu.constants import SBML_DFS, MINI_SBO_FROM_NAME, SBOTERM_NAMES
|
94
|
+
>>> import pandas as pd
|
95
|
+
>>>
|
96
|
+
>>> # Create sample reaction species data
|
97
|
+
>>> reaction_species = pd.DataFrame({
|
98
|
+
... SBML_DFS.R_ID: ['R1', 'R1', 'R2', 'R2'],
|
99
|
+
... SBML_DFS.SC_ID: ['A', 'B', 'C', 'D'],
|
100
|
+
... SBML_DFS.STOICHIOMETRY: [-1, 1, 0, 0],
|
101
|
+
... SBML_DFS.SBO_TERM: [
|
102
|
+
... MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT],
|
103
|
+
... MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT],
|
104
|
+
... MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR],
|
105
|
+
... MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR]
|
106
|
+
... ]
|
107
|
+
... })
|
108
|
+
>>>
|
109
|
+
>>> # Wire the reaction species using regulatory approach
|
110
|
+
>>> edges = wire_reaction_species(
|
111
|
+
... reaction_species,
|
112
|
+
... wiring_approach='regulatory',
|
113
|
+
... drop_reactions_when='same_tier'
|
114
|
+
... )
|
115
|
+
|
116
|
+
Raises
|
117
|
+
------
|
118
|
+
ValueError
|
119
|
+
If `wiring_approach` is not a valid value.
|
120
|
+
If `drop_reactions_when` is not a valid value.
|
121
|
+
If reaction species have unusable SBO terms.
|
122
|
+
|
123
|
+
See Also
|
124
|
+
--------
|
125
|
+
format_tiered_reaction_species : Process individual reactions with tiered algorithms
|
126
|
+
create_graph_hierarchy_df : Create hierarchy DataFrame for wiring approach
|
127
|
+
"""
|
128
|
+
|
129
|
+
# check whether all expect SBO terms are present
|
130
|
+
invalid_sbo_terms = reaction_species[
|
131
|
+
~reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
132
|
+
]
|
133
|
+
|
134
|
+
if invalid_sbo_terms.shape[0] != 0:
|
135
|
+
invalid_counts = invalid_sbo_terms.value_counts(SBML_DFS.SBO_TERM).to_frame("N")
|
136
|
+
if not isinstance(invalid_counts, pd.DataFrame):
|
137
|
+
raise TypeError("invalid_counts must be a pandas DataFrame")
|
138
|
+
logger.warning(utils.style_df(invalid_counts, headers="keys")) # type: ignore
|
139
|
+
raise ValueError("Some reaction species have unusable SBO terms")
|
140
|
+
|
141
|
+
# load and validate the schema of wiring_approach
|
142
|
+
graph_hierarchy_df = create_graph_hierarchy_df(wiring_approach)
|
143
|
+
|
144
|
+
# handle interactors since they can easily be processed en-masse
|
145
|
+
interactor_pairs = _find_sbo_duos(
|
146
|
+
reaction_species, MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR]
|
147
|
+
)
|
148
|
+
|
149
|
+
if len(interactor_pairs) > 0:
|
150
|
+
logger.info(f"Processing {len(interactor_pairs)} interaction pairs")
|
151
|
+
interactor_duos = reaction_species.loc[
|
152
|
+
reaction_species[SBML_DFS.R_ID].isin(interactor_pairs)
|
153
|
+
]
|
154
|
+
|
155
|
+
interactor_edges = _interactor_duos_to_wide(interactor_duos)
|
156
|
+
else:
|
157
|
+
interactor_edges = pd.DataFrame()
|
158
|
+
|
159
|
+
non_interactors_rspecies = reaction_species.loc[
|
160
|
+
~reaction_species[SBML_DFS.R_ID].isin(interactor_pairs)
|
161
|
+
]
|
162
|
+
|
163
|
+
if non_interactors_rspecies.shape[0] > 0:
|
164
|
+
|
165
|
+
logger.info(
|
166
|
+
f"Processing {non_interactors_rspecies.shape[0]} reaction species using the {wiring_approach} hierarchy"
|
167
|
+
)
|
168
|
+
|
169
|
+
# filter to just the entries which will be processed with the tiered algorithm
|
170
|
+
rspecies_fields = SBML_DFS_SCHEMA.SCHEMA[SBML_DFS.REACTION_SPECIES][
|
171
|
+
SCHEMA_DEFS.VARS
|
172
|
+
]
|
173
|
+
reaction_groups = non_interactors_rspecies[rspecies_fields].groupby(
|
174
|
+
SBML_DFS.R_ID
|
175
|
+
)
|
176
|
+
|
177
|
+
all_tiered_edges = [
|
178
|
+
format_tiered_reaction_species(
|
179
|
+
rxn_group.drop(columns=[SBML_DFS.R_ID])
|
180
|
+
.set_index(SBML_DFS.SBO_TERM)
|
181
|
+
.sort_index(), # Set index here
|
182
|
+
r_id,
|
183
|
+
graph_hierarchy_df,
|
184
|
+
drop_reactions_when,
|
185
|
+
)
|
186
|
+
for r_id, rxn_group in reaction_groups
|
187
|
+
]
|
188
|
+
|
189
|
+
all_tiered_edges_df = pd.concat(all_tiered_edges).reset_index(drop=True)
|
190
|
+
else:
|
191
|
+
all_tiered_edges_df = pd.DataFrame()
|
192
|
+
|
193
|
+
return pd.concat([interactor_edges, all_tiered_edges_df])
|
194
|
+
|
195
|
+
|
196
|
+
def format_tiered_reaction_species(
|
197
|
+
rxn_species: pd.DataFrame,
|
198
|
+
r_id: str,
|
199
|
+
graph_hierarchy_df: pd.DataFrame,
|
200
|
+
drop_reactions_when: str = DROP_REACTIONS_WHEN.SAME_TIER,
|
201
|
+
) -> pd.DataFrame:
|
202
|
+
"""
|
203
|
+
Create a Napistu graph from a reaction and its species.
|
204
|
+
|
205
|
+
Parameters
|
206
|
+
----------
|
207
|
+
rxn_species : pd.DataFrame
|
208
|
+
The reaction's participants indexed by SBO terms
|
209
|
+
r_id : str
|
210
|
+
The ID of the reaction. Should be indexed by `sbo_term` and have columns
|
211
|
+
graph_hierarchy_df : pd.DataFrame
|
212
|
+
The graph hierarchy.
|
213
|
+
drop_reactions_when : str, optional
|
214
|
+
The condition under which to drop reactions as a network vertex. Default is 'same_tier'.
|
215
|
+
|
216
|
+
Returns
|
217
|
+
-------
|
218
|
+
pd.DataFrame
|
219
|
+
The edges of the Napistu graph for a single reaction.
|
220
|
+
"""
|
221
|
+
|
222
|
+
_validate_sbo_indexed_rsc_stoi(rxn_species)
|
223
|
+
|
224
|
+
if rxn_species.shape[0] <= 1:
|
225
|
+
logger.warning(
|
226
|
+
f"Reaction {r_id} has {rxn_species.shape[0]} species. "
|
227
|
+
"This reaction will be dropped."
|
228
|
+
)
|
229
|
+
return pd.DataFrame()
|
230
|
+
|
231
|
+
# map reaction species to the tiers of the graph hierarchy. higher levels point to lower levels
|
232
|
+
# same-level entries point at each other only if there is only a single tier
|
233
|
+
entities_ordered_by_tier = _reaction_species_to_tiers(
|
234
|
+
rxn_species, graph_hierarchy_df, r_id
|
235
|
+
)
|
236
|
+
n_tiers = len(entities_ordered_by_tier.index.get_level_values("tier").unique())
|
237
|
+
|
238
|
+
# format edges for reactions where all participants are on the same tier of a wiring hierarcy
|
239
|
+
if n_tiers == 2:
|
240
|
+
edges = _format_same_tier_edges(rxn_species, r_id)
|
241
|
+
else:
|
242
|
+
edges = _format_cross_tier_edges(
|
243
|
+
entities_ordered_by_tier, r_id, drop_reactions_when
|
244
|
+
)
|
245
|
+
|
246
|
+
return edges
|
247
|
+
|
248
|
+
|
249
|
+
def create_graph_hierarchy_df(wiring_approach: str) -> pd.DataFrame:
|
250
|
+
"""
|
251
|
+
Create a DataFrame representing the graph hierarchy for a given wiring approach.
|
252
|
+
|
253
|
+
Parameters
|
254
|
+
----------
|
255
|
+
wiring_approach : str
|
256
|
+
The type of tiered graph to work with. Each type has its own specification in constants.py.
|
257
|
+
|
258
|
+
Returns
|
259
|
+
-------
|
260
|
+
pd.DataFrame
|
261
|
+
DataFrame with sbo_name, tier, and sbo_term.
|
262
|
+
|
263
|
+
Raises
|
264
|
+
------
|
265
|
+
ValueError
|
266
|
+
If wiring_approach is not valid.
|
267
|
+
"""
|
268
|
+
|
269
|
+
if wiring_approach not in VALID_GRAPH_WIRING_APPROACHES:
|
270
|
+
raise ValueError(
|
271
|
+
f"{wiring_approach} is not a valid wiring approach. Valid approaches are {', '.join(VALID_GRAPH_WIRING_APPROACHES)}"
|
272
|
+
)
|
273
|
+
|
274
|
+
sbo_names_hierarchy = GRAPH_WIRING_HIERARCHIES[wiring_approach]
|
275
|
+
|
276
|
+
# format as a DF
|
277
|
+
graph_hierarchy_df = pd.concat(
|
278
|
+
[
|
279
|
+
pd.DataFrame({"sbo_name": sbo_names_hierarchy[i]}).assign(tier=i)
|
280
|
+
for i in range(0, len(sbo_names_hierarchy))
|
281
|
+
]
|
282
|
+
).reset_index(drop=True)
|
283
|
+
graph_hierarchy_df[SBML_DFS.SBO_TERM] = graph_hierarchy_df["sbo_name"].apply(
|
284
|
+
lambda x: (
|
285
|
+
MINI_SBO_FROM_NAME[x] if x != NAPISTU_GRAPH_NODE_TYPES.REACTION else ""
|
286
|
+
)
|
287
|
+
)
|
288
|
+
|
289
|
+
# ensure that the output is expected
|
290
|
+
utils.match_pd_vars(
|
291
|
+
graph_hierarchy_df,
|
292
|
+
req_vars={NAPISTU_GRAPH_EDGES.SBO_NAME, "tier", SBML_DFS.SBO_TERM},
|
293
|
+
allow_series=False,
|
294
|
+
).assert_present()
|
295
|
+
|
296
|
+
return graph_hierarchy_df
|
297
|
+
|
298
|
+
|
299
|
+
def _should_drop_reaction(
|
300
|
+
entities_ordered_by_tier: pd.DataFrame,
|
301
|
+
drop_reactions_when: str = DROP_REACTIONS_WHEN.SAME_TIER,
|
302
|
+
):
|
303
|
+
"""
|
304
|
+
Determine if a reaction should be dropped based on regulatory relationships and stringency.
|
305
|
+
|
306
|
+
Parameters
|
307
|
+
----------
|
308
|
+
entities_ordered_by_tier : pd.DataFrame
|
309
|
+
The entities ordered by tier.
|
310
|
+
drop_reactions_when : str, optional
|
311
|
+
The desired stringency for dropping reactions. Default is 'same_tier'.
|
312
|
+
|
313
|
+
Returns
|
314
|
+
-------
|
315
|
+
bool
|
316
|
+
True if the reaction should be dropped, False otherwise.
|
317
|
+
|
318
|
+
Notes
|
319
|
+
_____
|
320
|
+
reactions are always dropped if they are on the same tier. This greatly decreases the number of vertices
|
321
|
+
in a graph constructed from relatively dense interaction networks like STRING.
|
322
|
+
|
323
|
+
Raises
|
324
|
+
------
|
325
|
+
ValueError
|
326
|
+
If drop_reactions_when is not a valid value.
|
327
|
+
|
328
|
+
"""
|
329
|
+
|
330
|
+
if drop_reactions_when == DROP_REACTIONS_WHEN.ALWAYS:
|
331
|
+
return True
|
332
|
+
|
333
|
+
elif drop_reactions_when == DROP_REACTIONS_WHEN.EDGELIST:
|
334
|
+
if entities_ordered_by_tier.shape[0] == 3: # 2 members + 1 for reaction
|
335
|
+
return True
|
336
|
+
else:
|
337
|
+
return False
|
338
|
+
|
339
|
+
elif drop_reactions_when == DROP_REACTIONS_WHEN.SAME_TIER:
|
340
|
+
n_reactant_tiers = len(
|
341
|
+
entities_ordered_by_tier.query("sbo_name != 'reaction'")
|
342
|
+
.index.unique()
|
343
|
+
.tolist()
|
344
|
+
)
|
345
|
+
if n_reactant_tiers == 1:
|
346
|
+
return True
|
347
|
+
else:
|
348
|
+
return False
|
349
|
+
|
350
|
+
else:
|
351
|
+
raise ValueError(
|
352
|
+
f"Invalid drop_reactions: {drop_reactions_when}; valid values are {VALID_DROP_REACTIONS_WHEN}"
|
353
|
+
)
|
354
|
+
|
355
|
+
|
356
|
+
def _format_same_tier_edges(rxn_species: pd.DataFrame, r_id: str) -> pd.DataFrame:
|
357
|
+
"""
|
358
|
+
Format edges for reactions where all participants are on the same tier of a wiring hierarchy.
|
359
|
+
|
360
|
+
Parameters
|
361
|
+
----------
|
362
|
+
rxn_species : pd.DataFrame
|
363
|
+
DataFrame of reaction species for the reaction.
|
364
|
+
r_id : str
|
365
|
+
Reaction ID.
|
366
|
+
|
367
|
+
Returns
|
368
|
+
-------
|
369
|
+
pd.DataFrame
|
370
|
+
DataFrame of formatted edges for same-tier reactions.
|
371
|
+
|
372
|
+
Raises
|
373
|
+
------
|
374
|
+
ValueError
|
375
|
+
If reaction has multiple distinct metadata.
|
376
|
+
"""
|
377
|
+
|
378
|
+
# if they have the same SBO_term and stoichiometry, then the
|
379
|
+
# reaction can be trivially treated as reversible
|
380
|
+
|
381
|
+
valid_species = rxn_species.reset_index().assign(
|
382
|
+
entry=range(0, rxn_species.shape[0])
|
383
|
+
)
|
384
|
+
distinct_metadata = valid_species[
|
385
|
+
[SBML_DFS.SBO_TERM, SBML_DFS.STOICHIOMETRY]
|
386
|
+
].drop_duplicates()
|
387
|
+
if distinct_metadata.shape[0] > 1:
|
388
|
+
_log_pathological_same_tier(distinct_metadata, r_id)
|
389
|
+
return pd.DataFrame()
|
390
|
+
|
391
|
+
crossed_species = (
|
392
|
+
valid_species.merge(valid_species, how="cross", suffixes=("_left", "_right"))
|
393
|
+
.query("entry_left < entry_right")
|
394
|
+
.rename(
|
395
|
+
{
|
396
|
+
"sc_id_left": NAPISTU_GRAPH_EDGES.FROM,
|
397
|
+
"sc_id_right": NAPISTU_GRAPH_EDGES.TO,
|
398
|
+
"stoichiometry_right": NAPISTU_GRAPH_EDGES.STOICHIOMETRY,
|
399
|
+
"sbo_term_left": NAPISTU_GRAPH_EDGES.SBO_TERM,
|
400
|
+
},
|
401
|
+
axis=1,
|
402
|
+
)
|
403
|
+
.assign(r_id=r_id)
|
404
|
+
)
|
405
|
+
|
406
|
+
OUT_ATTRS = [
|
407
|
+
NAPISTU_GRAPH_EDGES.FROM,
|
408
|
+
NAPISTU_GRAPH_EDGES.TO,
|
409
|
+
NAPISTU_GRAPH_EDGES.STOICHIOMETRY,
|
410
|
+
NAPISTU_GRAPH_EDGES.SBO_TERM,
|
411
|
+
SBML_DFS.R_ID,
|
412
|
+
]
|
413
|
+
|
414
|
+
return crossed_species[OUT_ATTRS]
|
415
|
+
|
416
|
+
|
417
|
+
def _log_pathological_same_tier(distinct_metadata: pd.DataFrame, r_id: str) -> None:
|
418
|
+
"""
|
419
|
+
Log a warning if a reaction has multiple distinct metadata.
|
420
|
+
"""
|
421
|
+
msg = list(
|
422
|
+
[
|
423
|
+
f"Ignoring reaction {r_id}; its members have distinct annotations but they exist on the same level of a wiring hierarchy so their relationships cannot be determined."
|
424
|
+
]
|
425
|
+
)
|
426
|
+
sbo_terms = distinct_metadata["sbo_term"].map(MINI_SBO_TO_NAME).unique().tolist()
|
427
|
+
if len(sbo_terms) > 1:
|
428
|
+
msg.append(f"SBO terms: {sbo_terms}")
|
429
|
+
stoichiometries = distinct_metadata["stoichiometry"].unique().tolist()
|
430
|
+
if len(stoichiometries) > 1:
|
431
|
+
msg.append(f"Stoichiometries: {stoichiometries}")
|
432
|
+
logger.warning(msg[0] + "; ".join(msg[1:]))
|
433
|
+
|
434
|
+
|
435
|
+
def _format_cross_tier_edges(
|
436
|
+
entities_ordered_by_tier: pd.DataFrame,
|
437
|
+
r_id: str,
|
438
|
+
drop_reactions_when: str = DROP_REACTIONS_WHEN.SAME_TIER,
|
439
|
+
):
|
440
|
+
"""
|
441
|
+
Format edges for reactions where participants are on different tiers of a wiring hierarchy.
|
442
|
+
|
443
|
+
Parameters
|
444
|
+
----------
|
445
|
+
entities_ordered_by_tier : pd.DataFrame
|
446
|
+
DataFrame of entities ordered by tier.
|
447
|
+
r_id : str
|
448
|
+
Reaction ID.
|
449
|
+
drop_reactions_when : str, optional
|
450
|
+
The condition under which to drop reactions as a network vertex. Default is 'same_tier'.
|
451
|
+
|
452
|
+
Returns
|
453
|
+
-------
|
454
|
+
pd.DataFrame
|
455
|
+
DataFrame of formatted edges for cross-tier reactions.
|
456
|
+
"""
|
457
|
+
|
458
|
+
ordered_tiers = entities_ordered_by_tier.index.get_level_values("tier").unique()
|
459
|
+
reaction_tier = entities_ordered_by_tier.query(
|
460
|
+
"sbo_name == 'reaction'"
|
461
|
+
).index.tolist()[0]
|
462
|
+
drop_reaction = _should_drop_reaction(entities_ordered_by_tier, drop_reactions_when)
|
463
|
+
|
464
|
+
rxn_edges = list()
|
465
|
+
past_reaction = False
|
466
|
+
for i in range(0, len(ordered_tiers) - 1):
|
467
|
+
|
468
|
+
if ordered_tiers[i] == reaction_tier:
|
469
|
+
if drop_reaction:
|
470
|
+
continue
|
471
|
+
|
472
|
+
next_tier = ordered_tiers[i + 1]
|
473
|
+
if ordered_tiers[i + 1] == reaction_tier:
|
474
|
+
# hop over the reaction tier
|
475
|
+
if drop_reaction:
|
476
|
+
next_tier = ordered_tiers[i + 2]
|
477
|
+
|
478
|
+
formatted_tier_combo = _format_tier_combo(
|
479
|
+
entities_ordered_by_tier.loc[[ordered_tiers[i]]],
|
480
|
+
entities_ordered_by_tier.loc[[next_tier]],
|
481
|
+
past_reaction,
|
482
|
+
)
|
483
|
+
|
484
|
+
if ordered_tiers[i + 1] == reaction_tier:
|
485
|
+
past_reaction = True
|
486
|
+
|
487
|
+
rxn_edges.append(formatted_tier_combo)
|
488
|
+
|
489
|
+
rxn_edges_df = (
|
490
|
+
pd.concat(rxn_edges)[
|
491
|
+
[
|
492
|
+
NAPISTU_GRAPH_EDGES.FROM,
|
493
|
+
NAPISTU_GRAPH_EDGES.TO,
|
494
|
+
NAPISTU_GRAPH_EDGES.STOICHIOMETRY,
|
495
|
+
NAPISTU_GRAPH_EDGES.SBO_TERM,
|
496
|
+
]
|
497
|
+
]
|
498
|
+
.reset_index(drop=True)
|
499
|
+
.assign(r_id=r_id)
|
500
|
+
)
|
501
|
+
|
502
|
+
return rxn_edges_df
|
503
|
+
|
504
|
+
|
505
|
+
def _validate_sbo_indexed_rsc_stoi(rxn_species: pd.DataFrame) -> None:
|
506
|
+
"""
|
507
|
+
Validate that rxn_species is a DataFrame with correct index and columns.
|
508
|
+
|
509
|
+
Parameters
|
510
|
+
----------
|
511
|
+
rxn_species : pd.DataFrame
|
512
|
+
DataFrame of reaction species, indexed by SBO_TERM.
|
513
|
+
|
514
|
+
Returns
|
515
|
+
-------
|
516
|
+
None
|
517
|
+
|
518
|
+
Raises
|
519
|
+
------
|
520
|
+
TypeError
|
521
|
+
If rxn_species is not a pandas DataFrame.
|
522
|
+
ValueError
|
523
|
+
If index or columns are not as expected.
|
524
|
+
"""
|
525
|
+
|
526
|
+
if not isinstance(rxn_species, pd.DataFrame):
|
527
|
+
raise TypeError("rxn_species must be a pandas DataFrame")
|
528
|
+
if list(rxn_species.index.names) != [SBML_DFS.SBO_TERM]:
|
529
|
+
raise ValueError("rxn_species index names must be [SBML_DFS.SBO_TERM]")
|
530
|
+
if rxn_species.columns.tolist() != [SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY]:
|
531
|
+
raise ValueError(
|
532
|
+
"rxn_species columns must be [SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY]"
|
533
|
+
)
|
534
|
+
|
535
|
+
return None
|
536
|
+
|
537
|
+
|
538
|
+
def _reaction_species_to_tiers(
|
539
|
+
rxn_species: pd.DataFrame, graph_hierarchy_df: pd.DataFrame, r_id: str
|
540
|
+
) -> pd.DataFrame:
|
541
|
+
"""
|
542
|
+
Map reaction species to tiers based on the graph hierarchy.
|
543
|
+
|
544
|
+
Parameters
|
545
|
+
----------
|
546
|
+
rxn_species : pd.DataFrame
|
547
|
+
DataFrame of reaction species.
|
548
|
+
graph_hierarchy_df : pd.DataFrame
|
549
|
+
DataFrame defining the graph hierarchy.
|
550
|
+
r_id : str
|
551
|
+
Reaction ID.
|
552
|
+
|
553
|
+
Returns
|
554
|
+
-------
|
555
|
+
pd.DataFrame
|
556
|
+
DataFrame of entities ordered by tier.
|
557
|
+
"""
|
558
|
+
|
559
|
+
entities_ordered_by_tier = (
|
560
|
+
pd.concat(
|
561
|
+
[
|
562
|
+
(
|
563
|
+
rxn_species.reset_index()
|
564
|
+
.rename({SBML_DFS.SC_ID: "entity_id"}, axis=1)
|
565
|
+
.merge(graph_hierarchy_df)
|
566
|
+
),
|
567
|
+
graph_hierarchy_df[
|
568
|
+
graph_hierarchy_df[NAPISTU_GRAPH_EDGES.SBO_NAME]
|
569
|
+
== NAPISTU_GRAPH_NODE_TYPES.REACTION
|
570
|
+
].assign(entity_id=r_id, r_id=r_id),
|
571
|
+
]
|
572
|
+
)
|
573
|
+
.sort_values(["tier"])
|
574
|
+
.set_index("tier")
|
575
|
+
)
|
576
|
+
return entities_ordered_by_tier
|
577
|
+
|
578
|
+
|
579
|
+
def _format_tier_combo(
|
580
|
+
upstream_tier: pd.DataFrame, downstream_tier: pd.DataFrame, past_reaction: bool
|
581
|
+
) -> pd.DataFrame:
|
582
|
+
"""
|
583
|
+
Create all edges between two tiers of a tiered reaction graph.
|
584
|
+
|
585
|
+
This function generates a set of edges by performing an all-vs-all combination between entities
|
586
|
+
in the upstream and downstream tiers. Tiers represent an ordering along the molecular entities
|
587
|
+
in a reaction, plus a tier for the reaction itself. Attributes such as stoichiometry and sbo_term
|
588
|
+
are assigned from the tier furthest from the reaction tier, ensuring that each molecular tier
|
589
|
+
applies its attributes to a single set of edges, while the "reaction" tier does not contribute
|
590
|
+
these attributes. Reaction entities have neither a stoichiometry nor sbo_term annotation.
|
591
|
+
|
592
|
+
Parameters
|
593
|
+
----------
|
594
|
+
upstream_tier : pd.DataFrame
|
595
|
+
DataFrame containing upstream entities in a reaction (e.g., regulators or substrates).
|
596
|
+
downstream_tier : pd.DataFrame
|
597
|
+
DataFrame containing downstream entities in a reaction (e.g., products or targets).
|
598
|
+
past_reaction : bool
|
599
|
+
If True, attributes (stoichiometry, sbo_term) are taken from downstream_tier;
|
600
|
+
if False, from upstream_tier. This controls the direction of attribute assignment
|
601
|
+
depending on whether the reaction tier has already been passed in the tier ordering.
|
602
|
+
|
603
|
+
Returns
|
604
|
+
-------
|
605
|
+
pd.DataFrame
|
606
|
+
DataFrame of edges, each with columns: 'from', 'to', 'stoichiometry', 'sbo_term', and 'r_id'.
|
607
|
+
The number of edges is the product of the number of entities in the upstream tier
|
608
|
+
and the number in the downstream tier.
|
609
|
+
|
610
|
+
Notes
|
611
|
+
-----
|
612
|
+
- This function is used to build the edge list for tiered graphs, where each tier represents
|
613
|
+
a functional group (e.g., substrates, products, modifiers, reaction).
|
614
|
+
- The direction and attributes of edges depend on the position relative to the reaction tier.
|
615
|
+
- Reaction entities themselves do not contribute stoichiometry or sbo_term attributes.
|
616
|
+
"""
|
617
|
+
|
618
|
+
upstream_fields = ["entity_id", SBML_DFS.STOICHIOMETRY, SBML_DFS.SBO_TERM]
|
619
|
+
downstream_fields = ["entity_id"]
|
620
|
+
|
621
|
+
if past_reaction:
|
622
|
+
# swap fields
|
623
|
+
upstream_fields, downstream_fields = downstream_fields, upstream_fields
|
624
|
+
|
625
|
+
formatted_tier_combo = (
|
626
|
+
upstream_tier[upstream_fields]
|
627
|
+
.rename({"entity_id": NAPISTU_GRAPH_EDGES.FROM}, axis=1)
|
628
|
+
.assign(_joiner=1)
|
629
|
+
).merge(
|
630
|
+
(
|
631
|
+
downstream_tier[downstream_fields]
|
632
|
+
.rename({"entity_id": NAPISTU_GRAPH_EDGES.TO}, axis=1)
|
633
|
+
.assign(_joiner=1)
|
634
|
+
),
|
635
|
+
left_on="_joiner",
|
636
|
+
right_on="_joiner",
|
637
|
+
)
|
638
|
+
|
639
|
+
return formatted_tier_combo
|
640
|
+
|
641
|
+
|
642
|
+
def _find_sbo_duos(
|
643
|
+
reaction_species: pd.DataFrame,
|
644
|
+
target_sbo_term: str = MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR],
|
645
|
+
) -> list[str]:
|
646
|
+
"""
|
647
|
+
Find r_ids that have exactly 2 rows with the specified sbo_term and no other sbo_terms.
|
648
|
+
|
649
|
+
Parameters
|
650
|
+
----------
|
651
|
+
reaction_species : pd.DataFrame
|
652
|
+
DataFrame with columns: sbo_term, sc_id, stoichiometry, r_id
|
653
|
+
target_sbo_term : str
|
654
|
+
The sbo_term to match (e.g., "SBO:0000336" aka "interactor")
|
655
|
+
|
656
|
+
Returns
|
657
|
+
-------
|
658
|
+
list
|
659
|
+
List of r_ids that meet the criteria
|
660
|
+
"""
|
661
|
+
# Group by r_id and check conditions
|
662
|
+
grouped = reaction_species.groupby(SBML_DFS.R_ID)
|
663
|
+
|
664
|
+
matching_r_ids = []
|
665
|
+
for r_id, group in grouped:
|
666
|
+
# Check if all sbo_terms match the target AND there are exactly 2 rows
|
667
|
+
if (group[SBML_DFS.SBO_TERM] == target_sbo_term).all() and len(group) == 2:
|
668
|
+
matching_r_ids.append(r_id)
|
669
|
+
|
670
|
+
return matching_r_ids
|
671
|
+
|
672
|
+
|
673
|
+
def _interactor_duos_to_wide(interactor_duos: pd.DataFrame):
|
674
|
+
"""
|
675
|
+
Convert paired long format to wide format with 'from' and 'to' columns.
|
676
|
+
|
677
|
+
Parameters
|
678
|
+
----------
|
679
|
+
interactor_duos : pd.DataFrame
|
680
|
+
DataFrame with exactly 2 rows per r_id, containing sc_id and stoichiometry
|
681
|
+
|
682
|
+
Returns
|
683
|
+
-------
|
684
|
+
pd.DataFrame
|
685
|
+
Wide format with from_sc_id, from_stoichiometry, to_sc_id, to_stoichiometry columns
|
686
|
+
"""
|
687
|
+
# Sort by sc_id within each group to ensure consistent ordering
|
688
|
+
|
689
|
+
_validate_interactor_duos(interactor_duos)
|
690
|
+
df_sorted = interactor_duos.sort_values([SBML_DFS.R_ID, SBML_DFS.SC_ID])
|
691
|
+
|
692
|
+
# Group by r_id and use cumcount to create row numbers (0, 1)
|
693
|
+
df_sorted["pair_order"] = df_sorted.groupby(SBML_DFS.R_ID).cumcount()
|
694
|
+
|
695
|
+
# Pivot to wide format
|
696
|
+
wide_df = df_sorted.pivot(
|
697
|
+
index=SBML_DFS.R_ID, columns="pair_order", values=SBML_DFS.SC_ID
|
698
|
+
)
|
699
|
+
|
700
|
+
# Flatten column names and rename
|
701
|
+
wide_df.columns = ["from", "to"]
|
702
|
+
|
703
|
+
# Reset index to make r_id a column
|
704
|
+
return wide_df.reset_index().assign(
|
705
|
+
sbo_term=MINI_SBO_FROM_NAME[SBOTERM_NAMES.INTERACTOR], stoichiometry=0
|
706
|
+
)
|
707
|
+
|
708
|
+
|
709
|
+
def _validate_interactor_duos(interactor_duos: pd.DataFrame):
|
710
|
+
"""Logs cases when a pair of interactors have non-zero stoichiometry"""
|
711
|
+
|
712
|
+
utils.match_pd_vars(
|
713
|
+
interactor_duos,
|
714
|
+
req_vars={
|
715
|
+
SBML_DFS.R_ID,
|
716
|
+
SBML_DFS.SC_ID,
|
717
|
+
SBML_DFS.SBO_TERM,
|
718
|
+
SBML_DFS.STOICHIOMETRY,
|
719
|
+
},
|
720
|
+
).assert_present()
|
721
|
+
|
722
|
+
non_zero_stoi = interactor_duos[interactor_duos[SBML_DFS.STOICHIOMETRY] != 0]
|
723
|
+
|
724
|
+
if not non_zero_stoi.empty:
|
725
|
+
affected_r_ids = non_zero_stoi[SBML_DFS.R_ID].unique()
|
726
|
+
n_reactions = len(affected_r_ids)
|
727
|
+
sample_r_ids = affected_r_ids[:5].tolist()
|
728
|
+
|
729
|
+
logger.warning(
|
730
|
+
f"Found {n_reactions} reactions constructed from pairs of interactors with non-zero"
|
731
|
+
"stoichiometry. These should likely be assigned to another SBO term so their relationship"
|
732
|
+
"can be properly represented.\n"
|
733
|
+
f"Affected r_ids (showing up to 5): {sample_r_ids}"
|
734
|
+
)
|