napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -145
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +47 -65
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +156 -19
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev7.dist-info/RECORD +0 -98
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -1,1353 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- from typing import Optional, Union, Set, Dict, List
5
-
6
- import igraph as ig
7
- import numpy as np
8
- import pandas as pd
9
-
10
- from napistu import identifiers
11
- from napistu import sbml_dfs_core
12
- from napistu import utils
13
- from napistu.constants import SBML_DFS
14
- from napistu.constants import CPR_EDGELIST
15
- from napistu.constants import CPR_EDGELIST_REQ_VARS
16
- from napistu.constants import FEATURE_ID_VAR_DEFAULT
17
- from napistu.constants import RESOLVE_MATCHES_AGGREGATORS
18
- from napistu.constants import RESOLVE_MATCHES_TMP_WEIGHT_COL
19
- from napistu.constants import IDENTIFIERS
20
- from napistu.constants import IDENTIFIER_EDGELIST_REQ_VARS
21
- from napistu.constants import ONTOLOGIES_LIST
22
- from napistu.network.constants import CPR_GRAPH_EDGES
23
- from napistu.network import paths
24
-
25
- logger = logging.getLogger(__name__)
26
-
27
-
28
- def bind_wide_results(
29
- sbml_dfs: sbml_dfs_core.SBML_dfs,
30
- results_df: pd.DataFrame,
31
- results_name: str,
32
- ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
33
- dogmatic: bool = False,
34
- species_identifiers: Optional[pd.DataFrame] = None,
35
- feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
36
- numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
37
- keep_id_col: bool = True,
38
- verbose: bool = False,
39
- ) -> sbml_dfs_core.SBML_dfs:
40
- """
41
- Binds wide results to a sbml_dfs object.
42
-
43
- Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data
44
-
45
- Parameters
46
- ----------
47
- sbml_dfs : sbml_dfs_core.SBML_dfs
48
- The sbml_dfs object to bind the results to.
49
- results_df : pd.DataFrame
50
- The table containing the results to bind.
51
- results_name : str
52
- The name of the results to bind.
53
- ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
54
- Either:
55
- - Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
56
- - Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
57
- - None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
58
- dogmatic : bool
59
- Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
60
- species_identifiers : Optional[pd.DataFrame]
61
- Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
62
- feature_id_var : str
63
- The name of the column in the results_df that contains the feature identifiers. If this does not exist it will be created.
64
- numeric_agg : str
65
- The aggregation method to use for resolving degeneracy.
66
- keep_id_col : bool
67
- Whether to keep the identifier column in the results_df.
68
- verbose : bool
69
- Whether to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
70
-
71
- Returns
72
- -------
73
- sbml_dfs : sbml_dfs_core.SBML_dfs
74
- The sbml_dfs object with the results bound.
75
- """
76
-
77
- species_identifiers = identifiers._prepare_species_identifiers(
78
- sbml_dfs, dogmatic=dogmatic, species_identifiers=species_identifiers
79
- )
80
-
81
- # match
82
- matched_s_ids_from_wide = match_features_to_wide_pathway_species(
83
- results_df,
84
- species_identifiers,
85
- ontologies=ontologies,
86
- feature_id_var=feature_id_var,
87
- verbose=verbose,
88
- )
89
-
90
- disambiguated_matches = resolve_matches(
91
- matched_data=matched_s_ids_from_wide,
92
- feature_id_var=feature_id_var,
93
- numeric_agg=numeric_agg,
94
- keep_id_col=keep_id_col,
95
- )
96
-
97
- clean_species_data = utils.drop_extra_cols(
98
- results_df, disambiguated_matches, always_include=[feature_id_var]
99
- )
100
-
101
- sbml_dfs.add_species_data(results_name, clean_species_data)
102
-
103
- return sbml_dfs
104
-
105
-
106
- def features_to_pathway_species(
107
- feature_identifiers: pd.DataFrame,
108
- species_identifiers: pd.DataFrame,
109
- ontologies: set,
110
- feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
111
- feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
112
- expand_identifiers: bool = False,
113
- identifier_delimiter: str = "/",
114
- verbose: bool = False,
115
- ) -> pd.DataFrame:
116
- """
117
- Features to Pathway Species
118
-
119
- Match a table of molecular species to their corresponding species in a pathway representation.
120
-
121
- Parameters:
122
- feature_identifiers: pd.DataFrame
123
- pd.Dataframe containing a "feature_identifiers_var" variable used to match entries
124
- species_identifiers: pd.DataFrame
125
- A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
126
- generally using sbml_dfs_core.export_sbml_dfs()
127
- ontologies: set
128
- A set of ontologies used to match features to pathway species
129
- feature_identifiers_var: str
130
- Variable in "feature_identifiers" containing identifiers
131
- expand_identifiers: bool, default=False
132
- If True, split identifiers in feature_identifiers_var by identifier_delimiter and explode into multiple rows
133
- identifier_delimiter: str, default="/"
134
- Delimiter to use for splitting identifiers if expand_identifiers is True
135
- verbose: bool, default=False
136
- If True, log mapping statistics at the end of the function
137
-
138
- Returns:
139
- pathway_species: pd.DataFrame
140
- species_identifiers joined to feature_identifiers based on shared identifiers
141
- """
142
-
143
- # Check for identifier column
144
- if feature_identifiers_var not in feature_identifiers.columns.to_list():
145
- raise ValueError(
146
- f"{feature_identifiers_var} must be a variable in 'feature_identifiers', "
147
- f"possible variables are {', '.join(feature_identifiers.columns.tolist())}"
148
- )
149
-
150
- # Respect or create feature_id column
151
- feature_identifiers = _ensure_feature_id_var(feature_identifiers, feature_id_var)
152
-
153
- # Optionally expand identifiers into multiple rows
154
- if expand_identifiers:
155
- # Count the number of expansions by counting delimiters
156
- n_expansions = (
157
- feature_identifiers[feature_identifiers_var]
158
- .astype(str)
159
- .str.count(identifier_delimiter)
160
- .sum()
161
- )
162
- if n_expansions > 0:
163
- logger.info(
164
- f"Expanding identifiers: {n_expansions} delimiters found in '{feature_identifiers_var}', will expand to more rows."
165
- )
166
-
167
- # Split, strip whitespace, and explode
168
- feature_identifiers = feature_identifiers.copy()
169
- feature_identifiers[feature_identifiers_var] = (
170
- feature_identifiers[feature_identifiers_var]
171
- .astype(str)
172
- .str.split(identifier_delimiter)
173
- .apply(lambda lst: [x.strip() for x in lst])
174
- )
175
- feature_identifiers = feature_identifiers.explode(
176
- feature_identifiers_var, ignore_index=True
177
- )
178
-
179
- # check identifiers table
180
- identifiers._check_species_identifiers_table(species_identifiers)
181
-
182
- available_ontologies = set(species_identifiers[IDENTIFIERS.ONTOLOGY].tolist())
183
- unavailable_ontologies = ontologies.difference(available_ontologies)
184
-
185
- # no ontologies present
186
- if len(unavailable_ontologies) == len(ontologies):
187
- raise ValueError(
188
- f"None of the requested ontologies ({', '.join(ontologies)}) "
189
- "were used to annotate pathway species. Available ontologies are: "
190
- f"{', '.join(available_ontologies)}"
191
- )
192
-
193
- # 1+ desired ontologies are not present
194
- if len(unavailable_ontologies) > 0:
195
- raise ValueError(
196
- f"Some of the requested ontologies ({', '.join(unavailable_ontologies)}) "
197
- "were NOT used to annotate pathway species. Available ontologies are: "
198
- f"{', '.join(available_ontologies)}"
199
- )
200
-
201
- relevant_identifiers = species_identifiers[
202
- species_identifiers[IDENTIFIERS.ONTOLOGY].isin(ontologies)
203
- ]
204
-
205
- # map features to pathway species
206
- pathway_species = feature_identifiers.merge(
207
- relevant_identifiers,
208
- left_on=feature_identifiers_var,
209
- right_on=IDENTIFIERS.IDENTIFIER,
210
- )
211
-
212
- if pathway_species.shape[0] == 0:
213
- logger.warning(
214
- "None of the provided species identifiers matched entries of the pathway; returning None"
215
- )
216
- None
217
-
218
- # report the fraction of unmapped species
219
- if verbose:
220
- _log_feature_species_mapping_stats(pathway_species, feature_id_var)
221
-
222
- return pathway_species
223
-
224
-
225
- def edgelist_to_pathway_species(
226
- formatted_edgelist: pd.DataFrame,
227
- species_identifiers: pd.DataFrame,
228
- ontologies: set,
229
- feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
230
- verbose: bool = False,
231
- ) -> pd.DataFrame:
232
- """
233
- Edgelist to Pathway Species
234
-
235
- Match an edgelist of molecular species pairs to their corresponding species in a pathway representation.
236
-
237
- Parameters:
238
- formatted_edgelist: pd.DataFrame
239
- pd.Dataframe containing a "identifier_upstream" and "identifier_downstream" variables used to to match entries
240
- species_identifiers: pd.DataFrame
241
- A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species") generally using
242
- sbml_dfs_core.export_sbml_dfs()
243
- ontologies: set
244
- A set of ontologies used to match features to pathway species
245
- feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
246
- Variable in "formatted_edgelist" containing feature ids
247
- verbose: bool, default=False
248
- Whether to print verbose output
249
-
250
- Returns:
251
- edges_on_pathway: pd.DataFrame
252
- formatted_edgelist with upstream features mapped
253
- to "s_id_upstream" and downstream species mapped
254
- to "s_id_downstream"
255
- """
256
-
257
- required_vars_distinct_features = {
258
- CPR_EDGELIST.IDENTIFIER_UPSTREAM,
259
- CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
260
- }
261
- missing_required_vars_distinct_features = (
262
- required_vars_distinct_features.difference(
263
- set(formatted_edgelist.columns.tolist())
264
- )
265
- )
266
-
267
- if len(missing_required_vars_distinct_features) > 0:
268
- raise ValueError(
269
- f"{len(missing_required_vars_distinct_features)} required variables were "
270
- "missing from 'formatted_edgelist': "
271
- f"{', '.join(missing_required_vars_distinct_features)}"
272
- )
273
-
274
- # define all distinct identifiers in edgelist
275
- distinct_identifiers = (
276
- pd.concat(
277
- [
278
- formatted_edgelist[CPR_EDGELIST.IDENTIFIER_UPSTREAM],
279
- formatted_edgelist[CPR_EDGELIST.IDENTIFIER_DOWNSTREAM],
280
- ]
281
- )
282
- .drop_duplicates()
283
- .reset_index(drop=True)
284
- .to_frame()
285
- .rename({0: feature_id_var}, axis=1)
286
- )
287
-
288
- # merge edgelist identifiers with pathway identifiers to map s_ids to identifiers
289
- features_on_pathway = features_to_pathway_species(
290
- feature_identifiers=distinct_identifiers,
291
- species_identifiers=species_identifiers,
292
- ontologies=ontologies,
293
- feature_identifiers_var=feature_id_var,
294
- verbose=verbose,
295
- )
296
-
297
- # add s_ids of both upstream and downstream edges to pathway
298
- edges_on_pathway = formatted_edgelist.merge(
299
- features_on_pathway[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
300
- {
301
- SBML_DFS.S_ID: CPR_EDGELIST.S_ID_UPSTREAM,
302
- IDENTIFIERS.IDENTIFIER: CPR_EDGELIST.IDENTIFIER_UPSTREAM,
303
- },
304
- axis=1,
305
- )
306
- ).merge(
307
- features_on_pathway[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
308
- {
309
- SBML_DFS.S_ID: CPR_EDGELIST.S_ID_DOWNSTREAM,
310
- IDENTIFIERS.IDENTIFIER: CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
311
- },
312
- axis=1,
313
- )
314
- )
315
-
316
- return edges_on_pathway
317
-
318
-
319
- def match_features_to_wide_pathway_species(
320
- wide_df: pd.DataFrame,
321
- species_identifiers: pd.DataFrame,
322
- ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
323
- feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
324
- feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
325
- verbose: bool = False,
326
- ) -> pd.DataFrame:
327
- """
328
- Convert a wide-format DataFrame with multiple ontology columns to long format,
329
- and match features to pathway species by ontology and identifier.
330
-
331
- Parameters
332
- ----------
333
- wide_df : pd.DataFrame
334
- DataFrame with ontology identifier columns and any number of results columns.
335
- All non-ontology columns are treated as results.
336
- species_identifiers : pd.DataFrame
337
- DataFrame as required by features_to_pathway_species
338
- ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
339
- Either:
340
- - Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
341
- - Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
342
- - None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
343
- feature_identifiers_var : str, default="identifier"
344
- Name for the identifier column in the long format
345
- feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
346
- Name for the feature id column in the long format
347
- verbose : bool, default=False
348
- Whether to print verbose output
349
-
350
- Returns
351
- -------
352
- pd.DataFrame
353
- Output of match_by_ontology_and_identifier
354
-
355
- Examples
356
- --------
357
- >>> # Example with auto-detected ontology columns and multiple results
358
- >>> wide_df = pd.DataFrame({
359
- ... 'uniprot': ['P12345', 'Q67890'],
360
- ... 'chebi': ['15377', '16810'],
361
- ... 'log2fc': [1.0, 2.0],
362
- ... 'pvalue': [0.01, 0.05]
363
- ... })
364
- >>> result = match_features_to_wide_pathway_species(
365
- ... wide_df=wide_df,
366
- ... species_identifiers=species_identifiers
367
- ... )
368
-
369
- >>> # Example with custom ontology mapping
370
- >>> wide_df = pd.DataFrame({
371
- ... 'protein_id': ['P12345', 'Q67890'],
372
- ... 'compound_id': ['15377', '16810'],
373
- ... 'expression': [1.0, 2.0],
374
- ... 'confidence': [0.8, 0.9]
375
- ... })
376
- >>> result = match_features_to_wide_pathway_species(
377
- ... wide_df=wide_df,
378
- ... species_identifiers=species_identifiers,
379
- ... ontologies={'protein_id': 'uniprot', 'compound_id': 'chebi'}
380
- ... )
381
- """
382
- # Make a copy to avoid modifying the input
383
- wide_df = wide_df.copy()
384
-
385
- # Validate ontologies and get the set of ontology columns
386
- ontology_cols = _validate_wide_ontologies(wide_df, ontologies)
387
- melt_cols = list(ontology_cols)
388
-
389
- # Apply renaming if a mapping is provided
390
- if isinstance(ontologies, dict):
391
- wide_df = wide_df.rename(columns=ontologies)
392
-
393
- # Ensure feature_id column exists
394
- wide_df = _ensure_feature_id_var(wide_df, feature_id_var)
395
-
396
- # All non-ontology columns are treated as results
397
- results_cols = list(set(wide_df.columns) - set(melt_cols))
398
- if not results_cols:
399
- raise ValueError("No results columns found in DataFrame")
400
-
401
- logger.info(f"Using columns as results: {results_cols}")
402
-
403
- # Melt ontology columns to long format, keeping all results columns
404
- long_df = wide_df.melt(
405
- id_vars=results_cols,
406
- value_vars=melt_cols,
407
- var_name=IDENTIFIERS.ONTOLOGY,
408
- value_name=feature_identifiers_var,
409
- ).dropna(subset=[feature_identifiers_var])
410
-
411
- logger.debug(f"Final long format shape: {long_df.shape}")
412
-
413
- # Call the matching function with the validated ontologies
414
- out = match_by_ontology_and_identifier(
415
- feature_identifiers=long_df,
416
- species_identifiers=species_identifiers,
417
- ontologies=ontology_cols,
418
- feature_identifiers_var=feature_identifiers_var,
419
- )
420
-
421
- if verbose:
422
- _log_feature_species_mapping_stats(out, feature_id_var)
423
-
424
- return out
425
-
426
-
427
- def match_by_ontology_and_identifier(
428
- feature_identifiers: pd.DataFrame,
429
- species_identifiers: pd.DataFrame,
430
- ontologies: Union[str, Set[str], List[str]],
431
- feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
432
- verbose: bool = False,
433
- ) -> pd.DataFrame:
434
- """
435
- Match features to pathway species based on both ontology and identifier matches.
436
- Performs separate matching for each ontology and concatenates the results.
437
-
438
- Parameters
439
- ----------
440
- feature_identifiers : pd.DataFrame
441
- DataFrame containing feature identifiers and results.
442
- Must have columns [ontology, feature_identifiers_var, results]
443
- species_identifiers : pd.DataFrame
444
- DataFrame containing species identifiers from pathway.
445
- Must have columns [ontology, identifier]
446
- ontologies : Union[str, Set[str], List[str]]
447
- Ontologies to match on. Can be:
448
- - A single ontology string
449
- - A set of ontology strings
450
- - A list of ontology strings
451
- feature_identifiers_var : str, default="identifier"
452
- Name of the identifier column in feature_identifiers
453
- verbose : bool, default=False
454
- Whether to print verbose output
455
-
456
- Returns
457
- -------
458
- pd.DataFrame
459
- Concatenated results of matching for each ontology.
460
- Contains all columns from features_to_pathway_species()
461
-
462
- Examples
463
- --------
464
- >>> # Match using a single ontology
465
- >>> result = match_by_ontology_and_identifier(
466
- ... feature_identifiers=features_df,
467
- ... species_identifiers=species_df,
468
- ... ontologies="uniprot"
469
- ... )
470
-
471
- >>> # Match using multiple ontologies
472
- >>> result = match_by_ontology_and_identifier(
473
- ... feature_identifiers=features_df,
474
- ... species_identifiers=species_df,
475
- ... ontologies={"uniprot", "chebi"}
476
- ... )
477
- """
478
- # Convert string to set for consistent handling
479
- if isinstance(ontologies, str):
480
- ontologies = {ontologies}
481
- elif isinstance(ontologies, list):
482
- ontologies = set(ontologies)
483
-
484
- # Validate ontologies
485
- invalid_onts = ontologies - set(ONTOLOGIES_LIST)
486
- if invalid_onts:
487
- raise ValueError(
488
- f"Invalid ontologies specified: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
489
- )
490
-
491
- # Initialize list to store results
492
- matched_dfs = []
493
-
494
- # Process each ontology separately
495
- for ont in ontologies:
496
- # Filter feature identifiers to current ontology and drop ontology column
497
- ont_features = (
498
- feature_identifiers[feature_identifiers[IDENTIFIERS.ONTOLOGY] == ont]
499
- .drop(columns=[IDENTIFIERS.ONTOLOGY])
500
- .copy()
501
- )
502
-
503
- if ont_features.empty:
504
- logger.warning(f"No features found for ontology: {ont}")
505
- continue
506
-
507
- # Filter species identifiers to current ontology
508
- ont_species = species_identifiers[
509
- species_identifiers[IDENTIFIERS.ONTOLOGY] == ont
510
- ].copy()
511
-
512
- if ont_species.empty:
513
- logger.warning(f"No species found for ontology: {ont}")
514
- continue
515
-
516
- logger.debug(
517
- f"Matching {len(ont_features)} features to {len(ont_species)} species for ontology {ont}"
518
- )
519
-
520
- # Match features to species for this ontology
521
- matched = features_to_pathway_species(
522
- feature_identifiers=ont_features,
523
- species_identifiers=ont_species,
524
- ontologies={ont},
525
- feature_identifiers_var=feature_identifiers_var,
526
- verbose=verbose,
527
- )
528
-
529
- if matched.empty:
530
- logger.warning(f"No matches found for ontology: {ont}")
531
- continue
532
-
533
- matched_dfs.append(matched)
534
-
535
- if not matched_dfs:
536
- logger.warning("No matches found for any ontology")
537
- return pd.DataFrame() # Return empty DataFrame with correct columns
538
-
539
- # Combine results from all ontologies
540
- result = pd.concat(matched_dfs, axis=0, ignore_index=True)
541
-
542
- logger.info(
543
- f"Found {len(result)} total matches across {len(matched_dfs)} ontologies"
544
- )
545
-
546
- return result
547
-
548
-
549
- def resolve_matches(
550
- matched_data: pd.DataFrame,
551
- feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
552
- index_col: str = SBML_DFS.S_ID,
553
- numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
554
- keep_id_col: bool = True,
555
- ) -> pd.DataFrame:
556
- """
557
- Resolve many-to-1 and 1-to-many matches in matched data.
558
-
559
- Parameters
560
- ----------
561
- matched_data : pd.DataFrame
562
- DataFrame containing matched data with columns:
563
- - feature_id_var: identifier column (e.g. feature_id)
564
- - index_col: index column (e.g. s_id)
565
- - other columns: data columns to be aggregated
566
- feature_id_var : str, default="feature_id"
567
- Name of the identifier column
568
- index_col : str, default="s_id"
569
- Name of the column to use as index
570
- numeric_agg : str, default="weighted_mean"
571
- Method to aggregate numeric columns:
572
- - "weighted_mean": weighted by inverse of feature_id frequency (default)
573
- - "mean": simple arithmetic mean
574
- - "first": first value after sorting by feature_id_var (requires feature_id_var)
575
- - "max": maximum value
576
- keep_id_col : bool, default=True
577
- Whether to keep and rollup the feature_id_var in the output.
578
- If False, feature_id_var will be dropped from the output.
579
-
580
- Returns
581
- -------
582
- pd.DataFrame
583
- DataFrame with resolved matches:
584
- - Many-to-1: numeric columns are aggregated using specified method
585
- - 1-to-many: adds a count column showing number of matches
586
- - Index is set to index_col and named accordingly
587
-
588
- Raises
589
- ------
590
- KeyError
591
- If feature_id_var is not present in the DataFrame
592
- TypeError
593
- If DataFrame contains unsupported data types (boolean or datetime)
594
- """
595
- # Make a copy to avoid modifying input
596
- df = matched_data.copy()
597
-
598
- # Check for unsupported data types
599
- unsupported_dtypes = df.select_dtypes(include=["bool", "datetime64"]).columns
600
- if not unsupported_dtypes.empty:
601
- raise TypeError(
602
- f"Unsupported data types found in columns: {list(unsupported_dtypes)}. "
603
- "Boolean and datetime columns are not supported."
604
- )
605
-
606
- # Always require feature_id_var
607
- if feature_id_var not in df.columns:
608
- raise KeyError(feature_id_var)
609
-
610
- # Deduplicate by feature_id within each s_id using groupby and first BEFORE any further processing
611
- df = df.groupby([index_col, feature_id_var], sort=False).first().reset_index()
612
-
613
- # Use a unique temporary column name for weights
614
- if RESOLVE_MATCHES_TMP_WEIGHT_COL in df.columns:
615
- raise ValueError(
616
- f"Temporary weight column name '{RESOLVE_MATCHES_TMP_WEIGHT_COL}' already exists in the input data. Please rename or remove this column and try again."
617
- )
618
-
619
- # Calculate weights if needed (after deduplication!)
620
- if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
621
- feature_counts = df[feature_id_var].value_counts()
622
- df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = (
623
- 1 / feature_counts[df[feature_id_var]].values
624
- )
625
-
626
- # Set index for grouping
627
- df = df.set_index(index_col)
628
-
629
- # Use utility to split columns
630
- always_non_numeric = [feature_id_var] if keep_id_col else []
631
- numeric_cols, non_numeric_cols = _split_numeric_non_numeric_columns(
632
- df, always_non_numeric=always_non_numeric
633
- )
634
-
635
- # Get aggregator function
636
- numeric_aggregator = _get_numeric_aggregator(
637
- method=numeric_agg, feature_id_var=feature_id_var
638
- )
639
- resolved = _aggregate_grouped_columns(
640
- df,
641
- numeric_cols,
642
- non_numeric_cols,
643
- numeric_aggregator,
644
- feature_id_var=feature_id_var,
645
- numeric_agg=numeric_agg,
646
- )
647
- # Add count of matches per feature_id
648
- match_counts = matched_data.groupby(index_col)[feature_id_var].nunique()
649
- resolved[f"{feature_id_var}_match_count"] = match_counts
650
-
651
- # Drop feature_id_var if not keeping it
652
- if not keep_id_col and feature_id_var in resolved.columns:
653
- resolved = resolved.drop(columns=[feature_id_var])
654
-
655
- # Ensure index is named consistently
656
- resolved.index.name = index_col
657
-
658
- return resolved
659
-
660
-
661
- def edgelist_to_scids(
662
- formatted_edgelist: pd.DataFrame,
663
- sbml_dfs: sbml_dfs_core.SBML_dfs,
664
- species_identifiers: pd.DataFrame,
665
- ontologies: set,
666
- ):
667
- """
668
-
669
- Edgelist to Compartmentalized Species IDds
670
-
671
- Map an edgelist of possible mechanistic interactions onto a
672
- pathadex pathway
673
-
674
- Parameters:
675
- formatted_edgelist: pd.DataFrame
676
- pd.Dataframe containing a "identifier_upstream" and
677
- "identifier_downstream" variables used to to match entries
678
- sbml_dfs: sbml_dfs_core.SBML_dfs
679
- A mechanistic model
680
- species_identifiers: pd.DataFrame
681
- A table of molecular species identifiers produced from
682
- sbml_dfs.get_identifiers("species") generally using sbml_dfs_core.export_sbml_dfs()
683
- ontologies: set
684
- A set of ontologies used to match features to pathway species
685
-
686
- Returns:
687
- edgelist_w_scids: pd.DataFrame
688
- formatted_edgelist with upstream features mapped to "sc_id_upstream" and
689
- downstream species mapped to "sc_id_downstream"
690
- """
691
-
692
- identifiers._check_species_identifiers_table(species_identifiers)
693
-
694
- # map edges onto pathway entities based on shared identifiers
695
- edges_on_pathway = edgelist_to_pathway_species(
696
- formatted_edgelist=formatted_edgelist,
697
- species_identifiers=species_identifiers,
698
- ontologies=ontologies,
699
- )
700
-
701
- # expand from s_ids to sc_ids
702
- s_id_pairs = edges_on_pathway[
703
- [CPR_EDGELIST.S_ID_UPSTREAM, CPR_EDGELIST.S_ID_DOWNSTREAM]
704
- ].drop_duplicates()
705
- sc_id_pairs = s_id_pairs.merge(
706
- sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
707
- .reset_index()
708
- .rename(
709
- {
710
- SBML_DFS.S_ID: CPR_EDGELIST.S_ID_UPSTREAM,
711
- SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM,
712
- },
713
- axis=1,
714
- )
715
- ).merge(
716
- sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
717
- .reset_index()
718
- .rename(
719
- {
720
- SBML_DFS.S_ID: CPR_EDGELIST.S_ID_DOWNSTREAM,
721
- SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM,
722
- },
723
- axis=1,
724
- )
725
- )
726
-
727
- # map sc_ids back to edges_on_pathway
728
- # join lookup table of s_id_upstream, s_id_downstream -> sc_ids
729
- edgelist_w_scids = edges_on_pathway.merge(sc_id_pairs)
730
-
731
- logger_msg = (
732
- f"{edgelist_w_scids.shape[0]} interactions mapped "
733
- "onto pairs of compartmentalized species in the mechanistic model"
734
- )
735
- if edgelist_w_scids.shape[0] == 0:
736
- logger.warning(logger_msg)
737
- else:
738
- logger.info(logger_msg)
739
-
740
- return edgelist_w_scids
741
-
742
-
743
- def filter_to_direct_mechanistic_interactions(
744
- formatted_edgelist: pd.DataFrame,
745
- sbml_dfs: sbml_dfs_core.SBML_dfs,
746
- species_identifiers: pd.DataFrame,
747
- ontologies: set,
748
- ) -> pd.DataFrame:
749
- """
750
- Filter to Direct Mechanistic Interactions
751
-
752
- Filter an edgelist to direct mechanistic interactions
753
-
754
- Parameters:
755
- formatted_edgelist: pd.DataFrame
756
- pd.Dataframe containing a "identifier_upstream" and "identifier_downstream" variables used to to match entries
757
- sbml_dfs: sbml_dfs_core.SBML_dfs
758
- A mechanistic model
759
- species_identifiers: pd.DataFrame
760
- A table of molecular species identifiers
761
- produced from sbml_dfs.get_identifiers("species") generally
762
- using sbml_dfs_core.export_sbml_dfs()
763
- ontologies: set
764
- A set of ontologies used to match features to pathway species
765
-
766
- Returns:
767
- edgelist_w_direct_mechanistic_interactions: pd.DataFrame
768
- formatted_edgelist filtered to mechanistic reactions present in the pathway representation
769
- """
770
-
771
- edgelist_w_scids = _edgelist_to_scids_if_needed(
772
- formatted_edgelist, sbml_dfs, species_identifiers, ontologies
773
- )
774
-
775
- # reduce to distinct sc_id pairs
776
- sc_id_pairs = edgelist_w_scids[list(CPR_EDGELIST_REQ_VARS)].drop_duplicates()
777
-
778
- # define all existing direct regulatory interactions
779
- pathway_interactions = pd.concat(
780
- [
781
- # pair 0 -> <0 # modifiers affect substrates
782
- sbml_dfs.reaction_species[
783
- sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] == 0
784
- ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
785
- .rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
786
- .merge(
787
- sbml_dfs.reaction_species[
788
- sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] < 0
789
- ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
790
- {SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
791
- )
792
- ),
793
- # pair <0 -> >0 # substrates affect products
794
- sbml_dfs.reaction_species[
795
- sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] < 0
796
- ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
797
- .rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
798
- .merge(
799
- sbml_dfs.reaction_species[
800
- sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] > 0
801
- ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
802
- {SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
803
- )
804
- ),
805
- # pair 0 -> >0 # modifiers affect products
806
- sbml_dfs.reaction_species[
807
- sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] == 0
808
- ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
809
- .rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
810
- .merge(
811
- sbml_dfs.reaction_species[
812
- sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] > 0
813
- ][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
814
- {SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
815
- )
816
- ),
817
- ]
818
- ).reset_index(drop=True)
819
-
820
- # filter pathway interactions based on matches to sc_id_pairs
821
- direct_edge_interactions = (
822
- sc_id_pairs.merge(pathway_interactions)
823
- .merge(
824
- sbml_dfs.species[SBML_DFS.S_NAME]
825
- .to_frame()
826
- .rename({SBML_DFS.S_NAME: CPR_EDGELIST.S_NAME_UPSTREAM}, axis=1),
827
- left_on=CPR_EDGELIST.S_ID_UPSTREAM,
828
- right_index=True,
829
- # add species metadata for matches
830
- )
831
- .merge(
832
- sbml_dfs.species[SBML_DFS.S_NAME]
833
- .to_frame()
834
- .rename({SBML_DFS.S_NAME: CPR_EDGELIST.S_NAME_DOWNSTREAM}, axis=1),
835
- left_on=CPR_EDGELIST.S_ID_DOWNSTREAM,
836
- right_index=True,
837
- # add metadata for reactions where interaction occurs
838
- )
839
- .merge(
840
- sbml_dfs.reactions[SBML_DFS.R_NAME].to_frame(),
841
- left_on=SBML_DFS.R_ID,
842
- right_index=True,
843
- )
844
- )
845
-
846
- edgelist_w_direct_mechanistic_interactions = edgelist_w_scids.merge(
847
- direct_edge_interactions[
848
- [
849
- CPR_EDGELIST.SC_ID_UPSTREAM,
850
- CPR_EDGELIST.SC_ID_DOWNSTREAM,
851
- SBML_DFS.R_ID,
852
- CPR_EDGELIST.S_NAME_UPSTREAM,
853
- CPR_EDGELIST.S_NAME_DOWNSTREAM,
854
- SBML_DFS.R_NAME,
855
- ]
856
- ]
857
- )
858
-
859
- return edgelist_w_direct_mechanistic_interactions
860
-
861
-
862
- def filter_to_indirect_mechanistic_interactions(
863
- formatted_edgelist: pd.DataFrame,
864
- sbml_dfs: sbml_dfs_core.SBML_dfs,
865
- species_identifiers: pd.DataFrame,
866
- cpr_graph: ig.Graph,
867
- ontologies: set,
868
- precomputed_distances=None,
869
- max_path_length=10,
870
- ):
871
- """
872
- Filter to Indirect Mechanistic Interactions
873
-
874
- Filter an edgelist to indirect mechanistic interactions.
875
- Indirect relationships are identified by searching a
876
- network for paths from an upstream species to a downstream species
877
-
878
- Parameters:
879
- formatted_edgelist: pd.DataFrame
880
- pd.Dataframe containing a "identifier_upstream" and
881
- "identifier_downstream" variables used to to match entries
882
- sbml_dfs: sbml_dfs_core.SBML_dfs
883
- A mechanistic model
884
- species_identifiers: pandas.DataFrame
885
- A table of molecular species identifiers produced from
886
- sbml_dfs.get_identifiers("species") generally using sbml_dfs_core.export_sbml_dfs()
887
- cpr_graph: igraph.Graph
888
- A network representation of the sbml_dfs model
889
- ontologies: set
890
- A set of ontologies used to match features to pathway species
891
- precomputed_distances: None or a pd.DataFrame containing path lengths and weights
892
- between pairs of cspecies.
893
- max_path_length: int
894
- Maximum number of steps to consider.
895
-
896
- Returns:
897
- edgelist_w_indirect_mechanistic_interactions: pd.DataFrame
898
- formatted_edgelist filtered to mechanistic reactions which can be described
899
- by an indirect mechanism. The mechanism is described by a path weight, length,
900
- and a vpath and epath list of vertices and edges which were traversed to create the path.
901
- """
902
-
903
- edgelist_w_scids = _edgelist_to_scids_if_needed(
904
- formatted_edgelist, sbml_dfs, species_identifiers, ontologies
905
- )
906
-
907
- if precomputed_distances is not None:
908
- # rename to match conventions in precomputed_distances
909
- # filter by these precomputed distances and then restore naming
910
- edgelist_w_scids = paths._filter_paths_by_precomputed_distances(
911
- edgelist_w_scids.rename(
912
- {
913
- CPR_EDGELIST.SC_ID_UPSTREAM: CPR_EDGELIST.SC_ID_ORIGIN,
914
- CPR_EDGELIST.SC_ID_DOWNSTREAM: CPR_EDGELIST.SC_ID_DEST,
915
- },
916
- axis=1,
917
- ),
918
- precomputed_distances,
919
- ).rename(
920
- {
921
- CPR_EDGELIST.SC_ID_ORIGIN: CPR_EDGELIST.SC_ID_UPSTREAM,
922
- CPR_EDGELIST.SC_ID_DEST: CPR_EDGELIST.SC_ID_DOWNSTREAM,
923
- },
924
- axis=1,
925
- )
926
-
927
- # find paths from 1 upstream to all desired downstream sc_ids
928
- # (this is the convention with igraph)
929
- indexed_origin_vertices = edgelist_w_scids.set_index(CPR_EDGELIST.SC_ID_UPSTREAM)
930
-
931
- # loop through upstream cspecies and find paths to all downstream species
932
- global_dict = dict()
933
- for an_origin_index in indexed_origin_vertices.index.unique(): # type: ignore
934
- origin_targets = indexed_origin_vertices.loc[
935
- an_origin_index
936
- ] # type: pd.DataFrame
937
-
938
- # if indexing only a single entry pd.DataFrame becomes a pd.Series
939
- # convert back to DataFrame for consistency
940
- origin_targets = utils.ensure_pd_df(origin_targets)
941
-
942
- # log entry for debugging
943
- logger.debug(
944
- f"finding paths from {an_origin_index} to "
945
- f"{origin_targets.shape[0]} target vertices"
946
- )
947
-
948
- # find all paths from indexed_origin to desired destination
949
- shortest_paths = paths.find_shortest_reaction_paths(
950
- cpr_graph,
951
- sbml_dfs,
952
- origin=an_origin_index,
953
- # find all unique destinations (as a list for compatibility with igraph dest)
954
- dest=origin_targets[CPR_EDGELIST.SC_ID_DOWNSTREAM].unique().tolist(),
955
- weight_var=CPR_GRAPH_EDGES.WEIGHTS,
956
- )
957
-
958
- if shortest_paths is None:
959
- continue
960
-
961
- vertices, edges = shortest_paths
962
- indexed_edges = edges.set_index("path")
963
- indexed_vertices = vertices.set_index("path")
964
-
965
- paths_list = list()
966
- for ind in indexed_edges.index.unique():
967
- one_path = indexed_edges.loc[ind]
968
-
969
- # make sure that we are working with a DF
970
- if type(one_path) is pd.Series:
971
- one_path = one_path.to_frame().T
972
-
973
- if one_path.shape[0] > max_path_length:
974
- continue
975
-
976
- # find the destination node
977
- # this is annoying because if the graph is undirected
978
- # its not clear if the from or to edge is the actual destination
979
- # when taking advantage of the fact that igraph lets you
980
- # look up multiple destinations at once this information is lost
981
- ancestor_species = {an_origin_index}
982
- if one_path.shape[0] > 1:
983
- penultimate_edge = one_path.iloc[one_path.shape[0] - 2]
984
- ancestor_species = ancestor_species.union(
985
- {
986
- penultimate_edge[CPR_GRAPH_EDGES.FROM],
987
- penultimate_edge[CPR_GRAPH_EDGES.TO],
988
- }
989
- )
990
-
991
- terminal_edge = one_path.iloc[one_path.shape[0] - 1]
992
- ending_cspecies = {terminal_edge[CPR_GRAPH_EDGES.FROM], terminal_edge[CPR_GRAPH_EDGES.TO]}.difference(ancestor_species) # type: ignore
993
-
994
- if len(ending_cspecies) != 1:
995
- raise ValueError(
996
- "The terminal edge could not be determined when summarizing paths"
997
- )
998
- ending_cspecies = ending_cspecies.pop()
999
-
1000
- path_series = pd.Series(
1001
- {
1002
- CPR_GRAPH_EDGES.FROM: an_origin_index,
1003
- CPR_GRAPH_EDGES.TO: ending_cspecies,
1004
- "weight": sum(one_path[CPR_GRAPH_EDGES.WEIGHTS]),
1005
- "path_length": one_path.shape[0],
1006
- "vpath": indexed_vertices.loc[ind],
1007
- "epath": one_path,
1008
- } # type: ignore
1009
- ) # type: pd.Series
1010
-
1011
- paths_list.append(path_series)
1012
-
1013
- if len(paths_list) > 0:
1014
- origin_paths = pd.DataFrame(paths_list)
1015
- global_dict[an_origin_index] = origin_paths
1016
-
1017
- if len(global_dict.keys()) == 0:
1018
- logger.warning(
1019
- "None of the provide molecular pairs could be mechanistically linked with a network path"
1020
- )
1021
- return None
1022
-
1023
- all_shortest_paths = pd.concat(global_dict.values())
1024
-
1025
- indirect_shortest_paths = edgelist_w_scids.merge(
1026
- all_shortest_paths,
1027
- left_on=[CPR_EDGELIST.SC_ID_UPSTREAM, CPR_EDGELIST.SC_ID_DOWNSTREAM],
1028
- right_on=[CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO],
1029
- )
1030
-
1031
- return indirect_shortest_paths
1032
-
1033
-
1034
- def _edgelist_to_scids_if_needed(
1035
- edgelist: pd.DataFrame,
1036
- sbml_dfs: sbml_dfs_core.SBML_dfs,
1037
- species_identifiers: pd.DataFrame,
1038
- ontologies: set,
1039
- ) -> pd.DataFrame:
1040
- """Map a set of edgelist species to cspecies or skip if cspecies were provided."""
1041
-
1042
- if utils.match_pd_vars(edgelist, CPR_EDGELIST_REQ_VARS).are_present:
1043
- logger.info(
1044
- f"An edgelist with {', '.join(CPR_EDGELIST_REQ_VARS)} was provided; identifier matching will be skipped"
1045
- )
1046
- return edgelist
1047
- else:
1048
- utils.match_pd_vars(edgelist, IDENTIFIER_EDGELIST_REQ_VARS).assert_present()
1049
-
1050
- identifiers._check_species_identifiers_table(species_identifiers)
1051
-
1052
- edgelist_w_scids = edgelist_to_scids(
1053
- edgelist,
1054
- sbml_dfs=sbml_dfs,
1055
- species_identifiers=species_identifiers,
1056
- ontologies=ontologies,
1057
- )
1058
-
1059
- return edgelist_w_scids
1060
-
1061
-
1062
- def _validate_wide_ontologies(
1063
- wide_df: pd.DataFrame,
1064
- ontologies: Optional[Union[str, Set[str], Dict[str, str]]] = None,
1065
- ) -> Set[str]:
1066
- """
1067
- Validate ontology specifications against the wide DataFrame and ONTOLOGIES_LIST.
1068
-
1069
- Parameters
1070
- ----------
1071
- wide_df : pd.DataFrame
1072
- DataFrame with one column per ontology and a results column
1073
- ontologies : Optional[Union[str, Set[str], Dict[str, str]]]
1074
- Either:
1075
- - String specifying a single ontology column
1076
- - Set of columns to treat as ontologies
1077
- - Dict mapping wide column names to ontology names
1078
- - None to automatically detect ontology columns based on ONTOLOGIES_LIST
1079
-
1080
- Returns
1081
- -------
1082
- Set[str]
1083
- Set of validated ontology names. For dictionary mappings, returns the target ontology names.
1084
-
1085
- Raises
1086
- ------
1087
- ValueError
1088
- If validation fails for any ontology specification or no valid ontologies are found
1089
- """
1090
- # Convert string input to set
1091
- if isinstance(ontologies, str):
1092
- ontologies = {ontologies}
1093
-
1094
- # Get the set of ontology columns
1095
- if isinstance(ontologies, dict):
1096
- # Check source columns exist in DataFrame
1097
- missing_cols = set(ontologies.keys()) - set(wide_df.columns)
1098
- if missing_cols:
1099
- raise ValueError(f"Source columns not found in DataFrame: {missing_cols}")
1100
- # Validate target ontologies against ONTOLOGIES_LIST
1101
- invalid_onts = set(ontologies.values()) - set(ONTOLOGIES_LIST)
1102
- if invalid_onts:
1103
- raise ValueError(
1104
- f"Invalid ontologies in mapping: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
1105
- )
1106
- # Return target ontology names instead of source column names
1107
- ontology_cols = set(ontologies.values())
1108
-
1109
- elif isinstance(ontologies, set):
1110
- # Check specified columns exist in DataFrame
1111
- missing_cols = ontologies - set(wide_df.columns)
1112
- if missing_cols:
1113
- raise ValueError(
1114
- f"Specified ontology columns not found in DataFrame: {missing_cols}"
1115
- )
1116
- # Validate specified ontologies against ONTOLOGIES_LIST
1117
- invalid_onts = ontologies - set(ONTOLOGIES_LIST)
1118
- if invalid_onts:
1119
- raise ValueError(
1120
- f"Invalid ontologies in set: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
1121
- )
1122
- ontology_cols = ontologies
1123
-
1124
- else:
1125
- # Auto-detect ontology columns by matching against ONTOLOGIES_LIST
1126
- ontology_cols = set(wide_df.columns) & set(ONTOLOGIES_LIST)
1127
- if not ontology_cols:
1128
- raise ValueError(
1129
- f"No valid ontology columns found in DataFrame. Column names must match one of: {ONTOLOGIES_LIST}"
1130
- )
1131
- logger.info(f"Auto-detected ontology columns: {ontology_cols}")
1132
-
1133
- logger.debug(f"Validated ontology columns: {ontology_cols}")
1134
- return ontology_cols
1135
-
1136
-
1137
- def _ensure_feature_id_var(
1138
- df: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
1139
- ) -> pd.DataFrame:
1140
- """
1141
- Ensure the DataFrame has a feature_id column, creating one if it doesn't exist.
1142
-
1143
- Parameters
1144
- ----------
1145
- df : pd.DataFrame
1146
- DataFrame to check/modify
1147
- feature_id_var : str, default=FEATURE_ID_VAR_DEFAULT
1148
- Name of the feature ID column
1149
-
1150
- Returns
1151
- -------
1152
- pd.DataFrame
1153
- DataFrame with guaranteed feature_id column
1154
- """
1155
- if feature_id_var not in df.columns:
1156
- logger.warning(f"No {feature_id_var} column found in DataFrame, creating one")
1157
- df = df.copy()
1158
- df[feature_id_var] = np.arange(len(df))
1159
- return df
1160
-
1161
-
1162
- def _get_numeric_aggregator(
1163
- method: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
1164
- feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
1165
- ) -> callable:
1166
- """
1167
- Get aggregation function for numeric columns with various methods.
1168
-
1169
- Parameters
1170
- ----------
1171
- method : str, default="weighted_mean"
1172
- Aggregation method to use:
1173
- - "weighted_mean": weighted by inverse of feature_id frequency (default)
1174
- - "mean": simple arithmetic mean
1175
- - "first": first value after sorting by feature_id_var (requires feature_id_var)
1176
- - "max": maximum value
1177
- feature_id_var : str, default="feature_id"
1178
- Name of the column specifying a measured feature - used for sorting and weighting
1179
-
1180
- Returns
1181
- -------
1182
- callable
1183
- Aggregation function to use with groupby
1184
-
1185
- Raises
1186
- ------
1187
- ValueError
1188
- If method is not recognized
1189
- """
1190
-
1191
- def weighted_mean(df: pd.DataFrame) -> float:
1192
- # Get values and weights for this group
1193
- values = df["value"]
1194
- weights = df["weight"]
1195
- # Weights are already normalized globally, just use them directly
1196
- return (values * weights).sum() / weights.sum()
1197
-
1198
- def first_by_id(df: pd.DataFrame) -> float:
1199
- # Sort by feature_id and take first value
1200
- return df.sort_values(feature_id_var).iloc[0]["value"]
1201
-
1202
- def simple_mean(series: pd.Series) -> float:
1203
- return series.mean()
1204
-
1205
- def simple_max(series: pd.Series) -> float:
1206
- return series.max()
1207
-
1208
- aggregators = {
1209
- RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN: weighted_mean,
1210
- RESOLVE_MATCHES_AGGREGATORS.MEAN: simple_mean,
1211
- RESOLVE_MATCHES_AGGREGATORS.FIRST: first_by_id,
1212
- RESOLVE_MATCHES_AGGREGATORS.MAX: simple_max,
1213
- }
1214
-
1215
- if method not in aggregators:
1216
- raise ValueError(
1217
- f"Unknown aggregation method: {method}. Must be one of {list(aggregators.keys())}"
1218
- )
1219
-
1220
- return aggregators[method]
1221
-
1222
-
1223
- def _split_numeric_non_numeric_columns(df: pd.DataFrame, always_non_numeric=None):
1224
- """
1225
- Utility to split DataFrame columns into numeric and non-numeric, always treating specified columns as non-numeric.
1226
-
1227
- Parameters
1228
- ----------
1229
- df : pd.DataFrame
1230
- The DataFrame to split.
1231
- always_non_numeric : list or set, optional
1232
- Columns to always treat as non-numeric (e.g., ['feature_id']).
1233
-
1234
- Returns
1235
- -------
1236
- numeric_cols : pd.Index
1237
- Columns considered numeric (int64, float64, and not in always_non_numeric).
1238
- non_numeric_cols : pd.Index
1239
- Columns considered non-numeric (object, string, etc., plus always_non_numeric).
1240
- """
1241
- if always_non_numeric is None:
1242
- always_non_numeric = []
1243
- always_non_numeric = set(always_non_numeric)
1244
- numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.difference(
1245
- always_non_numeric
1246
- )
1247
- non_numeric_cols = df.columns.difference(numeric_cols)
1248
- return numeric_cols, non_numeric_cols
1249
-
1250
-
1251
- def _aggregate_grouped_columns(
1252
- df: pd.DataFrame,
1253
- numeric_cols,
1254
- non_numeric_cols,
1255
- numeric_aggregator,
1256
- feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
1257
- numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
1258
- ) -> pd.DataFrame:
1259
- """
1260
- Aggregate numeric and non-numeric columns for grouped DataFrame.
1261
- Assumes deduplication by feature_id within each s_id has already been performed.
1262
- Returns the combined DataFrame.
1263
- """
1264
- results = []
1265
-
1266
- # Handle non-numeric columns
1267
- if len(non_numeric_cols) > 0:
1268
- non_numeric_agg = (
1269
- df[non_numeric_cols]
1270
- .groupby(level=0)
1271
- .agg(lambda x: ",".join(sorted(set(x.astype(str)))))
1272
- )
1273
- results.append(non_numeric_agg)
1274
- # Handle numeric columns
1275
- if len(numeric_cols) > 0:
1276
- numeric_results = {}
1277
- for col in numeric_cols:
1278
- if numeric_agg in [
1279
- RESOLVE_MATCHES_AGGREGATORS.FIRST,
1280
- RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
1281
- ]:
1282
- agg_df = pd.DataFrame(
1283
- {"value": df[col], feature_id_var: df[feature_id_var]}
1284
- )
1285
- if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
1286
- agg_df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = df[
1287
- RESOLVE_MATCHES_TMP_WEIGHT_COL
1288
- ]
1289
- numeric_results[col] = agg_df.groupby(level=0).apply(
1290
- lambda x: (
1291
- numeric_aggregator(x)
1292
- if numeric_agg != RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN
1293
- else numeric_aggregator(
1294
- x.rename(columns={RESOLVE_MATCHES_TMP_WEIGHT_COL: "weight"})
1295
- )
1296
- )
1297
- )
1298
- else:
1299
- numeric_results[col] = df[col].groupby(level=0).agg(numeric_aggregator)
1300
- numeric_agg_df = pd.DataFrame(numeric_results)
1301
- results.append(numeric_agg_df)
1302
- # Combine results
1303
- if results:
1304
- resolved = pd.concat(results, axis=1)
1305
- else:
1306
- resolved = pd.DataFrame(index=df.index)
1307
- return resolved
1308
-
1309
-
1310
- def _log_feature_species_mapping_stats(
1311
- pathway_species: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
1312
- ):
1313
- """
1314
- Log statistics about the mapping between feature_id and s_id in the pathway_species DataFrame.
1315
- """
1316
-
1317
- # Percent of feature_ids present one or more times in the output
1318
- n_feature_ids = pathway_species[feature_id_var].nunique()
1319
- n_input_feature_ids = (
1320
- pathway_species[feature_id_var].max() + 1
1321
- if feature_id_var in pathway_species.columns
1322
- else 0
1323
- )
1324
- percent_present = (
1325
- 100 * n_feature_ids / n_input_feature_ids if n_input_feature_ids else 0
1326
- )
1327
- logger.info(
1328
- f"{percent_present:.1f}% of feature_ids are present one or more times in the output ({n_feature_ids}/{n_input_feature_ids})"
1329
- )
1330
-
1331
- # Number of times an s_id maps to 1+ feature_ids (with s_name)
1332
- s_id_counts = pathway_species.groupby(SBML_DFS.S_ID)[feature_id_var].nunique()
1333
- s_id_multi = s_id_counts[s_id_counts > 1]
1334
- logger.info(f"{len(s_id_multi)} s_id(s) map to more than one feature_id.")
1335
- if not s_id_multi.empty:
1336
- examples = pathway_species[
1337
- pathway_species[SBML_DFS.S_ID].isin(s_id_multi.index)
1338
- ][[SBML_DFS.S_ID, SBML_DFS.S_NAME, feature_id_var]]
1339
- logger.info(
1340
- f"Examples of s_id mapping to multiple feature_ids (showing up to 3):\n{examples.groupby([SBML_DFS.S_ID, SBML_DFS.S_NAME])[feature_id_var].apply(list).head(3)}"
1341
- )
1342
-
1343
- # Number of times a feature_id maps to 1+ s_ids (with s_name)
1344
- feature_id_counts = pathway_species.groupby(feature_id_var)[SBML_DFS.S_ID].nunique()
1345
- feature_id_multi = feature_id_counts[feature_id_counts > 1]
1346
- logger.info(f"{len(feature_id_multi)} feature_id(s) map to more than one s_id.")
1347
- if not feature_id_multi.empty:
1348
- examples = pathway_species[
1349
- pathway_species[feature_id_var].isin(feature_id_multi.index)
1350
- ][[feature_id_var, SBML_DFS.S_ID, SBML_DFS.S_NAME]]
1351
- logger.info(
1352
- f"Examples of feature_id mapping to multiple s_ids (showing up to 3):\n{examples.groupby([feature_id_var])[[SBML_DFS.S_ID, SBML_DFS.S_NAME]].apply(lambda df: list(df.itertuples(index=False, name=None))).head(3)}"
1353
- )