napistu 0.1.0__py3-none-any.whl → 0.2.4.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. napistu/__init__.py +1 -1
  2. napistu/consensus.py +1010 -513
  3. napistu/constants.py +24 -0
  4. napistu/gcs/constants.py +2 -2
  5. napistu/gcs/downloads.py +57 -25
  6. napistu/gcs/utils.py +21 -0
  7. napistu/identifiers.py +105 -6
  8. napistu/ingestion/constants.py +0 -1
  9. napistu/ingestion/obo.py +24 -8
  10. napistu/ingestion/psi_mi.py +20 -5
  11. napistu/ingestion/reactome.py +8 -32
  12. napistu/mcp/__init__.py +69 -0
  13. napistu/mcp/__main__.py +180 -0
  14. napistu/mcp/codebase.py +182 -0
  15. napistu/mcp/codebase_utils.py +298 -0
  16. napistu/mcp/constants.py +72 -0
  17. napistu/mcp/documentation.py +166 -0
  18. napistu/mcp/documentation_utils.py +235 -0
  19. napistu/mcp/execution.py +382 -0
  20. napistu/mcp/profiles.py +73 -0
  21. napistu/mcp/server.py +86 -0
  22. napistu/mcp/tutorials.py +124 -0
  23. napistu/mcp/tutorials_utils.py +230 -0
  24. napistu/mcp/utils.py +47 -0
  25. napistu/mechanism_matching.py +782 -26
  26. napistu/modify/constants.py +41 -0
  27. napistu/modify/curation.py +4 -1
  28. napistu/modify/gaps.py +243 -156
  29. napistu/modify/pathwayannot.py +26 -8
  30. napistu/network/neighborhoods.py +16 -7
  31. napistu/network/net_create.py +209 -54
  32. napistu/network/net_propagation.py +118 -0
  33. napistu/network/net_utils.py +1 -32
  34. napistu/rpy2/netcontextr.py +10 -7
  35. napistu/rpy2/rids.py +7 -5
  36. napistu/sbml_dfs_core.py +46 -29
  37. napistu/sbml_dfs_utils.py +37 -1
  38. napistu/source.py +8 -2
  39. napistu/utils.py +67 -8
  40. napistu-0.2.4.dev3.dist-info/METADATA +84 -0
  41. napistu-0.2.4.dev3.dist-info/RECORD +95 -0
  42. {napistu-0.1.0.dist-info → napistu-0.2.4.dev3.dist-info}/WHEEL +1 -1
  43. tests/conftest.py +11 -5
  44. tests/test_consensus.py +4 -1
  45. tests/test_gaps.py +127 -0
  46. tests/test_gcs.py +3 -2
  47. tests/test_igraph.py +14 -0
  48. tests/test_mcp_documentation_utils.py +13 -0
  49. tests/test_mechanism_matching.py +658 -0
  50. tests/test_net_propagation.py +89 -0
  51. tests/test_net_utils.py +83 -0
  52. tests/test_sbml.py +2 -0
  53. tests/{test_sbml_dfs_create.py → test_sbml_dfs_core.py} +68 -4
  54. tests/test_utils.py +81 -0
  55. napistu-0.1.0.dist-info/METADATA +0 -56
  56. napistu-0.1.0.dist-info/RECORD +0 -77
  57. {napistu-0.1.0.dist-info → napistu-0.2.4.dev3.dist-info}/entry_points.txt +0 -0
  58. {napistu-0.1.0.dist-info → napistu-0.2.4.dev3.dist-info}/licenses/LICENSE +0 -0
  59. {napistu-0.1.0.dist-info → napistu-0.2.4.dev3.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,117 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ from typing import Optional, Union, Set, Dict, List
4
5
 
5
6
  import igraph as ig
7
+ import numpy as np
6
8
  import pandas as pd
9
+
10
+ from napistu import identifiers
7
11
  from napistu import sbml_dfs_core
8
12
  from napistu import utils
9
13
  from napistu.constants import SBML_DFS
10
14
  from napistu.constants import CPR_EDGELIST
11
15
  from napistu.constants import CPR_EDGELIST_REQ_VARS
16
+ from napistu.constants import FEATURE_ID_VAR_DEFAULT
17
+ from napistu.constants import RESOLVE_MATCHES_AGGREGATORS
18
+ from napistu.constants import RESOLVE_MATCHES_TMP_WEIGHT_COL
12
19
  from napistu.constants import IDENTIFIERS
13
20
  from napistu.constants import IDENTIFIER_EDGELIST_REQ_VARS
14
- from napistu.constants import SPECIES_IDENTIFIERS_REQUIRED_VARS
21
+ from napistu.constants import ONTOLOGIES_LIST
15
22
  from napistu.network.constants import CPR_GRAPH_EDGES
16
23
  from napistu.network import paths
17
24
 
18
25
  logger = logging.getLogger(__name__)
19
26
 
20
27
 
28
+ def bind_wide_results(
29
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
30
+ results_df: pd.DataFrame,
31
+ results_name: str,
32
+ ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
33
+ dogmatic: bool = False,
34
+ species_identifiers: Optional[pd.DataFrame] = None,
35
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
36
+ numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
37
+ keep_id_col: bool = True,
38
+ verbose: bool = False,
39
+ ) -> sbml_dfs_core.SBML_dfs:
40
+ """
41
+ Binds wide results to a sbml_dfs object.
42
+
43
+ Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data
44
+
45
+ Parameters
46
+ ----------
47
+ sbml_dfs : sbml_dfs_core.SBML_dfs
48
+ The sbml_dfs object to bind the results to.
49
+ results_df : pd.DataFrame
50
+ The table containing the results to bind.
51
+ results_name : str
52
+ The name of the results to bind.
53
+ ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
54
+ Either:
55
+ - Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
56
+ - Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
57
+ - None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
58
+ dogmatic : bool
59
+ Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
60
+ species_identifiers : Optional[pd.DataFrame]
61
+ Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
62
+ feature_id_var : str
63
+ The name of the column in the results_df that contains the feature identifiers. If this does not exist it will be created.
64
+ numeric_agg : str
65
+ The aggregation method to use for resolving degeneracy.
66
+ keep_id_col : bool
67
+ Whether to keep the identifier column in the results_df.
68
+ verbose : bool
69
+ Whether to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
70
+
71
+ Returns
72
+ -------
73
+ sbml_dfs : sbml_dfs_core.SBML_dfs
74
+ The sbml_dfs object with the results bound.
75
+ """
76
+
77
+ species_identifiers = identifiers._prepare_species_identifiers(
78
+ sbml_dfs, dogmatic=dogmatic, species_identifiers=species_identifiers
79
+ )
80
+
81
+ # match
82
+ matched_s_ids_from_wide = match_features_to_wide_pathway_species(
83
+ results_df,
84
+ species_identifiers,
85
+ ontologies=ontologies,
86
+ feature_id_var=feature_id_var,
87
+ verbose=verbose,
88
+ )
89
+
90
+ disambiguated_matches = resolve_matches(
91
+ matched_data=matched_s_ids_from_wide,
92
+ feature_id_var=feature_id_var,
93
+ numeric_agg=numeric_agg,
94
+ keep_id_col=keep_id_col,
95
+ )
96
+
97
+ clean_species_data = utils.drop_extra_cols(
98
+ results_df, disambiguated_matches, always_include=[feature_id_var]
99
+ )
100
+
101
+ sbml_dfs.add_species_data(results_name, clean_species_data)
102
+
103
+ return sbml_dfs
104
+
105
+
21
106
  def features_to_pathway_species(
22
107
  feature_identifiers: pd.DataFrame,
23
108
  species_identifiers: pd.DataFrame,
24
109
  ontologies: set,
25
- feature_id_var: str,
110
+ feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
111
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
112
+ expand_identifiers: bool = False,
113
+ identifier_delimiter: str = "/",
114
+ verbose: bool = False,
26
115
  ) -> pd.DataFrame:
27
116
  """
28
117
  Features to Pathway Species
@@ -31,29 +120,64 @@ def features_to_pathway_species(
31
120
 
32
121
  Parameters:
33
122
  feature_identifiers: pd.DataFrame
34
- pd.Dataframe containing a "feature_id_var" variable used to match entries
123
+ pd.Dataframe containing a "feature_identifiers_var" variable used to match entries
35
124
  species_identifiers: pd.DataFrame
36
125
  A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
37
126
  generally using sbml_dfs_core.export_sbml_dfs()
38
127
  ontologies: set
39
128
  A set of ontologies used to match features to pathway species
40
- feature_id_var: str
129
+ feature_identifiers_var: str
41
130
  Variable in "feature_identifiers" containing identifiers
131
+ expand_identifiers: bool, default=False
132
+ If True, split identifiers in feature_identifiers_var by identifier_delimiter and explode into multiple rows
133
+ identifier_delimiter: str, default="/"
134
+ Delimiter to use for splitting identifiers if expand_identifiers is True
135
+ verbose: bool, default=False
136
+ If True, log mapping statistics at the end of the function
42
137
 
43
138
  Returns:
44
139
  pathway_species: pd.DataFrame
45
140
  species_identifiers joined to feature_identifiers based on shared identifiers
46
141
  """
47
142
 
48
- # map features to molecular features in the pathway
49
- if feature_id_var not in feature_identifiers.columns.to_list():
143
+ # Check for identifier column
144
+ if feature_identifiers_var not in feature_identifiers.columns.to_list():
50
145
  raise ValueError(
51
- f"{feature_id_var} must be a variable in 'feature_identifiers', "
146
+ f"{feature_identifiers_var} must be a variable in 'feature_identifiers', "
52
147
  f"possible variables are {', '.join(feature_identifiers.columns.tolist())}"
53
148
  )
54
149
 
150
+ # Respect or create feature_id column
151
+ feature_identifiers = _ensure_feature_id_var(feature_identifiers, feature_id_var)
152
+
153
+ # Optionally expand identifiers into multiple rows
154
+ if expand_identifiers:
155
+ # Count the number of expansions by counting delimiters
156
+ n_expansions = (
157
+ feature_identifiers[feature_identifiers_var]
158
+ .astype(str)
159
+ .str.count(identifier_delimiter)
160
+ .sum()
161
+ )
162
+ if n_expansions > 0:
163
+ logger.info(
164
+ f"Expanding identifiers: {n_expansions} delimiters found in '{feature_identifiers_var}', will expand to more rows."
165
+ )
166
+
167
+ # Split, strip whitespace, and explode
168
+ feature_identifiers = feature_identifiers.copy()
169
+ feature_identifiers[feature_identifiers_var] = (
170
+ feature_identifiers[feature_identifiers_var]
171
+ .astype(str)
172
+ .str.split(identifier_delimiter)
173
+ .apply(lambda lst: [x.strip() for x in lst])
174
+ )
175
+ feature_identifiers = feature_identifiers.explode(
176
+ feature_identifiers_var, ignore_index=True
177
+ )
178
+
55
179
  # check identifiers table
56
- _check_species_identifiers_table(species_identifiers)
180
+ identifiers._check_species_identifiers_table(species_identifiers)
57
181
 
58
182
  available_ontologies = set(species_identifiers[IDENTIFIERS.ONTOLOGY].tolist())
59
183
  unavailable_ontologies = ontologies.difference(available_ontologies)
@@ -80,7 +204,9 @@ def features_to_pathway_species(
80
204
 
81
205
  # map features to pathway species
82
206
  pathway_species = feature_identifiers.merge(
83
- relevant_identifiers, left_on=feature_id_var, right_on=IDENTIFIERS.IDENTIFIER
207
+ relevant_identifiers,
208
+ left_on=feature_identifiers_var,
209
+ right_on=IDENTIFIERS.IDENTIFIER,
84
210
  )
85
211
 
86
212
  if pathway_species.shape[0] == 0:
@@ -90,12 +216,18 @@ def features_to_pathway_species(
90
216
  None
91
217
 
92
218
  # report the fraction of unmapped species
219
+ if verbose:
220
+ _log_feature_species_mapping_stats(pathway_species, feature_id_var)
93
221
 
94
222
  return pathway_species
95
223
 
96
224
 
97
225
  def edgelist_to_pathway_species(
98
- formatted_edgelist: pd.DataFrame, species_identifiers: pd.DataFrame, ontologies: set
226
+ formatted_edgelist: pd.DataFrame,
227
+ species_identifiers: pd.DataFrame,
228
+ ontologies: set,
229
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
230
+ verbose: bool = False,
99
231
  ) -> pd.DataFrame:
100
232
  """
101
233
  Edgelist to Pathway Species
@@ -110,6 +242,10 @@ def edgelist_to_pathway_species(
110
242
  sbml_dfs_core.export_sbml_dfs()
111
243
  ontologies: set
112
244
  A set of ontologies used to match features to pathway species
245
+ feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
246
+ Variable in "formatted_edgelist" containing feature ids
247
+ verbose: bool, default=False
248
+ Whether to print verbose output
113
249
 
114
250
  Returns:
115
251
  edges_on_pathway: pd.DataFrame
@@ -146,7 +282,7 @@ def edgelist_to_pathway_species(
146
282
  .drop_duplicates()
147
283
  .reset_index(drop=True)
148
284
  .to_frame()
149
- .rename({0: "feature_id"}, axis=1)
285
+ .rename({0: feature_id_var}, axis=1)
150
286
  )
151
287
 
152
288
  # merge edgelist identifiers with pathway identifiers to map s_ids to identifiers
@@ -154,7 +290,8 @@ def edgelist_to_pathway_species(
154
290
  feature_identifiers=distinct_identifiers,
155
291
  species_identifiers=species_identifiers,
156
292
  ontologies=ontologies,
157
- feature_id_var="feature_id",
293
+ feature_identifiers_var=feature_id_var,
294
+ verbose=verbose,
158
295
  )
159
296
 
160
297
  # add s_ids of both upstream and downstream edges to pathway
@@ -179,6 +316,348 @@ def edgelist_to_pathway_species(
179
316
  return edges_on_pathway
180
317
 
181
318
 
319
+ def match_features_to_wide_pathway_species(
320
+ wide_df: pd.DataFrame,
321
+ species_identifiers: pd.DataFrame,
322
+ ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
323
+ feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
324
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
325
+ verbose: bool = False,
326
+ ) -> pd.DataFrame:
327
+ """
328
+ Convert a wide-format DataFrame with multiple ontology columns to long format,
329
+ and match features to pathway species by ontology and identifier.
330
+
331
+ Parameters
332
+ ----------
333
+ wide_df : pd.DataFrame
334
+ DataFrame with ontology identifier columns and any number of results columns.
335
+ All non-ontology columns are treated as results.
336
+ species_identifiers : pd.DataFrame
337
+ DataFrame as required by features_to_pathway_species
338
+ ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
339
+ Either:
340
+ - Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
341
+ - Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
342
+ - None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
343
+ feature_identifiers_var : str, default="identifier"
344
+ Name for the identifier column in the long format
345
+ feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
346
+ Name for the feature id column in the long format
347
+ verbose : bool, default=False
348
+ Whether to print verbose output
349
+
350
+ Returns
351
+ -------
352
+ pd.DataFrame
353
+ Output of match_by_ontology_and_identifier
354
+
355
+ Examples
356
+ --------
357
+ >>> # Example with auto-detected ontology columns and multiple results
358
+ >>> wide_df = pd.DataFrame({
359
+ ... 'uniprot': ['P12345', 'Q67890'],
360
+ ... 'chebi': ['15377', '16810'],
361
+ ... 'log2fc': [1.0, 2.0],
362
+ ... 'pvalue': [0.01, 0.05]
363
+ ... })
364
+ >>> result = match_features_to_wide_pathway_species(
365
+ ... wide_df=wide_df,
366
+ ... species_identifiers=species_identifiers
367
+ ... )
368
+
369
+ >>> # Example with custom ontology mapping
370
+ >>> wide_df = pd.DataFrame({
371
+ ... 'protein_id': ['P12345', 'Q67890'],
372
+ ... 'compound_id': ['15377', '16810'],
373
+ ... 'expression': [1.0, 2.0],
374
+ ... 'confidence': [0.8, 0.9]
375
+ ... })
376
+ >>> result = match_features_to_wide_pathway_species(
377
+ ... wide_df=wide_df,
378
+ ... species_identifiers=species_identifiers,
379
+ ... ontologies={'protein_id': 'uniprot', 'compound_id': 'chebi'}
380
+ ... )
381
+ """
382
+ # Make a copy to avoid modifying the input
383
+ wide_df = wide_df.copy()
384
+
385
+ # Validate ontologies and get the set of ontology columns
386
+ ontology_cols = _validate_wide_ontologies(wide_df, ontologies)
387
+ melt_cols = list(ontology_cols)
388
+
389
+ # Apply renaming if a mapping is provided
390
+ if isinstance(ontologies, dict):
391
+ wide_df = wide_df.rename(columns=ontologies)
392
+
393
+ # Ensure feature_id column exists
394
+ wide_df = _ensure_feature_id_var(wide_df, feature_id_var)
395
+
396
+ # All non-ontology columns are treated as results
397
+ results_cols = list(set(wide_df.columns) - set(melt_cols))
398
+ if not results_cols:
399
+ raise ValueError("No results columns found in DataFrame")
400
+
401
+ logger.info(f"Using columns as results: {results_cols}")
402
+
403
+ # Melt ontology columns to long format, keeping all results columns
404
+ long_df = wide_df.melt(
405
+ id_vars=results_cols,
406
+ value_vars=melt_cols,
407
+ var_name=IDENTIFIERS.ONTOLOGY,
408
+ value_name=feature_identifiers_var,
409
+ ).dropna(subset=[feature_identifiers_var])
410
+
411
+ logger.debug(f"Final long format shape: {long_df.shape}")
412
+
413
+ # Call the matching function with the validated ontologies
414
+ out = match_by_ontology_and_identifier(
415
+ feature_identifiers=long_df,
416
+ species_identifiers=species_identifiers,
417
+ ontologies=ontology_cols,
418
+ feature_identifiers_var=feature_identifiers_var,
419
+ )
420
+
421
+ if verbose:
422
+ _log_feature_species_mapping_stats(out, feature_id_var)
423
+
424
+ return out
425
+
426
+
427
+ def match_by_ontology_and_identifier(
428
+ feature_identifiers: pd.DataFrame,
429
+ species_identifiers: pd.DataFrame,
430
+ ontologies: Union[str, Set[str], List[str]],
431
+ feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
432
+ verbose: bool = False,
433
+ ) -> pd.DataFrame:
434
+ """
435
+ Match features to pathway species based on both ontology and identifier matches.
436
+ Performs separate matching for each ontology and concatenates the results.
437
+
438
+ Parameters
439
+ ----------
440
+ feature_identifiers : pd.DataFrame
441
+ DataFrame containing feature identifiers and results.
442
+ Must have columns [ontology, feature_identifiers_var, results]
443
+ species_identifiers : pd.DataFrame
444
+ DataFrame containing species identifiers from pathway.
445
+ Must have columns [ontology, identifier]
446
+ ontologies : Union[str, Set[str], List[str]]
447
+ Ontologies to match on. Can be:
448
+ - A single ontology string
449
+ - A set of ontology strings
450
+ - A list of ontology strings
451
+ feature_identifiers_var : str, default="identifier"
452
+ Name of the identifier column in feature_identifiers
453
+ verbose : bool, default=False
454
+ Whether to print verbose output
455
+
456
+ Returns
457
+ -------
458
+ pd.DataFrame
459
+ Concatenated results of matching for each ontology.
460
+ Contains all columns from features_to_pathway_species()
461
+
462
+ Examples
463
+ --------
464
+ >>> # Match using a single ontology
465
+ >>> result = match_by_ontology_and_identifier(
466
+ ... feature_identifiers=features_df,
467
+ ... species_identifiers=species_df,
468
+ ... ontologies="uniprot"
469
+ ... )
470
+
471
+ >>> # Match using multiple ontologies
472
+ >>> result = match_by_ontology_and_identifier(
473
+ ... feature_identifiers=features_df,
474
+ ... species_identifiers=species_df,
475
+ ... ontologies={"uniprot", "chebi"}
476
+ ... )
477
+ """
478
+ # Convert string to set for consistent handling
479
+ if isinstance(ontologies, str):
480
+ ontologies = {ontologies}
481
+ elif isinstance(ontologies, list):
482
+ ontologies = set(ontologies)
483
+
484
+ # Validate ontologies
485
+ invalid_onts = ontologies - set(ONTOLOGIES_LIST)
486
+ if invalid_onts:
487
+ raise ValueError(
488
+ f"Invalid ontologies specified: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
489
+ )
490
+
491
+ # Initialize list to store results
492
+ matched_dfs = []
493
+
494
+ # Process each ontology separately
495
+ for ont in ontologies:
496
+ # Filter feature identifiers to current ontology and drop ontology column
497
+ ont_features = (
498
+ feature_identifiers[feature_identifiers[IDENTIFIERS.ONTOLOGY] == ont]
499
+ .drop(columns=[IDENTIFIERS.ONTOLOGY])
500
+ .copy()
501
+ )
502
+
503
+ if ont_features.empty:
504
+ logger.warning(f"No features found for ontology: {ont}")
505
+ continue
506
+
507
+ # Filter species identifiers to current ontology
508
+ ont_species = species_identifiers[
509
+ species_identifiers[IDENTIFIERS.ONTOLOGY] == ont
510
+ ].copy()
511
+
512
+ if ont_species.empty:
513
+ logger.warning(f"No species found for ontology: {ont}")
514
+ continue
515
+
516
+ logger.debug(
517
+ f"Matching {len(ont_features)} features to {len(ont_species)} species for ontology {ont}"
518
+ )
519
+
520
+ # Match features to species for this ontology
521
+ matched = features_to_pathway_species(
522
+ feature_identifiers=ont_features,
523
+ species_identifiers=ont_species,
524
+ ontologies={ont},
525
+ feature_identifiers_var=feature_identifiers_var,
526
+ verbose=verbose,
527
+ )
528
+
529
+ if matched.empty:
530
+ logger.warning(f"No matches found for ontology: {ont}")
531
+ continue
532
+
533
+ matched_dfs.append(matched)
534
+
535
+ if not matched_dfs:
536
+ logger.warning("No matches found for any ontology")
537
+ return pd.DataFrame() # Return empty DataFrame with correct columns
538
+
539
+ # Combine results from all ontologies
540
+ result = pd.concat(matched_dfs, axis=0, ignore_index=True)
541
+
542
+ logger.info(
543
+ f"Found {len(result)} total matches across {len(matched_dfs)} ontologies"
544
+ )
545
+
546
+ return result
547
+
548
+
549
+ def resolve_matches(
550
+ matched_data: pd.DataFrame,
551
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
552
+ index_col: str = SBML_DFS.S_ID,
553
+ numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
554
+ keep_id_col: bool = True,
555
+ ) -> pd.DataFrame:
556
+ """
557
+ Resolve many-to-1 and 1-to-many matches in matched data.
558
+
559
+ Parameters
560
+ ----------
561
+ matched_data : pd.DataFrame
562
+ DataFrame containing matched data with columns:
563
+ - feature_id_var: identifier column (e.g. feature_id)
564
+ - index_col: index column (e.g. s_id)
565
+ - other columns: data columns to be aggregated
566
+ feature_id_var : str, default="feature_id"
567
+ Name of the identifier column
568
+ index_col : str, default="s_id"
569
+ Name of the column to use as index
570
+ numeric_agg : str, default="weighted_mean"
571
+ Method to aggregate numeric columns:
572
+ - "weighted_mean": weighted by inverse of feature_id frequency (default)
573
+ - "mean": simple arithmetic mean
574
+ - "first": first value after sorting by feature_id_var (requires feature_id_var)
575
+ - "max": maximum value
576
+ keep_id_col : bool, default=True
577
+ Whether to keep and rollup the feature_id_var in the output.
578
+ If False, feature_id_var will be dropped from the output.
579
+
580
+ Returns
581
+ -------
582
+ pd.DataFrame
583
+ DataFrame with resolved matches:
584
+ - Many-to-1: numeric columns are aggregated using specified method
585
+ - 1-to-many: adds a count column showing number of matches
586
+ - Index is set to index_col and named accordingly
587
+
588
+ Raises
589
+ ------
590
+ KeyError
591
+ If feature_id_var is not present in the DataFrame
592
+ TypeError
593
+ If DataFrame contains unsupported data types (boolean or datetime)
594
+ """
595
+ # Make a copy to avoid modifying input
596
+ df = matched_data.copy()
597
+
598
+ # Check for unsupported data types
599
+ unsupported_dtypes = df.select_dtypes(include=["bool", "datetime64"]).columns
600
+ if not unsupported_dtypes.empty:
601
+ raise TypeError(
602
+ f"Unsupported data types found in columns: {list(unsupported_dtypes)}. "
603
+ "Boolean and datetime columns are not supported."
604
+ )
605
+
606
+ # Always require feature_id_var
607
+ if feature_id_var not in df.columns:
608
+ raise KeyError(feature_id_var)
609
+
610
+ # Deduplicate by feature_id within each s_id using groupby and first BEFORE any further processing
611
+ df = df.groupby([index_col, feature_id_var], sort=False).first().reset_index()
612
+
613
+ # Use a unique temporary column name for weights
614
+ if RESOLVE_MATCHES_TMP_WEIGHT_COL in df.columns:
615
+ raise ValueError(
616
+ f"Temporary weight column name '{RESOLVE_MATCHES_TMP_WEIGHT_COL}' already exists in the input data. Please rename or remove this column and try again."
617
+ )
618
+
619
+ # Calculate weights if needed (after deduplication!)
620
+ if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
621
+ feature_counts = df[feature_id_var].value_counts()
622
+ df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = (
623
+ 1 / feature_counts[df[feature_id_var]].values
624
+ )
625
+
626
+ # Set index for grouping
627
+ df = df.set_index(index_col)
628
+
629
+ # Use utility to split columns
630
+ always_non_numeric = [feature_id_var] if keep_id_col else []
631
+ numeric_cols, non_numeric_cols = _split_numeric_non_numeric_columns(
632
+ df, always_non_numeric=always_non_numeric
633
+ )
634
+
635
+ # Get aggregator function
636
+ numeric_aggregator = _get_numeric_aggregator(
637
+ method=numeric_agg, feature_id_var=feature_id_var
638
+ )
639
+ resolved = _aggregate_grouped_columns(
640
+ df,
641
+ numeric_cols,
642
+ non_numeric_cols,
643
+ numeric_aggregator,
644
+ feature_id_var=feature_id_var,
645
+ numeric_agg=numeric_agg,
646
+ )
647
+ # Add count of matches per feature_id
648
+ match_counts = matched_data.groupby(index_col)[feature_id_var].nunique()
649
+ resolved[f"{feature_id_var}_match_count"] = match_counts
650
+
651
+ # Drop feature_id_var if not keeping it
652
+ if not keep_id_col and feature_id_var in resolved.columns:
653
+ resolved = resolved.drop(columns=[feature_id_var])
654
+
655
+ # Ensure index is named consistently
656
+ resolved.index.name = index_col
657
+
658
+ return resolved
659
+
660
+
182
661
  def edgelist_to_scids(
183
662
  formatted_edgelist: pd.DataFrame,
184
663
  sbml_dfs: sbml_dfs_core.SBML_dfs,
@@ -210,7 +689,7 @@ def edgelist_to_scids(
210
689
  downstream species mapped to "sc_id_downstream"
211
690
  """
212
691
 
213
- _check_species_identifiers_table(species_identifiers)
692
+ identifiers._check_species_identifiers_table(species_identifiers)
214
693
 
215
694
  # map edges onto pathway entities based on shared identifiers
216
695
  edges_on_pathway = edgelist_to_pathway_species(
@@ -294,7 +773,7 @@ def filter_to_direct_mechanistic_interactions(
294
773
  )
295
774
 
296
775
  # reduce to distinct sc_id pairs
297
- sc_id_pairs = edgelist_w_scids[CPR_EDGELIST_REQ_VARS].drop_duplicates()
776
+ sc_id_pairs = edgelist_w_scids[list(CPR_EDGELIST_REQ_VARS)].drop_duplicates()
298
777
 
299
778
  # define all existing direct regulatory interactions
300
779
  pathway_interactions = pd.concat(
@@ -568,7 +1047,7 @@ def _edgelist_to_scids_if_needed(
568
1047
  else:
569
1048
  utils.match_pd_vars(edgelist, IDENTIFIER_EDGELIST_REQ_VARS).assert_present()
570
1049
 
571
- _check_species_identifiers_table(species_identifiers)
1050
+ identifiers._check_species_identifiers_table(species_identifiers)
572
1051
 
573
1052
  edgelist_w_scids = edgelist_to_scids(
574
1053
  edgelist,
@@ -580,18 +1059,295 @@ def _edgelist_to_scids_if_needed(
580
1059
  return edgelist_w_scids
581
1060
 
582
1061
 
583
- def _check_species_identifiers_table(
584
- species_identifiers: pd.DataFrame,
585
- required_vars: set = SPECIES_IDENTIFIERS_REQUIRED_VARS,
1062
+ def _validate_wide_ontologies(
1063
+ wide_df: pd.DataFrame,
1064
+ ontologies: Optional[Union[str, Set[str], Dict[str, str]]] = None,
1065
+ ) -> Set[str]:
1066
+ """
1067
+ Validate ontology specifications against the wide DataFrame and ONTOLOGIES_LIST.
1068
+
1069
+ Parameters
1070
+ ----------
1071
+ wide_df : pd.DataFrame
1072
+ DataFrame with one column per ontology and a results column
1073
+ ontologies : Optional[Union[str, Set[str], Dict[str, str]]]
1074
+ Either:
1075
+ - String specifying a single ontology column
1076
+ - Set of columns to treat as ontologies
1077
+ - Dict mapping wide column names to ontology names
1078
+ - None to automatically detect ontology columns based on ONTOLOGIES_LIST
1079
+
1080
+ Returns
1081
+ -------
1082
+ Set[str]
1083
+ Set of validated ontology names. For dictionary mappings, returns the target ontology names.
1084
+
1085
+ Raises
1086
+ ------
1087
+ ValueError
1088
+ If validation fails for any ontology specification or no valid ontologies are found
1089
+ """
1090
+ # Convert string input to set
1091
+ if isinstance(ontologies, str):
1092
+ ontologies = {ontologies}
1093
+
1094
+ # Get the set of ontology columns
1095
+ if isinstance(ontologies, dict):
1096
+ # Check source columns exist in DataFrame
1097
+ missing_cols = set(ontologies.keys()) - set(wide_df.columns)
1098
+ if missing_cols:
1099
+ raise ValueError(f"Source columns not found in DataFrame: {missing_cols}")
1100
+ # Validate target ontologies against ONTOLOGIES_LIST
1101
+ invalid_onts = set(ontologies.values()) - set(ONTOLOGIES_LIST)
1102
+ if invalid_onts:
1103
+ raise ValueError(
1104
+ f"Invalid ontologies in mapping: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
1105
+ )
1106
+ # Return target ontology names instead of source column names
1107
+ ontology_cols = set(ontologies.values())
1108
+
1109
+ elif isinstance(ontologies, set):
1110
+ # Check specified columns exist in DataFrame
1111
+ missing_cols = ontologies - set(wide_df.columns)
1112
+ if missing_cols:
1113
+ raise ValueError(
1114
+ f"Specified ontology columns not found in DataFrame: {missing_cols}"
1115
+ )
1116
+ # Validate specified ontologies against ONTOLOGIES_LIST
1117
+ invalid_onts = ontologies - set(ONTOLOGIES_LIST)
1118
+ if invalid_onts:
1119
+ raise ValueError(
1120
+ f"Invalid ontologies in set: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
1121
+ )
1122
+ ontology_cols = ontologies
1123
+
1124
+ else:
1125
+ # Auto-detect ontology columns by matching against ONTOLOGIES_LIST
1126
+ ontology_cols = set(wide_df.columns) & set(ONTOLOGIES_LIST)
1127
+ if not ontology_cols:
1128
+ raise ValueError(
1129
+ f"No valid ontology columns found in DataFrame. Column names must match one of: {ONTOLOGIES_LIST}"
1130
+ )
1131
+ logger.info(f"Auto-detected ontology columns: {ontology_cols}")
1132
+
1133
+ logger.debug(f"Validated ontology columns: {ontology_cols}")
1134
+ return ontology_cols
1135
+
1136
+
1137
+ def _ensure_feature_id_var(
1138
+ df: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
1139
+ ) -> pd.DataFrame:
1140
+ """
1141
+ Ensure the DataFrame has a feature_id column, creating one if it doesn't exist.
1142
+
1143
+ Parameters
1144
+ ----------
1145
+ df : pd.DataFrame
1146
+ DataFrame to check/modify
1147
+ feature_id_var : str, default=FEATURE_ID_VAR_DEFAULT
1148
+ Name of the feature ID column
1149
+
1150
+ Returns
1151
+ -------
1152
+ pd.DataFrame
1153
+ DataFrame with guaranteed feature_id column
1154
+ """
1155
+ if feature_id_var not in df.columns:
1156
+ logger.warning(f"No {feature_id_var} column found in DataFrame, creating one")
1157
+ df = df.copy()
1158
+ df[feature_id_var] = np.arange(len(df))
1159
+ return df
1160
+
1161
+
1162
+ def _get_numeric_aggregator(
1163
+ method: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
1164
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
1165
+ ) -> callable:
1166
+ """
1167
+ Get aggregation function for numeric columns with various methods.
1168
+
1169
+ Parameters
1170
+ ----------
1171
+ method : str, default="weighted_mean"
1172
+ Aggregation method to use:
1173
+ - "weighted_mean": weighted by inverse of feature_id frequency (default)
1174
+ - "mean": simple arithmetic mean
1175
+ - "first": first value after sorting by feature_id_var (requires feature_id_var)
1176
+ - "max": maximum value
1177
+ feature_id_var : str, default="feature_id"
1178
+ Name of the column specifying a measured feature - used for sorting and weighting
1179
+
1180
+ Returns
1181
+ -------
1182
+ callable
1183
+ Aggregation function to use with groupby
1184
+
1185
+ Raises
1186
+ ------
1187
+ ValueError
1188
+ If method is not recognized
1189
+ """
1190
+
1191
+ def weighted_mean(df: pd.DataFrame) -> float:
1192
+ # Get values and weights for this group
1193
+ values = df["value"]
1194
+ weights = df["weight"]
1195
+ # Weights are already normalized globally, just use them directly
1196
+ return (values * weights).sum() / weights.sum()
1197
+
1198
+ def first_by_id(df: pd.DataFrame) -> float:
1199
+ # Sort by feature_id and take first value
1200
+ return df.sort_values(feature_id_var).iloc[0]["value"]
1201
+
1202
+ def simple_mean(series: pd.Series) -> float:
1203
+ return series.mean()
1204
+
1205
+ def simple_max(series: pd.Series) -> float:
1206
+ return series.max()
1207
+
1208
+ aggregators = {
1209
+ RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN: weighted_mean,
1210
+ RESOLVE_MATCHES_AGGREGATORS.MEAN: simple_mean,
1211
+ RESOLVE_MATCHES_AGGREGATORS.FIRST: first_by_id,
1212
+ RESOLVE_MATCHES_AGGREGATORS.MAX: simple_max,
1213
+ }
1214
+
1215
+ if method not in aggregators:
1216
+ raise ValueError(
1217
+ f"Unknown aggregation method: {method}. Must be one of {list(aggregators.keys())}"
1218
+ )
1219
+
1220
+ return aggregators[method]
1221
+
1222
+
1223
+ def _split_numeric_non_numeric_columns(df: pd.DataFrame, always_non_numeric=None):
1224
+ """
1225
+ Utility to split DataFrame columns into numeric and non-numeric, always treating specified columns as non-numeric.
1226
+
1227
+ Parameters
1228
+ ----------
1229
+ df : pd.DataFrame
1230
+ The DataFrame to split.
1231
+ always_non_numeric : list or set, optional
1232
+ Columns to always treat as non-numeric (e.g., ['feature_id']).
1233
+
1234
+ Returns
1235
+ -------
1236
+ numeric_cols : pd.Index
1237
+ Columns considered numeric (int64, float64, and not in always_non_numeric).
1238
+ non_numeric_cols : pd.Index
1239
+ Columns considered non-numeric (object, string, etc., plus always_non_numeric).
1240
+ """
1241
+ if always_non_numeric is None:
1242
+ always_non_numeric = []
1243
+ always_non_numeric = set(always_non_numeric)
1244
+ numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.difference(
1245
+ always_non_numeric
1246
+ )
1247
+ non_numeric_cols = df.columns.difference(numeric_cols)
1248
+ return numeric_cols, non_numeric_cols
1249
+
1250
+
1251
+ def _aggregate_grouped_columns(
1252
+ df: pd.DataFrame,
1253
+ numeric_cols,
1254
+ non_numeric_cols,
1255
+ numeric_aggregator,
1256
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
1257
+ numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
1258
+ ) -> pd.DataFrame:
1259
+ """
1260
+ Aggregate numeric and non-numeric columns for grouped DataFrame.
1261
+ Assumes deduplication by feature_id within each s_id has already been performed.
1262
+ Returns the combined DataFrame.
1263
+ """
1264
+ results = []
1265
+
1266
+ # Handle non-numeric columns
1267
+ if len(non_numeric_cols) > 0:
1268
+ non_numeric_agg = (
1269
+ df[non_numeric_cols]
1270
+ .groupby(level=0)
1271
+ .agg(lambda x: ",".join(sorted(set(x.astype(str)))))
1272
+ )
1273
+ results.append(non_numeric_agg)
1274
+ # Handle numeric columns
1275
+ if len(numeric_cols) > 0:
1276
+ numeric_results = {}
1277
+ for col in numeric_cols:
1278
+ if numeric_agg in [
1279
+ RESOLVE_MATCHES_AGGREGATORS.FIRST,
1280
+ RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
1281
+ ]:
1282
+ agg_df = pd.DataFrame(
1283
+ {"value": df[col], feature_id_var: df[feature_id_var]}
1284
+ )
1285
+ if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
1286
+ agg_df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = df[
1287
+ RESOLVE_MATCHES_TMP_WEIGHT_COL
1288
+ ]
1289
+ numeric_results[col] = agg_df.groupby(level=0).apply(
1290
+ lambda x: (
1291
+ numeric_aggregator(x)
1292
+ if numeric_agg != RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN
1293
+ else numeric_aggregator(
1294
+ x.rename(columns={RESOLVE_MATCHES_TMP_WEIGHT_COL: "weight"})
1295
+ )
1296
+ )
1297
+ )
1298
+ else:
1299
+ numeric_results[col] = df[col].groupby(level=0).agg(numeric_aggregator)
1300
+ numeric_agg_df = pd.DataFrame(numeric_results)
1301
+ results.append(numeric_agg_df)
1302
+ # Combine results
1303
+ if results:
1304
+ resolved = pd.concat(results, axis=1)
1305
+ else:
1306
+ resolved = pd.DataFrame(index=df.index)
1307
+ return resolved
1308
+
1309
+
1310
+ def _log_feature_species_mapping_stats(
1311
+ pathway_species: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
586
1312
  ):
587
- missing_required_vars = required_vars.difference(
588
- set(species_identifiers.columns.tolist())
1313
+ """
1314
+ Log statistics about the mapping between feature_id and s_id in the pathway_species DataFrame.
1315
+ """
1316
+
1317
+ # Percent of feature_ids present one or more times in the output
1318
+ n_feature_ids = pathway_species[feature_id_var].nunique()
1319
+ n_input_feature_ids = (
1320
+ pathway_species[feature_id_var].max() + 1
1321
+ if feature_id_var in pathway_species.columns
1322
+ else 0
589
1323
  )
590
- if len(missing_required_vars) > 0:
591
- raise ValueError(
592
- f"{len(missing_required_vars)} required variables "
593
- "were missing from the species_identifiers table: "
594
- f"{', '.join(missing_required_vars)}"
1324
+ percent_present = (
1325
+ 100 * n_feature_ids / n_input_feature_ids if n_input_feature_ids else 0
1326
+ )
1327
+ logger.info(
1328
+ f"{percent_present:.1f}% of feature_ids are present one or more times in the output ({n_feature_ids}/{n_input_feature_ids})"
1329
+ )
1330
+
1331
+ # Number of times an s_id maps to 1+ feature_ids (with s_name)
1332
+ s_id_counts = pathway_species.groupby(SBML_DFS.S_ID)[feature_id_var].nunique()
1333
+ s_id_multi = s_id_counts[s_id_counts > 1]
1334
+ logger.info(f"{len(s_id_multi)} s_id(s) map to more than one feature_id.")
1335
+ if not s_id_multi.empty:
1336
+ examples = pathway_species[
1337
+ pathway_species[SBML_DFS.S_ID].isin(s_id_multi.index)
1338
+ ][[SBML_DFS.S_ID, SBML_DFS.S_NAME, feature_id_var]]
1339
+ logger.info(
1340
+ f"Examples of s_id mapping to multiple feature_ids (showing up to 3):\n{examples.groupby([SBML_DFS.S_ID, SBML_DFS.S_NAME])[feature_id_var].apply(list).head(3)}"
595
1341
  )
596
1342
 
597
- return None
1343
+ # Number of times a feature_id maps to 1+ s_ids (with s_name)
1344
+ feature_id_counts = pathway_species.groupby(feature_id_var)[SBML_DFS.S_ID].nunique()
1345
+ feature_id_multi = feature_id_counts[feature_id_counts > 1]
1346
+ logger.info(f"{len(feature_id_multi)} feature_id(s) map to more than one s_id.")
1347
+ if not feature_id_multi.empty:
1348
+ examples = pathway_species[
1349
+ pathway_species[feature_id_var].isin(feature_id_multi.index)
1350
+ ][[feature_id_var, SBML_DFS.S_ID, SBML_DFS.S_NAME]]
1351
+ logger.info(
1352
+ f"Examples of feature_id mapping to multiple s_ids (showing up to 3):\n{examples.groupby([feature_id_var])[[SBML_DFS.S_ID, SBML_DFS.S_NAME]].apply(lambda df: list(df.itertuples(index=False, name=None))).head(3)}"
1353
+ )