PyPI - napistu - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.4.dev2__py3-none-any.whl - Mend

napistu 0.1.0py3-none-any.whl → 0.2.4.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

napistu/__init__.py +1 -1
napistu/consensus.py +1010 -513
napistu/constants.py +24 -0
napistu/gcs/constants.py +2 -2
napistu/gcs/downloads.py +57 -25
napistu/gcs/utils.py +21 -0
napistu/identifiers.py +105 -6
napistu/ingestion/constants.py +0 -1
napistu/ingestion/obo.py +24 -8
napistu/ingestion/psi_mi.py +20 -5
napistu/ingestion/reactome.py +8 -32
napistu/mcp/__init__.py +69 -0
napistu/mcp/__main__.py +180 -0
napistu/mcp/codebase.py +182 -0
napistu/mcp/codebase_utils.py +298 -0
napistu/mcp/constants.py +72 -0
napistu/mcp/documentation.py +166 -0
napistu/mcp/documentation_utils.py +235 -0
napistu/mcp/execution.py +382 -0
napistu/mcp/profiles.py +73 -0
napistu/mcp/server.py +86 -0
napistu/mcp/tutorials.py +124 -0
napistu/mcp/tutorials_utils.py +230 -0
napistu/mcp/utils.py +47 -0
napistu/mechanism_matching.py +782 -26
napistu/modify/constants.py +41 -0
napistu/modify/curation.py +4 -1
napistu/modify/gaps.py +243 -156
napistu/modify/pathwayannot.py +26 -8
napistu/network/neighborhoods.py +16 -7
napistu/network/net_create.py +209 -54
napistu/network/net_propagation.py +118 -0
napistu/network/net_utils.py +1 -32
napistu/rpy2/netcontextr.py +10 -7
napistu/rpy2/rids.py +7 -5
napistu/sbml_dfs_core.py +46 -29
napistu/sbml_dfs_utils.py +37 -1
napistu/source.py +8 -2
napistu/utils.py +67 -8
napistu-0.2.4.dev2.dist-info/METADATA +84 -0
napistu-0.2.4.dev2.dist-info/RECORD +95 -0
{napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/WHEEL +1 -1
tests/conftest.py +11 -5
tests/test_consensus.py +4 -1
tests/test_gaps.py +127 -0
tests/test_gcs.py +3 -2
tests/test_igraph.py +14 -0
tests/test_mcp_documentation_utils.py +13 -0
tests/test_mechanism_matching.py +658 -0
tests/test_net_propagation.py +89 -0
tests/test_net_utils.py +83 -0
tests/test_sbml.py +2 -0
tests/{test_sbml_dfs_create.py → test_sbml_dfs_core.py} +68 -4
tests/test_utils.py +81 -0
napistu-0.1.0.dist-info/METADATA +0 -56
napistu-0.1.0.dist-info/RECORD +0 -77
{napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/entry_points.txt +0 -0
{napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/licenses/LICENSE +0 -0
{napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/top_level.txt +0 -0

napistu/mechanism_matching.py CHANGED Viewed

@@ -1,28 +1,117 @@
 from __future__ import annotations
 import logging
+from typing import Optional, Union, Set, Dict, List
 import igraph as ig
+import numpy as np
 import pandas as pd
+from napistu import identifiers
 from napistu import sbml_dfs_core
 from napistu import utils
 from napistu.constants import SBML_DFS
 from napistu.constants import CPR_EDGELIST
 from napistu.constants import CPR_EDGELIST_REQ_VARS
+from napistu.constants import FEATURE_ID_VAR_DEFAULT
+from napistu.constants import RESOLVE_MATCHES_AGGREGATORS
+from napistu.constants import RESOLVE_MATCHES_TMP_WEIGHT_COL
 from napistu.constants import IDENTIFIERS
 from napistu.constants import IDENTIFIER_EDGELIST_REQ_VARS
-from napistu.constants import SPECIES_IDENTIFIERS_REQUIRED_VARS
+from napistu.constants import ONTOLOGIES_LIST
 from napistu.network.constants import CPR_GRAPH_EDGES
 from napistu.network import paths
 logger = logging.getLogger(__name__)
+def bind_wide_results(
+    sbml_dfs: sbml_dfs_core.SBML_dfs,
+    results_df: pd.DataFrame,
+    results_name: str,
+    ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
+    dogmatic: bool = False,
+    species_identifiers: Optional[pd.DataFrame] = None,
+    feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
+    numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
+    keep_id_col: bool = True,
+    verbose: bool = False,
+) -> sbml_dfs_core.SBML_dfs:
+    """
+    Binds wide results to a sbml_dfs object.
+    Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data
+    Parameters
+    ----------
+    sbml_dfs : sbml_dfs_core.SBML_dfs
+        The sbml_dfs object to bind the results to.
+    results_df : pd.DataFrame
+        The table containing the results to bind.
+    results_name : str
+        The name of the results to bind.
+    ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
+        Either:
+        - Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
+        - Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
+        - None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
+    dogmatic : bool
+        Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
+    species_identifiers : Optional[pd.DataFrame]
+        Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
+    feature_id_var : str
+        The name of the column in the results_df that contains the feature identifiers. If this does not exist it will be created.
+    numeric_agg : str
+        The aggregation method to use for resolving degeneracy.
+    keep_id_col : bool
+        Whether to keep the identifier column in the results_df.
+    verbose : bool
+        Whether to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
+    Returns
+    -------
+    sbml_dfs : sbml_dfs_core.SBML_dfs
+        The sbml_dfs object with the results bound.
+    """
+    species_identifiers = identifiers._prepare_species_identifiers(
+        sbml_dfs, dogmatic=dogmatic, species_identifiers=species_identifiers
+    )
+    # match
+    matched_s_ids_from_wide = match_features_to_wide_pathway_species(
+        results_df,
+        species_identifiers,
+        ontologies=ontologies,
+        feature_id_var=feature_id_var,
+        verbose=verbose,
+    )
+    disambiguated_matches = resolve_matches(
+        matched_data=matched_s_ids_from_wide,
+        feature_id_var=feature_id_var,
+        numeric_agg=numeric_agg,
+        keep_id_col=keep_id_col,
+    )
+    clean_species_data = utils.drop_extra_cols(
+        results_df, disambiguated_matches, always_include=[feature_id_var]
+    )
+    sbml_dfs.add_species_data(results_name, clean_species_data)
+    return sbml_dfs
 def features_to_pathway_species(
     feature_identifiers: pd.DataFrame,
     species_identifiers: pd.DataFrame,
     ontologies: set,
-    feature_id_var: str,
+    feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
+    feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
+    expand_identifiers: bool = False,
+    identifier_delimiter: str = "/",
+    verbose: bool = False,
 ) -> pd.DataFrame:
     """
     Features to Pathway Species
@@ -31,29 +120,64 @@ def features_to_pathway_species(
     Parameters:
     feature_identifiers: pd.DataFrame
-        pd.Dataframe containing a "feature_id_var" variable used to match entries
+        pd.Dataframe containing a "feature_identifiers_var" variable used to match entries
     species_identifiers: pd.DataFrame
         A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
         generally using sbml_dfs_core.export_sbml_dfs()
     ontologies: set
         A set of ontologies used to match features to pathway species
-    feature_id_var: str
+    feature_identifiers_var: str
         Variable in "feature_identifiers" containing identifiers
+    expand_identifiers: bool, default=False
+        If True, split identifiers in feature_identifiers_var by identifier_delimiter and explode into multiple rows
+    identifier_delimiter: str, default="/"
+        Delimiter to use for splitting identifiers if expand_identifiers is True
+    verbose: bool, default=False
+        If True, log mapping statistics at the end of the function
     Returns:
     pathway_species: pd.DataFrame
         species_identifiers joined to feature_identifiers based on shared identifiers
     """
-    # map features to molecular features in the pathway
-    if feature_id_var not in feature_identifiers.columns.to_list():
+    # Check for identifier column
+    if feature_identifiers_var not in feature_identifiers.columns.to_list():
         raise ValueError(
-            f"{feature_id_var} must be a variable in 'feature_identifiers', "
+            f"{feature_identifiers_var} must be a variable in 'feature_identifiers', "
             f"possible variables are {', '.join(feature_identifiers.columns.tolist())}"
         )
+    # Respect or create feature_id column
+    feature_identifiers = _ensure_feature_id_var(feature_identifiers, feature_id_var)
+    # Optionally expand identifiers into multiple rows
+    if expand_identifiers:
+        # Count the number of expansions by counting delimiters
+        n_expansions = (
+            feature_identifiers[feature_identifiers_var]
+            .astype(str)
+            .str.count(identifier_delimiter)
+            .sum()
+        )
+        if n_expansions > 0:
+            logger.info(
+                f"Expanding identifiers: {n_expansions} delimiters found in '{feature_identifiers_var}', will expand to more rows."
+            )
+        # Split, strip whitespace, and explode
+        feature_identifiers = feature_identifiers.copy()
+        feature_identifiers[feature_identifiers_var] = (
+            feature_identifiers[feature_identifiers_var]
+            .astype(str)
+            .str.split(identifier_delimiter)
+            .apply(lambda lst: [x.strip() for x in lst])
+        )
+        feature_identifiers = feature_identifiers.explode(
+            feature_identifiers_var, ignore_index=True
+        )
     # check identifiers table
-    _check_species_identifiers_table(species_identifiers)
+    identifiers._check_species_identifiers_table(species_identifiers)
     available_ontologies = set(species_identifiers[IDENTIFIERS.ONTOLOGY].tolist())
     unavailable_ontologies = ontologies.difference(available_ontologies)
@@ -80,7 +204,9 @@ def features_to_pathway_species(
     # map features to pathway species
     pathway_species = feature_identifiers.merge(
-        relevant_identifiers, left_on=feature_id_var, right_on=IDENTIFIERS.IDENTIFIER
+        relevant_identifiers,
+        left_on=feature_identifiers_var,
+        right_on=IDENTIFIERS.IDENTIFIER,
     )
     if pathway_species.shape[0] == 0:
@@ -90,12 +216,18 @@ def features_to_pathway_species(
         None
     # report the fraction of unmapped species
+    if verbose:
+        _log_feature_species_mapping_stats(pathway_species, feature_id_var)
     return pathway_species
 def edgelist_to_pathway_species(
-    formatted_edgelist: pd.DataFrame, species_identifiers: pd.DataFrame, ontologies: set
+    formatted_edgelist: pd.DataFrame,
+    species_identifiers: pd.DataFrame,
+    ontologies: set,
+    feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
+    verbose: bool = False,
 ) -> pd.DataFrame:
     """
     Edgelist to Pathway Species
@@ -110,6 +242,10 @@ def edgelist_to_pathway_species(
         sbml_dfs_core.export_sbml_dfs()
     ontologies: set
         A set of ontologies used to match features to pathway species
+    feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
+        Variable in "formatted_edgelist" containing feature ids
+    verbose: bool, default=False
+        Whether to print verbose output
     Returns:
     edges_on_pathway: pd.DataFrame
@@ -146,7 +282,7 @@ def edgelist_to_pathway_species(
         .drop_duplicates()
         .reset_index(drop=True)
         .to_frame()
-        .rename({0: "feature_id"}, axis=1)
+        .rename({0: feature_id_var}, axis=1)
     )
     # merge edgelist identifiers with pathway identifiers to map s_ids to identifiers
@@ -154,7 +290,8 @@ def edgelist_to_pathway_species(
         feature_identifiers=distinct_identifiers,
         species_identifiers=species_identifiers,
         ontologies=ontologies,
-        feature_id_var="feature_id",
+        feature_identifiers_var=feature_id_var,
+        verbose=verbose,
     )
     # add s_ids of both upstream and downstream edges to pathway
@@ -179,6 +316,348 @@ def edgelist_to_pathway_species(
     return edges_on_pathway
+def match_features_to_wide_pathway_species(
+    wide_df: pd.DataFrame,
+    species_identifiers: pd.DataFrame,
+    ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
+    feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
+    feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
+    verbose: bool = False,
+) -> pd.DataFrame:
+    """
+    Convert a wide-format DataFrame with multiple ontology columns to long format,
+    and match features to pathway species by ontology and identifier.
+    Parameters
+    ----------
+    wide_df : pd.DataFrame
+        DataFrame with ontology identifier columns and any number of results columns.
+        All non-ontology columns are treated as results.
+    species_identifiers : pd.DataFrame
+        DataFrame as required by features_to_pathway_species
+    ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
+        Either:
+        - Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
+        - Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
+        - None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
+    feature_identifiers_var : str, default="identifier"
+        Name for the identifier column in the long format
+    feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
+        Name for the feature id column in the long format
+    verbose : bool, default=False
+        Whether to print verbose output
+    Returns
+    -------
+    pd.DataFrame
+        Output of match_by_ontology_and_identifier
+    Examples
+    --------
+    >>> # Example with auto-detected ontology columns and multiple results
+    >>> wide_df = pd.DataFrame({
+    ...     'uniprot': ['P12345', 'Q67890'],
+    ...     'chebi': ['15377', '16810'],
+    ...     'log2fc': [1.0, 2.0],
+    ...     'pvalue': [0.01, 0.05]
+    ... })
+    >>> result = match_features_to_wide_pathway_species(
+    ...     wide_df=wide_df,
+    ...     species_identifiers=species_identifiers
+    ... )
+    >>> # Example with custom ontology mapping
+    >>> wide_df = pd.DataFrame({
+    ...     'protein_id': ['P12345', 'Q67890'],
+    ...     'compound_id': ['15377', '16810'],
+    ...     'expression': [1.0, 2.0],
+    ...     'confidence': [0.8, 0.9]
+    ... })
+    >>> result = match_features_to_wide_pathway_species(
+    ...     wide_df=wide_df,
+    ...     species_identifiers=species_identifiers,
+    ...     ontologies={'protein_id': 'uniprot', 'compound_id': 'chebi'}
+    ... )
+    """
+    # Make a copy to avoid modifying the input
+    wide_df = wide_df.copy()
+    # Validate ontologies and get the set of ontology columns
+    ontology_cols = _validate_wide_ontologies(wide_df, ontologies)
+    melt_cols = list(ontology_cols)
+    # Apply renaming if a mapping is provided
+    if isinstance(ontologies, dict):
+        wide_df = wide_df.rename(columns=ontologies)
+    # Ensure feature_id column exists
+    wide_df = _ensure_feature_id_var(wide_df, feature_id_var)
+    # All non-ontology columns are treated as results
+    results_cols = list(set(wide_df.columns) - set(melt_cols))
+    if not results_cols:
+        raise ValueError("No results columns found in DataFrame")
+    logger.info(f"Using columns as results: {results_cols}")
+    # Melt ontology columns to long format, keeping all results columns
+    long_df = wide_df.melt(
+        id_vars=results_cols,
+        value_vars=melt_cols,
+        var_name=IDENTIFIERS.ONTOLOGY,
+        value_name=feature_identifiers_var,
+    ).dropna(subset=[feature_identifiers_var])
+    logger.debug(f"Final long format shape: {long_df.shape}")
+    # Call the matching function with the validated ontologies
+    out = match_by_ontology_and_identifier(
+        feature_identifiers=long_df,
+        species_identifiers=species_identifiers,
+        ontologies=ontology_cols,
+        feature_identifiers_var=feature_identifiers_var,
+    )
+    if verbose:
+        _log_feature_species_mapping_stats(out, feature_id_var)
+    return out
+def match_by_ontology_and_identifier(
+    feature_identifiers: pd.DataFrame,
+    species_identifiers: pd.DataFrame,
+    ontologies: Union[str, Set[str], List[str]],
+    feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
+    verbose: bool = False,
+) -> pd.DataFrame:
+    """
+    Match features to pathway species based on both ontology and identifier matches.
+    Performs separate matching for each ontology and concatenates the results.
+    Parameters
+    ----------
+    feature_identifiers : pd.DataFrame
+        DataFrame containing feature identifiers and results.
+        Must have columns [ontology, feature_identifiers_var, results]
+    species_identifiers : pd.DataFrame
+        DataFrame containing species identifiers from pathway.
+        Must have columns [ontology, identifier]
+    ontologies : Union[str, Set[str], List[str]]
+        Ontologies to match on. Can be:
+        - A single ontology string
+        - A set of ontology strings
+        - A list of ontology strings
+    feature_identifiers_var : str, default="identifier"
+        Name of the identifier column in feature_identifiers
+    verbose : bool, default=False
+        Whether to print verbose output
+    Returns
+    -------
+    pd.DataFrame
+        Concatenated results of matching for each ontology.
+        Contains all columns from features_to_pathway_species()
+    Examples
+    --------
+    >>> # Match using a single ontology
+    >>> result = match_by_ontology_and_identifier(
+    ...     feature_identifiers=features_df,
+    ...     species_identifiers=species_df,
+    ...     ontologies="uniprot"
+    ... )
+    >>> # Match using multiple ontologies
+    >>> result = match_by_ontology_and_identifier(
+    ...     feature_identifiers=features_df,
+    ...     species_identifiers=species_df,
+    ...     ontologies={"uniprot", "chebi"}
+    ... )
+    """
+    # Convert string to set for consistent handling
+    if isinstance(ontologies, str):
+        ontologies = {ontologies}
+    elif isinstance(ontologies, list):
+        ontologies = set(ontologies)
+    # Validate ontologies
+    invalid_onts = ontologies - set(ONTOLOGIES_LIST)
+    if invalid_onts:
+        raise ValueError(
+            f"Invalid ontologies specified: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
+        )
+    # Initialize list to store results
+    matched_dfs = []
+    # Process each ontology separately
+    for ont in ontologies:
+        # Filter feature identifiers to current ontology and drop ontology column
+        ont_features = (
+            feature_identifiers[feature_identifiers[IDENTIFIERS.ONTOLOGY] == ont]
+            .drop(columns=[IDENTIFIERS.ONTOLOGY])
+            .copy()
+        )
+        if ont_features.empty:
+            logger.warning(f"No features found for ontology: {ont}")
+            continue
+        # Filter species identifiers to current ontology
+        ont_species = species_identifiers[
+            species_identifiers[IDENTIFIERS.ONTOLOGY] == ont
+        ].copy()
+        if ont_species.empty:
+            logger.warning(f"No species found for ontology: {ont}")
+            continue
+        logger.debug(
+            f"Matching {len(ont_features)} features to {len(ont_species)} species for ontology {ont}"
+        )
+        # Match features to species for this ontology
+        matched = features_to_pathway_species(
+            feature_identifiers=ont_features,
+            species_identifiers=ont_species,
+            ontologies={ont},
+            feature_identifiers_var=feature_identifiers_var,
+            verbose=verbose,
+        )
+        if matched.empty:
+            logger.warning(f"No matches found for ontology: {ont}")
+            continue
+        matched_dfs.append(matched)
+    if not matched_dfs:
+        logger.warning("No matches found for any ontology")
+        return pd.DataFrame()  # Return empty DataFrame with correct columns
+    # Combine results from all ontologies
+    result = pd.concat(matched_dfs, axis=0, ignore_index=True)
+    logger.info(
+        f"Found {len(result)} total matches across {len(matched_dfs)} ontologies"
+    )
+    return result
+def resolve_matches(
+    matched_data: pd.DataFrame,
+    feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
+    index_col: str = SBML_DFS.S_ID,
+    numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
+    keep_id_col: bool = True,
+) -> pd.DataFrame:
+    """
+    Resolve many-to-1 and 1-to-many matches in matched data.
+    Parameters
+    ----------
+    matched_data : pd.DataFrame
+        DataFrame containing matched data with columns:
+        - feature_id_var: identifier column (e.g. feature_id)
+        - index_col: index column (e.g. s_id)
+        - other columns: data columns to be aggregated
+    feature_id_var : str, default="feature_id"
+        Name of the identifier column
+    index_col : str, default="s_id"
+        Name of the column to use as index
+    numeric_agg : str, default="weighted_mean"
+        Method to aggregate numeric columns:
+        - "weighted_mean": weighted by inverse of feature_id frequency (default)
+        - "mean": simple arithmetic mean
+        - "first": first value after sorting by feature_id_var (requires feature_id_var)
+        - "max": maximum value
+    keep_id_col : bool, default=True
+        Whether to keep and rollup the feature_id_var in the output.
+        If False, feature_id_var will be dropped from the output.
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with resolved matches:
+        - Many-to-1: numeric columns are aggregated using specified method
+        - 1-to-many: adds a count column showing number of matches
+        - Index is set to index_col and named accordingly
+    Raises
+    ------
+    KeyError
+        If feature_id_var is not present in the DataFrame
+    TypeError
+        If DataFrame contains unsupported data types (boolean or datetime)
+    """
+    # Make a copy to avoid modifying input
+    df = matched_data.copy()
+    # Check for unsupported data types
+    unsupported_dtypes = df.select_dtypes(include=["bool", "datetime64"]).columns
+    if not unsupported_dtypes.empty:
+        raise TypeError(
+            f"Unsupported data types found in columns: {list(unsupported_dtypes)}. "
+            "Boolean and datetime columns are not supported."
+        )
+    # Always require feature_id_var
+    if feature_id_var not in df.columns:
+        raise KeyError(feature_id_var)
+    # Deduplicate by feature_id within each s_id using groupby and first BEFORE any further processing
+    df = df.groupby([index_col, feature_id_var], sort=False).first().reset_index()
+    # Use a unique temporary column name for weights
+    if RESOLVE_MATCHES_TMP_WEIGHT_COL in df.columns:
+        raise ValueError(
+            f"Temporary weight column name '{RESOLVE_MATCHES_TMP_WEIGHT_COL}' already exists in the input data. Please rename or remove this column and try again."
+        )
+    # Calculate weights if needed (after deduplication!)
+    if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
+        feature_counts = df[feature_id_var].value_counts()
+        df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = (
+            1 / feature_counts[df[feature_id_var]].values
+        )
+    # Set index for grouping
+    df = df.set_index(index_col)
+    # Use utility to split columns
+    always_non_numeric = [feature_id_var] if keep_id_col else []
+    numeric_cols, non_numeric_cols = _split_numeric_non_numeric_columns(
+        df, always_non_numeric=always_non_numeric
+    )
+    # Get aggregator function
+    numeric_aggregator = _get_numeric_aggregator(
+        method=numeric_agg, feature_id_var=feature_id_var
+    )
+    resolved = _aggregate_grouped_columns(
+        df,
+        numeric_cols,
+        non_numeric_cols,
+        numeric_aggregator,
+        feature_id_var=feature_id_var,
+        numeric_agg=numeric_agg,
+    )
+    # Add count of matches per feature_id
+    match_counts = matched_data.groupby(index_col)[feature_id_var].nunique()
+    resolved[f"{feature_id_var}_match_count"] = match_counts
+    # Drop feature_id_var if not keeping it
+    if not keep_id_col and feature_id_var in resolved.columns:
+        resolved = resolved.drop(columns=[feature_id_var])
+    # Ensure index is named consistently
+    resolved.index.name = index_col
+    return resolved
 def edgelist_to_scids(
     formatted_edgelist: pd.DataFrame,
     sbml_dfs: sbml_dfs_core.SBML_dfs,
@@ -210,7 +689,7 @@ def edgelist_to_scids(
         downstream species mapped to "sc_id_downstream"
     """
-    _check_species_identifiers_table(species_identifiers)
+    identifiers._check_species_identifiers_table(species_identifiers)
     # map edges onto pathway entities based on shared identifiers
     edges_on_pathway = edgelist_to_pathway_species(
@@ -294,7 +773,7 @@ def filter_to_direct_mechanistic_interactions(
     )
     # reduce to distinct sc_id pairs
-    sc_id_pairs = edgelist_w_scids[CPR_EDGELIST_REQ_VARS].drop_duplicates()
+    sc_id_pairs = edgelist_w_scids[list(CPR_EDGELIST_REQ_VARS)].drop_duplicates()
     # define all existing direct regulatory interactions
     pathway_interactions = pd.concat(
@@ -568,7 +1047,7 @@ def _edgelist_to_scids_if_needed(
     else:
         utils.match_pd_vars(edgelist, IDENTIFIER_EDGELIST_REQ_VARS).assert_present()
-        _check_species_identifiers_table(species_identifiers)
+        identifiers._check_species_identifiers_table(species_identifiers)
         edgelist_w_scids = edgelist_to_scids(
             edgelist,
@@ -580,18 +1059,295 @@ def _edgelist_to_scids_if_needed(
         return edgelist_w_scids
-def _check_species_identifiers_table(
-    species_identifiers: pd.DataFrame,
-    required_vars: set = SPECIES_IDENTIFIERS_REQUIRED_VARS,
+def _validate_wide_ontologies(
+    wide_df: pd.DataFrame,
+    ontologies: Optional[Union[str, Set[str], Dict[str, str]]] = None,
+) -> Set[str]:
+    """
+    Validate ontology specifications against the wide DataFrame and ONTOLOGIES_LIST.
+    Parameters
+    ----------
+    wide_df : pd.DataFrame
+        DataFrame with one column per ontology and a results column
+    ontologies : Optional[Union[str, Set[str], Dict[str, str]]]
+        Either:
+        - String specifying a single ontology column
+        - Set of columns to treat as ontologies
+        - Dict mapping wide column names to ontology names
+        - None to automatically detect ontology columns based on ONTOLOGIES_LIST
+    Returns
+    -------
+    Set[str]
+        Set of validated ontology names. For dictionary mappings, returns the target ontology names.
+    Raises
+    ------
+    ValueError
+        If validation fails for any ontology specification or no valid ontologies are found
+    """
+    # Convert string input to set
+    if isinstance(ontologies, str):
+        ontologies = {ontologies}
+    # Get the set of ontology columns
+    if isinstance(ontologies, dict):
+        # Check source columns exist in DataFrame
+        missing_cols = set(ontologies.keys()) - set(wide_df.columns)
+        if missing_cols:
+            raise ValueError(f"Source columns not found in DataFrame: {missing_cols}")
+        # Validate target ontologies against ONTOLOGIES_LIST
+        invalid_onts = set(ontologies.values()) - set(ONTOLOGIES_LIST)
+        if invalid_onts:
+            raise ValueError(
+                f"Invalid ontologies in mapping: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
+            )
+        # Return target ontology names instead of source column names
+        ontology_cols = set(ontologies.values())
+    elif isinstance(ontologies, set):
+        # Check specified columns exist in DataFrame
+        missing_cols = ontologies - set(wide_df.columns)
+        if missing_cols:
+            raise ValueError(
+                f"Specified ontology columns not found in DataFrame: {missing_cols}"
+            )
+        # Validate specified ontologies against ONTOLOGIES_LIST
+        invalid_onts = ontologies - set(ONTOLOGIES_LIST)
+        if invalid_onts:
+            raise ValueError(
+                f"Invalid ontologies in set: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
+            )
+        ontology_cols = ontologies
+    else:
+        # Auto-detect ontology columns by matching against ONTOLOGIES_LIST
+        ontology_cols = set(wide_df.columns) & set(ONTOLOGIES_LIST)
+        if not ontology_cols:
+            raise ValueError(
+                f"No valid ontology columns found in DataFrame. Column names must match one of: {ONTOLOGIES_LIST}"
+            )
+        logger.info(f"Auto-detected ontology columns: {ontology_cols}")
+    logger.debug(f"Validated ontology columns: {ontology_cols}")
+    return ontology_cols
+def _ensure_feature_id_var(
+    df: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
+) -> pd.DataFrame:
+    """
+    Ensure the DataFrame has a feature_id column, creating one if it doesn't exist.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame to check/modify
+    feature_id_var : str, default=FEATURE_ID_VAR_DEFAULT
+        Name of the feature ID column
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with guaranteed feature_id column
+    """
+    if feature_id_var not in df.columns:
+        logger.warning(f"No {feature_id_var} column found in DataFrame, creating one")
+        df = df.copy()
+        df[feature_id_var] = np.arange(len(df))
+    return df
+def _get_numeric_aggregator(
+    method: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
+    feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
+) -> callable:
+    """
+    Get aggregation function for numeric columns with various methods.
+    Parameters
+    ----------
+    method : str, default="weighted_mean"
+        Aggregation method to use:
+        - "weighted_mean": weighted by inverse of feature_id frequency (default)
+        - "mean": simple arithmetic mean
+        - "first": first value after sorting by feature_id_var (requires feature_id_var)
+        - "max": maximum value
+    feature_id_var : str, default="feature_id"
+        Name of the column specifying a measured feature - used for sorting and weighting
+    Returns
+    -------
+    callable
+        Aggregation function to use with groupby
+    Raises
+    ------
+    ValueError
+        If method is not recognized
+    """
+    def weighted_mean(df: pd.DataFrame) -> float:
+        # Get values and weights for this group
+        values = df["value"]
+        weights = df["weight"]
+        # Weights are already normalized globally, just use them directly
+        return (values * weights).sum() / weights.sum()
+    def first_by_id(df: pd.DataFrame) -> float:
+        # Sort by feature_id and take first value
+        return df.sort_values(feature_id_var).iloc[0]["value"]
+    def simple_mean(series: pd.Series) -> float:
+        return series.mean()
+    def simple_max(series: pd.Series) -> float:
+        return series.max()
+    aggregators = {
+        RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN: weighted_mean,
+        RESOLVE_MATCHES_AGGREGATORS.MEAN: simple_mean,
+        RESOLVE_MATCHES_AGGREGATORS.FIRST: first_by_id,
+        RESOLVE_MATCHES_AGGREGATORS.MAX: simple_max,
+    }
+    if method not in aggregators:
+        raise ValueError(
+            f"Unknown aggregation method: {method}. Must be one of {list(aggregators.keys())}"
+        )
+    return aggregators[method]
+def _split_numeric_non_numeric_columns(df: pd.DataFrame, always_non_numeric=None):
+    """
+    Utility to split DataFrame columns into numeric and non-numeric, always treating specified columns as non-numeric.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame to split.
+    always_non_numeric : list or set, optional
+        Columns to always treat as non-numeric (e.g., ['feature_id']).
+    Returns
+    -------
+    numeric_cols : pd.Index
+        Columns considered numeric (int64, float64, and not in always_non_numeric).
+    non_numeric_cols : pd.Index
+        Columns considered non-numeric (object, string, etc., plus always_non_numeric).
+    """
+    if always_non_numeric is None:
+        always_non_numeric = []
+    always_non_numeric = set(always_non_numeric)
+    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.difference(
+        always_non_numeric
+    )
+    non_numeric_cols = df.columns.difference(numeric_cols)
+    return numeric_cols, non_numeric_cols
+def _aggregate_grouped_columns(
+    df: pd.DataFrame,
+    numeric_cols,
+    non_numeric_cols,
+    numeric_aggregator,
+    feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
+    numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
+) -> pd.DataFrame:
+    """
+    Aggregate numeric and non-numeric columns for grouped DataFrame.
+    Assumes deduplication by feature_id within each s_id has already been performed.
+    Returns the combined DataFrame.
+    """
+    results = []
+    # Handle non-numeric columns
+    if len(non_numeric_cols) > 0:
+        non_numeric_agg = (
+            df[non_numeric_cols]
+            .groupby(level=0)
+            .agg(lambda x: ",".join(sorted(set(x.astype(str)))))
+        )
+        results.append(non_numeric_agg)
+    # Handle numeric columns
+    if len(numeric_cols) > 0:
+        numeric_results = {}
+        for col in numeric_cols:
+            if numeric_agg in [
+                RESOLVE_MATCHES_AGGREGATORS.FIRST,
+                RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
+            ]:
+                agg_df = pd.DataFrame(
+                    {"value": df[col], feature_id_var: df[feature_id_var]}
+                )
+                if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
+                    agg_df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = df[
+                        RESOLVE_MATCHES_TMP_WEIGHT_COL
+                    ]
+                numeric_results[col] = agg_df.groupby(level=0).apply(
+                    lambda x: (
+                        numeric_aggregator(x)
+                        if numeric_agg != RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN
+                        else numeric_aggregator(
+                            x.rename(columns={RESOLVE_MATCHES_TMP_WEIGHT_COL: "weight"})
+                        )
+                    )
+                )
+            else:
+                numeric_results[col] = df[col].groupby(level=0).agg(numeric_aggregator)
+        numeric_agg_df = pd.DataFrame(numeric_results)
+        results.append(numeric_agg_df)
+    # Combine results
+    if results:
+        resolved = pd.concat(results, axis=1)
+    else:
+        resolved = pd.DataFrame(index=df.index)
+    return resolved
+def _log_feature_species_mapping_stats(
+    pathway_species: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
 ):
-    missing_required_vars = required_vars.difference(
-        set(species_identifiers.columns.tolist())
+    """
+    Log statistics about the mapping between feature_id and s_id in the pathway_species DataFrame.
+    """
+    # Percent of feature_ids present one or more times in the output
+    n_feature_ids = pathway_species[feature_id_var].nunique()
+    n_input_feature_ids = (
+        pathway_species[feature_id_var].max() + 1
+        if feature_id_var in pathway_species.columns
+        else 0
     )
-    if len(missing_required_vars) > 0:
-        raise ValueError(
-            f"{len(missing_required_vars)} required variables "
-            "were missing from the species_identifiers table: "
-            f"{', '.join(missing_required_vars)}"
+    percent_present = (
+        100 * n_feature_ids / n_input_feature_ids if n_input_feature_ids else 0
+    )
+    logger.info(
+        f"{percent_present:.1f}% of feature_ids are present one or more times in the output ({n_feature_ids}/{n_input_feature_ids})"
+    )
+    # Number of times an s_id maps to 1+ feature_ids (with s_name)
+    s_id_counts = pathway_species.groupby(SBML_DFS.S_ID)[feature_id_var].nunique()
+    s_id_multi = s_id_counts[s_id_counts > 1]
+    logger.info(f"{len(s_id_multi)} s_id(s) map to more than one feature_id.")
+    if not s_id_multi.empty:
+        examples = pathway_species[
+            pathway_species[SBML_DFS.S_ID].isin(s_id_multi.index)
+        ][[SBML_DFS.S_ID, SBML_DFS.S_NAME, feature_id_var]]
+        logger.info(
+            f"Examples of s_id mapping to multiple feature_ids (showing up to 3):\n{examples.groupby([SBML_DFS.S_ID, SBML_DFS.S_NAME])[feature_id_var].apply(list).head(3)}"
         )
-    return None
+    # Number of times a feature_id maps to 1+ s_ids (with s_name)
+    feature_id_counts = pathway_species.groupby(feature_id_var)[SBML_DFS.S_ID].nunique()
+    feature_id_multi = feature_id_counts[feature_id_counts > 1]
+    logger.info(f"{len(feature_id_multi)} feature_id(s) map to more than one s_id.")
+    if not feature_id_multi.empty:
+        examples = pathway_species[
+            pathway_species[feature_id_var].isin(feature_id_multi.index)
+        ][[feature_id_var, SBML_DFS.S_ID, SBML_DFS.S_NAME]]
+        logger.info(
+            f"Examples of feature_id mapping to multiple s_ids (showing up to 3):\n{examples.groupby([feature_id_var])[[SBML_DFS.S_ID, SBML_DFS.S_NAME]].apply(lambda df: list(df.itertuples(index=False, name=None))).head(3)}"
+        )

napistu 0.1.0__py3-none-any.whl → 0.2.4.dev2__py3-none-any.whl

napistu 0.1.0py3-none-any.whl → 0.2.4.dev2py3-none-any.whl