PyPI - napistu - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

napistu 0.4.2py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

napistu/consensus.py +3 -4
napistu/ingestion/constants.py +51 -0
napistu/ingestion/reactom_fi.py +208 -0
napistu/network/ng_utils.py +26 -6
napistu/network/precompute.py +56 -0
napistu/sbml_dfs_utils.py +8 -2
napistu/source.py +243 -40
napistu/statistics/hypothesis_testing.py +66 -0
napistu/utils.py +23 -1
{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/METADATA +1 -1
{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/RECORD +19 -17
tests/test_network_precompute.py +30 -0
tests/test_sbml_dfs_utils.py +13 -0
tests/test_source.py +38 -6
tests/test_statistics_hypothesis_testing.py +62 -0
tests/test_set_coverage.py +0 -50
{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/WHEEL +0 -0
{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/entry_points.txt +0 -0
{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/licenses/LICENSE +0 -0
{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/top_level.txt +0 -0

napistu/consensus.py CHANGED Viewed

@@ -426,7 +426,7 @@ def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> s
     # get the sources of species in the consensus model
     consensus_sbmldf_tbl_var_sc = (
-        source.unnest_sources(sbml_dfs.species, SBML_DFS.S_SOURCE, verbose=False)
+        source.unnest_sources(sbml_dfs.species, verbose=False)
         .reset_index()
         .sort_values([SOURCE_SPEC.NAME])
     )
@@ -504,12 +504,11 @@ def post_consensus_source_check(
 ) -> pd.DataFrame:
     """Provide sources of tables in a consensus model; the output df will be used to determine whether models are merged."""
-    table_source = sbml_dfs.schema[table_name][SOURCE_SPEC.SOURCE]
-    table_pk = sbml_dfs.schema[table_name]["pk"]
+    table_pk = sbml_dfs.schema[table_name][SCHEMA_DEFS.PK]
     sbml_dfs_tbl = getattr(sbml_dfs, table_name)
     sbml_dfs_tbl_pathway_source = (
-        source.unnest_sources(sbml_dfs_tbl, table_source, verbose=False)
+        source.unnest_sources(sbml_dfs_tbl, verbose=False)
         .reset_index()
         .sort_values(["name"])
     )

napistu/ingestion/constants.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 from types import SimpleNamespace
+from napistu.constants import SBOTERM_NAMES
 SPECIES_FULL_NAME_HUMAN = "Homo sapiens"
 SPECIES_FULL_NAME_MOUSE = "Mus musculus"
@@ -90,6 +91,56 @@ REACTOME_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathways.
 REACTOME_PATHWAY_INDEX_COLUMNS = ["file", "source", "species", "pathway_id", "name"]
 REACTOME_PATHWAY_LIST_COLUMNS = ["pathway_id", "name", "species"]
+# REACTOME FI
+REACTOME_FI_URL = "http://cpws.reactome.org/caBigR3WebApp2025/FIsInGene_04142025_with_annotations.txt.zip"
+REACTOME_FI = SimpleNamespace(
+    GENE1="Gene1",
+    GENE2="Gene2",
+    ANNOTATION="Annotation",
+    DIRECTION="Direction",
+    SCORE="Score",
+)
+REACTOME_FI_DIRECTIONS = SimpleNamespace(
+    UNDIRECTED="-",
+    STIMULATED_BY="<-",
+    STIMULATES="->",
+    STIMULATES_AND_STIMULATED_BY="<->",
+    INHIBITED_BY="|-",
+    INHIBITS="-|",
+    INHIBITS_AND_INHIBITED_BY="|-|",
+    STIMULATES_AND_INHIBITED_BY="|->",
+    INHIBITS_AND_STIMULATED_BY="<-|",
+)
+VALID_REACTOME_FI_DIRECTIONS = REACTOME_FI_DIRECTIONS.__dict__.values()
+REACTOME_FI_RULES_REVERSE = SimpleNamespace(
+    NAME_RULES={"catalyzed by": SBOTERM_NAMES.CATALYST},
+    DIRECTION_RULES={
+        REACTOME_FI_DIRECTIONS.STIMULATED_BY: SBOTERM_NAMES.STIMULATOR,
+        REACTOME_FI_DIRECTIONS.STIMULATES_AND_STIMULATED_BY: SBOTERM_NAMES.STIMULATOR,
+        REACTOME_FI_DIRECTIONS.INHIBITED_BY: SBOTERM_NAMES.INHIBITOR,
+        REACTOME_FI_DIRECTIONS.INHIBITS_AND_INHIBITED_BY: SBOTERM_NAMES.INHIBITOR,
+        REACTOME_FI_DIRECTIONS.STIMULATES_AND_INHIBITED_BY: SBOTERM_NAMES.INHIBITOR,
+        REACTOME_FI_DIRECTIONS.UNDIRECTED: SBOTERM_NAMES.INTERACTOR,
+    },
+)
+REACTOME_FI_RULES_FORWARD = SimpleNamespace(
+    NAME_RULES={"catalyze(;$)": SBOTERM_NAMES.CATALYST},
+    DIRECTION_RULES={
+        REACTOME_FI_DIRECTIONS.STIMULATES: SBOTERM_NAMES.STIMULATOR,
+        REACTOME_FI_DIRECTIONS.STIMULATES_AND_STIMULATED_BY: SBOTERM_NAMES.STIMULATOR,
+        REACTOME_FI_DIRECTIONS.STIMULATES_AND_INHIBITED_BY: SBOTERM_NAMES.STIMULATOR,
+        REACTOME_FI_DIRECTIONS.INHIBITS: SBOTERM_NAMES.INHIBITOR,
+        REACTOME_FI_DIRECTIONS.INHIBITS_AND_INHIBITED_BY: SBOTERM_NAMES.INHIBITOR,
+        REACTOME_FI_DIRECTIONS.INHIBITS_AND_STIMULATED_BY: SBOTERM_NAMES.INHIBITOR,
+        REACTOME_FI_DIRECTIONS.UNDIRECTED: SBOTERM_NAMES.INTERACTOR,
+    },
+)
 # SBML
 SBML_DEFS = SimpleNamespace(
     ERROR_NUMBER="error_number",

napistu/ingestion/reactom_fi.py ADDED Viewed

@@ -0,0 +1,208 @@
+import logging
+import pandas as pd
+from napistu.identifiers import Identifiers
+from napistu import utils
+from napistu.ingestion.constants import (
+    REACTOME_FI,
+    REACTOME_FI_RULES_FORWARD,
+    REACTOME_FI_RULES_REVERSE,
+    REACTOME_FI_URL,
+    VALID_REACTOME_FI_DIRECTIONS,
+)
+logger = logging.getLogger(__name__)
+def download_reactome_fi(target_uri: str, url: str = REACTOME_FI_URL) -> None:
+    """
+    Download the Reactome Functional Interactions (FI) dataset as a TSV file.
+    Parameters
+    ----------
+    target_uri : str
+        The URI where the Reactome FI data should be saved. Should end with .tsv
+    url : str, optional
+        URL to download the zipped Reactome functional interactions TSV from.
+        Defaults to REACTOME_FI_URL.
+    Returns
+    -------
+    None
+    Raises
+    ------
+    ValueError
+        If target_uri does not end with .tsv
+    """
+    if not target_uri.endswith(".tsv"):
+        raise ValueError(f"Target URI must end with .tsv, got {target_uri}")
+    file_ext = url.split(".")[-1]
+    target_filename = url.split("/")[-1].split(f".{file_ext}")[0]
+    logger.info("Start downloading proteinatlas %s to %s", url, target_uri)
+    # target_filename is the name of the file in the zip file which will be renamed to target_uri
+    utils.download_wget(url, target_uri, target_filename=target_filename)
+    return None
+def format_reactome_fi_edgelist(interactions: pd.DataFrame):
+    """
+    Format the Reactome FI interactions DataFrame as an edgelist for network analysis.
+    Parameters
+    ----------
+    interactions : pd.DataFrame
+        DataFrame containing Reactome FI interactions.
+    Returns
+    -------
+    Dictonary of:
+    interaction_edgelist : pd.DataFrame
+        Table containing molecular interactions with columns:
+        - upstream_name : str, matches "s_name" from species_df
+        - downstream_name : str, matches "s_name" from species_df
+        - upstream_compartment : str, matches "c_name" from compartments_df
+        - downstream_compartment : str, matches "c_name" from compartments_df
+        - r_name : str, name for the interaction
+        - sbo_term : str, SBO term defining interaction type
+        - r_Identifiers : identifiers.Identifiers, supporting identifiers
+        - r_isreversible : bool, whether reaction is reversible
+    species_df : pd.DataFrame
+        Table defining molecular species with columns:
+        - s_name : str, name of molecular species
+        - s_Identifiers : identifiers.Identifiers, species identifiers
+    compartments_df : pd.DataFrame
+        Table defining compartments with columns:
+        - c_name : str, name of compartment
+        - c_Identifiers : identifiers.Identifiers, compartment identifiers
+    Notes
+    -----
+    This function is not yet implemented and will raise NotImplementedError.
+    """
+    raise NotImplementedError("TO DO - This function is incomplete")
+    formatted_annotations = _parse_reactome_fi_annotations(interactions)
+    # this join will expand some rows to 2 since the bidirectional relationships are captured as separate edges in Napistu
+    annotated_interactions = interactions.merge(
+        formatted_annotations,
+        on=[REACTOME_FI.ANNOTATION, REACTOME_FI.DIRECTION],
+        how="left",
+    )
+    # flip reverse entries so all relationships are forward or undirected
+    formatted_interactions = (
+        pd.concat(
+            [
+                annotated_interactions.query("polarity == 'forward'"),
+                (
+                    annotated_interactions.query("polarity == 'reverse'").rename(
+                        columns={
+                            REACTOME_FI.GENE1: REACTOME_FI.GENE2,
+                            REACTOME_FI.GENE2: REACTOME_FI.GENE1,
+                        }
+                    )
+                ),
+            ]
+        )[[REACTOME_FI.GENE1, REACTOME_FI.GENE2, "sbo_term_name", "Score"]]
+        # looks like they were already unique edges
+        .sort_values("Score", ascending=False)
+        .groupby([REACTOME_FI.GENE1, REACTOME_FI.GENE2])
+        .first()
+    )
+    fi_edgelist = (
+        formatted_interactions.reset_index()
+        .rename(
+            columns={
+                REACTOME_FI.GENE1: "upstream_name",
+                REACTOME_FI.GENE2: "downstream_name",
+            }
+        )
+        .assign(r_Identifiers=Identifiers([]))
+    )
+    return fi_edgelist
+def _parse_reactome_fi_annotations(interactions: pd.DataFrame) -> pd.DataFrame:
+    """
+    Parse and annotate Reactome FI interaction types and directions using regex-based rules.
+    Parameters
+    ----------
+    interactions : pd.DataFrame
+        DataFrame containing Reactome FI interactions, with annotation and direction columns.
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with annotation, direction, SBO term name, and polarity for each unique annotation/direction pair.
+    Raises
+    ------
+    ValueError
+        If an annotation/direction pair cannot be matched to a rule or if invalid directions are found.
+    """
+    distinct_annotations = (
+        interactions[[REACTOME_FI.ANNOTATION, REACTOME_FI.DIRECTION]]
+        .drop_duplicates()
+        .reset_index(drop=True)
+    )
+    invalid_directions = distinct_annotations.loc[
+        ~distinct_annotations[REACTOME_FI.DIRECTION].isin(VALID_REACTOME_FI_DIRECTIONS),
+        "Direction",
+    ]
+    if len(invalid_directions) > 0:
+        raise ValueError(f"Invalid directions: {invalid_directions}")
+    annotations = list()
+    for _, vals in distinct_annotations.iterrows():
+        annot, direction = vals
+        forward_match = utils.match_regex_dict(
+            annot, REACTOME_FI_RULES_FORWARD.NAME_RULES
+        )
+        if not forward_match:
+            if direction in REACTOME_FI_RULES_FORWARD.DIRECTION_RULES:
+                forward_match = REACTOME_FI_RULES_FORWARD.DIRECTION_RULES[direction]
+        reverse_match = utils.match_regex_dict(
+            annot, REACTOME_FI_RULES_REVERSE.NAME_RULES
+        )
+        if not reverse_match:
+            if direction in REACTOME_FI_RULES_REVERSE.DIRECTION_RULES:
+                reverse_match = REACTOME_FI_RULES_REVERSE.DIRECTION_RULES[direction]
+        if not (forward_match or reverse_match):
+            raise ValueError(f"No match found for {annot} with direction {direction}")
+        if forward_match:
+            annotations.append(
+                {
+                    REACTOME_FI.ANNOTATION: annot,
+                    REACTOME_FI.DIRECTION: direction,
+                    "sbo_term_name": forward_match,
+                    "polarity": "forward",
+                }
+            )
+        if reverse_match:
+            annotations.append(
+                {
+                    REACTOME_FI.ANNOTATION: annot,
+                    REACTOME_FI.DIRECTION: direction,
+                    "sbo_term_name": reverse_match,
+                    "polarity": "reverse",
+                }
+            )
+    return pd.DataFrame(annotations)

napistu/network/ng_utils.py CHANGED Viewed

@@ -66,7 +66,7 @@ def compartmentalize_species_pairs(
     Compartmentalize Shortest Paths
     For a set of origin and destination species pairs, consider each species in every
-      compartment it operates in, seperately.
+    compartment it operates in, seperately.
     Parameters
     ----------
@@ -112,22 +112,42 @@ def compartmentalize_species_pairs(
 def get_minimal_sources_edges(
-    vertices: pd.DataFrame, sbml_dfs: sbml_dfs_core.SBML_dfs
+    vertices: pd.DataFrame,
+    sbml_dfs: sbml_dfs_core.SBML_dfs,
+    source_total_counts: Optional[pd.Series] = None,
 ) -> pd.DataFrame | None:
-    """Assign edges to a set of sources."""
+    """
+    Assign edges to a set of sources.
+    Parameters
+    ----------
+    vertices: pd.DataFrame
+        A table of vertices.
+    sbml_dfs: sbml_dfs_core.SBML_dfs
+        A pathway model
+    source_total_counts: pd.Series
+        A series of the total counts of each source.
+    Returns
+    -------
+    edge_sources: pd.DataFrame
+        A table of edges and the sources they are assigned to.
+    """
     nodes = vertices["node"].tolist()
     present_reactions = sbml_dfs.reactions[sbml_dfs.reactions.index.isin(nodes)]
     if len(present_reactions) == 0:
         return None
-    table_schema = sbml_dfs.schema[SBML_DFS.REACTIONS]
-    source_df = source.unnest_sources(present_reactions, table_schema["source"])
+    source_df = source.unnest_sources(present_reactions)
     if source_df is None:
         return None
     else:
-        edge_sources = source.greedy_set_coverge_of_sources(source_df, table_schema)
+        edge_sources = source.source_set_coverage(
+            source_df, source_total_counts, sbml_dfs
+        )
         return edge_sources.reset_index()[
             [SBML_DFS.R_ID, SOURCE_SPEC.PATHWAY_ID, SOURCE_SPEC.NAME]
         ]

napistu/network/precompute.py CHANGED Viewed

@@ -110,6 +110,62 @@ def precompute_distances(
     return filtered_precomputed_distances
+def filter_precomputed_distances_top_n(precomputed_distances, top_n=50):
+    """
+    Filter precomputed distances to only include the top-n pairs for each distance measure.
+    Parameters
+    ----------
+    precomputed_distances : pd.DataFrame
+        Precomputed distances.
+    top_n : int, optional
+        Top-n pairs to include for each distance measure.
+    Returns
+    -------
+    pd.DataFrame
+        Filtered precomputed distances.
+    """
+    # take the union of top-n for each distance measure; and from origin -> dest and dest -> origin
+    distance_vars = set(precomputed_distances.columns) - {
+        NAPISTU_EDGELIST.SC_ID_ORIGIN,
+        NAPISTU_EDGELIST.SC_ID_DEST,
+    }
+    valid_pairs = list()
+    for distance_var in distance_vars:
+        top_n_pairs_by_origin = (
+            precomputed_distances.sort_values(by=distance_var, ascending=False)
+            .groupby(NAPISTU_EDGELIST.SC_ID_ORIGIN)
+            .head(top_n)
+        )
+        top_n_pairs_by_dest = (
+            precomputed_distances.sort_values(by=distance_var, ascending=False)
+            .groupby(NAPISTU_EDGELIST.SC_ID_DEST)
+            .head(top_n)
+        )
+        valid_pairs.append(
+            top_n_pairs_by_origin[
+                [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
+            ]
+        )
+        valid_pairs.append(
+            top_n_pairs_by_dest[
+                [NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST]
+            ]
+        )
+    all_valid_pairs = pd.concat(valid_pairs).drop_duplicates()
+    return precomputed_distances.merge(
+        all_valid_pairs,
+        on=[NAPISTU_EDGELIST.SC_ID_ORIGIN, NAPISTU_EDGELIST.SC_ID_DEST],
+        how="inner",
+    )
 def _calculate_distances_subset(
     napistu_graph: NapistuGraph,
     vs_to_partition: pd.DataFrame,

napistu/sbml_dfs_utils.py CHANGED Viewed

@@ -456,8 +456,14 @@ def infer_entity_type(df: pd.DataFrame) -> str:
             if entity_schema.get(SCHEMA_DEFS.PK) == df.index.name:
                 return entity_type
-    # Get DataFrame columns that are also primary keys
-    df_columns = set(df.columns).intersection(primary_keys)
+    # Get DataFrame columns that are also primary keys, including index or MultiIndex names
+    index_names = []
+    if isinstance(df.index, pd.MultiIndex):
+        index_names = [name for name in df.index.names if name is not None]
+    elif df.index.name is not None:
+        index_names = [df.index.name]
+    df_columns = set(df.columns).union(index_names).intersection(primary_keys)
     # Check for exact match with primary key + foreign keys
     for entity_type, entity_schema in schema.items():

napistu/source.py CHANGED Viewed

@@ -1,8 +1,14 @@
 from __future__ import annotations
+import numpy as np
 import pandas as pd
+from typing import Optional
 from napistu import indices
-from napistu.constants import SOURCE_SPEC
+from napistu import sbml_dfs_core
+from napistu import sbml_dfs_utils
+from napistu.statistics import hypothesis_testing
+from napistu.constants import SBML_DFS_SCHEMA, SCHEMA_DEFS, SOURCE_SPEC
 class Source:
@@ -41,11 +47,18 @@ class Source:
             Creates an empty source object. This is typically used when creating an SBML_dfs
             object from a single source.
         pw_index : indices.PWIndex
+            a pathway index object containing the pathway_id and other metadata
         Returns
         -------
         None.
+        Raises
+        ------
+        ValueError:
+            if pw_index is not a indices.PWIndex
+        ValueError:
+            if SOURCE_SPEC.MODEL is not present in source_df
         """
         if init is True:
@@ -101,8 +114,27 @@ def create_source_table(
     """
     Create Source Table
-    Create a table with one row per "new_id" and a Source object created from the union
-      of "old_id" Source objects
+    Create a table with one row per "new_id" and a Source object created from the unionof "old_id" Source objects
+    Parameters
+    ----------
+    lookup_table: pd.Series
+        a pd.Series containing the index of the table to create a source table for
+    table_schema: dict
+        a dictionary containing the schema of the table to create a source table for
+    pw_index: indices.PWIndex
+        a pathway index object containing the pathway_id and other metadata
+    Returns
+    -------
+    source_table: pd.DataFrame
+        a pd.DataFrame containing the index of the table to create a source table for
+        with one row per "new_id" and a Source object created from the union of "old_id" Source objects
+    Raises
+    ------
+    ValueError:
+        if SOURCE_SPEC.SOURCE is not present in table_schema
     """
     if SOURCE_SPEC.SOURCE not in table_schema.keys():
@@ -142,8 +174,27 @@ def merge_sources(source_list: list | pd.Series) -> Source:
     Merge a list of Source objects into a single Source object
+    Parameters
+    ----------
+    source_list: list | pd.Series
+        a list of Source objects or a pd.Series of Source objects
+    Returns
+    -------
+    source: Source
+        a Source object created from the union of the Source objects in source_list
+    Raises
+    ------
+    TypeError:
+        if source_list is not a list or pd.Series
     """
+    if not isinstance(source_list, (list, pd.Series)):
+        raise TypeError(
+            f"source_list must be a list or pd.Series, but was a {type(source_list).__name__}"
+        )
     # filter to non-empty sources
     # empty sources have only been initialized; a merge hasn't occured
     existing_sources = [s.source is not None for s in source_list]
@@ -160,28 +211,35 @@ def merge_sources(source_list: list | pd.Series) -> Source:
     return Source(pd.concat(existing_source_list))
-def unnest_sources(
-    source_table: pd.DataFrame, source_var: str, verbose: bool = False
-) -> pd.DataFrame:
+def unnest_sources(source_table: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
     """
     Unnest Sources
     Take a pd.DataFrame containing an array of Sources and
     return one-row per source.
-    Parameters:
+    Parameters
+    ----------
     source_table: pd.DataFrame
         a table containing an array of Sources
-    source_var: str
-        variable containing Sources
+    verbose: bool
+        print progress
-    Returns:
+    Returns
+    -------
     pd.Dataframe containing the index of source_table but expanded
     to include one row per source
     """
     sources = list()
+    table_type = sbml_dfs_utils.infer_entity_type(source_table)
+    source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
+    if SCHEMA_DEFS.SOURCE not in source_table_schema.keys():
+        raise ValueError(f"{table_type} does not have a source attribute")
+    source_var = source_table_schema[SCHEMA_DEFS.SOURCE]
     source_table_index = source_table.index.to_frame().reset_index(drop=True)
     for i in range(source_table.shape[0]):
@@ -216,53 +274,73 @@ def unnest_sources(
     return pd.concat(sources)
-def greedy_set_coverge_of_sources(
-    source_df: pd.DataFrame, table_schema: dict
+def source_set_coverage(
+    select_sources_df: pd.DataFrame,
+    source_total_counts: Optional[pd.Series] = None,
+    sbml_dfs: Optional[sbml_dfs_core.SBML_dfs] = None,
 ) -> pd.DataFrame:
     """
     Greedy Set Coverage of Sources
-    Apply the greedy set coverge algorithm to find the minimal set of
-    sources which cover all entries
+    Find the set of pathways covering `select_sources_df`. If `all_sources_df`
+    is provided pathways will be selected iteratively based on statistical
+    enrichment. If `all_sources_df` is not provided, the largest pathways
+    will be chosen iteratively.
-    Parameters:
-    source_df: pd.DataFrame
+    Parameters
+    ----------
+    select_sources_df: pd.DataFrame
         pd.Dataframe containing the index of source_table but expanded to
         include one row per source. As produced by source.unnest_sources()
-    Returns:
+    source_total_counts: pd.Series
+        pd.Series containing the total counts of each source. As produced by
+        source.get_source_total_counts()
+    sbml_dfs: sbml_dfs_core.SBML_dfs
+        if `source_total_counts` is provided then `sbml_dfs` must be provided
+        to calculate the total number of entities in the table.
+    Returns
+    -------
     minimial_sources: [str]
         A list of pathway_ids of the minimal source set
     """
+    table_type = sbml_dfs_utils.infer_entity_type(select_sources_df)
+    pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
+    if source_total_counts is not None:
+        if sbml_dfs is None:
+            raise ValueError(
+                "If `source_total_counts` is provided, `sbml_dfs` must be provided to calculate the total number of entities in the table."
+            )
+        n_total_entities = sbml_dfs.get_table(table_type).shape[0]
     # rollup pathways with identical membership
-    deduplicated_sources = _deduplicate_source_df(source_df, table_schema)
+    deduplicated_sources = _deduplicate_source_df(select_sources_df)
     unaccounted_for_members = deduplicated_sources
     retained_pathway_ids = []
     while unaccounted_for_members.shape[0] != 0:
         # find the pathway with the most members
-        pathway_members = unaccounted_for_members.groupby(SOURCE_SPEC.PATHWAY_ID).size()
-        top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
+        if source_total_counts is None:
+            top_pathway = _select_top_pathway_by_size(unaccounted_for_members)
+        else:
+            top_pathway = _select_top_pathway_by_enrichment(
+                unaccounted_for_members, source_total_counts, n_total_entities, pk
+            )
+        if top_pathway is None:
+            break
         retained_pathway_ids.append(top_pathway)
         # remove all members associated with the top pathway
-        members_captured = (
-            unaccounted_for_members[
-                unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
-            ]
-            .index.get_level_values(table_schema["pk"])
-            .tolist()
+        unaccounted_for_members = _update_unaccounted_for_members(
+            top_pathway, unaccounted_for_members
         )
-        unaccounted_for_members = unaccounted_for_members[
-            ~unaccounted_for_members.index.get_level_values(table_schema["pk"]).isin(
-                members_captured
-            )
-        ]
     minimial_sources = deduplicated_sources[
         deduplicated_sources[SOURCE_SPEC.PATHWAY_ID].isin(retained_pathway_ids)
     ].sort_index()
@@ -270,9 +348,39 @@ def greedy_set_coverge_of_sources(
     return minimial_sources
-def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.DataFrame:
+def get_source_total_counts(
+    sbml_dfs: sbml_dfs_core.SBML_dfs, entity_type: str
+) -> pd.Series:
+    """
+    Get the total counts of each source.
+    Parameters
+    ----------
+    sbml_dfs: sbml_dfs_core.SBML_dfs
+        sbml_dfs object containing the table to get the total counts of
+    entity_type: str
+        the type of entity to get the total counts of
+    Returns
+    -------
+    source_total_counts: pd.Series
+        pd.Series containing the total counts of each source.
+    """
+    all_sources_table = unnest_sources(sbml_dfs.get_table(entity_type))
+    source_total_counts = all_sources_table.value_counts(SOURCE_SPEC.PATHWAY_ID).rename(
+        "total_counts"
+    )
+    return source_total_counts
+def _deduplicate_source_df(source_df: pd.DataFrame) -> pd.DataFrame:
     """Combine entries in a source table when multiple models have the same members."""
+    table_type = sbml_dfs_utils.infer_entity_type(source_df)
+    source_table_schema = SBML_DFS_SCHEMA.SCHEMA[table_type]
     # drop entries which are missing required attributes and throw an error if none are left
     REQUIRED_NON_NA_ATTRIBUTES = [SOURCE_SPEC.PATHWAY_ID]
     indexed_sources = (
@@ -296,7 +404,11 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
                 {
                     SOURCE_SPEC.PATHWAY_ID: p,
                     "membership_string": "_".join(
-                        set(indexed_sources.loc[[p]][table_schema["pk"]].tolist())
+                        set(
+                            indexed_sources.loc[[p]][
+                                source_table_schema[SCHEMA_DEFS.PK]
+                            ].tolist()
+                        )
                     ),
                 }
                 for p in pathways
@@ -320,16 +432,16 @@ def _deduplicate_source_df(source_df: pd.DataFrame, table_schema: dict) -> pd.Da
     merged_sources = pd.concat(
         [
-            _collapse_by_membership_string(s, membership_categories, table_schema)  # type: ignore
+            _collapse_by_membership_string(s, membership_categories, source_table_schema)  # type: ignore
             for s in category_index.tolist()
         ]
     )
     merged_sources[SOURCE_SPEC.INDEX_NAME] = merged_sources.groupby(
-        table_schema["pk"]
+        source_table_schema[SCHEMA_DEFS.PK]
     ).cumcount()
     return merged_sources.set_index(
-        [table_schema["pk"], SOURCE_SPEC.INDEX_NAME]
+        [source_table_schema[SCHEMA_DEFS.PK], SOURCE_SPEC.INDEX_NAME]
     ).sort_index()
@@ -345,7 +457,10 @@ def _collapse_by_membership_string(
     return pd.DataFrame(
         [
             pd.concat(
-                [pd.Series({table_schema["pk"]: ms}), collapsed_source_membership]
+                [
+                    pd.Series({table_schema[SCHEMA_DEFS.PK]: ms}),
+                    collapsed_source_membership,
+                ]
             )
             for ms in membership_string.split("_")
         ]
@@ -398,3 +513,91 @@ def _safe_source_merge(member_Sources: Source | list) -> Source:
         return merge_sources(member_Sources.tolist())
     else:
         raise TypeError("Expecting source.Source or pd.Series")
+def _select_top_pathway_by_size(unaccounted_for_members: pd.DataFrame) -> str:
+    pathway_members = unaccounted_for_members.value_counts(SOURCE_SPEC.PATHWAY_ID)
+    top_pathway = pathway_members[pathway_members == max(pathway_members)].index[0]
+    return top_pathway
+def _select_top_pathway_by_enrichment(
+    unaccounted_for_members: pd.DataFrame,
+    source_total_counts: pd.Series,
+    n_total_entities: int,
+    table_pk: str,
+    min_pw_size: int = 5,
+) -> str:
+    n_observed_entities = len(
+        unaccounted_for_members.index.get_level_values(table_pk).unique()
+    )
+    pathway_members = unaccounted_for_members.value_counts(
+        SOURCE_SPEC.PATHWAY_ID
+    ).rename("observed_members")
+    pathway_members = pathway_members.loc[pathway_members >= min_pw_size]
+    if pathway_members.shape[0] == 0:
+        return None
+    wide_contingency_table = (
+        pathway_members.to_frame()
+        .join(source_total_counts)
+        .assign(
+            missing_members=lambda x: x["total_counts"] - x["observed_members"],
+            observed_nonmembers=lambda x: n_observed_entities - x["observed_members"],
+            nonobserved_nonmembers=lambda x: n_total_entities
+            - x["observed_nonmembers"]
+            - x["missing_members"]
+            - x["observed_members"],
+        )
+        .drop(columns=["total_counts"])
+    )
+    # calculate enrichments using a fast vectorized normal approximation
+    odds_ratios, _ = hypothesis_testing.fisher_exact_vectorized(
+        wide_contingency_table["observed_members"],
+        wide_contingency_table["missing_members"],
+        wide_contingency_table["observed_nonmembers"],
+        wide_contingency_table["nonobserved_nonmembers"],
+    )
+    return pathway_members.index[np.argmax(odds_ratios)]
+def _update_unaccounted_for_members(
+    top_pathway, unaccounted_for_members
+) -> pd.DataFrame:
+    """
+    Update the unaccounted for members dataframe by removing the members
+    associated with the top pathway.
+    Parameters
+    ----------
+    top_pathway: str
+        the pathway to remove from the unaccounted for members
+    unaccounted_for_members: pd.DataFrame
+        the dataframe of unaccounted for members
+    Returns
+    -------
+    unaccounted_for_members: pd.DataFrame
+        the dataframe of unaccounted for members with the top pathway removed
+    """
+    table_type = sbml_dfs_utils.infer_entity_type(unaccounted_for_members)
+    pk = SBML_DFS_SCHEMA.SCHEMA[table_type][SCHEMA_DEFS.PK]
+    members_captured = (
+        unaccounted_for_members[
+            unaccounted_for_members[SOURCE_SPEC.PATHWAY_ID] == top_pathway
+        ]
+        .index.get_level_values(pk)
+        .tolist()
+    )
+    return unaccounted_for_members[
+        ~unaccounted_for_members.index.get_level_values(pk).isin(members_captured)
+    ]

napistu/statistics/hypothesis_testing.py ADDED Viewed

@@ -0,0 +1,66 @@
+from typing import Union
+import numpy as np
+from scipy.stats import norm
+def fisher_exact_vectorized(
+    observed_members: Union[list[int], np.ndarray],
+    missing_members: Union[list[int], np.ndarray],
+    observed_nonmembers: Union[list[int], np.ndarray],
+    nonobserved_nonmembers: Union[list[int], np.ndarray],
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Fast vectorized one-tailed Fisher exact test using normal approximation.
+    Parameters:
+    -----------
+    observed_members, missing_members, observed_nonmembers, nonobserved_nonmembers : array-like
+        The four cells of the 2x2 contingency tables (must be non-negative)
+    Returns:
+    --------
+    odds_ratios : numpy array
+        Odds ratios for each test
+    p_values : numpy array
+        One-tailed p-values (tests for enrichment)
+    """
+    # Convert to numpy arrays
+    a = np.array(observed_members, dtype=float)
+    b = np.array(missing_members, dtype=float)
+    c = np.array(observed_nonmembers, dtype=float)
+    d = np.array(nonobserved_nonmembers, dtype=float)
+    # Check for negative values and raise error
+    if np.any((a < 0) | (b < 0) | (c < 0) | (d < 0)):
+        raise ValueError("All contingency table values must be non-negative")
+    # Calculate odds ratios
+    odds_ratios = np.divide(
+        a * d, b * c, out=np.full_like(a, np.inf, dtype=float), where=(b * c) != 0
+    )
+    # Normal approximation to hypergeometric distribution
+    n = a + b + c + d
+    # Avoid division by zero in expected value calculation
+    expected_a = np.divide(
+        (a + b) * (a + c), n, out=np.zeros_like(n, dtype=float), where=n != 0
+    )
+    # Variance calculation with protection against division by zero
+    var_a = np.divide(
+        (a + b) * (c + d) * (a + c) * (b + d),
+        n * n * (n - 1),
+        out=np.ones_like(n, dtype=float),  # Default to 1 to avoid sqrt(0)
+        where=(n > 1),
+    )
+    var_a = np.maximum(var_a, 1e-10)  # Ensure positive variance
+    # Continuity correction and z-score
+    z = (a - expected_a - 0.5) / np.sqrt(var_a)
+    # One-tailed p-value (upper tail for enrichment)
+    p_values = norm.sf(z)  # 1 - norm.cdf(z)
+    return odds_ratios, p_values

napistu/utils.py CHANGED Viewed

@@ -14,7 +14,7 @@ import zipfile
 from contextlib import closing
 from itertools import starmap
 from textwrap import fill
-from typing import Any, List, Optional, Union
+from typing import Any, Dict, Optional, List, Union
 from urllib.parse import urlparse
 from pathlib import Path
 from requests.adapters import HTTPAdapter
@@ -1131,6 +1131,28 @@ def safe_fill(x: str, fill_width: int = 15) -> str:
         return fill(x, fill_width)
+def match_regex_dict(s: str, regex_dict: Dict[str, any]) -> Optional[any]:
+    """
+    Apply each regex in regex_dict to the string s. If a regex matches, return its value.
+    If no regex matches, return None.
+    Parameters
+    ----------
+    s : str
+        The string to test.
+    regex_dict : dict
+        Dictionary where keys are regex patterns (str), and values are the values to return.
+    Returns
+    -------
+    The value associated with the first matching regex, or None if no match.
+    """
+    for pattern, value in regex_dict.items():
+        if re.search(pattern, s):
+            return value
+    return None
 def _add_nameness_score_wrapper(df, name_var, table_schema):
     """Call _add_nameness_score with default value."""

{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: napistu
-Version: 0.4.2
+Version: 0.4.3
 Summary: Connecting high-dimensional data to curated pathways
 Home-page: https://github.com/napistu/napistu-py
 Author: Sean Hackett

{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 napistu/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
 napistu/__main__.py,sha256=xwlbh_0Ig3a-yG6BIJRiDPSN9R2HnX2pEBvlodlO6h4,29015
-napistu/consensus.py,sha256=xWXiqIM6ot-SSPJZXTrVpohbINSCkZXBtRi-5REfk_g,69897
+napistu/consensus.py,sha256=SDw58vkDivzy5AiOQUnf5vUbFxmSrMGMMmptDMZhk0E,69807
 napistu/constants.py,sha256=8sp1l0cxu2rsnCrWBEEwhcBKvDtc4u0D0f_72zILLW0,13427
 napistu/identifiers.py,sha256=e2-nTVzr5AINa0y1ER9218bKXyF2kAeJ9At22S4Z00o,33914
 napistu/indices.py,sha256=Zjg3gE0JQ3T879lCPazYg-WXVE6hvcAr713ZKpJ32rk,9830
 napistu/sbml_dfs_core.py,sha256=s0OyoHs-AjOcbZu1d3KNkW_PI7Rxbhu5ZLpfQeO4iY8,72639
-napistu/sbml_dfs_utils.py,sha256=w5dFcJFDKnKDK9jxPOCuCW8IccxdXmyNmP9vCUhVdf8,46184
-napistu/source.py,sha256=UGpN70bqbC9gnKmM0ivSdQYim9hfzgABeXoQKzRr9oU,13646
-napistu/utils.py,sha256=PEAsLn7VGN8JlNJQcAMYpjF1gr2mWmb5IqBsypP9hi0,35768
+napistu/sbml_dfs_utils.py,sha256=SOy1Ii2hDFOfQa7pFAJS9EfAmfBVD_sHvDJBVmCN_p8,46456
+napistu/source.py,sha256=iDDKpN-4k_W_tyxEjqe_z-yPJv7uoFRRBhkiBtOH5C8,20416
+napistu/utils.py,sha256=p2sJxTklmV30XS6hanJRjcdfgeaZpkULuMyQX3BPP0c,36404
 napistu/context/__init__.py,sha256=LQBEqipcHKK0E5UlDEg1ct-ymCs93IlUrUaH8BCevf0,242
 napistu/context/discretize.py,sha256=Qq7zg46F_I-PvQIT2_pEDQV7YEtUQCxKoRvT5Gu9QsE,15052
 napistu/context/filtering.py,sha256=l1oq-43ysSGqU9VmhTOO_pYT4DSMf20yxvktPC1MI0I,13696
@@ -17,13 +17,14 @@ napistu/gcs/downloads.py,sha256=SvGv9WYr_Vt3guzyz1QiAuBndeKPTBtWSFLj1-QbLf4,6348
 napistu/gcs/utils.py,sha256=eLSsvewWJdCguyj2k0ozUGP5BTemaE1PZg41Z3aY5kM,571
 napistu/ingestion/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
 napistu/ingestion/bigg.py,sha256=f65--8ARe248eYCUJpFMF284Wz53sLyFyBuwelxHmJA,4340
-napistu/ingestion/constants.py,sha256=9UP47VImZ11q0kz17N3EJg2155USqLewwNWyKpA-cbA,8089
+napistu/ingestion/constants.py,sha256=jo3v8Z7Y_tNNhTmEcokVOh1HBJFAXc-Z38S4mG58qfo,10059
 napistu/ingestion/gtex.py,sha256=X0hSC1yrpf4xSJWFhpeNcnHwJzKDII2MvjfUqYA0JN8,3720
 napistu/ingestion/hpa.py,sha256=R27ExrryKQ4Crxv9ATXmBJCa-yd01TMOrDjkeBhIQac,5054
 napistu/ingestion/identifiers_etl.py,sha256=6ppDUA6lEZurdmVbiFLOUzphYbr-hndMhtqsQnq_yAc,5009
 napistu/ingestion/napistu_edgelist.py,sha256=4RLXsoIk_-Atu-Nqme_t1JpEpBET26VIY2Y_Hcd3sMw,3580
 napistu/ingestion/obo.py,sha256=AQkIPWbjA464Lma0tx91JucWkIwLjC7Jgv5VHGRTDkE,9601
 napistu/ingestion/psi_mi.py,sha256=5eJjm7XWogL9oTyGqR52kntHClLwLsTePKqCvUGyi-w,10111
+napistu/ingestion/reactom_fi.py,sha256=hKdOY2wNtcNk6WlnHnNalryiXv6mtcWUiBW9isXPB0Y,6991
 napistu/ingestion/reactome.py,sha256=Hn9X-vDp4o_HK-OtaQvel3vJeZ8_TC1-4N2rruK9Oks,7099
 napistu/ingestion/sbml.py,sha256=l8Z98yWuOIRGns8G4UNnoQz7v_xmukZb_IZ_5ye34Ko,25296
 napistu/ingestion/string.py,sha256=go1WGTkoLJejX7GQWf9bFeInFGAw4jNSpS2B_Zr5f_s,11364
@@ -66,9 +67,9 @@ napistu/network/net_create.py,sha256=66kV_xoWnu4BVLaJZ1TAC7wBSsjPDqjoAXH-X9ShV3s
 napistu/network/net_create_utils.py,sha256=zajwaz2xAij_9fEnD77SgBw_EnNAnJ8jBCmmK2rk_bA,24672
 napistu/network/net_propagation.py,sha256=Il5nDOWh3nLz8gRhDFHGp2LxcvJ9C1twiSZjDeiZMUo,23490
 napistu/network/ng_core.py,sha256=dGnTUKR4WtnvaYMyIHqqF55FY4mJSa7wjA2LZ4cVB6U,11720
-napistu/network/ng_utils.py,sha256=c1tHXz_JcH01D5KovNQmRLTEVxpCkCe36otULq-liz8,15579
+napistu/network/ng_utils.py,sha256=ahSm-8M2pV662V7MMVcGaoguBM55_y-F7LDmZSVp9ag,15951
 napistu/network/paths.py,sha256=r6LVKVvX7i3ctBA5r-xvHfpH5Zsd0VDHUCtin2iag20,17453
-napistu/network/precompute.py,sha256=ibL0ByY7Wp5kEfIG3LUDpQKdvAeQX0DNkT_46g2YrGc,8367
+napistu/network/precompute.py,sha256=ARU2tktWnxFISaHAY8chpkg8pusZPv7TT5jSIB9eFF0,10081
 napistu/ontologies/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
 napistu/ontologies/constants.py,sha256=GyOFvezSxDK1VigATcruTKtNhjcYaid1ggulEf_HEtQ,4345
 napistu/ontologies/dogma.py,sha256=VVj6NKBgNym4SdOSu8g22OohALj7cbObhIJmdY2Sfy0,8860
@@ -84,8 +85,9 @@ napistu/scverse/__init__.py,sha256=Lgxr3iMQAkTzXE9BNz93CndNP5djzerLvmHM-D0PU3I,3
 napistu/scverse/constants.py,sha256=0iAkhyJUIeFGHdLLU3fCaEU1O3Oix4qAsxr3CxGTjVs,653
 napistu/scverse/loading.py,sha256=jqiE71XB-wdV50GyZrauFNY0Lai4bX9Fm2Gv80VR8t8,27016
 napistu/statistics/__init__.py,sha256=dFXAhIqlTLJMwowS4BUDT08-Vy3Q0u1L0CMCErSZT1Y,239
+napistu/statistics/hypothesis_testing.py,sha256=k0mBFAMF0XHVcKwS26aPnEbq_FIUVwXU1gZ6cKfFbCk,2190
 napistu/statistics/quantiles.py,sha256=1-LnmVzC2CQWxCKUh0yi6YfKrbsZM1-kkD7nu2-aS5s,3042
-napistu-0.4.2.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
+napistu-0.4.3.dist-info/licenses/LICENSE,sha256=kW8wVT__JWoHjl2BbbJDAZInWa9AxzJeR_uv6-i5x1g,1063
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/conftest.py,sha256=t-GHb0MvSsC-MyhkFpOy2K3t5fi7eaig_Rc2xEQC-t8,9678
 tests/test_consensus.py,sha256=Hzfrgp4SpkRDnEMVMD3f0UInSycndB8kKzC4wDDvRas,15076
@@ -114,7 +116,7 @@ tests/test_network_net_propagation.py,sha256=kZeDHD93iMrLVvxO4OyfRH5_vgsYeQyC40O
 tests/test_network_ng_core.py,sha256=w-iNBTtenennJhaLFauk952pEsk7W0-Fa8lPvIRqHyY,628
 tests/test_network_ng_utils.py,sha256=QVVuRnvCRfTSIlGdwQTIF9lr0wOwoc5gGeXAUY_AdgE,713
 tests/test_network_paths.py,sha256=TWZnxY5bF3m6gahcxcYJGrBIawh2-_vUcec1LyPmXV8,1686
-tests/test_network_precompute.py,sha256=zwJrKNC3s8rIrsyAQfQMYxbl8HZXUr7u09nMJ_K8jiU,9005
+tests/test_network_precompute.py,sha256=IPr1KhtxBD0fXx_2TvZqnevrD-Iig35otb8yloRFpRc,10014
 tests/test_ontologies_genodexito.py,sha256=6fINyUiubHZqu7qxye09DQfJXw28ZMAJc3clPb-cCoY,2298
 tests/test_ontologies_id_tables.py,sha256=CpwpbmQvTc1BaVd6jbDKHAVE2etwN0vx93nC8jpnMlE,7265
 tests/test_ontologies_mygene.py,sha256=VkdRcKIWmcG6V-2dpfvsBiOJN5dO-j0RqZNxtJRcyBU,1583
@@ -124,18 +126,18 @@ tests/test_rpy2_callr.py,sha256=V4a-QH5krgYOQRgqzksMzIkGAFjBqKOAqgprxrH6bE0,2904
 tests/test_rpy2_init.py,sha256=T3gnxC1O7XNvYM2P4018ikpPPAy-kwQLm7Erj0RfA-4,5895
 tests/test_sbml.py,sha256=f25zj1NogYrmLluvBDboLameTuCiQ309433Qn3iPvhg,1483
 tests/test_sbml_dfs_core.py,sha256=nnLPpZTVtCznOBohk7CX67x6sMqktJWt-sZMWQKoaDs,26521
-tests/test_sbml_dfs_utils.py,sha256=gWIhzUEtQlOR9c1TiCyhlSAELmWnBSncn6vCEqH5hl0,11029
+tests/test_sbml_dfs_utils.py,sha256=ZD9x2B81fsfYEjAV9wphHOR7ywjNcfvfw1LGNv4PxUA,11471
 tests/test_sbo.py,sha256=x_PENFaXYsrZIzOZu9cj_Wrej7i7SNGxgBYYvcigLs0,308
 tests/test_scverse_loading.py,sha256=bnU1lQSYYWhOAs0IIBoi4ZohqPokDQJ0n_rtkAfEyMU,29948
-tests/test_set_coverage.py,sha256=J-6m6LuOjcQa9pxRuWglSfJk4Ltm7kt_eOrn_Q-7P6Q,1604
-tests/test_source.py,sha256=hT0IlpexR5zP0OhWl5BBaho9d1aCYQlFZLwRIRRnw_Y,1969
+tests/test_source.py,sha256=iV-Yyu8flhIGWF17SCL8msG2bjqwb9w2IZ694b0iZ-o,2985
+tests/test_statistics_hypothesis_testing.py,sha256=qD-oS9zo5JlH-jdtiOrWAKI4nKFuZvvh6361_pFSpIs,2259
 tests/test_statistics_quantiles.py,sha256=yNDeqwgbP-1Rx3C_dLX_wnwT_Lr-iJWClmeKmElqmTE,4984
 tests/test_uncompartmentalize.py,sha256=nAk5kfAVLU9a2VWe2x2HYVcKqj-EnwmwddERIPRax8c,1289
 tests/test_utils.py,sha256=qPSpV-Q9b6vmdycgaDmQqtcvzKnAVnN9j5xJ9x-T6bg,23959
 tests/utils.py,sha256=SoWQ_5roJteFGcMaOeEiQ5ucwq3Z2Fa3AAs9iXHTsJY,749
 tests/test_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-napistu-0.4.2.dist-info/METADATA,sha256=6P_9Mmno6pVu4Me-3QdcMtiGOhCcajTqm5LP_Hns4lI,4078
-napistu-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-napistu-0.4.2.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
-napistu-0.4.2.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
-napistu-0.4.2.dist-info/RECORD,,
+napistu-0.4.3.dist-info/METADATA,sha256=gV0a41vyQ52Ja15QyLSPGfeIJPj6oQRTC00HsxJjG88,4078
+napistu-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+napistu-0.4.3.dist-info/entry_points.txt,sha256=_QnaPOvJNA3IltxmZgWIiBoen-L1bPYX18YQfC7oJgQ,41
+napistu-0.4.3.dist-info/top_level.txt,sha256=Gpvk0a_PjrtqhYcQ9IDr3zR5LqpZ-uIHidQMIpjlvhY,14
+napistu-0.4.3.dist-info/RECORD,,

tests/test_network_precompute.py CHANGED Viewed

@@ -276,3 +276,33 @@ def test_precomputed_distances_serialization():
         # Clean up the temporary file
         if os.path.exists(temp_path):
             os.remove(temp_path)
+def test_filter_precomputed_distances_top_n_subset():
+    # Use a small top_n for a quick test
+    top_n = 5
+    filtered = precompute.filter_precomputed_distances_top_n(
+        precomputed_distances, top_n=top_n
+    )
+    # Check that the filtered DataFrame is a subset of the original
+    merged = filtered.merge(
+        precomputed_distances,
+        on=[
+            precompute.NAPISTU_EDGELIST.SC_ID_ORIGIN,
+            precompute.NAPISTU_EDGELIST.SC_ID_DEST,
+        ],
+        how="left",
+        indicator=True,
+    )
+    assert (
+        merged["_merge"] == "both"
+    ).all(), "Filtered rows must be present in the original DataFrame"
+    # Check that columns are preserved
+    assert set(
+        [
+            precompute.NAPISTU_EDGELIST.SC_ID_ORIGIN,
+            precompute.NAPISTU_EDGELIST.SC_ID_DEST,
+        ]
+    ).issubset(filtered.columns)
+    # Optionally, check that the number of rows is less than or equal to the input
+    assert filtered.shape[0] <= precomputed_distances.shape[0]

tests/test_sbml_dfs_utils.py CHANGED Viewed

@@ -334,3 +334,16 @@ def test_infer_entity_type_errors():
     )  # Two primary keys
     with pytest.raises(ValueError):
         sbml_dfs_utils.infer_entity_type(df)
+def test_infer_entity_type_multindex_reactions():
+    # DataFrame with MultiIndex (r_id, foo), should infer as reactions
+    import pandas as pd
+    from napistu.constants import SBML_DFS
+    df = pd.DataFrame({"some_col": [1, 2]})
+    df.index = pd.MultiIndex.from_tuples(
+        [("rxn1", "a"), ("rxn2", "b")], names=[SBML_DFS.R_ID, "foo"]
+    )
+    result = sbml_dfs_utils.infer_entity_type(df)
+    assert result == SBML_DFS.REACTIONS

tests/test_source.py CHANGED Viewed

@@ -5,6 +5,8 @@ import os
 import pandas as pd
 from napistu import indices
 from napistu import source
+from napistu.network import ng_utils
+from napistu.constants import SBML_DFS
 test_path = os.path.abspath(os.path.join(__file__, os.pardir))
 test_data = os.path.join(test_path, "test_data")
@@ -58,10 +60,40 @@ def test_source_w_pwindex():
     assert source_obj.source.shape == (2, 8)
-################################################
-# __main__
-################################################
+def test_get_minimal_source_edges(sbml_dfs_metabolism):
+    vertices = sbml_dfs_metabolism.reactions.reset_index().rename(
+        columns={SBML_DFS.R_ID: "node"}
+    )
+    minimal_source_edges = ng_utils.get_minimal_sources_edges(
+        vertices, sbml_dfs_metabolism
+    )
+    # print(minimal_source_edges.shape)
+    assert minimal_source_edges.shape == (87, 3)
+def test_source_set_coverage(sbml_dfs_metabolism):
+    source_df = source.unnest_sources(sbml_dfs_metabolism.reactions)
+    # print(source_df.shape)
+    assert source_df.shape == (111, 7)
+    set_coverage = source.source_set_coverage(source_df)
+    # print(set_coverage.shape)
+    assert set_coverage.shape == (87, 6)
+def test_source_set_coverage_enrichment(sbml_dfs_metabolism):
+    source_total_counts = source.get_source_total_counts(
+        sbml_dfs_metabolism, "reactions"
+    )
+    source_df = source.unnest_sources(sbml_dfs_metabolism.reactions).head(40)
+    set_coverage = source.source_set_coverage(
+        source_df, source_total_counts=source_total_counts, sbml_dfs=sbml_dfs_metabolism
+    )
-if __name__ == "__main__":
-    test_source()
-    test_source_w_pwindex()
+    assert set_coverage.shape == (30, 6)

tests/test_statistics_hypothesis_testing.py ADDED Viewed

@@ -0,0 +1,62 @@
+import numpy as np
+from scipy.stats import fisher_exact
+from napistu.statistics import hypothesis_testing
+def test_fisher_exact_vectorized_basic_and_vectorized():
+    # Classic Fisher's test example: [[1, 9], [11, 3]]
+    # a=1, b=9, c=11, d=3
+    odds, p = hypothesis_testing.fisher_exact_vectorized([1], [9], [11], [3])
+    # Odds ratio: (1*3)/(9*11) = 3/99 = 0.0303...
+    assert np.allclose(odds, [3 / 99])
+    assert p.shape == (1,)
+    assert (p >= 0).all() and (p <= 1).all()
+    # Vectorized: two tables
+    odds, p = hypothesis_testing.fisher_exact_vectorized(
+        [1, 2], [9, 8], [11, 10], [3, 4]
+    )
+    assert odds.shape == (2,)
+    assert p.shape == (2,)
+    # Check that odds ratios are correct
+    expected_odds = np.array([(1 * 3) / (9 * 11), (2 * 4) / (8 * 10)])
+    assert np.allclose(odds, expected_odds)
+    # P-values should be between 0 and 1
+    assert (p >= 0).all() and (p <= 1).all()
+def test_fisher_exact_vectorized_vs_scipy():
+    # Define several 2x2 tables
+    tables = [
+        ([1], [9], [11], [3]),
+        ([5], [2], [8], [7]),
+        ([10], [10], [10], [10]),
+        ([0], [5], [5], [10]),
+        ([3], [7], [2], [8]),
+    ]
+    for a, b, c, d in tables:
+        odds_vec, p_vec = hypothesis_testing.fisher_exact_vectorized(a, b, c, d)
+        # Build the table for scipy
+        table = np.array([[a[0], b[0]], [c[0], d[0]]])
+        odds_scipy, p_scipy = fisher_exact(table, alternative="greater")
+        # Odds ratios should be nearly identical
+        assert np.allclose(odds_vec, [odds_scipy], rtol=1e-6, atol=1e-8)
+        # P-values should be close (normal approx vs exact)
+        assert np.allclose(
+            p_vec, [p_scipy], rtol=0.15, atol=1e-3
+        )  # allow some tolerance
+    # Also test vectorized input
+    a = [1, 5, 10, 0, 3]
+    b = [9, 2, 10, 5, 7]
+    c = [11, 8, 10, 5, 2]
+    d = [3, 7, 10, 10, 8]
+    odds_vec, p_vec = hypothesis_testing.fisher_exact_vectorized(a, b, c, d)
+    for i in range(len(a)):
+        table = np.array([[a[i], b[i]], [c[i], d[i]]])
+        odds_scipy, p_scipy = fisher_exact(table, alternative="greater")
+        assert np.allclose(odds_vec[i], odds_scipy, rtol=1e-6, atol=1e-8)
+        assert np.allclose(p_vec[i], p_scipy, rtol=0.15, atol=1e-3)

tests/test_set_coverage.py DELETED Viewed

@@ -1,50 +0,0 @@
-from __future__ import annotations
-from napistu import source
-from napistu.network import ng_utils
-def test_get_minimal_source_edges(sbml_dfs_metabolism):
-    vertices = sbml_dfs_metabolism.reactions.reset_index().rename(
-        columns={"r_id": "node"}
-    )
-    minimal_source_edges = ng_utils.get_minimal_sources_edges(
-        vertices, sbml_dfs_metabolism
-    )
-    # print(minimal_source_edges.shape)
-    assert minimal_source_edges.shape == (87, 3)
-def test_greedy_set_coverge_of_sources(sbml_dfs_metabolism):
-    table_schema = sbml_dfs_metabolism.schema["reactions"]
-    source_df = source.unnest_sources(
-        sbml_dfs_metabolism.reactions, source_var="r_Source"
-    )
-    # print(source_df.shape)
-    assert source_df.shape == (111, 7)
-    set_coverage = source.greedy_set_coverge_of_sources(source_df, table_schema)
-    # print(set_coverage.shape)
-    assert set_coverage.shape == (87, 6)
-################################################
-# __main__
-################################################
-if __name__ == "__main__":
-    import os
-    from napistu import indices
-    from napistu import consensus
-    test_path = os.path.abspath(os.path.join(__file__, os.pardir))
-    test_data = os.path.join(test_path, "test_data")
-    pw_index = indices.PWIndex(os.path.join(test_data, "pw_index_metabolism.tsv"))
-    sbml_dfs_dict = consensus.construct_sbml_dfs_dict(pw_index)
-    sbml_dfs_metabolism = consensus.construct_consensus_model(sbml_dfs_dict, pw_index)
-    test_get_minimal_source_edges(sbml_dfs_metabolism)
-    test_greedy_set_coverge_of_sources(sbml_dfs_metabolism)

{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{napistu-0.4.2.dist-info → napistu-0.4.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

napistu 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

napistu 0.4.2py3-none-any.whl → 0.4.3py3-none-any.whl