PyPI - napistu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

napistu/__init__.py +12 -0
napistu/__main__.py +867 -0
napistu/consensus.py +1557 -0
napistu/constants.py +500 -0
napistu/gcs/__init__.py +10 -0
napistu/gcs/constants.py +69 -0
napistu/gcs/downloads.py +180 -0
napistu/identifiers.py +805 -0
napistu/indices.py +227 -0
napistu/ingestion/__init__.py +10 -0
napistu/ingestion/bigg.py +146 -0
napistu/ingestion/constants.py +296 -0
napistu/ingestion/cpr_edgelist.py +106 -0
napistu/ingestion/identifiers_etl.py +148 -0
napistu/ingestion/obo.py +268 -0
napistu/ingestion/psi_mi.py +276 -0
napistu/ingestion/reactome.py +218 -0
napistu/ingestion/sbml.py +621 -0
napistu/ingestion/string.py +356 -0
napistu/ingestion/trrust.py +285 -0
napistu/ingestion/yeast.py +147 -0
napistu/mechanism_matching.py +597 -0
napistu/modify/__init__.py +10 -0
napistu/modify/constants.py +86 -0
napistu/modify/curation.py +628 -0
napistu/modify/gaps.py +635 -0
napistu/modify/pathwayannot.py +1381 -0
napistu/modify/uncompartmentalize.py +264 -0
napistu/network/__init__.py +10 -0
napistu/network/constants.py +117 -0
napistu/network/neighborhoods.py +1594 -0
napistu/network/net_create.py +1647 -0
napistu/network/net_utils.py +652 -0
napistu/network/paths.py +500 -0
napistu/network/precompute.py +221 -0
napistu/rpy2/__init__.py +127 -0
napistu/rpy2/callr.py +168 -0
napistu/rpy2/constants.py +101 -0
napistu/rpy2/netcontextr.py +464 -0
napistu/rpy2/rids.py +697 -0
napistu/sbml_dfs_core.py +2216 -0
napistu/sbml_dfs_utils.py +304 -0
napistu/source.py +394 -0
napistu/utils.py +943 -0
napistu-0.1.0.dist-info/METADATA +56 -0
napistu-0.1.0.dist-info/RECORD +77 -0
napistu-0.1.0.dist-info/WHEEL +5 -0
napistu-0.1.0.dist-info/entry_points.txt +2 -0
napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
napistu-0.1.0.dist-info/top_level.txt +2 -0
tests/__init__.py +0 -0
tests/conftest.py +83 -0
tests/test_consensus.py +255 -0
tests/test_constants.py +20 -0
tests/test_curation.py +134 -0
tests/test_data/__init__.py +0 -0
tests/test_edgelist.py +20 -0
tests/test_gcs.py +23 -0
tests/test_identifiers.py +151 -0
tests/test_igraph.py +353 -0
tests/test_indices.py +88 -0
tests/test_mechanism_matching.py +126 -0
tests/test_net_utils.py +66 -0
tests/test_netcontextr.py +105 -0
tests/test_obo.py +34 -0
tests/test_pathwayannot.py +95 -0
tests/test_precomputed_distances.py +222 -0
tests/test_rpy2.py +61 -0
tests/test_sbml.py +46 -0
tests/test_sbml_dfs_create.py +307 -0
tests/test_sbml_dfs_utils.py +22 -0
tests/test_sbo.py +11 -0
tests/test_set_coverage.py +50 -0
tests/test_source.py +67 -0
tests/test_uncompartmentalize.py +40 -0
tests/test_utils.py +487 -0
tests/utils.py +30 -0

napistu/network/precompute.py ADDED Viewed

@@ -0,0 +1,221 @@
+from __future__ import annotations
+import logging
+import math
+import igraph as ig
+import numpy as np
+import pandas as pd
+from napistu.network import net_utils
+logger = logging.getLogger(__name__)
+def precompute_distances(
+    cpr_graph: ig.Graph,
+    max_steps: int = -1,
+    max_score_q: float = float(1),
+    partition_size: int = int(5000),
+    weights_vars: list[str] = ["weights", "upstream_weights"],
+) -> pd.DataFrame:
+    """
+    Pre-Compute Distances
+    Parameters
+    ----------
+    cpr_graph: ig.Graph
+        An igraph network model
+    max_steps: int
+        The maximum number of steps between pairs of species to save a distance
+    max_score_q: float
+        Retain up to the "max_score_q" quantiles of all scores (small scores are better)
+    partition_size: int
+        The number of species to process together when computing distances. Decreasing this
+        value will lower the overall memory footprint of distance calculation.
+    weights_vars: list
+        One or more variables defining edge weights to use when calculating weighted
+        shortest paths. Shortest paths will be separately calculated with each type of
+        weights and used to construct path weights named according to 'path_{weight_var}'
+    Returns:
+    ----------
+    A pd.DataFrame containing:
+    - sc_id_origin: origin node
+    - sc_id_dest: destination node
+    - path_length: minimum path length between from and to
+    - path_weight*: minimum path weight between from and to (formed by summing the weights of individual edges).
+      *One variable will exist for each weight specified in 'weights_vars'
+    """
+    if max_steps == -1:
+        max_steps = int(100000)
+    # validate inputs
+    if max_steps < 1:
+        raise ValueError(f"max_steps must >= 1, but was {max_steps}")
+    if (max_score_q < 0) or (max_score_q > 1):
+        raise ValueError(f"max_score_q must be between 0 and 1 but was {max_score_q}")
+    # make sure weight vars exist
+    net_utils._validate_edge_attributes(cpr_graph, weights_vars)
+    # assign molecular species to partitions
+    vs_to_partition = pd.DataFrame(
+        {"sc_id": cpr_graph.vs["name"], "node_type": cpr_graph.vs["node_type"]}
+    ).query("node_type == 'species'")
+    n_paritions = math.ceil(vs_to_partition.shape[0] / partition_size)
+    vs_to_partition["partition"] = vs_to_partition.index % n_paritions
+    vs_to_partition = vs_to_partition.set_index("partition").sort_index()
+    # interate through all partitions of "from" nodes and find their shortest and lowest weighted paths
+    unique_partitions = vs_to_partition.index.unique().tolist()
+    precomputed_distances = (
+        pd.concat(
+            [
+                _calculate_distances_subset(
+                    cpr_graph,
+                    vs_to_partition,
+                    vs_to_partition.loc[uq_part],
+                    weights_vars=weights_vars,
+                )
+                for uq_part in unique_partitions
+            ]
+        )
+        .reset_index(drop=True)
+        .query("sc_id_origin != sc_id_dest")
+    )
+    # filter by path length and/or weight
+    filtered_precomputed_distances = _filter_precomputed_distances(
+        precomputed_distances=precomputed_distances,
+        max_steps=max_steps,
+        max_score_q=max_score_q,
+        path_weights_vars=["path_" + w for w in weights_vars],
+    )
+    return filtered_precomputed_distances
+def _calculate_distances_subset(
+    cpr_graph: ig.Graph,
+    vs_to_partition: pd.DataFrame,
+    one_partition: pd.DataFrame,
+    weights_vars: list[str] = ["weights", "upstream_weights"],
+) -> pd.DataFrame:
+    """Calculate distances from a subset of vertices to all vertices."""
+    d_steps = (
+        pd.DataFrame(
+            np.array(
+                cpr_graph.distances(
+                    source=one_partition["sc_id"], target=vs_to_partition["sc_id"]
+                )
+            ),
+            index=one_partition["sc_id"].rename("sc_id_origin"),
+            columns=vs_to_partition["sc_id"].rename("sc_id_dest"),
+        )
+        .reset_index()
+        .melt("sc_id_origin", value_name="path_length")
+        .replace([np.inf, -np.inf], np.nan, inplace=False)
+        .dropna()
+    )
+    d_weights_list = list()
+    for weight_type in weights_vars:
+        d_weights_subset = (
+            pd.DataFrame(
+                np.array(
+                    cpr_graph.distances(
+                        source=one_partition["sc_id"],
+                        target=vs_to_partition["sc_id"],
+                        weights=weight_type,
+                    )
+                ),
+                index=one_partition["sc_id"].rename("sc_id_origin"),
+                columns=vs_to_partition["sc_id"].rename("sc_id_dest"),
+            )
+            .reset_index()
+            .melt("sc_id_origin", value_name=f"path_{weight_type}")
+            .replace([np.inf, -np.inf], np.nan, inplace=False)
+            .dropna()
+        )
+        d_weights_list.append(d_weights_subset)
+    d_weights = d_weights_list.pop()
+    while len(d_weights_list) > 0:
+        d_weights = d_weights.merge(d_weights_list.pop())
+    # merge shortest path distances by length and by weight
+    # note: these may be different paths! e.g., a longer path may have a lower weight than a short one
+    path_summaries = d_steps.merge(
+        d_weights,
+        left_on=["sc_id_origin", "sc_id_dest"],
+        right_on=["sc_id_origin", "sc_id_dest"],
+    )
+    # return connected species
+    return path_summaries
+def _filter_precomputed_distances(
+    precomputed_distances: pd.DataFrame,
+    max_steps: float | int = np.inf,
+    max_score_q: float = 1,
+    path_weights_vars: list[str] = ["path_weights", "path_upstream_weights"],
+) -> pd.DataFrame:
+    """Filter precomputed distances by maximum steps and/or to low scores by quantile."""
+    # filter by path lengths
+    short_precomputed_distances = precomputed_distances[
+        precomputed_distances["path_length"] <= max_steps
+    ]
+    n_filtered_by_path_length = (
+        precomputed_distances.shape[0] - short_precomputed_distances.shape[0]
+    )
+    if n_filtered_by_path_length > 0:
+        logger.info(
+            f"filtered {n_filtered_by_path_length} possible paths with length > {max_steps}"
+        )
+    # filter by path weights
+    for wt_var in path_weights_vars:
+        score_q_cutoff = np.quantile(short_precomputed_distances[wt_var], max_score_q)
+        short_precomputed_distances.loc[
+            short_precomputed_distances[wt_var] > score_q_cutoff, wt_var
+        ] = np.nan
+    valid_weights = short_precomputed_distances[path_weights_vars].dropna(how="all")
+    low_weight_precomputed_distances = short_precomputed_distances[
+        short_precomputed_distances.index.isin(valid_weights.index.tolist())
+    ]
+    n_filtered_by_low_weight = (
+        short_precomputed_distances.shape[0] - low_weight_precomputed_distances.shape[0]
+    )
+    if n_filtered_by_low_weight > 0:
+        logger.info(
+            f"filtered {n_filtered_by_low_weight} possible paths with path weights greater"
+        )
+        logger.info(f"than the {max_score_q} quantile of the path weight distribution")
+    weight_nan_summary = valid_weights.isnull().sum()
+    if any(weight_nan_summary != 0):
+        nan_summary = " and ".join(
+            [
+                f"{k} has {v} np.nan values"
+                for k, v in weight_nan_summary.to_dict().items()
+            ]
+        )
+        logger.info(nan_summary)
+    return low_weight_precomputed_distances

napistu/rpy2/__init__.py ADDED Viewed

@@ -0,0 +1,127 @@
+from __future__ import annotations
+import functools
+import logging
+import os
+import sys
+logger = logging.getLogger(__name__)
+try:
+    import rpy2  # noqa
+    has_rpy2 = True
+    from rpy2.robjects import conversion, default_converter  # noqa
+    from rpy2.robjects.packages import importr  # noqa
+except ImportError:
+    has_rpy2 = False
+    logger.warning(
+        "rpy2 is not installed. "
+        "Some functions will not work. "
+        "Consider installing `cpr[rpy2]`."
+    )
+except Exception as e:
+    has_rpy2 = False
+    print(e)
+    logger.warning("rpy2 initialization failed with an unrecognized exception.")
+def warn_if_no_rpy2(func):
+    @functools.wraps(func)
+    def warn_if_no_rpy2_wrapper(*args, **kwargs):
+        if not has_rpy2:
+            raise ImportError(
+                "This function requires `rpy2`. \n"
+                "Please install `cpr` with the `rpy2` extra dependencies. \n"
+                "For example: `pip install cpr[rpy2]`\n"
+            )
+        return func(*args, **kwargs)
+    return warn_if_no_rpy2_wrapper
+def rsession_info() -> None:
+    # report summaries of the R installation found by rpy2
+    # default converters bundled with rpy2 are used
+    # for this step rather than those bundled with rpy2_arrow
+    # because rpy2_arrow requires the arrow R package so
+    # it can be difficult to import this package without
+    # a valid R setup.
+    with conversion.localconverter(default_converter):
+        base = importr("base")
+        utils = importr("utils")
+        lib_paths = base._libPaths()
+        session_info = utils.sessionInfo()
+        logger.warning(
+            "An exception occurred when running some rpy2-related functionality\n"
+            "Here is a summary of your R session\n"
+            f"Using R version in {base.R_home()[0]}\n"
+            ".libPaths ="
+        )
+        logger.warning("\n".join(lib_paths))
+        logger.warning(f"sessionInfo = {session_info}")
+        # suggest a fix
+        logger.warning(_r_homer_warning())
+    return None
+def _r_homer_warning() -> None:
+    # utility function to suggest installation directions for R
+    # as part of rsession
+    is_conda = os.path.exists(os.path.join(sys.prefix, "conda-meta"))
+    if is_conda:
+        r_lib_path = os.path.join(sys.prefix, "lib", "R")
+        if os.path.isdir(r_lib_path):
+            logging.warning(
+                "You seem to be working in a conda environment with R installed.\n"
+                "If this version was not located by rpy2 then then try to set R_HOME using:\n"
+                f"os.environ['R_HOME'] = {r_lib_path}"
+            )
+        else:
+            logging.warning(
+                "You seem to be working in a conda environment but R is NOT installed.\n"
+                "If this is the case then install R, the CPR R package and the R arrow package into your\n"
+                "conda environment and then set the R_HOME environmental variable using:\n"
+                "os.environ['R_HOME'] = <<PATH_TO_R_lib/R>>"
+            )
+    else:
+        logging.warning(
+            "If you don't have R installed or if your desired R library does not match the\n"
+            "one above, then set your R_HOME environmental variable using:\n"
+            "os.environ['R_HOME'] = <<PATH_TO_lib/R>>"
+        )
+    return None
+def report_r_exceptions(function):
+    @functools.wraps(function)
+    def report_r_exceptions_wrapper(*args, **kwargs):
+        if not has_rpy2:
+            raise ImportError(
+                "This function requires `rpy2`. \n"
+                "Please install `cpr` with the `rpy2` extra dependencies. \n"
+                "For example: `pip install cpr[rpy2]`\n"
+            )
+        try:
+            return function(*args, **kwargs)
+        except Exception as e:
+            # log the exception
+            err = "There was an exception in  "
+            err += function.__name__
+            logger.warning(err)
+            # report session info
+            rsession_info()
+            # re-raise the exception
+            raise e
+    return report_r_exceptions_wrapper

napistu/rpy2/callr.py ADDED Viewed

@@ -0,0 +1,168 @@
+from __future__ import annotations
+import pandas as pd
+from napistu.rpy2 import has_rpy2
+from napistu.rpy2 import report_r_exceptions
+from napistu.rpy2 import rsession_info
+from napistu.rpy2 import warn_if_no_rpy2
+if has_rpy2:
+    from rpy2.robjects import pandas2ri
+    from rpy2.robjects.packages import importr
+    from rpy2.robjects.packages import InstalledSTPackage, InstalledPackage
+    import pyarrow
+    # loading rpy2_arrow checks whether the R arrow package is found
+    # this is the first time when a non-standard R package is loaded
+    # so a bad R setup can cause issues at this stage
+    # rsession_info() adds some helpful debugging information
+    try:
+        import rpy2_arrow.arrow as pyra
+    except Exception as e:
+        rsession_info()
+        raise e
+    import rpy2.robjects.conversion
+    import rpy2.rinterface
+    import rpy2.robjects as ro
+@warn_if_no_rpy2
+@report_r_exceptions
+def get_rcpr(
+    r_paths: list[str] | None = None,
+):
+    """
+    Get rcpr
+    Gets the rcpr R package
+    Args:
+        r_paths (list[str]):
+            Paths to add to .libPaths() in R
+    Returns:
+        rcpr R package
+    """
+    _ = get_rbase(r_paths)
+    # connect the cpr R package
+    rcpr = importr("rcpr")
+    return rcpr
+@warn_if_no_rpy2
+@report_r_exceptions
+def bioconductor_org_r_function(
+    object_type: str, species: str, r_paths: list[str] | None = None
+):
+    """
+    Bioconuctor Organism R Function
+    Calls "bioconductor_org_function" from the R cpr package to pull a mapping object
+    out of a species specific library.
+    Parameters:
+    object_type (str):
+        Type of function to call
+    species (str):
+        Species name
+    r_paths: list(str):
+        Paths to add to .libPaths() in R. Alternatively consider setting the R_HOME env variable.
+    Returns:
+    pd.DataFrame or a function for non-tabular results
+    """
+    _ = get_rbase(r_paths)
+    # connect the cpr R package
+    cpr = importr("rcpr")
+    results = cpr.bioconductor_org_function(object_type, species)
+    return results
+@report_r_exceptions
+def get_rbase(
+    r_paths: list[str] | None = None,
+) -> InstalledSTPackage | InstalledPackage:
+    """Get the base R package
+    Args:
+        r_paths (list[str], optional): Optional additional
+            r_paths. Defaults to None.
+    Returns:
+        _type_: _description_
+    """
+    base = importr("base")
+    if r_paths is not None:
+        base._libPaths(r_paths)
+    return base
+@warn_if_no_rpy2
+@report_r_exceptions
+def pandas_to_r_dataframe(df: pd.DataFrame) -> rpy2.robjects.DataFrame:
+    """Convert a pandas dataframe to an R dataframe
+    This uses the rpy2-arrow functionality
+    to increase the performance of conversion orders of magnitude.
+    Args:
+        df (pd.DataFrame): Pandas dataframe
+    Returns:
+        rpy2.robjects.DataFrame: R dataframe
+    """
+    conv = _get_py2rpy_pandas_conv()
+    with (ro.default_converter + conv).context():
+        r_df = ro.conversion.get_conversion().py2rpy(df)
+    return r_df
+@warn_if_no_rpy2
+@report_r_exceptions
+def r_dataframe_to_pandas(rdf: rpy2.robjects.DataFrame) -> pd.DataFrame:
+    """Convert an R dataframe to a pandas dataframe
+    Args:
+        rdf (rpy2.robjects.DataFrame): R dataframe
+    Returns:
+        pd.DataFrame: Pandas dataframe
+    """
+    with (ro.default_converter + pandas2ri.converter).context():
+        df = ro.conversion.get_conversion().rpy2py(rdf)
+    return df
+@warn_if_no_rpy2
+@report_r_exceptions
+def _get_py2rpy_pandas_conv():
+    """Get the py2rpy arrow converter for pandas
+    This is a high-performance converter using
+    the rpy2-arrow functionality:
+    https://rpy2.github.io/rpy2-arrow/version/main/html/index.html
+    Returns:
+        Callable: The converter function
+    """
+    base = get_rbase()
+    # We use the converter included in rpy2-arrow as template.
+    conv = rpy2.robjects.conversion.Converter(
+        "Pandas to data.frame", template=pyra.converter
+    )
+    @conv.py2rpy.register(pd.DataFrame)
+    def py2rpy_pandas(dataf):
+        pa_tbl = pyarrow.Table.from_pandas(dataf)
+        # pa_tbl is a pyarrow table, and this is something
+        # that the converter shipping with rpy2-arrow knows
+        # how to handle.
+        return base.as_data_frame(pa_tbl)
+    return conv

napistu/rpy2/constants.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Module for Rpy2 module-specific constants"""
+# Contextualization
+# Proteinatlas
+from __future__ import annotations
+from types import SimpleNamespace
+from napistu.constants import ONTOLOGIES
+from napistu.constants import MINI_SBO_FROM_NAME
+# available ontologies for mapping via bioconductor "org" packages as part of rpy2.rids
+# ontologies which are valid to map to and/or from when adding annotations to an SBML_dfs model.
+BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES = {
+    ONTOLOGIES.ENSEMBL_GENE,
+    ONTOLOGIES.ENSEMBL_TRANSCRIPT,
+    ONTOLOGIES.ENSEMBL_PROTEIN,
+    ONTOLOGIES.NCBI_ENTREZ_GENE,
+    ONTOLOGIES.UNIPROT,
+    ONTOLOGIES.GENE_NAME,
+    ONTOLOGIES.SYMBOL,
+}
+# bioc ontologies used for linking systematic identifiers
+# (entrez is not part of this list because it forms the gene index)
+BIOC_DOGMATIC_MAPPING_ONTOLOGIES = {
+    ONTOLOGIES.ENSEMBL_GENE,
+    ONTOLOGIES.ENSEMBL_TRANSCRIPT,
+    ONTOLOGIES.ENSEMBL_PROTEIN,
+    ONTOLOGIES.UNIPROT,
+    ONTOLOGIES.GENE_NAME,
+    ONTOLOGIES.SYMBOL,
+}
+BIOC_PROTEIN_ONTOLOGIES = [ONTOLOGIES.UNIPROT, ONTOLOGIES.ENSEMBL_PROTEIN]
+BIOC_GENE_ONTOLOGIES = [
+    ONTOLOGIES.NCBI_ENTREZ_GENE,
+    ONTOLOGIES.ENSEMBL_GENE,
+    ONTOLOGIES.ENSEMBL_TRANSCRIPT,
+]
+BIOC_NAME_ONTOLOGIES = {
+    ONTOLOGIES.GENE_NAME: 0,
+    ONTOLOGIES.SYMBOL: 1,
+    ONTOLOGIES.UNIPROT: 2,
+    ONTOLOGIES.ENSEMBL_PROTEIN: 3,
+}
+# prefixes for bioconductor mapping tables
+BIOC_NOMENCLATURE = SimpleNamespace(
+    CHR_TBL="CHR",
+    ENSG_TBL="ENSEMBL",
+    ENST_TBL="ENSEMBLTRANS",
+    ENSP_TBL="ENSEMBLPROT",
+    UNIPROT_TBL="UNIPROT",
+    NAME_TBL="GENENAME",
+    SYMBOL_TBL="SYMBOL",
+    CHROMOSOME="chromosome",
+    NCBI_ENTREZ_GENE="gene_id",
+    ENSEMBL_GENE="ensembl_id",
+    ENSEMBL_TRANSCRIPT="trans_id",
+    ENSEMBL_PROTEIN="prot_id",
+    UNIPROT="uniprot_id",
+    GENE_NAME="gene_name",
+    SYMBOL="symbol",
+)
+# netcontextr constants
+COL_GENE = "gene"
+COL_PROTEIN_1 = "protein1"
+COL_PROTEIN_2 = "protein2"
+FIELD_INTERACTIONS = "interactions"
+FIELD_GENES = "genes"
+FIELD_REACTIONS = "reactions"
+# Netcontextr reactions
+COL_ROLE = "role"
+COL_REACTION_ID = "reaction_id"
+COL_STOICHIOMETRY = "stoi"
+SBO_TERM_MAP = {
+    "reactant": "substrate",
+    "product": "product",
+    "catalyst": "catalyst",
+    "interactor": "interactor",
+    "stimulator": "activator",
+    "inhibitor": "inhibitor",
+}
+NETCONTEXTR_ONTOLOGY = "ensembl_gene"
+def _map_sbo_identifiers() -> dict[str, str]:
+    """Map sbo identifiers to netcontextr identifiers"""
+    sbo_map = {MINI_SBO_FROM_NAME[k]: v for k, v in SBO_TERM_MAP.items()}
+    return sbo_map
+NETCONTEXTR_SBO_MAP = _map_sbo_identifiers()