PyPI - napistu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

napistu/__init__.py +12 -0
napistu/__main__.py +867 -0
napistu/consensus.py +1557 -0
napistu/constants.py +500 -0
napistu/gcs/__init__.py +10 -0
napistu/gcs/constants.py +69 -0
napistu/gcs/downloads.py +180 -0
napistu/identifiers.py +805 -0
napistu/indices.py +227 -0
napistu/ingestion/__init__.py +10 -0
napistu/ingestion/bigg.py +146 -0
napistu/ingestion/constants.py +296 -0
napistu/ingestion/cpr_edgelist.py +106 -0
napistu/ingestion/identifiers_etl.py +148 -0
napistu/ingestion/obo.py +268 -0
napistu/ingestion/psi_mi.py +276 -0
napistu/ingestion/reactome.py +218 -0
napistu/ingestion/sbml.py +621 -0
napistu/ingestion/string.py +356 -0
napistu/ingestion/trrust.py +285 -0
napistu/ingestion/yeast.py +147 -0
napistu/mechanism_matching.py +597 -0
napistu/modify/__init__.py +10 -0
napistu/modify/constants.py +86 -0
napistu/modify/curation.py +628 -0
napistu/modify/gaps.py +635 -0
napistu/modify/pathwayannot.py +1381 -0
napistu/modify/uncompartmentalize.py +264 -0
napistu/network/__init__.py +10 -0
napistu/network/constants.py +117 -0
napistu/network/neighborhoods.py +1594 -0
napistu/network/net_create.py +1647 -0
napistu/network/net_utils.py +652 -0
napistu/network/paths.py +500 -0
napistu/network/precompute.py +221 -0
napistu/rpy2/__init__.py +127 -0
napistu/rpy2/callr.py +168 -0
napistu/rpy2/constants.py +101 -0
napistu/rpy2/netcontextr.py +464 -0
napistu/rpy2/rids.py +697 -0
napistu/sbml_dfs_core.py +2216 -0
napistu/sbml_dfs_utils.py +304 -0
napistu/source.py +394 -0
napistu/utils.py +943 -0
napistu-0.1.0.dist-info/METADATA +56 -0
napistu-0.1.0.dist-info/RECORD +77 -0
napistu-0.1.0.dist-info/WHEEL +5 -0
napistu-0.1.0.dist-info/entry_points.txt +2 -0
napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
napistu-0.1.0.dist-info/top_level.txt +2 -0
tests/__init__.py +0 -0
tests/conftest.py +83 -0
tests/test_consensus.py +255 -0
tests/test_constants.py +20 -0
tests/test_curation.py +134 -0
tests/test_data/__init__.py +0 -0
tests/test_edgelist.py +20 -0
tests/test_gcs.py +23 -0
tests/test_identifiers.py +151 -0
tests/test_igraph.py +353 -0
tests/test_indices.py +88 -0
tests/test_mechanism_matching.py +126 -0
tests/test_net_utils.py +66 -0
tests/test_netcontextr.py +105 -0
tests/test_obo.py +34 -0
tests/test_pathwayannot.py +95 -0
tests/test_precomputed_distances.py +222 -0
tests/test_rpy2.py +61 -0
tests/test_sbml.py +46 -0
tests/test_sbml_dfs_create.py +307 -0
tests/test_sbml_dfs_utils.py +22 -0
tests/test_sbo.py +11 -0
tests/test_set_coverage.py +50 -0
tests/test_source.py +67 -0
tests/test_uncompartmentalize.py +40 -0
tests/test_utils.py +487 -0
tests/utils.py +30 -0

napistu/ingestion/psi_mi.py ADDED Viewed

@@ -0,0 +1,276 @@
+from __future__ import annotations
+import logging
+import os
+import xml.etree.ElementTree as ET
+from typing import Any
+from napistu import utils
+from napistu.ingestion.constants import PSI_MI_INTACT_DEFAULT_OUTPUT_DIR
+from napistu.ingestion.constants import PSI_MI_INTACT_FTP_URL
+from napistu.ingestion.constants import PSI_MI_INTACT_SPECIES_TO_BASENAME
+from napistu.ingestion.constants import PSI_MI_INTACT_XML_NAMESPACE
+logger = logging.getLogger(__name__)
+def format_psi(
+    xml_path: str, xml_namespace: str = PSI_MI_INTACT_XML_NAMESPACE
+) -> list[dict[str, Any]]:
+    """
+    Format PSI 3.0
+    Format an .xml file containing molecular interactions following the PSI 3.0 format.
+    Args:
+        xml_path (str): path to a .xml file
+        xml_namespace (str): Namespace for the xml file
+    Returns:
+        entry_list (list): a list containing molecular interaction entry dicts of the format:
+            - source : dict containing the database that interactions were drawn from.
+            - experiment : a simple summary of the experimental design and the publication.
+            - interactor_list : list containing dictionaries annotating the molecules
+              (defined by their "interactor_id") involved in interactions.
+            - interactions_list : list containing dictionaries annotating molecular
+              interactions involving a set of "interactor_id"s.
+    """
+    if not os.path.isfile(xml_path):
+        raise FileNotFoundError(f"{xml_path} was not found")
+    et = ET.parse(xml_path)
+    # the root should be an entrySet if this is a PSI 3.0 file
+    entry_set = et.getroot()
+    assert entry_set.tag == PSI_MI_INTACT_XML_NAMESPACE + "entrySet"
+    entry_nodes = entry_set.findall(f"./{PSI_MI_INTACT_XML_NAMESPACE}entry")
+    logger.info(f"Processing {len(entry_nodes)} entries from {xml_path}")
+    formatted_entries = [_format_entry(an_entry) for an_entry in entry_nodes]
+    return formatted_entries
+def _download_intact_species(
+    species: str,
+    output_dir_path: str = PSI_MI_INTACT_DEFAULT_OUTPUT_DIR,
+    overwrite: bool = False,
+):
+    """
+    Download IntAct Species
+    Download the PSM-30 XML files from IntAct for a species of interest.
+    Args:
+        species (str): The species name (Genus species) to work with
+        output_dir_path (str): Local directory to create an unzip files into
+        overwrite (bool): Overwrite an existing output directory. Default: False
+    Returns:
+        None
+    """
+    if species not in PSI_MI_INTACT_SPECIES_TO_BASENAME.keys():
+        raise ValueError(
+            f"The provided species {species} did not match any of the species in INTACT_SPECIES_TO_BASENAME: "
+            f"{', '.join(PSI_MI_INTACT_SPECIES_TO_BASENAME.keys())}"
+        )
+    intact_species_url = os.path.join(
+        PSI_MI_INTACT_FTP_URL, f"{PSI_MI_INTACT_SPECIES_TO_BASENAME[species]}.zip"
+    )
+    logger.info(f"Downloading and unzipping {intact_species_url}")
+    utils.download_and_extract(
+        intact_species_url,
+        output_dir_path=output_dir_path,
+        download_method="ftp",
+        overwrite=overwrite,
+    )
+def _format_entry(an_entry) -> dict[str, Any]:
+    """Extract a single XML entry of interactors and interactions."""
+    assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
+    entry_dict = {
+        "source": _format_entry_source(an_entry),
+        "experiment": _format_entry_experiment(an_entry),
+        "interactor_list": _format_entry_interactor_list(an_entry),
+        "interactions_list": _format_entry_interactions(an_entry),
+    }
+    return entry_dict
+def _format_entry_source(an_entry) -> dict[str, str]:
+    """Format the source describing the provenance of an XML entry."""
+    assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
+    source_names = an_entry.find(
+        f".{PSI_MI_INTACT_XML_NAMESPACE}source/.{PSI_MI_INTACT_XML_NAMESPACE}names"
+    )
+    out = {
+        "short_label": source_names.find(
+            f".{PSI_MI_INTACT_XML_NAMESPACE}shortLabel"
+        ).text,
+        "full_name": source_names.find(f".{PSI_MI_INTACT_XML_NAMESPACE}fullName").text,
+    }
+    return out
+def _format_entry_experiment(an_entry) -> dict[str, str]:
+    """Format experiment-level information in an XML entry."""
+    assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
+    experiment_info = an_entry.find(
+        f".{PSI_MI_INTACT_XML_NAMESPACE}experimentList/.{PSI_MI_INTACT_XML_NAMESPACE}experimentDescription"
+    )
+    primary_ref = experiment_info.find(
+        f".{PSI_MI_INTACT_XML_NAMESPACE}bibref/{PSI_MI_INTACT_XML_NAMESPACE}xref/{PSI_MI_INTACT_XML_NAMESPACE}primaryRef"
+    )
+    out = {
+        "experiment_name": experiment_info.find(
+            f".{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
+        ).text,
+        "interaction_method": experiment_info.find(
+            f".{PSI_MI_INTACT_XML_NAMESPACE}interactionDetectionMethod/{PSI_MI_INTACT_XML_NAMESPACE}"
+            f"names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
+        ).text,
+        "primary_ref_db": primary_ref.attrib["db"],
+        "primary_ref_id": primary_ref.attrib["id"],
+    }
+    return out
+def _format_entry_interactor_list(an_entry) -> list[dict[str, Any]]:
+    """Format the molecular interactors in an XML entry."""
+    assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
+    interactor_list = an_entry.find(f"./{PSI_MI_INTACT_XML_NAMESPACE}interactorList")
+    return [_format_entry_interactor(x) for x in interactor_list]
+def _format_entry_interactor(interactor) -> dict[str, Any]:
+    """Format a single molecular interactor in an interaction list XML node."""
+    assert interactor.tag == PSI_MI_INTACT_XML_NAMESPACE + "interactor"
+    # optional full name
+    interactor_name_node = interactor.find(
+        f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
+    )
+    if interactor_name_node is None:
+        interactor_name_value = ""  # type: ignore
+    else:
+        interactor_name_value = interactor_name_node.text  # type: ignore
+    interactor_aliases = [
+        {"alias_type": x.attrib["type"], "alias_value": x.text}
+        for x in interactor.findall(
+            f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}alias"
+        )
+    ]  # type: ignore
+    out = {
+        "interactor_id": interactor.attrib["id"],
+        "interactor_label": interactor.find(
+            f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}shortLabel"
+        ).text,
+        "interactor_name": interactor_name_value,
+        "interactor_aliases": interactor_aliases,
+        "interactor_xrefs": _format_entry_interactor_xrefs(interactor),
+    }
+    return out
+def _format_entry_interactor_xrefs(interactor) -> list[dict[str, str]]:
+    """Format the cross-references of a single interactor."""
+    assert interactor.tag == PSI_MI_INTACT_XML_NAMESPACE + "interactor"
+    xref_nodes = [
+        *[
+            interactor.find(
+                f"./{PSI_MI_INTACT_XML_NAMESPACE}xref/{PSI_MI_INTACT_XML_NAMESPACE}primaryRef"
+            )
+        ],
+        *interactor.findall(
+            f"./{PSI_MI_INTACT_XML_NAMESPACE}xref/{PSI_MI_INTACT_XML_NAMESPACE}secondaryRef"
+        ),
+    ]
+    out = [
+        {"tag": x.tag, "db": x.attrib["db"], "id": x.attrib["id"]} for x in xref_nodes
+    ]
+    return out
+def _format_entry_interactions(an_entry) -> list[dict[str, Any]]:
+    """Format the molecular interaction in an XML entry."""
+    assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
+    interaction_list = an_entry.find(f"./{PSI_MI_INTACT_XML_NAMESPACE}interactionList")
+    interaction_dicts = [_format_entry_interaction(x) for x in interaction_list]
+    return interaction_dicts
+def _format_entry_interaction(interaction) -> dict[str, Any]:
+    """Format a single interaction in an XML interaction list."""
+    assert interaction.tag == PSI_MI_INTACT_XML_NAMESPACE + "interaction"
+    interaction_name = interaction.find(
+        f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}shortLabel"
+    ).text
+    interaction_participants = interaction.findall(
+        f"./{PSI_MI_INTACT_XML_NAMESPACE}participantList/{PSI_MI_INTACT_XML_NAMESPACE}participant"
+    )
+    # iterate through particpants and format them as a list of dicts
+    interactors = [
+        _format_entry_interaction_participants(x) for x in interaction_participants
+    ]
+    out = {"interaction_name": interaction_name, "interactors": interactors}
+    return out
+def _format_entry_interaction_participants(interaction_participant) -> dict[str, str]:
+    """Format the participants in an XML interaction."""
+    assert interaction_participant.tag == PSI_MI_INTACT_XML_NAMESPACE + "participant"
+    out = {
+        "interactor_id": interaction_participant.attrib["id"],
+        "biological_role": interaction_participant.find(
+            f"./{PSI_MI_INTACT_XML_NAMESPACE}biologicalRole/{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
+        ).text,
+        "experimental_role": interaction_participant.find(
+            f"./{PSI_MI_INTACT_XML_NAMESPACE}experimentalRoleList/{PSI_MI_INTACT_XML_NAMESPACE}experimentalRole/"
+            f"{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}fullName"
+        ).text,
+    }
+    return out

napistu/ingestion/reactome.py ADDED Viewed

@@ -0,0 +1,218 @@
+from __future__ import annotations
+import datetime
+import logging
+import os
+import random
+from io import StringIO
+from typing import Iterable
+import pandas as pd
+import requests
+from napistu import indices
+from napistu import sbml_dfs_core
+from napistu import utils
+from napistu.consensus import construct_consensus_model
+from napistu.consensus import construct_sbml_dfs_dict
+from napistu.ingestion.constants import REACTOME_PATHWAY_INDEX_COLUMNS
+from napistu.ingestion.constants import REACTOME_PATHWAY_LIST_COLUMNS
+from napistu.ingestion.constants import REACTOME_PATHWAYS_URL
+from napistu.ingestion.constants import REACTOME_SBGN_URL
+from napistu.ingestion.constants import REACTOME_SMBL_URL
+from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
+from fs import open_fs
+logger = logging.getLogger(__name__)
+def reactome_sbgn_download(output_dir_path: str, overwrite: bool = False):
+    """
+    Reactome SBGN Download
+    Download all human Reactome SBGN (systems biology graphical notation) files.
+    Args:
+        output_dir_path (str): Paths to a directory where .sbgn files should be saved.
+        overwrite (bool): Overwrite an existing output directory.
+    """
+    utils.download_and_extract(
+        REACTOME_SBGN_URL,
+        output_dir_path=output_dir_path,
+        overwrite=overwrite,
+    )
+    # create the pathway index
+    pw_index = _build_reactome_pw_index(
+        output_dir_path,
+        file_ext="sbgn",
+        # For sbgn only homo sapiens files are available
+        species_filter=(SPECIES_FULL_NAME_HUMAN,),
+    )
+    # save as tsv
+    out_fs = open_fs(output_dir_path)
+    with out_fs.open("pw_index.tsv", "wb") as index_path:
+        pw_index.to_csv(index_path, sep="\t", index=False)
+def reactome_sbml_download(output_dir_path: str, overwrite: bool = False):
+    """
+    Reactome SBML Download
+    Download Reactome SBML (systems biology markup language) for all reactome species.
+    Args:
+        output_dir_path (str): Paths to a directory where .sbml files should be saved.
+        overwrite (bool): Overwrite an existing output directory. Default: False
+    """
+    utils.download_and_extract(
+        REACTOME_SMBL_URL,
+        output_dir_path=output_dir_path,
+        overwrite=overwrite,
+    )
+    # create the pathway index
+    pw_index = _build_reactome_pw_index(output_dir_path, file_ext="sbml")
+    # save as tsv
+    out_fs = open_fs(output_dir_path)
+    with out_fs.open("pw_index.tsv", "wb") as index_path:
+        pw_index.to_csv(index_path, sep="\t", index=False)
+# Functions useful to integrate reactome pathways into a consensus
+def construct_reactome_consensus(
+    pw_index_inp: str | indices.PWIndex,
+    species: str | Iterable[str] | None = None,
+    outdir: str | None = None,
+    strict: bool = True,
+) -> sbml_dfs_core.SBML_dfs:
+    """Constructs a basic consensus model by merging all models from a pw_index
+    Args:
+        pw_index_inp (str | indices.PWIndex): PWIndex or uri pointing to PWIndex
+        species (str | Iterable[str] | None): one or more species to filter by. Default: no filtering
+        outdir (str | None, optional): output directory used to cache results. Defaults to None.
+        strict (bool): should failure of loading any given model throw an exception? If False a warning is thrown.
+    Returns:
+        sbml_dfs_core.SBML_dfs: A consensus SBML
+    """
+    if isinstance(pw_index_inp, str):
+        pw_index = indices.adapt_pw_index(pw_index_inp, species=species, outdir=outdir)
+    elif isinstance(pw_index_inp, indices.PWIndex):
+        pw_index = pw_index_inp
+    else:
+        raise ValueError("pw_index_inp needs to be a PWIndex or a str to a location.")
+    if outdir is not None:
+        construct_sbml_dfs_dict_fkt = utils.pickle_cache(
+            os.path.join(outdir, "model_pool.pkl")
+        )(construct_sbml_dfs_dict)
+        construct_consensus_model_fkt = utils.pickle_cache(
+            os.path.join(outdir, "consensus.pkl")
+        )(construct_consensus_model)
+    else:
+        construct_sbml_dfs_dict_fkt = construct_sbml_dfs_dict
+        construct_consensus_model_fkt = construct_consensus_model
+    sbml_dfs_dict = construct_sbml_dfs_dict_fkt(pw_index, strict)
+    consensus_model = construct_consensus_model_fkt(sbml_dfs_dict, pw_index)
+    return consensus_model
+def _build_reactome_pw_index(
+    output_dir: str,
+    file_ext: str,
+    species_filter: Iterable[str] | None = None,
+) -> pd.DataFrame:
+    """Build a reactome pathway index
+    Builds the index based on available files and cross-checkes it with the
+    expected reactome pathway list.
+    Args:
+        output_dir (str): File directory
+        file_ext (str): File extension
+        species_filter (Optional[Iterable[str]], optional): Filter the expected
+            pathway list based on a list of species. Eg in cases only one species available. Defaults to None.
+    Returns:
+        pd.DataFrame: pathway index
+    """
+    # create the pathway index
+    out_fs = open_fs(output_dir)
+    all_files = [os.path.basename(f.path) for f in out_fs.glob(f"**/*.{file_ext}")]
+    if len(all_files) == 0:
+        raise ValueError(f"Zero files in {output_dir} have the {file_ext} extension")
+    pw_index = pd.DataFrame({"file": all_files}).assign(source="Reactome")
+    pw_index["pathway_id"] = [os.path.splitext(x)[0] for x in pw_index["file"]]
+    # test before merging
+    pathway_list = _get_reactome_pathway_list()
+    if species_filter is not None:
+        pathway_list = pathway_list.loc[pathway_list["species"].isin(species_filter)]
+    _check_reactome_pw_index(pw_index, pathway_list)
+    pw_index = pw_index.merge(pathway_list)
+    pw_index = pw_index[REACTOME_PATHWAY_INDEX_COLUMNS]
+    pw_index["date"] = datetime.date.today().strftime("%Y%m%d")
+    return pw_index
+def _check_reactome_pw_index(pw_index: indices.PWIndex, reactome_pathway_list: list):
+    """Compare local files defined in the pathway index to a list of Reactome's pathways."""
+    # check extension in pw_index
+    extn = set([os.path.splitext(x)[1] for x in pw_index["file"]])
+    assert len(extn) == 1
+    assert len(extn.intersection(set([".sbgn", ".sbml"]))) == 1
+    extn_string = extn.pop()
+    local_reactome_pws = set(pw_index["pathway_id"])
+    remote_reactome_pws = set(reactome_pathway_list["pathway_id"])
+    extra_local = local_reactome_pws.difference(remote_reactome_pws)
+    if len(extra_local) != 0:
+        n_samples = min(5, len(extra_local))
+        local_str = ", ".join(random.sample(list(extra_local), n_samples))
+        logger.warning(
+            f"{len(extra_local)} Reactome {extn_string} files were detected "
+            "which are not found in reactome.get_reactome_pathway_list(). "
+            f"The include {local_str}. "
+            "These files will be excluded from the pathway index"
+        )
+    extra_remote = remote_reactome_pws.difference(local_reactome_pws)
+    if len(extra_remote) != 0:
+        n_samples = min(5, len(extra_remote))
+        remote_str = ", ".join(random.sample(list(extra_remote), n_samples))
+        logger.warning(
+            f"{len(extra_remote)} Reactome {extn_string} files were missing "
+            "which should be present based on reactome.get_reactome_pathway_list(). "
+            f"These include {remote_str}."
+        )
+    return None
+def _get_reactome_pathway_list():
+    """Reactome Pathway List
+    Produce a pd.DataFrame listing all pathways in reactome and their internal ids
+    Parameters:
+        None
+    Returns:
+        pd.DataFrame containing pathway_id, name and species
+    """
+    page = requests.get(REACTOME_PATHWAYS_URL)
+    if page.status_code != 200:
+        raise ValueError(
+            f"Reactome data could not be accessed at {REACTOME_PATHWAYS_URL}"
+        )
+    StringData = StringIO(page.content.decode())
+    df = pd.read_csv(StringData, sep="\t", names=REACTOME_PATHWAY_LIST_COLUMNS)
+    return df