PyPI - napistu - Versions diffs - 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

napistu 0.2.5.dev6py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

napistu/__main__.py +126 -96
napistu/constants.py +35 -41
napistu/context/__init__.py +10 -0
napistu/context/discretize.py +462 -0
napistu/context/filtering.py +387 -0
napistu/gcs/__init__.py +1 -1
napistu/identifiers.py +74 -15
napistu/indices.py +68 -0
napistu/ingestion/__init__.py +1 -1
napistu/ingestion/bigg.py +47 -62
napistu/ingestion/constants.py +18 -133
napistu/ingestion/gtex.py +113 -0
napistu/ingestion/hpa.py +147 -0
napistu/ingestion/sbml.py +0 -97
napistu/ingestion/string.py +2 -2
napistu/matching/__init__.py +10 -0
napistu/matching/constants.py +18 -0
napistu/matching/interactions.py +518 -0
napistu/matching/mount.py +529 -0
napistu/matching/species.py +510 -0
napistu/mcp/__init__.py +7 -4
napistu/mcp/__main__.py +128 -72
napistu/mcp/client.py +16 -25
napistu/mcp/codebase.py +201 -153
napistu/mcp/component_base.py +170 -0
napistu/mcp/config.py +223 -0
napistu/mcp/constants.py +45 -2
napistu/mcp/documentation.py +253 -136
napistu/mcp/documentation_utils.py +13 -48
napistu/mcp/execution.py +372 -305
napistu/mcp/health.py +49 -67
napistu/mcp/profiles.py +10 -6
napistu/mcp/server.py +161 -80
napistu/mcp/tutorials.py +139 -87
napistu/modify/__init__.py +1 -1
napistu/modify/gaps.py +1 -1
napistu/network/__init__.py +1 -1
napistu/network/constants.py +101 -34
napistu/network/data_handling.py +388 -0
napistu/network/ig_utils.py +351 -0
napistu/network/napistu_graph_core.py +354 -0
napistu/network/neighborhoods.py +40 -40
napistu/network/net_create.py +373 -309
napistu/network/net_propagation.py +47 -19
napistu/network/{net_utils.py → ng_utils.py} +124 -272
napistu/network/paths.py +67 -51
napistu/network/precompute.py +11 -11
napistu/ontologies/__init__.py +10 -0
napistu/ontologies/constants.py +129 -0
napistu/ontologies/dogma.py +243 -0
napistu/ontologies/genodexito.py +649 -0
napistu/ontologies/mygene.py +369 -0
napistu/ontologies/renaming.py +198 -0
napistu/rpy2/__init__.py +229 -86
napistu/rpy2/callr.py +47 -77
napistu/rpy2/constants.py +24 -23
napistu/rpy2/rids.py +61 -648
napistu/sbml_dfs_core.py +587 -222
napistu/scverse/__init__.py +15 -0
napistu/scverse/constants.py +28 -0
napistu/scverse/loading.py +727 -0
napistu/utils.py +118 -10
{napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
napistu-0.3.1.dist-info/RECORD +133 -0
tests/conftest.py +22 -0
tests/test_context_discretize.py +56 -0
tests/test_context_filtering.py +267 -0
tests/test_identifiers.py +100 -0
tests/test_indices.py +65 -0
tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
tests/test_matching_interactions.py +108 -0
tests/test_matching_mount.py +305 -0
tests/test_matching_species.py +394 -0
tests/test_mcp_config.py +193 -0
tests/test_mcp_documentation_utils.py +12 -3
tests/test_mcp_server.py +356 -0
tests/test_network_data_handling.py +397 -0
tests/test_network_ig_utils.py +23 -0
tests/test_network_neighborhoods.py +19 -0
tests/test_network_net_create.py +459 -0
tests/test_network_ng_utils.py +30 -0
tests/test_network_paths.py +56 -0
tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
tests/test_ontologies_genodexito.py +58 -0
tests/test_ontologies_mygene.py +39 -0
tests/test_ontologies_renaming.py +110 -0
tests/test_rpy2_callr.py +79 -0
tests/test_rpy2_init.py +151 -0
tests/test_sbml.py +0 -31
tests/test_sbml_dfs_core.py +134 -10
tests/test_scverse_loading.py +778 -0
tests/test_set_coverage.py +2 -2
tests/test_utils.py +121 -1
napistu/mechanism_matching.py +0 -1353
napistu/rpy2/netcontextr.py +0 -467
napistu-0.2.5.dev6.dist-info/RECORD +0 -97
tests/test_igraph.py +0 -367
tests/test_mechanism_matching.py +0 -784
tests/test_net_utils.py +0 -149
tests/test_netcontextr.py +0 -105
tests/test_rpy2.py +0 -61
/napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
{napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
{napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
{napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
{napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
/tests/{test_obo.py → test_ingestion_obo.py} +0 -0

napistu/ingestion/bigg.py CHANGED Viewed

@@ -1,21 +1,16 @@
 from __future__ import annotations
-import datetime
 import logging
 import os
 from typing import Iterable
-import pandas as pd
 from napistu import indices
 from napistu import sbml_dfs_core
 from napistu import utils
 from napistu.consensus import construct_sbml_dfs_dict
-from napistu.ingestion import sbml
-from napistu.ingestion.constants import BIGG_MODEL_FIELD_SPECIES
-from napistu.ingestion.constants import BIGG_MODEL_FIELD_URL
+from napistu.ontologies.renaming import rename_species_ontologies
 from napistu.ingestion.constants import BIGG_MODEL_KEYS
 from napistu.ingestion.constants import BIGG_MODEL_URLS
-from napistu.ingestion.constants import BIGG_RECON3D_FIELD_ANNOTATION
 from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
 from napistu.ingestion.constants import SPECIES_FULL_NAME_MOUSE
 from napistu.ingestion.constants import SPECIES_FULL_NAME_YEAST
@@ -40,33 +35,17 @@ def bigg_sbml_download(bg_pathway_root: str, overwrite: bool = False) -> None:
     """
     utils.initialize_dir(bg_pathway_root, overwrite)
-    bigg_models = {
-        BIGG_MODEL_KEYS[SPECIES_FULL_NAME_HUMAN]: {
-            BIGG_MODEL_FIELD_URL: BIGG_MODEL_URLS[SPECIES_FULL_NAME_HUMAN],
-            BIGG_MODEL_FIELD_SPECIES: SPECIES_FULL_NAME_HUMAN,
+    bigg_models_df = indices.create_pathway_index_df(
+        model_keys=BIGG_MODEL_KEYS,
+        model_urls=BIGG_MODEL_URLS,
+        model_species={
+            SPECIES_FULL_NAME_HUMAN: SPECIES_FULL_NAME_HUMAN,
+            SPECIES_FULL_NAME_MOUSE: SPECIES_FULL_NAME_MOUSE,
+            SPECIES_FULL_NAME_YEAST: SPECIES_FULL_NAME_YEAST,
         },
-        BIGG_MODEL_KEYS[SPECIES_FULL_NAME_MOUSE]: {
-            BIGG_MODEL_FIELD_URL: BIGG_MODEL_URLS[SPECIES_FULL_NAME_MOUSE],
-            BIGG_MODEL_FIELD_SPECIES: SPECIES_FULL_NAME_MOUSE,
-        },
-        BIGG_MODEL_KEYS[SPECIES_FULL_NAME_YEAST]: {
-            BIGG_MODEL_FIELD_URL: BIGG_MODEL_URLS[SPECIES_FULL_NAME_YEAST],
-            BIGG_MODEL_FIELD_SPECIES: SPECIES_FULL_NAME_YEAST,
-        },
-    }
-    bigg_models_df = pd.DataFrame(bigg_models).T
-    bigg_models_df["sbml_path"] = [
-        os.path.join(bg_pathway_root, k) + ".sbml"
-        for k in bigg_models_df.index.tolist()
-    ]
-    bigg_models_df["file"] = [os.path.basename(x) for x in bigg_models_df["sbml_path"]]
-    # add other attributes which will be used in the pw_index
-    bigg_models_df["date"] = datetime.date.today().strftime("%Y%m%d")
-    bigg_models_df.index = bigg_models_df.index.rename("pathway_id")
-    bigg_models_df = bigg_models_df.reset_index()
-    bigg_models_df["name"] = bigg_models_df["pathway_id"]
-    bigg_models_df = bigg_models_df.assign(source="BiGG")
+        base_path=bg_pathway_root,
+        source_name="BiGG",
+    )
     with open_fs(bg_pathway_root, create=True) as bg_fs:
         for _, row in bigg_models_df.iterrows():
@@ -84,41 +63,46 @@ def bigg_sbml_download(bg_pathway_root: str, overwrite: bool = False) -> None:
     return None
-def annotate_recon(raw_model_path: str, annotated_model_path: str) -> None:
-    """Annotate Recon3D
-    Add compartment annotations to Recon3D so it can be merged with other pathways
-    """
-    logger.warning(
-        "add_sbml_annotations is deprecated and maybe removed in a future version of rcpr; "
-        "we are now adding these annotation during ingestion by sbml.sbml_df_from_sbml() rather "
-        "than directly appending them to the raw .sbml"
-    )
-    recon_3d_annotations = pd.DataFrame(BIGG_RECON3D_FIELD_ANNOTATION)
-    sbml_model = sbml.SBML(raw_model_path)
-    sbml.add_sbml_annotations(
-        sbml_model, recon_3d_annotations, save_path=annotated_model_path
-    )
-    return None
 def construct_bigg_consensus(
     pw_index_inp: str | indices.PWIndex,
     species: str | Iterable[str] | None = None,
     outdir: str | None = None,
 ) -> sbml_dfs_core.SBML_dfs:
-    """Constructs a BiGG SBML DFs Pathway Representation
-    Attention: curently this does work only for a singly model. Integraiton of multiple
-    models is not supported yet in BiGG.
-    Args:
-        pw_index_inp (str | indices.PWIndex): PWIndex or uri pointing to PWIndex
-        species (str | Iterable[str] | None): one or more species to filter by. Default: no filtering
-        outdir (str | None, optional): output directory used to cache results. Defaults to None.
-    Returns:
-        sbml_dfs_core.SBML_dfs: A consensus SBML
+    """Construct a BiGG SBML DFs pathway representation.
+    Parameters
+    ----------
+    pw_index_inp : str or indices.PWIndex
+        PWIndex object or URI pointing to PWIndex
+    species : str or Iterable[str] or None, optional
+        One or more species to filter by, by default None (no filtering)
+    outdir : str or None, optional
+        Output directory used to cache results, by default None
+    Returns
+    -------
+    sbml_dfs_core.SBML_dfs
+        A consensus SBML representation
+    Notes
+    -----
+    Currently this only works for a single model. Integration of multiple
+    models is not yet supported in BiGG.
+    The function:
+    1. Loads/validates the pathway index
+    2. Constructs SBML DFs dictionary
+    3. Processes the single model:
+        - Infers compartmentalization for species without location
+        - Names compartmentalized species
+        - Validates the final model
+    Raises
+    ------
+    ValueError
+        If pw_index_inp is neither a PWIndex nor a string
+    NotImplementedError
+        If attempting to merge multiple models
     """
     if isinstance(pw_index_inp, str):
         pw_index = indices.adapt_pw_index(pw_index_inp, species=species, outdir=outdir)
@@ -142,5 +126,6 @@ def construct_bigg_consensus(
     # fix missing compartimentalization
     model = sbml_dfs_core.infer_uncompartmentalized_species_location(model)
     model = sbml_dfs_core.name_compartmentalized_species(model)
+    rename_species_ontologies(model)
     model.validate()
     return model

napistu/ingestion/constants.py CHANGED Viewed

@@ -3,12 +3,30 @@ from __future__ import annotations
 from types import SimpleNamespace
 SPECIES_FULL_NAME_HUMAN = "Homo sapiens"
 SPECIES_FULL_NAME_MOUSE = "Mus musculus"
 SPECIES_FULL_NAME_YEAST = "Saccharomyces cerevisiae"
 SPECIES_FULL_NAME_RAT = "Rattus norvegicus"
 SPECIES_FULL_NAME_WORM = "Caenorhabditis elegans"
+PROTEINATLAS_SUBCELL_LOC_URL = (
+    "https://www.proteinatlas.org/download/tsv/subcellular_location.tsv.zip"
+)
+PROTEINATLAS_DEFS = SimpleNamespace(
+    GO_ID="GO id",
+    GENE="Gene",
+)
+# GTEx
+GTEX_RNASEQ_EXPRESSION_URL = "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
+GTEX_DEFS = SimpleNamespace(
+    NAME="Name",
+    DESCRIPTION="Description",
+)
 # BIGG
 BIGG_MODEL_URLS = {
@@ -29,134 +47,6 @@ BIGG_RECON3D_FIELD_ID = "id"
 BIGG_RECON3D_FIELD_TYPE = "type"
 BIGG_RECON3D_FIELD_URI = "uri"
-BIGG_RECON3D_ID_C = "c"
-BIGG_RECON3D_ID_L = "l"
-BIGG_RECON3D_ID_E = "e"
-BIGG_RECON3D_ID_M = "m"
-BIGG_RECON3D_ID_R = "r"
-BIGG_RECON3D_ID_X = "x"
-BIGG_RECON3D_ID_N = "n"
-BIGG_RECON3D_ID_I = "i"
-BIGG_RECON3D_TYPE_COMPARTMENT = "compartment"
-BIGG_RECON3D_FIELD_ANNOTATION = [
-    {
-        # cytosol
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005829",
-    },
-    {
-        # cytoplasm
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005737",
-    },
-    {
-        # plasma membrane
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005886",
-    },
-    {
-        # lysosome lumen
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_L,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0043202",
-    },
-    {
-        # lysosomal membrane
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_L,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005765",
-    },
-    {
-        # mitochondrial intermembrane space
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_M,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005758",
-    },
-    {
-        # mitochondrial outer membrane
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_M,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005741",
-    },
-    {
-        # ER membrane
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_R,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005789",
-    },
-    {
-        # ER lumen
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_R,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005788",
-    },
-    {
-        # extracellular region
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_E,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005576",
-    },
-    {
-        # peroxosomal membrane
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_X,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005778",
-    },
-    {
-        # peroxosomal matrix
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_X,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005782",
-    },
-    {
-        # nucleolus
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005730",
-    },
-    {
-        # nuclear envelope
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005635",
-    },
-    {
-        # nucleoplasm
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005654",
-    },
-    {
-        # golgi membrane
-        BIGG_RECON3D_FIELD_ID: "g",
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0000139",
-    },
-    {
-        # golgi lumen
-        BIGG_RECON3D_FIELD_ID: "g",
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005796",
-    },
-    {
-        # mitochondrial matrix
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_I,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005759",
-    },
-    {
-        # mitochondrial inner membrane
-        BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_I,
-        BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
-        BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005743",
-    },
-]
 # IDENTIFIERS ETL
 IDENTIFIERS_ETL_YEAST_URL = "https://www.uniprot.org/docs/yeast.txt"
 IDENTIFIERS_ETL_SBO_URL = (
@@ -239,11 +129,6 @@ SBML_COMPARTMENTALIZED_SPECIES_DICT_SOURCE = "sc_Source"
 SBML_REACTION_ATTR_GET_GENE_PRODUCT = "getGeneProduct"
-SBML_ANNOTATION_METHOD_GET_SPECIES = "getSpecies"
-SBML_ANNOTATION_METHOD_GET_COMPARTMENT = "getCompartment"
-SBML_ANNOTATION_METHOD_GET_REACTION = "getReaction"
 # STRING
 STRING_URL_EXPRESSIONS = {
     "interactions": "https://stringdb-static.org/download/protein.links.full.v{version}/{taxid}.protein.links.full.v{version}.txt.gz",

napistu/ingestion/gtex.py ADDED Viewed

@@ -0,0 +1,113 @@
+from __future__ import annotations
+import logging
+import pandas as pd
+from fs import open_fs
+from napistu import utils
+from napistu.constants import ONTOLOGIES
+from napistu.ingestion.constants import GTEX_DEFS, GTEX_RNASEQ_EXPRESSION_URL
+logger = logging.getLogger(__name__)
+def download_gtex_rnaseq(
+    target_uri: str, url: str = GTEX_RNASEQ_EXPRESSION_URL
+) -> None:
+    """Download GTEx RNA-seq expression data.
+    Parameters
+    ----------
+    target_uri : str
+        The URI where the GTEx data should be saved
+    url : str, optional
+        URL to download the GTEx RNA-seq expression data from.
+        Defaults to GTEX_RNASEQ_EXPRESSION_URL.
+    Returns
+    -------
+    None
+    Notes
+    -----
+    Downloads GTEx RNA-seq expression data (median TPM per gene per tissue) from the
+    specified URL and saves it to the target URI. By default, downloads from GTEx
+    Analysis V8 data (dbGaP Accession phs000424.v8.p2).
+    """
+    logger.info("Start downloading gtex %s to %s", url, target_uri)
+    utils.download_wget(url, target_uri)
+def load_and_clean_gtex_data(gtex_data_path: str) -> pd.DataFrame:
+    """Load and format GTEx tissue specific expression data.
+    This function loads tissue-specific expression data from GTEx (median value per gene per tissue).
+    Parameters
+    ----------
+    gtex_data_path : str
+        Path to GTEx tissue specific expression data (medians)
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing all the information from the GTEx file with standardized column names:
+        - ensembl_gene_id: Ensembl gene ID without version number
+        - ensembl_geneTranscript_id: Original GTEx hybrid gene/transcript ID
+        - Description: Gene description/symbol
+        - Multiple tissue columns with median TPM values
+    Notes
+    -----
+    The function:
+    1. Skips the first 2 lines of the GTEx file (header info)
+    2. Creates clean Ensembl gene IDs by removing version numbers
+    3. Renames columns for clarity
+    4. Reorders columns to put ID and description columns first
+    Raises
+    ------
+    FileNotFoundError
+        If the input file does not exist
+    """
+    # Check file exists
+    base_path, file_name = utils.get_source_base_and_path(gtex_data_path)
+    logger.info("Loading GTEx tissue specific expression data")
+    # Read the TSV file using pandas, skipping first 2 lines
+    with open_fs(base_path) as base_fs:
+        with base_fs.open(file_name, "rb") as f:
+            gtex_expression_data = pd.read_csv(
+                f, sep="\t", skiprows=2, dtype=str, na_values=[""], keep_default_na=True
+            )
+    # Create ensembl_gene_id by removing version numbers from Name column
+    gtex_expression_data[ONTOLOGIES.ENSEMBL_GENE] = gtex_expression_data[
+        GTEX_DEFS.NAME
+    ].str.replace(r"\.[0-9]+$", "", regex=True)
+    # Rename Name column to be more informative
+    gtex_expression_data = gtex_expression_data.rename(
+        columns={
+            GTEX_DEFS.NAME: ONTOLOGIES.ENSEMBL_GENE_VERSION,
+            GTEX_DEFS.DESCRIPTION: ONTOLOGIES.SYMBOL,
+        }
+    )
+    # Reorder columns to put ID and description columns first
+    first_cols = [
+        ONTOLOGIES.ENSEMBL_GENE,
+        ONTOLOGIES.ENSEMBL_GENE_VERSION,
+        ONTOLOGIES.SYMBOL,
+    ]
+    other_cols = [col for col in gtex_expression_data.columns if col not in first_cols]
+    gtex_expression_data = gtex_expression_data[first_cols + other_cols]
+    # Convert tissue columns to numeric
+    numeric_cols = [col for col in other_cols if col not in first_cols]
+    gtex_expression_data[numeric_cols] = gtex_expression_data[numeric_cols].apply(
+        pd.to_numeric, errors="coerce"
+    )
+    return gtex_expression_data

napistu/ingestion/hpa.py ADDED Viewed

@@ -0,0 +1,147 @@
+from __future__ import annotations
+import logging
+import pandas as pd
+from napistu import utils
+from fs import open_fs
+from napistu.constants import ONTOLOGIES
+from napistu.ingestion.constants import PROTEINATLAS_SUBCELL_LOC_URL, PROTEINATLAS_DEFS
+logger = logging.getLogger(__name__)
+def download_hpa_data(target_uri: str, url: str = PROTEINATLAS_SUBCELL_LOC_URL) -> None:
+    """Download protein localization data from the Human Protein Atlas.
+    Parameters
+    ----------
+    target_uri : str
+        The URI where the HPA data should be saved. Should end with .tsv
+    url : str, optional
+        URL to download the zipped protein atlas subcellular localization tsv from.
+        Defaults to PROTEINATLAS_SUBCELL_LOC_URL.
+    Returns
+    -------
+    None
+    Notes
+    -----
+    Downloads the subcellular localization data from the Human Protein Atlas and saves
+    it to the specified target URI. The data is downloaded from the official HPA website
+    as a ZIP file and automatically unzipped to extract the TSV.
+    Raises
+    ------
+    ValueError
+        If target_uri does not end with .tsv
+    """
+    if not target_uri.endswith(".tsv"):
+        raise ValueError(f"Target URI must end with .tsv, got {target_uri}")
+    file_ext = url.split(".")[-1]
+    target_filename = url.split("/")[-1].split(f".{file_ext}")[0]
+    logger.info("Start downloading proteinatlas %s to %s", url, target_uri)
+    # target_filename is the name of the file in the zip file which will be renamed to target_uri
+    utils.download_wget(url, target_uri, target_filename=target_filename)
+    return None
+def load_and_clean_hpa_data(hpa_data_path: str) -> pd.DataFrame:
+    """Load and format Human Protein Atlas subcellular localization data.
+    Parameters
+    ----------
+    hpa_data_path : str
+        Path to HPA subcellular localization data TSV file
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with genes as rows and GO terms as columns. Each cell
+        is a binary value (0 or 1) indicating whether that gene (row) is found in that
+        compartment (column). Genes with no compartment annotations are filtered out.
+    Notes
+    -----
+    This function loads subcellular localization data from the Human Protein Atlas
+    and creates a binary matrix where rows are genes and columns are GO terms,
+    with 1 indicating that a gene is localized to that compartment and 0 indicating
+    it is not.
+    The function filters out genes that have no compartment annotations and logs
+    information about the number of genes filtered and the final matrix dimensions.
+    Raises
+    ------
+    FileNotFoundError
+        If the input file does not exist
+    ValueError
+        If no gene-compartment associations are found in the data
+    """
+    # Check file exists
+    base_path, file_name = utils.get_source_base_and_path(hpa_data_path)
+    logger.info("Loading Human Protein Atlas subcellular localization data")
+    # Read the TSV file using pandas
+    with open_fs(base_path) as base_fs:
+        with base_fs.open(file_name, "rb") as f:
+            protein_subcellular_localizations = pd.read_csv(
+                f, sep="\t", dtype=str, na_values=[""], keep_default_na=True
+            )
+    # Rename Gene column to be more informative
+    protein_subcellular_localizations = protein_subcellular_localizations.rename(
+        columns={PROTEINATLAS_DEFS.GENE: ONTOLOGIES.ENSEMBL_GENE}
+    )
+    # Convert GO id column to lists
+    def _split_go_terms(go_terms):
+        if pd.isna(go_terms):
+            return []
+        return go_terms.split(";")
+    # Create a list of all gene-GO term pairs
+    gene_go_pairs = []
+    for _, row in protein_subcellular_localizations.iterrows():
+        go_terms = _split_go_terms(row[PROTEINATLAS_DEFS.GO_ID])
+        for term in go_terms:
+            gene_go_pairs.append(
+                {
+                    ONTOLOGIES.ENSEMBL_GENE: row[ONTOLOGIES.ENSEMBL_GENE],
+                    ONTOLOGIES.GO: term,
+                }
+            )
+    # Convert to DataFrame and pivot to create binary matrix
+    gene_go_df = pd.DataFrame(gene_go_pairs)
+    if len(gene_go_df) == 0:
+        raise ValueError("No gene-compartment associations found in the data")
+    localization_matrix = pd.crosstab(
+        gene_go_df[ONTOLOGIES.ENSEMBL_GENE], gene_go_df[ONTOLOGIES.GO]
+    ).astype(int)
+    # Log number of genes without compartments that were filtered
+    n_total_genes = len(
+        protein_subcellular_localizations[ONTOLOGIES.ENSEMBL_GENE].unique()
+    )
+    n_genes_with_compartments = len(localization_matrix)
+    n_filtered = n_total_genes - n_genes_with_compartments
+    if n_filtered > 0:
+        logger.debug(
+            "Filtered out %d genes with no compartment annotations (from %d total genes)",
+            n_filtered,
+            n_total_genes,
+        )
+    logger.info(
+        "Created localization matrix with shape %d genes x %d compartments",
+        localization_matrix.shape[0],
+        localization_matrix.shape[1],
+    )
+    return localization_matrix

napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl

napistu 0.2.5.dev6py3-none-any.whl → 0.3.1py3-none-any.whl