PyPI - napistu - Versions diffs - 0.3.6__tar.gz → 0.3.7__tar.gz - Mend

napistu 0.3.6tar.gz → 0.3.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

{napistu-0.3.6 → napistu-0.3.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: napistu
-Version: 0.3.6
+Version: 0.3.7
 Summary: Connecting high-dimensional data to curated pathways
 Home-page: https://github.com/napistu/napistu-py
 Author: Sean Hackett

{napistu-0.3.6 → napistu-0.3.7}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = napistu
-version = 0.3.6
+version = 0.3.7
 description = Connecting high-dimensional data to curated pathways
 long_description = file: README.md
 long_description_content_type = text/markdown

{napistu-0.3.6 → napistu-0.3.7}/src/napistu/__main__.py RENAMED Viewed

@@ -12,7 +12,7 @@ import click_logging
 import napistu
 import igraph as ig
 import pandas as pd
-from napistu import consensus as cpr_consensus
+from napistu import consensus as napistu_consensus
 from napistu import indices
 from napistu import sbml_dfs_core
 from napistu import utils
@@ -65,7 +65,7 @@ def ingestion():
     "--overwrite", "-o", is_flag=True, default=False, help="Overwrite existing files?"
 )
 @click_logging.simple_verbosity_option(logger)
-def load_reactome(base_folder: str, overwrite=True):
+def ingest_reactome(base_folder: str, overwrite=True):
     logger.info("Start downloading Reactome to %s", base_folder)
     reactome.reactome_sbml_download(f"{base_folder}/sbml", overwrite=overwrite)
@@ -76,7 +76,7 @@ def load_reactome(base_folder: str, overwrite=True):
     "--overwrite", "-o", is_flag=True, default=False, help="Overwrite existing files?"
 )
 @click_logging.simple_verbosity_option(logger)
-def load_bigg(base_folder: str, overwrite: bool):
+def ingest_bigg(base_folder: str, overwrite: bool):
     logger.info("Start downloading Bigg to %s", base_folder)
     bigg.bigg_sbml_download(base_folder, overwrite)
@@ -84,7 +84,7 @@ def load_bigg(base_folder: str, overwrite: bool):
 @ingestion.command(name="trrust")
 @click.argument("target_uri", type=str)
 @click_logging.simple_verbosity_option(logger)
-def load_ttrust(target_uri: str):
+def ingest_ttrust(target_uri: str):
     logger.info("Start downloading TRRUST to %s", target_uri)
     trrust.download_trrust(target_uri)
@@ -98,7 +98,7 @@ def load_ttrust(target_uri: str):
     help="URL to download the zipped protein atlas subcellular localization tsv from.",
 )
 @click_logging.simple_verbosity_option(logger)
-def load_proteinatlas_subcell(target_uri: str, url: str):
+def ingest_proteinatlas_subcell(target_uri: str, url: str):
     hpa.download_hpa_data(target_uri, url)
@@ -111,7 +111,7 @@ def load_proteinatlas_subcell(target_uri: str, url: str):
     help="URL to download the gtex file from.",
 )
 @click_logging.simple_verbosity_option(logger)
-def load_gtex_rnaseq(target_uri: str, url: str):
+def ingest_gtex_rnaseq(target_uri: str, url: str):
     gtex.download_gtex_rnaseq(target_uri, url)
@@ -124,7 +124,7 @@ def load_gtex_rnaseq(target_uri: str, url: str):
     help="Species name (e.g., Homo sapiens).",
 )
 @click_logging.simple_verbosity_option(logger)
-def load_string_db(target_uri: str, species: str):
+def ingest_string_db(target_uri: str, species: str):
     string.download_string(target_uri, species)
@@ -137,7 +137,7 @@ def load_string_db(target_uri: str, species: str):
     help="Species name (e.g., Homo sapiens).",
 )
 @click_logging.simple_verbosity_option(logger)
-def load_string_aliases(target_uri: str, species: str):
+def ingest_string_aliases(target_uri: str, species: str):
     string.download_string_aliases(target_uri, species)
@@ -289,7 +289,7 @@ def create_consensus(
     )
     pw_index_df["species"] = "unknown"
     pw_index = indices.PWIndex(pw_index=pw_index_df, validate_paths=False)
-    consensus_model = cpr_consensus.construct_consensus_model(
+    consensus_model = napistu_consensus.construct_consensus_model(
         sbml_dfs_dict, pw_index, dogmatic
     )
     utils.save_pickle(output_model_uri, consensus_model)
@@ -855,6 +855,17 @@ def copy_uri(input_uri, output_uri, is_file=True):
     utils.copy_uri(input_uri, output_uri, is_file=is_file)
+@helpers.command(name="validate_sbml_dfs")
+@click.argument("input_uri", type=str)
+@click_logging.simple_verbosity_option(logger)
+def validate_sbml_dfs(input_uri):
+    """Validate a sbml_dfs object"""
+    sbml_dfs = utils.load_pickle(input_uri)
+    sbml_dfs.validate()
+    logger.info(f"Successfully validated: {input_uri}")
 @click.group()
 def stats():
     """Various functions to calculate network statistics

{napistu-0.3.6 → napistu-0.3.7}/src/napistu/consensus.py RENAMED Viewed

@@ -15,10 +15,13 @@ from napistu import source
 from napistu import utils
 from napistu.ingestion import sbml
+from napistu.constants import SCHEMA_DEFS
 from napistu.constants import SBML_DFS
+from napistu.constants import SBML_DFS_SCHEMA
 from napistu.constants import IDENTIFIERS
 from napistu.constants import SOURCE_SPEC
 from napistu.constants import BQB_DEFINING_ATTRS
+from napistu.constants import VALID_BQB_TERMS
 logger = logging.getLogger(__name__)
 # set the level to show logger.info message
@@ -137,8 +140,7 @@ def unnest_SBML_df(
     """
     # check that all sbml_dfs have the same schema
-    _test_same_schema(sbml_dfs_dict)
-    table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
+    table_schema = SBML_DFS_SCHEMA.SCHEMA[table]
     df_list = [
         getattr(sbml_dfs_dict[x], table).assign(model=x) for x in sbml_dfs_dict.keys()
@@ -192,7 +194,7 @@ def construct_meta_entities_identifiers(
     agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=table)
     # since all sbml_dfs have the same schema pull out one schema for reference
-    table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
+    table_schema = SBML_DFS_SCHEMA.SCHEMA[table]
     # update foreign keys using provided lookup tables
     if "fk" in table_schema.keys():
@@ -244,6 +246,8 @@ def reduce_to_consensus_ids(
         Series mapping the index of the aggregated entities to new consensus IDs.
     """
     # Step 1: Build consensus identifiers to create clusters of equivalent entities
+    table_name = table_schema[SCHEMA_DEFS.TABLE]
+    logger.debug(f"Building consensus identifiers for {table_name}")
     indexed_cluster, cluster_consensus_identifiers = build_consensus_identifiers(
         sbml_df, table_schema, defining_biological_qualifiers
     )
@@ -252,25 +256,28 @@ def reduce_to_consensus_ids(
     agg_table_harmonized = sbml_df.join(indexed_cluster)
     # Step 3: Create lookup table for entity IDs
+    logger.debug(f"Creating lookup table for {table_name}")
     lookup_table = _create_entity_lookup_table(agg_table_harmonized, table_schema)
     # Step 4: Add nameness scores to help select representative names
     agg_table_harmonized = utils._add_nameness_score_wrapper(
-        agg_table_harmonized, "label", table_schema
+        agg_table_harmonized, SCHEMA_DEFS.LABEL, table_schema
     )
     # Step 5: Prepare the consensus table with one row per unique entity
+    logger.debug(f"Preparing consensus table for {table_name}")
     new_id_table = _prepare_consensus_table(
         agg_table_harmonized, table_schema, cluster_consensus_identifiers
     )
     # Step 6: Add source information if required
-    if "source" in table_schema.keys():
+    if SCHEMA_DEFS.SOURCE in table_schema.keys():
         new_id_table = _add_consensus_sources(
             new_id_table, agg_table_harmonized, lookup_table, table_schema, pw_index
         )
     # Step 7: Validate the resulting table
+    logger.debug(f"Validating consensus table for {table_name}")
     _validate_consensus_table(new_id_table, sbml_df)
     return new_id_table, lookup_table
@@ -667,7 +674,7 @@ def construct_meta_entities_members(
     defined_by_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[defined_by]
     # Step 2: Prepare the member table and validate its structure
-    agg_tbl, defining_fk = _prepare_member_table(
+    agg_tbl, _ = _prepare_member_table(
         sbml_dfs_dict,
         defined_by,
         defined_lookup_tables,
@@ -681,9 +688,7 @@ def construct_meta_entities_members(
     membership_lookup = _create_membership_lookup(agg_tbl, table_schema)
     # Step 4: Create consensus entities and lookup table
-    consensus_entities, lookup_table = _create_entity_consensus(
-        membership_lookup, table_schema
-    )
+    _, lookup_table = _create_entity_consensus(membership_lookup, table_schema)
     # Step 5: Log merger information
     report_consensus_merges(
@@ -1507,6 +1512,11 @@ def _filter_identifiers_by_qualifier(
     pd.DataFrame
         Filtered identifiers
     """
+    invalid_bqbs = set(meta_identifiers[IDENTIFIERS.BQB]) - set(VALID_BQB_TERMS)
+    if len(invalid_bqbs) > 0:
+        logger.warning(f"Invalid biological qualifiers: {invalid_bqbs}")
     valid_identifiers = meta_identifiers.copy()
     return valid_identifiers[
         meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
@@ -2034,22 +2044,6 @@ def _merge_entity_data_report_mismatches(
     return None
-def _test_same_schema(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
-    """
-    Ensure that all sbml_dfs in the dict have the same schema
-    """
-    if len(sbml_dfs_dict) != 0:
-        # extract all schemas
-        schema_list = [sbml_dfs_dict[x].schema for x in sbml_dfs_dict.keys()]
-        # if multiple entries are present then are they the same?
-        if len(sbml_dfs_dict) > 1:
-            if not all([x == schema_list[0] for x in schema_list]):
-                raise ValueError("sbml_df schemas were not identical")
-    return None
 def _create_member_string(x: list[str]) -> str:
     x.sort()
     return "_".join(x)

{napistu-0.3.6 → napistu-0.3.7}/src/napistu/constants.py RENAMED Viewed

@@ -55,28 +55,49 @@ SBML_DFS = SimpleNamespace(
     SBO_TERM="sbo_term",
 )
+SCHEMA_DEFS = SimpleNamespace(
+    TABLE="table",
+    PK="pk",
+    FK="fk",
+    LABEL="label",
+    ID="id",
+    SOURCE="source",
+    VARS="vars",
+)
 SBML_DFS_SCHEMA = SimpleNamespace(
     SCHEMA={
         SBML_DFS.COMPARTMENTS: {
-            "pk": SBML_DFS.C_ID,
-            "label": SBML_DFS.C_NAME,
-            "id": SBML_DFS.C_IDENTIFIERS,
-            "source": SBML_DFS.C_SOURCE,
-            "vars": [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE],
+            SCHEMA_DEFS.TABLE: SBML_DFS.COMPARTMENTS,
+            SCHEMA_DEFS.PK: SBML_DFS.C_ID,
+            SCHEMA_DEFS.LABEL: SBML_DFS.C_NAME,
+            SCHEMA_DEFS.ID: SBML_DFS.C_IDENTIFIERS,
+            SCHEMA_DEFS.SOURCE: SBML_DFS.C_SOURCE,
+            SCHEMA_DEFS.VARS: [
+                SBML_DFS.C_NAME,
+                SBML_DFS.C_IDENTIFIERS,
+                SBML_DFS.C_SOURCE,
+            ],
         },
         SBML_DFS.SPECIES: {
-            "pk": SBML_DFS.S_ID,
-            "label": SBML_DFS.S_NAME,
-            "id": SBML_DFS.S_IDENTIFIERS,
-            "source": SBML_DFS.S_SOURCE,
-            "vars": [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE],
+            SCHEMA_DEFS.TABLE: SBML_DFS.SPECIES,
+            SCHEMA_DEFS.PK: SBML_DFS.S_ID,
+            SCHEMA_DEFS.LABEL: SBML_DFS.S_NAME,
+            SCHEMA_DEFS.ID: SBML_DFS.S_IDENTIFIERS,
+            SCHEMA_DEFS.SOURCE: SBML_DFS.S_SOURCE,
+            SCHEMA_DEFS.VARS: [
+                SBML_DFS.S_NAME,
+                SBML_DFS.S_IDENTIFIERS,
+                SBML_DFS.S_SOURCE,
+            ],
         },
         SBML_DFS.COMPARTMENTALIZED_SPECIES: {
-            "pk": SBML_DFS.SC_ID,
-            "label": SBML_DFS.SC_NAME,
-            "fk": [SBML_DFS.S_ID, SBML_DFS.C_ID],
-            "source": SBML_DFS.SC_SOURCE,
-            "vars": [
+            SCHEMA_DEFS.TABLE: SBML_DFS.COMPARTMENTALIZED_SPECIES,
+            SCHEMA_DEFS.PK: SBML_DFS.SC_ID,
+            SCHEMA_DEFS.LABEL: SBML_DFS.SC_NAME,
+            SCHEMA_DEFS.FK: [SBML_DFS.S_ID, SBML_DFS.C_ID],
+            SCHEMA_DEFS.SOURCE: SBML_DFS.SC_SOURCE,
+            SCHEMA_DEFS.VARS: [
                 SBML_DFS.SC_NAME,
                 SBML_DFS.S_ID,
                 SBML_DFS.C_ID,
@@ -84,11 +105,12 @@ SBML_DFS_SCHEMA = SimpleNamespace(
             ],
         },
         SBML_DFS.REACTIONS: {
-            "pk": SBML_DFS.R_ID,
-            "label": SBML_DFS.R_NAME,
-            "id": SBML_DFS.R_IDENTIFIERS,
-            "source": SBML_DFS.R_SOURCE,
-            "vars": [
+            SCHEMA_DEFS.TABLE: SBML_DFS.REACTIONS,
+            SCHEMA_DEFS.PK: SBML_DFS.R_ID,
+            SCHEMA_DEFS.LABEL: SBML_DFS.R_NAME,
+            SCHEMA_DEFS.ID: SBML_DFS.R_IDENTIFIERS,
+            SCHEMA_DEFS.SOURCE: SBML_DFS.R_SOURCE,
+            SCHEMA_DEFS.VARS: [
                 SBML_DFS.R_NAME,
                 SBML_DFS.R_IDENTIFIERS,
                 SBML_DFS.R_SOURCE,
@@ -96,9 +118,10 @@ SBML_DFS_SCHEMA = SimpleNamespace(
             ],
         },
         SBML_DFS.REACTION_SPECIES: {
-            "pk": SBML_DFS.RSC_ID,
-            "fk": [SBML_DFS.R_ID, SBML_DFS.SC_ID],
-            "vars": [
+            SCHEMA_DEFS.TABLE: SBML_DFS.REACTION_SPECIES,
+            SCHEMA_DEFS.PK: SBML_DFS.RSC_ID,
+            SCHEMA_DEFS.FK: [SBML_DFS.R_ID, SBML_DFS.SC_ID],
+            SCHEMA_DEFS.VARS: [
                 SBML_DFS.R_ID,
                 SBML_DFS.SC_ID,
                 SBML_DFS.STOICHIOMETRY,
@@ -129,10 +152,10 @@ ENTITIES_TO_ENTITY_DATA = {
 REQUIRED_REACTION_FROMEDGELIST_COLUMNS = [
     "sc_id_up",
     "sc_id_down",
-    "sbo_term",
-    "r_name",
-    "r_Identifiers",
-    "r_isreversible",
+    SBML_DFS.SBO_TERM,
+    SBML_DFS.R_NAME,
+    SBML_DFS.R_IDENTIFIERS,
+    SBML_DFS.R_ISREVERSIBLE,
 ]
 NAPISTU_STANDARD_OUTPUTS = SimpleNamespace(
@@ -155,20 +178,6 @@ INTERACTION_EDGELIST_EXPECTED_VARS = {
     SBML_DFS.R_ISREVERSIBLE,
 }
-BQB_PRIORITIES = pd.DataFrame(
-    [{"bqb": "BQB_IS", "bqb_rank": 1}, {"bqb": "BQB_HAS_PART", "bqb_rank": 2}]
-)
-ONTOLOGY_PRIORITIES = pd.DataFrame(
-    [
-        {"ontology": "reactome", "ontology_rank": 1},
-        {"ontology": "ensembl_gene", "ontology_rank": 2},
-        {"ontology": "chebi", "ontology_rank": 3},
-        {"ontology": "uniprot", "ontology_rank": 4},
-        {"ontology": "go", "ontology_rank": 5},
-    ]
-)
 # SBML
 # Biological qualifiers
 # Biomodels qualifiers
@@ -189,16 +198,33 @@ BQB = SimpleNamespace(
     UNKNOWN="BQB_UNKNOWN",
 )
+VALID_BQB_TERMS = [
+    BQB.IS,
+    BQB.HAS_PART,
+    BQB.IS_PART_OF,
+    BQB.IS_VERSION_OF,
+    BQB.HAS_VERSION,
+    BQB.IS_HOMOLOG_TO,
+    BQB.IS_DESCRIBED_BY,
+    BQB.IS_ENCODED_BY,
+    BQB.ENCODES,
+    BQB.OCCURS_IN,
+    BQB.HAS_PROPERTY,
+    BQB.IS_PROPERTY_OF,
+    BQB.HAS_TAXON,
+    BQB.UNKNOWN,
+]
 # molecules are distinctly defined by these BQB terms
-BQB_DEFINING_ATTRS = ["BQB_IS", "IS_HOMOLOG_TO"]
+BQB_DEFINING_ATTRS = [BQB.IS, BQB.IS_HOMOLOG_TO]
 # a looser convention which will aggregate genes, transcripts, and proteins
 # if they are linked with the appropriate bioqualifiers
 BQB_DEFINING_ATTRS_LOOSE = [
-    "BQB_IS",
-    "IS_HOMOLOG_TO",
-    "BQB_IS_ENCODED_BY",
-    "BQB_ENCODES",
+    BQB.IS,
+    BQB.IS_HOMOLOG_TO,
+    BQB.IS_ENCODED_BY,
+    BQB.ENCODES,
 ]
 # identifiers
@@ -206,6 +232,13 @@ IDENTIFIERS = SimpleNamespace(
     ONTOLOGY="ontology", IDENTIFIER="identifier", BQB="bqb", URL="url"
 )
+BQB_PRIORITIES = pd.DataFrame(
+    [
+        {IDENTIFIERS.BQB: BQB.IS, "bqb_rank": 1},
+        {IDENTIFIERS.BQB: BQB.HAS_PART, "bqb_rank": 2},
+    ]
+)
 IDENTIFIERS_REQUIRED_VARS = {
     IDENTIFIERS.ONTOLOGY,
     IDENTIFIERS.IDENTIFIER,
@@ -217,26 +250,9 @@ SPECIES_IDENTIFIERS_REQUIRED_VARS = IDENTIFIERS_REQUIRED_VARS | {
     SBML_DFS.S_NAME,
 }
-BIOLOGICAL_QUALIFIERS = [
-    "BQB_IS",
-    "BQB_HAS_PART",
-    "BQB_IS_PART_OF",
-    "BQB_IS_VERSION_OF",
-    "BQB_HAS_VERSION",
-    "BQB_IS_HOMOLOG_TO",
-    "BQB_IS_DESCRIBED_BY",
-    "BQB_IS_ENCODED_BY",
-    "BQB_ENCODES",
-    "BQB_OCCURS_IN",
-    "BQB_HAS_PROPERTY",
-    "BQB_IS_PROPERTY_OF",
-    "BQB_HAS_TAXON",
-    "BQB_UNKNOWN",
-]
 def get_biological_qualifier_codes():
-    bio_qualifier_codes = {getattr(libsbml, bqb): bqb for bqb in BIOLOGICAL_QUALIFIERS}
+    bio_qualifier_codes = {getattr(libsbml, bqb): bqb for bqb in VALID_BQB_TERMS}
     return bio_qualifier_codes
@@ -409,6 +425,16 @@ ONTOLOGY_SPECIES_ALIASES = {
     ONTOLOGIES.UNIPROT: {"Uniprot"},
 }
+ONTOLOGY_PRIORITIES = pd.DataFrame(
+    [
+        {"ontology": ONTOLOGIES.REACTOME, "ontology_rank": 1},
+        {"ontology": ONTOLOGIES.ENSEMBL_GENE, "ontology_rank": 2},
+        {"ontology": ONTOLOGIES.CHEBI, "ontology_rank": 3},
+        {"ontology": ONTOLOGIES.UNIPROT, "ontology_rank": 4},
+        {"ontology": ONTOLOGIES.GO, "ontology_rank": 5},
+    ]
+)
 ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY = {
     "G": ONTOLOGIES.ENSEMBL_GENE,
     "T": ONTOLOGIES.ENSEMBL_TRANSCRIPT,

{napistu-0.3.6 → napistu-0.3.7}/src/napistu/indices.py RENAMED Viewed

@@ -266,6 +266,7 @@ def adapt_pw_index(
     source: str | PWIndex,
     species: str | Iterable[str] | None,
     outdir: str | None = None,
+    update_index: bool = False,
 ) -> PWIndex:
     """Adapts a pw_index
@@ -288,8 +289,9 @@ def adapt_pw_index(
         raise ValueError("'source' needs to be str or PWIndex")
     pw_index.filter(species=species)
-    if outdir is not None:
+    if outdir is not None and update_index:
         with open_fs(outdir, create=True) as fs:
             with fs.open("pw_index.tsv", "w") as f:
                 pw_index.index.to_csv(f, sep="\t")
     return pw_index

napistu 0.3.6__tar.gz → 0.3.7__tar.gz

napistu 0.3.6tar.gz → 0.3.7tar.gz