PyPI - napistu - Versions diffs - 0.4.0__tar.gz → 0.4.1__tar.gz - Mend

napistu 0.4.0tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

{napistu-0.4.0/src/napistu.egg-info → napistu-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: napistu
-Version: 0.4.0
+Version: 0.4.1
 Summary: Connecting high-dimensional data to curated pathways
 Home-page: https://github.com/napistu/napistu-py
 Author: Sean Hackett
@@ -61,7 +61,12 @@ Dynamic: license-file
 # Napistu Python Library
+[![PyPI version](https://badge.fury.io/py/napistu.svg)](https://badge.fury.io/py/napistu)
 [![Documentation Status](https://readthedocs.org/projects/napistu/badge/?version=latest)](https://napistu.readthedocs.io/en/latest/?badge=latest)
+[![CI](https://github.com/napistu/napistu-py/actions/workflows/ci.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/ci.yml)
+[![Release](https://github.com/napistu/napistu-py/actions/workflows/release.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/release.yml)
+[![Deploy to Cloud Run](https://github.com/napistu/napistu-py/actions/workflows/deploy.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/deploy.yml)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 This Python package hosts the majority of the algorithmic code for the [Napistu project](https://github.com/napistu/napistu).

{napistu-0.4.0 → napistu-0.4.1}/README.md RENAMED Viewed

@@ -1,6 +1,11 @@
 # Napistu Python Library
+[![PyPI version](https://badge.fury.io/py/napistu.svg)](https://badge.fury.io/py/napistu)
 [![Documentation Status](https://readthedocs.org/projects/napistu/badge/?version=latest)](https://napistu.readthedocs.io/en/latest/?badge=latest)
+[![CI](https://github.com/napistu/napistu-py/actions/workflows/ci.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/ci.yml)
+[![Release](https://github.com/napistu/napistu-py/actions/workflows/release.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/release.yml)
+[![Deploy to Cloud Run](https://github.com/napistu/napistu-py/actions/workflows/deploy.yml/badge.svg)](https://github.com/napistu/napistu-py/actions/workflows/deploy.yml)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 This Python package hosts the majority of the algorithmic code for the [Napistu project](https://github.com/napistu/napistu).

{napistu-0.4.0 → napistu-0.4.1}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = napistu
-version = 0.4.0
+version = 0.4.1
 description = Connecting high-dimensional data to curated pathways
 long_description = file: README.md
 long_description_content_type = text/markdown

{napistu-0.4.0 → napistu-0.4.1}/src/napistu/constants.py RENAMED Viewed

@@ -402,12 +402,14 @@ ONTOLOGIES = SimpleNamespace(
     ENSEMBL_PROTEIN_VERSION="ensembl_protein_version",
     GENE_NAME="gene_name",
     GO="go",
+    KEGG="kegg",
     MIRBASE="mirbase",
     NCBI_ENTREZ_GENE="ncbi_entrez_gene",
     PHAROS="pharos",
     REACTOME="reactome",
     SYMBOL="symbol",
     UNIPROT="uniprot",
+    WIKIPATHWAYS="wikipathways",
 )
 ONTOLOGIES_LIST = list(ONTOLOGIES.__dict__.values())

{napistu-0.4.0 → napistu-0.4.1}/src/napistu/gcs/constants.py RENAMED Viewed

@@ -5,17 +5,17 @@ from types import SimpleNamespace
 GCS_SUBASSET_NAMES = SimpleNamespace(
     SBML_DFS="sbml_dfs",
-    IDENTIFIERS="identifiers",
-    REGULATORY_GRAPH="regulatory_graph",
+    NAPISTU_GRAPH="napistu_graph",
+    SPECIES_IDENTIFIERS="species_identifiers",
     REGULATORY_DISTANCES="regulatory_distances",
 )
 GCS_FILETYPES = SimpleNamespace(
     SBML_DFS="sbml_dfs.pkl",
-    IDENTIFIERS="identifiers.tsv",
-    REGULATORY_GRAPH="regulatory_graph.pkl",
-    REGULATORY_DISTANCES="regulatory_distances.json",
+    NAPISTU_GRAPH="napistu_graph.pkl",
+    SPECIES_IDENTIFIERS="species_identifiers.tsv",
+    REGULATORY_DISTANCES="regulatory_distances.parquet",
 )
@@ -27,8 +27,8 @@ GCS_ASSETS = SimpleNamespace(
             "file": "test_pathway.tar.gz",
             "subassets": {
                 GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
-                GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
-                GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
+                GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
+                GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
                 GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
             },
             "public_url": "https://storage.googleapis.com/shackett-napistu-public/test_pathway.tar.gz",
@@ -37,8 +37,8 @@ GCS_ASSETS = SimpleNamespace(
             "file": "human_consensus.tar.gz",
             "subassets": {
                 GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
-                GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
-                GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
+                GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
+                GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
             },
             "public_url": "https://storage.googleapis.com/shackett-napistu-public/human_consensus.tar.gz",
         },
@@ -46,8 +46,8 @@ GCS_ASSETS = SimpleNamespace(
             "file": "human_consensus_w_distances.tar.gz",
             "subassets": {
                 GCS_SUBASSET_NAMES.SBML_DFS: GCS_FILETYPES.SBML_DFS,
-                GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
-                GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
+                GCS_SUBASSET_NAMES.NAPISTU_GRAPH: GCS_FILETYPES.NAPISTU_GRAPH,
+                GCS_SUBASSET_NAMES.SPECIES_IDENTIFIERS: GCS_FILETYPES.SPECIES_IDENTIFIERS,
                 GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
             },
             "public_url": "https://storage.googleapis.com/calico-cpr-public/human_consensus_w_distances.tar.gz",

napistu-0.4.1/src/napistu/ontologies/id_tables.py ADDED Viewed

@@ -0,0 +1,282 @@
+import logging
+from typing import Optional, Union, Set
+import pandas as pd
+from napistu import sbml_dfs_utils
+from napistu.constants import (
+    BQB,
+    BQB_DEFINING_ATTRS_LOOSE,
+    IDENTIFIERS,
+    SBML_DFS_SCHEMA,
+    SCHEMA_DEFS,
+    VALID_BQB_TERMS,
+)
+from napistu import utils
+logger = logging.getLogger(__name__)
+def filter_id_table(
+    id_table: pd.DataFrame,
+    identifiers: Optional[Union[str, list, set]] = None,
+    ontologies: Optional[Union[str, list, set]] = None,
+    bqbs: Optional[Union[str, list, set]] = BQB_DEFINING_ATTRS_LOOSE + [BQB.HAS_PART],
+) -> pd.DataFrame:
+    """
+    Filter an identifier table by identifiers, ontologies, and BQB terms for a given entity type.
+    Parameters
+    ----------
+    id_table : pd.DataFrame
+        DataFrame containing identifier mappings to be filtered.
+    identifiers : str, list, set, or None, optional
+        Identifiers to filter by. If None, no filtering is applied on identifiers.
+    ontologies : str, list, set, or None, optional
+        Ontologies to filter by. If None, no filtering is applied on ontologies.
+    bqbs : str, list, set, or None, optional
+        BQB terms to filter by. If None, no filtering is applied on BQB terms. Default is [BQB.IS, BQB.HAS_PART].
+    Returns
+    -------
+    pd.DataFrame
+        Filtered DataFrame containing only rows matching the specified criteria.
+    Raises
+    ------
+    ValueError
+        If the id_table or filter values are invalid, or required columns are missing.
+    """
+    entity_type = sbml_dfs_utils.infer_entity_type(id_table)
+    _validate_id_table(id_table, entity_type)
+    # bqbs
+    if bqbs is not None:
+        bqbs = _sanitize_id_table_bqbs(bqbs, id_table)
+        id_table = id_table.query("bqb in @bqbs")
+    # ontologies
+    if ontologies is not None:
+        ontologies = _sanitize_id_table_ontologies(ontologies, id_table)
+        id_table = id_table.query("ontology in @ontologies")
+    # identifiers
+    if identifiers is not None:
+        identifiers = _sanitize_id_table_identifiers(identifiers, id_table)
+        id_table = id_table.query("identifier in @identifiers")
+    # return the filtered id_table
+    return id_table
+def _validate_id_table(id_table: pd.DataFrame, entity_type: str) -> None:
+    """
+    Validate that the id_table contains the required columns and matches the schema for the given entity_type.
+    Parameters
+    ----------
+    id_table : pd.DataFrame
+        DataFrame containing identifier mappings for a given entity type.
+    entity_type : str
+        The type of entity (e.g., 'species', 'reactions') to validate against the schema.
+    Returns
+    -------
+    None
+    Raises
+    ------
+    ValueError
+        If entity_type is not present in the schema, or if required columns are missing in id_table.
+    """
+    schema = SBML_DFS_SCHEMA.SCHEMA
+    if entity_type not in schema.keys():
+        raise ValueError(
+            f"{entity_type} does not match a table in the SBML_dfs object. The tables "
+            f"which are present are {', '.join(schema.keys())}"
+        )
+    entity_table_attrs = schema[entity_type]
+    if SCHEMA_DEFS.ID not in entity_table_attrs.keys():
+        raise ValueError(f"{entity_type} does not have an 'id' attribute")
+    entity_pk = entity_table_attrs[SCHEMA_DEFS.PK]
+    utils.match_pd_vars(
+        id_table,
+        req_vars={
+            entity_pk,
+            IDENTIFIERS.ONTOLOGY,
+            IDENTIFIERS.IDENTIFIER,
+            IDENTIFIERS.URL,
+            IDENTIFIERS.BQB,
+        },
+        allow_series=False,
+    ).assert_present()
+    return None
+def _sanitize_id_table_values(
+    values: Union[str, list, set],
+    id_table: pd.DataFrame,
+    column_name: str,
+    valid_values: Optional[Set[str]] = None,
+    value_type_name: str = None,
+) -> set:
+    """
+    Generic function to sanitize and validate values against an id_table column.
+    Parameters
+    ----------
+    values : str, list, or set
+        Values to sanitize and validate. Can be a single string, list of strings,
+        or set of strings.
+    id_table : pd.DataFrame
+        DataFrame containing the reference data to validate against.
+    column_name : str
+        Name of the column in id_table to check values against.
+    valid_values : set of str, optional
+        Optional set of globally valid values for additional validation
+        (e.g., VALID_BQB_TERMS). If provided, values must be a subset of this set.
+    value_type_name : str, optional
+        Human-readable name for the value type used in error messages.
+        If None, defaults to column_name.
+    Returns
+    -------
+    set
+        Set of sanitized and validated values.
+    Raises
+    ------
+    ValueError
+        If values is not a string, list, or set.
+        If any values are not in valid_values (when provided).
+        If none of the requested values are present in the id_table.
+    Warnings
+    --------
+    Logs a warning if some (but not all) requested values are missing from id_table.
+    """
+    if value_type_name is None:
+        value_type_name = column_name
+    # Convert to set
+    if isinstance(values, str):
+        values = {values}
+    elif isinstance(values, list):
+        values = set(values)
+    elif isinstance(values, set):
+        pass
+    else:
+        raise ValueError(
+            f"{value_type_name} must be a string, a set, or list, got {type(values).__name__}"
+        )
+    # Check against global valid values if provided
+    if valid_values is not None:
+        invalid_values = values.difference(valid_values)
+        if len(invalid_values) > 0:
+            raise ValueError(
+                f"The following {value_type_name} are not valid: {', '.join(invalid_values)}.\n"
+                f"Valid {value_type_name} are {', '.join(valid_values)}"
+            )
+    # Check against values present in the id_table
+    available_values = set(id_table[column_name].unique())
+    missing_values = values.difference(available_values)
+    if len(missing_values) == len(values):
+        raise ValueError(
+            f"None of the requested {value_type_name} are present in the id_table: {', '.join(missing_values)}.\n"
+            f"The included {value_type_name} are {', '.join(available_values)}"
+        )
+    elif len(missing_values) > 0:
+        logger.warning(
+            f"The following {value_type_name} are not present in the id_table: {', '.join(missing_values)}.\n"
+            f"The included {value_type_name} are {', '.join(available_values)}"
+        )
+    return values
+def _sanitize_id_table_ontologies(
+    ontologies: Union[str, list, set], id_table: pd.DataFrame
+) -> set:
+    """
+    Sanitize and validate ontologies against the id_table.
+    Parameters
+    ----------
+    ontologies : str, list, or set
+        Ontology names to validate.
+    id_table : pd.DataFrame
+        DataFrame containing ontology reference data.
+    Returns
+    -------
+    set
+        Set of validated ontology names.
+    """
+    return _sanitize_id_table_values(
+        values=ontologies,
+        id_table=id_table,
+        column_name=IDENTIFIERS.ONTOLOGY,
+        value_type_name="ontologies",
+    )
+def _sanitize_id_table_bqbs(bqbs: Union[str, list, set], id_table: pd.DataFrame) -> set:
+    """
+    Sanitize and validate BQBs against the id_table.
+    Parameters
+    ----------
+    bqbs : str, list, or set
+        BQB terms to validate.
+    id_table : pd.DataFrame
+        DataFrame containing BQB reference data.
+    Returns
+    -------
+    set
+        Set of validated BQB terms.
+    """
+    return _sanitize_id_table_values(
+        values=bqbs,
+        id_table=id_table,
+        column_name=IDENTIFIERS.BQB,
+        valid_values=VALID_BQB_TERMS,
+        value_type_name="bqbs",
+    )
+def _sanitize_id_table_identifiers(
+    identifiers: Union[str, list, set], id_table: pd.DataFrame
+) -> set:
+    """
+    Sanitize and validate identifiers against the id_table.
+    Parameters
+    ----------
+    identifiers : str, list, or set
+        Identifier values to validate.
+    id_table : pd.DataFrame
+        DataFrame containing identifier reference data.
+    Returns
+    -------
+    set
+        Set of validated identifiers.
+    """
+    return _sanitize_id_table_values(
+        values=identifiers,
+        id_table=id_table,
+        column_name=IDENTIFIERS.IDENTIFIER,
+        value_type_name="identifiers",
+    )

{napistu-0.4.0 → napistu-0.4.1}/src/napistu/sbml_dfs_core.py RENAMED Viewed

@@ -19,17 +19,23 @@ from napistu import sbml_dfs_utils
 from napistu import source
 from napistu import utils
 from napistu.ingestion import sbml
-from napistu.constants import SBML_DFS
-from napistu.constants import SBML_DFS_SCHEMA
-from napistu.constants import IDENTIFIERS
-from napistu.constants import NAPISTU_STANDARD_OUTPUTS
-from napistu.constants import BQB_PRIORITIES
-from napistu.constants import ONTOLOGY_PRIORITIES
-from napistu.constants import MINI_SBO_FROM_NAME
-from napistu.constants import MINI_SBO_TO_NAME
-from napistu.constants import SBOTERM_NAMES
-from napistu.constants import ENTITIES_W_DATA
-from napistu.constants import ENTITIES_TO_ENTITY_DATA
+from napistu.ontologies import id_tables
+from napistu.constants import (
+    BQB,
+    BQB_DEFINING_ATTRS_LOOSE,
+    BQB_PRIORITIES,
+    ENTITIES_W_DATA,
+    ENTITIES_TO_ENTITY_DATA,
+    IDENTIFIERS,
+    MINI_SBO_FROM_NAME,
+    MINI_SBO_TO_NAME,
+    NAPISTU_STANDARD_OUTPUTS,
+    ONTOLOGY_PRIORITIES,
+    SBML_DFS,
+    SBML_DFS_SCHEMA,
+    SBOTERM_NAMES,
+    SCHEMA_DEFS,
+)
 logger = logging.getLogger(__name__)
@@ -101,7 +107,7 @@ class SBML_dfs:
         Remove a reactions data table by label.
     remove_species_data(label)
         Remove a species data table by label.
-    search_by_ids(ids, entity_type, identifiers_df, ontologies=None)
+    search_by_ids(id_table, identifiers=None, ontologies=None, bqbs=None)
         Find entities and identifiers matching a set of query IDs.
     search_by_name(name, entity_type, partial_match=True)
         Find entities by exact or partial name match.
@@ -455,12 +461,12 @@ class SBML_dfs:
         ValueError
             If id_type is invalid or identifiers are malformed
         """
-        selected_table = self.get_table(id_type, {"id"})
+        selected_table = self.get_table(id_type, {SCHEMA_DEFS.ID})
         schema = SBML_DFS_SCHEMA.SCHEMA
         identifiers_dict = dict()
         for sysid in selected_table.index:
-            id_entry = selected_table[schema[id_type]["id"]][sysid]
+            id_entry = selected_table[schema[id_type][SCHEMA_DEFS.ID]][sysid]
             if isinstance(id_entry, identifiers.Identifiers):
                 identifiers_dict[sysid] = pd.DataFrame(id_entry.ids)
@@ -473,16 +479,16 @@ class SBML_dfs:
                 )
         if not identifiers_dict:
             # Return empty DataFrame with expected columns if nothing found
-            return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
+            return pd.DataFrame(columns=[schema[id_type][SCHEMA_DEFS.PK], "entry"])
         identifiers_tbl = pd.concat(identifiers_dict)
-        identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
+        identifiers_tbl.index.names = [schema[id_type][SCHEMA_DEFS.PK], "entry"]
         identifiers_tbl = identifiers_tbl.reset_index()
         named_identifiers = identifiers_tbl.merge(
-            selected_table.drop(schema[id_type]["id"], axis=1),
-            left_on=schema[id_type]["pk"],
+            selected_table.drop(schema[id_type][SCHEMA_DEFS.ID], axis=1),
+            left_on=schema[id_type][SCHEMA_DEFS.PK],
             right_index=True,
         )
@@ -1163,24 +1169,25 @@ class SBML_dfs:
     def search_by_ids(
         self,
-        ids: list[str],
-        entity_type: str,
-        identifiers_df: pd.DataFrame,
-        ontologies: None | set[str] = None,
+        id_table: pd.DataFrame,
+        identifiers: Optional[Union[str, list, set]] = None,
+        ontologies: Optional[Union[str, list, set]] = None,
+        bqbs: Optional[Union[str, list, set]] = BQB_DEFINING_ATTRS_LOOSE
+        + [BQB.HAS_PART],
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
         """
         Find entities and identifiers matching a set of query IDs.
         Parameters
         ----------
-        ids : List[str]
-            List of identifiers to search for
-        entity_type : str
-            Type of entity to search (e.g., 'species', 'reactions')
-        identifiers_df : pd.DataFrame
+        id_table : pd.DataFrame
             DataFrame containing identifier mappings
-        ontologies : Optional[Set[str]], optional
-            Set of ontologies to filter by, by default None
+        identifiers : Optional[Union[str, list, set]], optional
+            Identifiers to filter by, by default None
+        ontologies : Optional[Union[str, list, set]], optional
+            Ontologies to filter by, by default None
+        bqbs : Optional[Union[str, list, set]], optional
+            BQB terms to filter by, by default [BQB.IS, BQB.HAS_PART]
         Returns
         -------
@@ -1196,42 +1203,25 @@ class SBML_dfs:
             If ontologies is not a set
         """
         # validate inputs
-        entity_table = self.get_table(entity_type, required_attributes={"id"})
-        entity_pk = self.schema[entity_type]["pk"]
-        utils.match_pd_vars(
-            identifiers_df,
-            req_vars={
-                entity_pk,
-                IDENTIFIERS.ONTOLOGY,
-                IDENTIFIERS.IDENTIFIER,
-                IDENTIFIERS.URL,
-                IDENTIFIERS.BQB,
-            },
-            allow_series=False,
-        ).assert_present()
-        if ontologies is not None:
-            if not isinstance(ontologies, set):
-                # for clarity this should not be reachable based on type hints
-                raise TypeError(
-                    f"ontologies must be a set, but got {type(ontologies).__name__}"
-                )
-            ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
-            invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
-            if len(invalid_ontologies) > 0:
-                raise ValueError(
-                    f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
-                    f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
-                )
-            # fitler to just to identifiers matchign the ontologies of interest
-            identifiers_df = identifiers_df.query("ontology in @ontologies")
+        entity_type = sbml_dfs_utils.infer_entity_type(id_table)
+        entity_table = self.get_table(entity_type, required_attributes={SCHEMA_DEFS.ID})
+        entity_pk = self.schema[entity_type][SCHEMA_DEFS.PK]
-        matching_identifiers = identifiers_df.loc[
-            identifiers_df["identifier"].isin(ids)
-        ]
-        entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
+        matching_identifiers = id_tables.filter_id_table(
+            id_table=id_table, identifiers=identifiers, ontologies=ontologies, bqbs=bqbs
+        )
+        matching_keys = matching_identifiers[entity_pk].tolist()
+        entity_subset = entity_table.loc[matching_keys]
+        if matching_identifiers.shape[0] != entity_subset.shape[0]:
+            raise ValueError(
+                f"Some identifiers did not match to an entity for {entity_type}. "
+                "This suggests that the identifiers and sbml_dfs are not in sync. "
+                "Please create new identifiers with sbml_dfs.get_characteristic_species_ids() "
+                "or sbml_dfs.get_identifiers()."
+            )
         return entity_subset, matching_identifiers

{napistu-0.4.0 → napistu-0.4.1}/src/napistu/sbml_dfs_utils.py RENAMED Viewed

@@ -14,24 +14,29 @@ from napistu import utils
 from napistu import identifiers
 from napistu import indices
-from napistu.constants import BQB
-from napistu.constants import SBML_DFS
-from napistu.constants import SBML_DFS_SCHEMA
-from napistu.constants import IDENTIFIERS
-from napistu.constants import BQB_DEFINING_ATTRS
-from napistu.constants import BQB_DEFINING_ATTRS_LOOSE
-from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
-from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
-from napistu.constants import SBO_ROLES_DEFS
-from napistu.constants import MINI_SBO_FROM_NAME
-from napistu.constants import MINI_SBO_TO_NAME
-from napistu.constants import SBO_NAME_TO_ROLE
-from napistu.constants import ONTOLOGIES
-from napistu.constants import VALID_SBO_TERM_NAMES
-from napistu.constants import VALID_SBO_TERMS
-from napistu.ingestion.constants import VALID_COMPARTMENTS
-from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
-from napistu.ingestion.constants import GENERIC_COMPARTMENT
+from napistu.constants import (
+    BQB,
+    BQB_DEFINING_ATTRS,
+    BQB_DEFINING_ATTRS_LOOSE,
+    SBML_DFS,
+    SBML_DFS_SCHEMA,
+    SCHEMA_DEFS,
+    IDENTIFIERS,
+    INTERACTION_EDGELIST_EXPECTED_VARS,
+    ONTOLOGIES,
+    MINI_SBO_FROM_NAME,
+    MINI_SBO_TO_NAME,
+    REQUIRED_REACTION_FROMEDGELIST_COLUMNS,
+    SBO_ROLES_DEFS,
+    SBO_NAME_TO_ROLE,
+    VALID_SBO_TERM_NAMES,
+    VALID_SBO_TERMS,
+)
+from napistu.ingestion.constants import (
+    COMPARTMENTS_GO_TERMS,
+    GENERIC_COMPARTMENT,
+    VALID_COMPARTMENTS,
+)
 logger = logging.getLogger(__name__)
@@ -418,6 +423,65 @@ def id_formatter_inv(ids: list[str]) -> list[int]:
     return id_val
+def infer_entity_type(df: pd.DataFrame) -> str:
+    """
+    Infer the entity type of a DataFrame based on its structure and schema.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame to analyze
+    Returns
+    -------
+    str
+        The inferred entity type name
+    Raises
+    ------
+    ValueError
+        If no entity type can be determined
+    """
+    schema = SBML_DFS_SCHEMA.SCHEMA
+    # Get all primary keys
+    primary_keys = [
+        entity_schema.get(SCHEMA_DEFS.PK) for entity_schema in schema.values()
+    ]
+    primary_keys = [pk for pk in primary_keys if pk is not None]
+    # Check if index matches a primary key
+    if df.index.name in primary_keys:
+        for entity_type, entity_schema in schema.items():
+            if entity_schema.get(SCHEMA_DEFS.PK) == df.index.name:
+                return entity_type
+    # Get DataFrame columns that are also primary keys
+    df_columns = set(df.columns).intersection(primary_keys)
+    # Check for exact match with primary key + foreign keys
+    for entity_type, entity_schema in schema.items():
+        expected_keys = set()
+        # Add primary key
+        pk = entity_schema.get(SCHEMA_DEFS.PK)
+        if pk:
+            expected_keys.add(pk)
+        # Add foreign keys
+        fks = entity_schema.get(SCHEMA_DEFS.FK, [])
+        expected_keys.update(fks)
+        # Check for exact match
+        if df_columns == expected_keys:
+            return entity_type
+    # No match found
+    raise ValueError(
+        f"No entity type matches DataFrame with columns: {sorted(df_columns)}"
+    )
 def match_entitydata_index_to_entity(
     entity_data_dict: dict,
     an_entity_data_type: str,

napistu 0.4.0__tar.gz → 0.4.1__tar.gz

napistu 0.4.0tar.gz → 0.4.1tar.gz