PyPI - napistu - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

napistu 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

napistu/__main__.py +20 -9
napistu/consensus.py +19 -25
napistu/constants.py +90 -64
napistu/indices.py +3 -1
napistu/ingestion/sbml.py +298 -295
napistu/ingestion/string.py +14 -18
napistu/ingestion/trrust.py +22 -27
napistu/matching/species.py +1 -1
napistu/ontologies/genodexito.py +5 -1
napistu/ontologies/renaming.py +4 -0
napistu/sbml_dfs_core.py +127 -64
napistu/sbml_dfs_utils.py +4 -0
napistu/utils.py +52 -41
{napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/METADATA +1 -1
{napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/RECORD +27 -27
tests/conftest.py +70 -13
tests/test_consensus.py +74 -5
tests/test_gaps.py +26 -15
tests/test_network_net_create.py +1 -1
tests/test_network_precompute.py +1 -1
tests/test_ontologies_renaming.py +28 -24
tests/test_sbml_dfs_core.py +165 -15
tests/test_utils.py +19 -0
{napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/WHEEL +0 -0
{napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/entry_points.txt +0 -0
{napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/licenses/LICENSE +0 -0
{napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/top_level.txt +0 -0

napistu/ingestion/string.py CHANGED Viewed

@@ -8,15 +8,11 @@ from napistu import sbml_dfs_core
 from napistu import sbml_dfs_utils
 from napistu import source
 from napistu import utils
+from napistu.ingestion import napistu_edgelist
 from napistu.constants import BQB
 from napistu.constants import MINI_SBO_FROM_NAME
-from napistu.ingestion import napistu_edgelist
-from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
-from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
-from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
-from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
-from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
-from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
+from napistu.constants import ONTOLOGIES
+from napistu.constants import SBML_DFS
 from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
 from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
 from napistu.ingestion.constants import STRING_PROTEIN_ID
@@ -137,10 +133,10 @@ def convert_string_to_sbml_dfs(
     # define identifier mapping from aliases to use:
     alias_to_identifier = {
-        "Ensembl_gene": ("ensembl_gene", BQB.IS_ENCODED_BY),
-        "Ensembl_transcript": ("ensembl_transcript", BQB.IS_ENCODED_BY),
-        "Ensembl_translation": ("ensembl_protein", BQB.IS),
-        "Ensembl_UniProt_AC": ("uniprot", BQB.IS),
+        "Ensembl_gene": (ONTOLOGIES.ENSEMBL_GENE, BQB.IS_ENCODED_BY),
+        "Ensembl_transcript": (ONTOLOGIES.ENSEMBL_TRANSCRIPT, BQB.IS_ENCODED_BY),
+        "Ensembl_translation": (ONTOLOGIES.ENSEMBL_PROTEIN, BQB.IS),
+        "Ensembl_UniProt_AC": (ONTOLOGIES.UNIPROT, BQB.IS),
     }
     # filter aliases to only keep required ones
@@ -276,17 +272,17 @@ def _build_species_df(
     species_df = (
         pd.Series(
             list(set(edgelist[source_col]).union(edgelist[target_col])),
-            name=SBML_SPECIES_DICT_NAME,
+            name=SBML_DFS.S_NAME,
         )
         .to_frame()
-        .set_index(SBML_SPECIES_DICT_NAME, drop=False)
+        .set_index(SBML_DFS.S_NAME, drop=False)
         .apply(
             _get_identifiers,
             alias_to_identifier=alias_to_identifier,
             dat_alias=aliases,
             axis=1,
         )
-        .rename(SBML_SPECIES_DICT_IDENTIFIERS)
+        .rename(SBML_DFS.S_IDENTIFIERS)
         .reset_index()
     )
     return species_df
@@ -312,8 +308,8 @@ def _build_interactor_edgelist(
         **{
             STRING_UPSTREAM_COMPARTMENT: compartment,
             STRING_DOWNSTREAM_COMPARTMENT: compartment,
-            SMBL_REACTION_SPEC_SBO_TERM: sbo_interactor,
-            SMBL_REACTION_DICT_IDENTIFIERS: lambda x: identifiers.Identifiers([]),
+            SBML_DFS.SBO_TERM: sbo_interactor,
+            SBML_DFS.R_IDENTIFIERS: lambda x: identifiers.Identifiers([]),
         }
     )
     if add_reverse_interactions:
@@ -336,10 +332,10 @@ def _build_interactor_edgelist(
         )
     interaction_edgelist = dat
-    interaction_edgelist[SMBL_REACTION_DICT_NAME] = _build_string_reaction_name(
+    interaction_edgelist[SBML_DFS.R_NAME] = _build_string_reaction_name(
         dat[STRING_UPSTREAM_NAME], dat[STRING_DOWNSTREAM_NAME]
     )
-    interaction_edgelist[SMBL_REACTION_DICT_IS_REVERSIBLE] = True
+    interaction_edgelist[SBML_DFS.R_ISREVERSIBLE] = True
     return interaction_edgelist

napistu/ingestion/trrust.py CHANGED Viewed

@@ -8,16 +8,11 @@ from napistu import identifiers
 from napistu import sbml_dfs_core
 from napistu import source
 from napistu import utils
+from napistu.constants import BQB
+from napistu.constants import IDENTIFIERS
 from napistu.constants import MINI_SBO_FROM_NAME
 from napistu.constants import SBOTERM_NAMES
-from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_IDENTIFIERS
-from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_NAME
-from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
-from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
-from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
-from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
-from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
-from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
+from napistu.constants import SBML_DFS
 from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
 from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
 from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
@@ -81,16 +76,16 @@ def convert_trrust_to_sbml_dfs(
     species_df = (
         pd.DataFrame(
             {
-                SBML_SPECIES_DICT_NAME: list(
+                SBML_DFS.S_NAME: list(
                     {*edge_summaries_df["from"], *edge_summaries_df["to"]}
                 )
             }
         )
         .merge(
-            uniprot_2_symbol.rename({TRRUST_SYMBOL: SBML_SPECIES_DICT_NAME}, axis=1),
+            uniprot_2_symbol.rename({TRRUST_SYMBOL: SBML_DFS.S_NAME}, axis=1),
             how="left",
         )
-        .set_index(SBML_SPECIES_DICT_NAME)
+        .set_index(SBML_DFS.S_NAME)
     )
     # create Identifiers objects for all species with uniprot IDs
@@ -106,14 +101,14 @@ def convert_trrust_to_sbml_dfs(
         [
             identifiers.Identifiers(
                 [
-                    identifiers.format_uri(uri=x, biological_qualifier_type="BQB_IS")
-                    for x in species_w_ids.loc[[ind]]["url"].tolist()
+                    identifiers.format_uri(uri=x, biological_qualifier_type=BQB.IS)
+                    for x in species_w_ids.loc[[ind]][IDENTIFIERS.URL].tolist()
                 ]
             )
             for ind in species_w_ids.index.unique()
         ],
         index=species_w_ids.index.unique(),
-    ).rename(SBML_SPECIES_DICT_IDENTIFIERS)
+    ).rename(SBML_DFS.S_IDENTIFIERS)
     # just retain s_name and s_Identifiers
     # this just needs a source object which will be added later
@@ -124,21 +119,21 @@ def convert_trrust_to_sbml_dfs(
         .merge(
             species_w_ids_series,
             how="left",
-            left_on=SBML_SPECIES_DICT_NAME,
+            left_on=SBML_DFS.S_NAME,
             right_index=True,
         )
         .reset_index(drop=True)
     )
     # stub genes with missing IDs
-    species_df[SBML_SPECIES_DICT_IDENTIFIERS] = species_df[SBML_SPECIES_DICT_IDENTIFIERS].fillna(  # type: ignore
+    species_df[SBML_DFS.S_IDENTIFIERS] = species_df[SBML_DFS.S_IDENTIFIERS].fillna(  # type: ignore
         value=identifiers.Identifiers([])
     )
     # define distinct compartments
     compartments_df = pd.DataFrame(
         {
-            SBML_COMPARTMENT_DICT_NAME: TRRUST_COMPARTMENT_NUCLEOPLASM,
-            SBML_COMPARTMENT_DICT_IDENTIFIERS: identifiers.Identifiers(
+            SBML_DFS.C_NAME: TRRUST_COMPARTMENT_NUCLEOPLASM,
+            SBML_DFS.C_IDENTIFIERS: identifiers.Identifiers(
                 [
                     identifiers.format_uri(
                         uri=identifiers.create_uri_url(
@@ -159,7 +154,7 @@ def convert_trrust_to_sbml_dfs(
         upstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
         downstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
     )
-    gene_gene_identifier_edgelist[SMBL_REACTION_DICT_NAME] = [
+    gene_gene_identifier_edgelist[SBML_DFS.R_NAME] = [
         f"{x} {y} of {z}"
         for x, y, z in zip(
             gene_gene_identifier_edgelist[STRING_UPSTREAM_NAME],
@@ -171,15 +166,15 @@ def convert_trrust_to_sbml_dfs(
     # convert relationships to SBO terms
     interaction_edgelist = gene_gene_identifier_edgelist.replace(
         {"sign": MINI_SBO_FROM_NAME}
-    ).rename({"sign": SMBL_REACTION_SPEC_SBO_TERM}, axis=1)
+    ).rename({"sign": SBML_DFS.SBO_TERM}, axis=1)
     # format pubmed identifiers of interactions
-    interaction_edgelist[SMBL_REACTION_DICT_IDENTIFIERS] = [
+    interaction_edgelist[SBML_DFS.R_IDENTIFIERS] = [
         _format_pubmed_for_interactions(x) for x in interaction_edgelist["reference"]
     ]
     # directionality: by default, set r_isreversible to False for TRRUST data
-    interaction_edgelist[SMBL_REACTION_DICT_IS_REVERSIBLE] = False
+    interaction_edgelist[SBML_DFS.R_ISREVERSIBLE] = False
     # reduce to essential variables
     interaction_edgelist = interaction_edgelist[
@@ -188,10 +183,10 @@ def convert_trrust_to_sbml_dfs(
             STRING_DOWNSTREAM_NAME,
             STRING_UPSTREAM_COMPARTMENT,
             STRING_DOWNSTREAM_COMPARTMENT,
-            SMBL_REACTION_DICT_NAME,
-            SMBL_REACTION_SPEC_SBO_TERM,
-            SMBL_REACTION_DICT_IDENTIFIERS,
-            SMBL_REACTION_DICT_IS_REVERSIBLE,
+            SBML_DFS.R_NAME,
+            SBML_DFS.SBO_TERM,
+            SBML_DFS.R_IDENTIFIERS,
+            SBML_DFS.R_ISREVERSIBLE,
         ]
     ]
@@ -277,7 +272,7 @@ def _format_pubmed_for_interactions(pubmed_set):
         url = identifiers.create_uri_url(ontology="pubmed", identifier=p, strict=False)
         if url is not None:
             valid_url = identifiers.format_uri(
-                uri=url, biological_qualifier_type="BQB_IS_DESCRIBED_BY"
+                uri=url, biological_qualifier_type=BQB.IS_DESCRIBED_BY
             )
             ids.append(valid_url)

napistu/matching/species.py CHANGED Viewed

@@ -33,7 +33,7 @@ def features_to_pathway_species(
         pd.Dataframe containing a "feature_identifiers_var" variable used to match entries
     species_identifiers: pd.DataFrame
         A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
-        generally using sbml_dfs_core.export_sbml_dfs()
+        generally using sbml_dfs.export_sbml_dfs()
     ontologies: set
         A set of ontologies used to match features to pathway species
     feature_identifiers_var: str

napistu/ontologies/genodexito.py CHANGED Viewed

@@ -356,7 +356,7 @@ class Genodexito:
             )
             logger.debug(
                 f"{ids.shape[0] - expanded_ids.shape[0]} "
-                "ids are not included in expanded ids"
+                "ids are not included in expanded ids. These will be filled with empty Identifiers"
             )
         else:
             matched_expanded_ids = expanded_ids
@@ -364,6 +364,10 @@ class Genodexito:
         updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
             pd.DataFrame(matched_expanded_ids)
         )
+        # fill missing attributes with empty Identifiers
+        updated_ids[SBML_DFS.S_IDENTIFIERS] = updated_ids[
+            SBML_DFS.S_IDENTIFIERS
+        ].fillna(identifiers.Identifiers([]))
         setattr(sbml_dfs, "species", updated_ids)

napistu/ontologies/renaming.py CHANGED Viewed

@@ -72,6 +72,10 @@ def rename_species_ontologies(
     updated_species = sbml_dfs.species.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
         pd.DataFrame(species_identifiers)
     )
+    # fill missing attributes with empty Identifiers
+    updated_species[SBML_DFS.S_IDENTIFIERS] = updated_species[
+        SBML_DFS.S_IDENTIFIERS
+    ].fillna(identifiers.Identifiers([]))
     setattr(sbml_dfs, "species", updated_species)

napistu/sbml_dfs_core.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import copy
 import logging
 import re
 from typing import Any
@@ -64,6 +65,8 @@ class SBML_dfs:
         Add a new reactions data table to the model with validation.
     add_species_data(label, data)
         Add a new species data table to the model with validation.
+    copy()
+        Return a deep copy of the SBML_dfs object.
     export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
         Export the SBML_dfs model and its tables to files in a specified directory.
     get_characteristic_species_ids(dogmatic=True)
@@ -114,7 +117,6 @@ class SBML_dfs:
     Private/Hidden Methods (alphabetical, appear after public methods)
     -----------------------------------------------------------------
     _attempt_resolve(e)
-    _check_pk_fk_correspondence()
     _find_underspecified_reactions_by_scids(sc_ids)
     _get_unused_cspecies()
     _get_unused_species()
@@ -123,9 +125,12 @@ class SBML_dfs:
     _remove_species(s_ids)
     _remove_unused_cspecies()
     _remove_unused_species()
+    _validate_identifiers()
+    _validate_pk_fk_correspondence()
     _validate_r_ids(r_ids)
     _validate_reaction_species()
     _validate_reactions_data(reactions_data_table)
+    _validate_sources()
     _validate_species_data(species_data_table)
     _validate_table(table_name)
     """
@@ -255,6 +260,17 @@ class SBML_dfs:
             )
         self.species_data[label] = data
+    def copy(self):
+        """
+        Return a deep copy of the SBML_dfs object.
+        Returns
+        -------
+        SBML_dfs
+            A deep copy of the current SBML_dfs object.
+        """
+        return copy.deepcopy(self)
     def export_sbml_dfs(
         self,
         model_prefix: str,
@@ -440,7 +456,7 @@ class SBML_dfs:
             If id_type is invalid or identifiers are malformed
         """
         selected_table = self.get_table(id_type, {"id"})
-        schema = self.schema
+        schema = SBML_DFS_SCHEMA.SCHEMA
         identifiers_dict = dict()
         for sysid in selected_table.index:
@@ -458,6 +474,7 @@ class SBML_dfs:
         if not identifiers_dict:
             # Return empty DataFrame with expected columns if nothing found
             return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
         identifiers_tbl = pd.concat(identifiers_dict)
         identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
@@ -1382,7 +1399,7 @@ class SBML_dfs:
             self._validate_table(table)
         # check whether pks and fks agree
-        self._check_pk_fk_correspondence()
+        self._validate_pk_fk_correspondence()
         # check optional data tables:
         for k, v in self.species_data.items():
@@ -1400,6 +1417,10 @@ class SBML_dfs:
         # validate reaction_species sbo_terms and stoi
         self._validate_reaction_species()
+        # validate identifiers and sources
+        self._validate_identifiers()
+        self._validate_sources()
     def validate_and_resolve(self):
         """
         Validate and attempt to automatically fix common issues.
@@ -1455,67 +1476,6 @@ class SBML_dfs:
             )
             raise e
-    def _check_pk_fk_correspondence(self):
-        """
-        Check whether primary keys and foreign keys agree for all tables in the schema.
-        Raises ValueError if any correspondence fails.
-        """
-        pk_df = pd.DataFrame(
-            [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
-        )
-        fk_df = (
-            pd.DataFrame(
-                [
-                    {"fk_table": k, "fk": v["fk"]}
-                    for k, v in self.schema.items()
-                    if "fk" in v.keys()
-                ]
-            )
-            .set_index("fk_table")["fk"]
-            .apply(pd.Series)
-            .reset_index()
-            .melt(id_vars="fk_table")
-            .drop(["variable"], axis=1)
-            .rename(columns={"value": "key"})
-        )
-        pk_fk_correspondences = pk_df.merge(fk_df)
-        for i in range(0, pk_fk_correspondences.shape[0]):
-            pk_table_keys = set(
-                getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
-            )
-            if None in pk_table_keys:
-                raise ValueError(
-                    f"{pk_fk_correspondences['pk_table'][i]} had "
-                    "missing values in its index"
-                )
-            fk_table_keys = set(
-                getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
-                    :, pk_fk_correspondences["key"][i]
-                ]
-            )
-            if None in fk_table_keys:
-                raise ValueError(
-                    f"{pk_fk_correspondences['fk_table'][i]} included "
-                    f"missing {pk_fk_correspondences['key'][i]} values"
-                )
-            # all foreign keys need to match a primary key
-            extra_fks = fk_table_keys.difference(pk_table_keys)
-            if len(extra_fks) != 0:
-                raise ValueError(
-                    f"{len(extra_fks)} distinct "
-                    f"{pk_fk_correspondences['key'][i]} values were"
-                    f" found in {pk_fk_correspondences['fk_table'][i]} "
-                    f"but missing from {pk_fk_correspondences['pk_table'][i]}."
-                    " All foreign keys must have a matching primary key.\n\n"
-                    f"Extra key are: {', '.join(extra_fks)}"
-                )
     def _find_underspecified_reactions_by_scids(
         self, sc_ids: Iterable[str]
     ) -> set[str]:
@@ -1640,6 +1600,88 @@ class SBML_dfs:
         s_ids = self._get_unused_species()
         self._remove_species(s_ids)
+    def _validate_identifiers(self):
+        """
+        Validate identifiers in the model
+        Iterates through all tables and checks if the identifier columns are valid.
+        Raises:
+            ValueError: missing identifiers in the table
+        """
+        SCHEMA = SBML_DFS_SCHEMA.SCHEMA
+        for table in SBML_DFS_SCHEMA.SCHEMA.keys():
+            if "id" not in SCHEMA[table].keys():
+                continue
+            id_series = self.get_table(table)[SCHEMA[table]["id"]]
+            if id_series.isna().sum() > 0:
+                missing_ids = id_series[id_series.isna()].index
+                raise ValueError(
+                    f"{table} has {len(missing_ids)} missing ids: {missing_ids}"
+                )
+    def _validate_pk_fk_correspondence(self):
+        """
+        Check whether primary keys and foreign keys agree for all tables in the schema.
+        Raises ValueError if any correspondence fails.
+        """
+        pk_df = pd.DataFrame(
+            [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
+        )
+        fk_df = (
+            pd.DataFrame(
+                [
+                    {"fk_table": k, "fk": v["fk"]}
+                    for k, v in self.schema.items()
+                    if "fk" in v.keys()
+                ]
+            )
+            .set_index("fk_table")["fk"]
+            .apply(pd.Series)
+            .reset_index()
+            .melt(id_vars="fk_table")
+            .drop(["variable"], axis=1)
+            .rename(columns={"value": "key"})
+        )
+        pk_fk_correspondences = pk_df.merge(fk_df)
+        for i in range(0, pk_fk_correspondences.shape[0]):
+            pk_table_keys = set(
+                getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
+            )
+            if None in pk_table_keys:
+                raise ValueError(
+                    f"{pk_fk_correspondences['pk_table'][i]} had "
+                    "missing values in its index"
+                )
+            fk_table_keys = set(
+                getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
+                    :, pk_fk_correspondences["key"][i]
+                ]
+            )
+            if None in fk_table_keys:
+                raise ValueError(
+                    f"{pk_fk_correspondences['fk_table'][i]} included "
+                    f"missing {pk_fk_correspondences['key'][i]} values"
+                )
+            # all foreign keys need to match a primary key
+            extra_fks = fk_table_keys.difference(pk_table_keys)
+            if len(extra_fks) != 0:
+                raise ValueError(
+                    f"{len(extra_fks)} distinct "
+                    f"{pk_fk_correspondences['key'][i]} values were"
+                    f" found in {pk_fk_correspondences['fk_table'][i]} "
+                    f"but missing from {pk_fk_correspondences['pk_table'][i]}."
+                    " All foreign keys must have a matching primary key.\n\n"
+                    f"Extra key are: {', '.join(extra_fks)}"
+                )
     def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
         if isinstance(r_ids, str):
@@ -1694,6 +1736,27 @@ class SBML_dfs:
         """
         sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
+    def _validate_sources(self):
+        """
+        Validate sources in the model
+        Iterates through all tables and checks if the source columns are valid.
+        Raises:
+            ValueError: missing sources in the table
+        """
+        SCHEMA = SBML_DFS_SCHEMA.SCHEMA
+        for table in SBML_DFS_SCHEMA.SCHEMA.keys():
+            if "source" not in SCHEMA[table].keys():
+                continue
+            source_series = self.get_table(table)[SCHEMA[table]["source"]]
+            if source_series.isna().sum() > 0:
+                missing_sources = source_series[source_series.isna()].index
+                raise ValueError(
+                    f"{table} has {len(missing_sources)} missing sources: {missing_sources}"
+                )
     def _validate_species_data(self, species_data_table: pd.DataFrame):
         """Validates species data attribute

napistu/sbml_dfs_utils.py CHANGED Viewed

@@ -559,6 +559,10 @@ def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
     N_invalid_ids = sum(id_table[id_var].isna())
     if N_invalid_ids != 0:
+        print("Rows with missing identifiers:")
+        print(id_table.loc[id_table[id_var].isna(), id_var])
         raise ValueError(
             f'{N_invalid_ids} entries in "id_table" were missing',
             "entries with no identifiers should still include an Identifiers object",

napistu/utils.py CHANGED Viewed

@@ -810,50 +810,15 @@ def drop_extra_cols(
     return df_out.loc[:, ordered_cols]
-def _merge_and_log_overwrites(
-    left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
-) -> pd.DataFrame:
+def update_pathological_names(names: pd.Series, prefix: str) -> pd.Series:
     """
-    Merge two DataFrames and log any column overwrites.
-    Parameters
-    ----------
-    left_df : pd.DataFrame
-        Left DataFrame for merge
-    right_df : pd.DataFrame
-        Right DataFrame for merge
-    merge_context : str
-        Description of the merge operation for logging
-    **merge_kwargs : dict
-        Additional keyword arguments passed to pd.merge
+    Update pathological names in a pandas Series.
-    Returns
-    -------
-    pd.DataFrame
-        Merged DataFrame with overwritten columns removed
+    Add a prefix to the names if they are all numeric.
     """
-    # Track original columns
-    original_cols = left_df.columns.tolist()
-    # Ensure we're using the correct suffixes
-    merge_kwargs["suffixes"] = ("_old", "")
-    # Perform merge
-    merged_df = pd.merge(left_df, right_df, **merge_kwargs)
-    # Check for and log any overwritten columns
-    new_cols = merged_df.columns.tolist()
-    overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
-    if overwritten_cols:
-        logger.warning(
-            f"The following columns were overwritten during {merge_context} merge and their original values "
-            f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
-        )
-        # Drop the old columns
-        cols_to_drop = [col + "_old" for col in overwritten_cols]
-        merged_df = merged_df.drop(columns=cols_to_drop)
-    return merged_df
+    if names.apply(lambda x: x.isdigit()).all():
+        names = names.apply(lambda x: f"{prefix}{x}")
+    return names
 def format_identifiers_as_edgelist(
@@ -1108,3 +1073,49 @@ def _add_nameness_score(df, name_var):
     df.loc[:, "nameness_score"] = df[name_var].apply(score_nameness)
     return df
+def _merge_and_log_overwrites(
+    left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
+) -> pd.DataFrame:
+    """
+    Merge two DataFrames and log any column overwrites.
+    Parameters
+    ----------
+    left_df : pd.DataFrame
+        Left DataFrame for merge
+    right_df : pd.DataFrame
+        Right DataFrame for merge
+    merge_context : str
+        Description of the merge operation for logging
+    **merge_kwargs : dict
+        Additional keyword arguments passed to pd.merge
+    Returns
+    -------
+    pd.DataFrame
+        Merged DataFrame with overwritten columns removed
+    """
+    # Track original columns
+    original_cols = left_df.columns.tolist()
+    # Ensure we're using the correct suffixes
+    merge_kwargs["suffixes"] = ("_old", "")
+    # Perform merge
+    merged_df = pd.merge(left_df, right_df, **merge_kwargs)
+    # Check for and log any overwritten columns
+    new_cols = merged_df.columns.tolist()
+    overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
+    if overwritten_cols:
+        logger.warning(
+            f"The following columns were overwritten during {merge_context} merge and their original values "
+            f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
+        )
+        # Drop the old columns
+        cols_to_drop = [col + "_old" for col in overwritten_cols]
+        merged_df = merged_df.drop(columns=cols_to_drop)
+    return merged_df

{napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: napistu
-Version: 0.3.6
+Version: 0.3.7
 Summary: Connecting high-dimensional data to curated pathways
 Home-page: https://github.com/napistu/napistu-py
 Author: Sean Hackett

napistu 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

napistu 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl