PyPI - napistu - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.4.dev2__py3-none-any.whl - Mend

napistu 0.1.0py3-none-any.whl → 0.2.4.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

napistu/__init__.py +1 -1
napistu/consensus.py +1010 -513
napistu/constants.py +24 -0
napistu/gcs/constants.py +2 -2
napistu/gcs/downloads.py +57 -25
napistu/gcs/utils.py +21 -0
napistu/identifiers.py +105 -6
napistu/ingestion/constants.py +0 -1
napistu/ingestion/obo.py +24 -8
napistu/ingestion/psi_mi.py +20 -5
napistu/ingestion/reactome.py +8 -32
napistu/mcp/__init__.py +69 -0
napistu/mcp/__main__.py +180 -0
napistu/mcp/codebase.py +182 -0
napistu/mcp/codebase_utils.py +298 -0
napistu/mcp/constants.py +72 -0
napistu/mcp/documentation.py +166 -0
napistu/mcp/documentation_utils.py +235 -0
napistu/mcp/execution.py +382 -0
napistu/mcp/profiles.py +73 -0
napistu/mcp/server.py +86 -0
napistu/mcp/tutorials.py +124 -0
napistu/mcp/tutorials_utils.py +230 -0
napistu/mcp/utils.py +47 -0
napistu/mechanism_matching.py +782 -26
napistu/modify/constants.py +41 -0
napistu/modify/curation.py +4 -1
napistu/modify/gaps.py +243 -156
napistu/modify/pathwayannot.py +26 -8
napistu/network/neighborhoods.py +16 -7
napistu/network/net_create.py +209 -54
napistu/network/net_propagation.py +118 -0
napistu/network/net_utils.py +1 -32
napistu/rpy2/netcontextr.py +10 -7
napistu/rpy2/rids.py +7 -5
napistu/sbml_dfs_core.py +46 -29
napistu/sbml_dfs_utils.py +37 -1
napistu/source.py +8 -2
napistu/utils.py +67 -8
napistu-0.2.4.dev2.dist-info/METADATA +84 -0
napistu-0.2.4.dev2.dist-info/RECORD +95 -0
{napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/WHEEL +1 -1
tests/conftest.py +11 -5
tests/test_consensus.py +4 -1
tests/test_gaps.py +127 -0
tests/test_gcs.py +3 -2
tests/test_igraph.py +14 -0
tests/test_mcp_documentation_utils.py +13 -0
tests/test_mechanism_matching.py +658 -0
tests/test_net_propagation.py +89 -0
tests/test_net_utils.py +83 -0
tests/test_sbml.py +2 -0
tests/{test_sbml_dfs_create.py → test_sbml_dfs_core.py} +68 -4
tests/test_utils.py +81 -0
napistu-0.1.0.dist-info/METADATA +0 -56
napistu-0.1.0.dist-info/RECORD +0 -77
{napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/entry_points.txt +0 -0
{napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/licenses/LICENSE +0 -0
{napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/top_level.txt +0 -0

napistu/consensus.py CHANGED Viewed

@@ -31,111 +31,43 @@ def construct_consensus_model(
     dogmatic: bool = True,
 ) -> sbml_dfs_core.SBML_dfs:
     """
-    Construct Consensus Model
+    Construct a Consensus Model by merging shared entities across pathway models.
-    Turn a dictionary of pathway models into a single consensus model by merging shared entities.
+    This function takes a dictionary of pathway models and merges shared entities (compartments, species, reactions, etc.)
+    into a single consensus model, using a set of rules for entity identity and merging.
-    Parameters:
-    ----------
-    sbml_dfs_dict: dict{cpr.SBML_dfs}
-        A dictionary of SBML_dfs from different models
-    pw_index: indices.PWIndex
-        An index of all tables being aggregated
-    dogmatic: bool
-        If True then try to preserve genes, transcript, and proteins as separate species. If False
-        then try to merge them.
-    Returns:
+    Parameters
     ----------
-        A cpr.SBML_dfs object containing the consensus model
+    sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
+        A dictionary of SBML_dfs objects from different models, keyed by model name.
+    pw_index : indices.PWIndex
+        An index of all tables being aggregated, used for cross-referencing entities.
+    dogmatic : bool, default=True
+        If True, preserve genes, transcripts, and proteins as separate species. If False, merge them when possible.
+    Returns
+    -------
+    sbml_dfs_core.SBML_dfs
+        A consensus SBML_dfs object containing the merged model.
     """
+    # Validate inputs
     logger.info("Reporting possible issues in component models")
     _check_sbml_dfs_dict(sbml_dfs_dict)
     assert isinstance(pw_index, indices.PWIndex)
-    # select valid BQB attributes based on dogmatic flag
-    defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
-    logger.info("Defining compartments based on unique ids")
-    comp_consensus_entities, comp_lookup_table = construct_meta_entities_identifiers(
-        sbml_dfs_dict=sbml_dfs_dict, pw_index=pw_index, table="compartments"
-    )
-    logger.info("Defining species based on unique ids")
-    spec_consensus_entities, spec_lookup_table = construct_meta_entities_identifiers(
-        sbml_dfs_dict=sbml_dfs_dict,
-        pw_index=pw_index,
-        table=SBML_DFS.SPECIES,
-        defining_biological_qualifiers=defining_biological_qualifiers,
-    )
-    logger.info(
-        "Defining compartmentalized species based on unique species x compartments"
-    )
-    compspec_consensus_instances, compspec_lookup_table = construct_meta_entities_fk(
-        sbml_dfs_dict,
-        pw_index,
-        table=SBML_DFS.COMPARTMENTALIZED_SPECIES,
-        fk_lookup_tables={
-            SBML_DFS.C_ID: comp_lookup_table,
-            SBML_DFS.S_ID: spec_lookup_table,
-        },
-    )
-    logger.info(
-        "Define reactions based on membership of identical compartmentalized species"
-    )
-    rxn_consensus_species, rxn_lookup_table = construct_meta_entities_members(
-        sbml_dfs_dict,
-        pw_index,
-        table=SBML_DFS.REACTIONS,
-        defined_by=SBML_DFS.REACTION_SPECIES,
-        defined_lookup_tables={SBML_DFS.SC_ID: compspec_lookup_table},
-        defining_attrs=[SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
-    )
-    logger.info("Annotating reversibility based on merged reactions")
-    rxn_consensus_species = _resolve_reversibility(
-        sbml_dfs_dict, rxn_consensus_species, rxn_lookup_table
-    )
+    # Select valid BQB attributes based on dogmatic flag
+    defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
-    # define reaction species with species
-    logger.info("Define reaction species based on reactions")
-    rxnspec_consensus_instances, rxnspec_lookup_table = construct_meta_entities_fk(
-        sbml_dfs_dict,
-        pw_index,
-        table=SBML_DFS.REACTION_SPECIES,
-        fk_lookup_tables={
-            SBML_DFS.R_ID: rxn_lookup_table,
-            SBML_DFS.SC_ID: compspec_lookup_table,
-        },
-        # retain species with different roles
-        extra_defining_attrs=[SBML_DFS.SBO_TERM],
+    # Step 1: Create consensus entities for all primary tables
+    consensus_entities, lookup_tables = _create_consensus_entities(
+        sbml_dfs_dict, pw_index, defining_biological_qualifiers
     )
-    sbml_tbl_dict = {
-        SBML_DFS.COMPARTMENTS: comp_consensus_entities,
-        SBML_DFS.SPECIES: spec_consensus_entities,
-        SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_consensus_instances,
-        SBML_DFS.REACTIONS: rxn_consensus_species,
-        SBML_DFS.REACTION_SPECIES: rxnspec_consensus_instances,
-    }
-    sbml_dfs = sbml_dfs_core.SBML_dfs(sbml_tbl_dict)  # type: ignore
+    # Step 2: Create the consensus SBML_dfs object
+    sbml_dfs = sbml_dfs_core.SBML_dfs(consensus_entities)  # type: ignore
-    # add species and reactions data from component models
-    consensus_species_data = merge_entity_data(
-        sbml_dfs_dict, lookup_table=spec_lookup_table, table=SBML_DFS.SPECIES
-    )
-    for k in consensus_species_data.keys():
-        sbml_dfs.add_species_data(k, consensus_species_data[k])
-    consensus_reactions_data = merge_entity_data(
-        sbml_dfs_dict, lookup_table=rxn_lookup_table, table=SBML_DFS.REACTIONS
-    )
-    for k in consensus_reactions_data.keys():
-        sbml_dfs.add_reactions_data(k, consensus_reactions_data[k])
+    # Step 3: Add entity data from component models
+    sbml_dfs = _add_entity_data(sbml_dfs, sbml_dfs_dict, lookup_tables)
     return sbml_dfs
@@ -144,18 +76,22 @@ def construct_sbml_dfs_dict(
     pw_index: pd.DataFrame, strict: bool = True
 ) -> dict[str, sbml_dfs_core.SBML_dfs]:
     """
-    Construct SBML DFs Dict
-    Convert all models in the pathway index into SBML_dfs and add them to a dict.
-    Parameters:
-    pw_index: indices.PWIndex
-        An index of all tables being aggregated
-    strict (bool): if set to `false` errorenous files are skipped with warning. Default: True
+    Construct a dictionary of SBML_dfs objects from a pathway index.
-    Returns:
-        dict(sbml_dfs_core.SBML_dfs)
+    This function converts all models in the pathway index into SBML_dfs objects and adds them to a dictionary.
+    Optionally, it can skip erroneous files with a warning instead of raising an error.
+    Parameters
+    ----------
+    pw_index : pd.DataFrame
+        An index of all tables being aggregated, containing model metadata and file paths.
+    strict : bool, default=True
+        If True, raise an error on any file that cannot be loaded. If False, skip erroneous files with a warning.
+    Returns
+    -------
+    dict[str, sbml_dfs_core.SBML_dfs]
+        A dictionary mapping model names to SBML_dfs objects.
     """
     sbml_dfs_dict = dict()
@@ -182,18 +118,22 @@ def unnest_SBML_df(
     sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], table: str
 ) -> pd.DataFrame:
     """
-    Unnest SBML_dfs
+    Unnest and concatenate a specific table from multiple SBML_dfs models.
-    Merge corresponding tables from a set of models
-    sbml_dfs_dict: dict{cpr.SBML_dfs}
-        A dictionary of SBML_dfs from different models
-    table: str
-        A table to aggregate (e.g., species, reactions, compartments)
-    Returns:
-        pd.Dataframe, a table with a multindex of model and an entity_id
+    This function merges corresponding tables from a set of models into a single DataFrame,
+    adding the model name as an index level.
+    Parameters
+    ----------
+    sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
+        A dictionary of SBML_dfs objects from different models, keyed by model name.
+    table : str
+        The name of the table to aggregate (e.g., 'species', 'reactions', 'compartments').
+    Returns
+    -------
+    pd.DataFrame
+        A concatenated table with a MultiIndex of model and entity ID.
     """
     # check that all sbml_dfs have the same schema
@@ -222,31 +162,30 @@ def construct_meta_entities_identifiers(
     defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
 ) -> tuple[pd.DataFrame, pd.Series]:
     """
-    Construct Meta Entities Defined by Identifiers
-    Aggregating across one entity type for a set of pathway models merge entities which share identifiers
+    Construct meta-entities by merging entities across models that share identifiers.
-    Parameters:
-    ----------
-    sbml_df_dict (dict{"model": cpr.SBML_dfs}):
-        A dictionary of cpr.SBML_dfs
-    pw_index (indices.PWIndex):
-        An index of all tables being aggregated
-    table (str):
-        A table/entity set from the sbml_dfs to work-with
-    fk_lookup_tables (dict):
-        Dictionary containing lookup tables for all foreign keys used by the table
-    defining_biological_qualifiers (list[str]):
-        BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
-        permissive settings could merge homologs, different forms of the same gene.
+    Aggregates a single entity type from a set of pathway models and merges entities that share identifiers
+    (as defined by the provided biological qualifiers).
-    Returns:
+    Parameters
     ----------
-    new_id_table: pd.DataFrame
-        Matching the schema of one of the tables within sbml_df_dict
-    lookup_table: pd.Series
-        Matches the index of the aggregated entities to new_ids
+    sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
+        A dictionary of SBML_dfs objects from different models, keyed by model name.
+    pw_index : indices.PWIndex
+        An index of all tables being aggregated.
+    table : str
+        The name of the table/entity set to aggregate (e.g., 'species', 'compartments').
+    fk_lookup_tables : dict, optional
+        Dictionary containing lookup tables for all foreign keys used by the table (default: empty dict).
+    defining_biological_qualifiers : list[str], optional
+        List of BQB codes which define distinct entities. Defaults to BQB_DEFINING_ATTRS.
+    Returns
+    -------
+    new_id_table : pd.DataFrame
+        Table matching the schema of one of the input models, with merged entities.
+    lookup_table : pd.Series
+        Series mapping the index of the aggregated entities to new consensus IDs.
     """
     # combine sbml_dfs by adding model to the index and concatinating all dfs
@@ -281,96 +220,58 @@ def reduce_to_consensus_ids(
     defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
 ) -> tuple[pd.DataFrame, pd.Series]:
     """
-    Reduce to Consensus
-    Reduce a table of entities to unique entries based on identifiers.
+    Reduce a table of entities to unique entries based on consensus identifiers.
-    Parameters:
-    ----------
-    sbml_df: pd.DataFrame
-        One type of entity from sbml_dfs_dict expanded to include
-        model its index, as produced by unnest_SBML_df(sbml_dfs_dict)
-    table_schema: dict
-        Schema for the table sbml_df
-    pw_index: indices.PWIndex
-        An index of all tables being aggregated
-    defining_biological_qualifiers: list(str)
-        A list of biological qualifier types which define distinct entities
+    This function clusters entities that share identifiers (as defined by the provided biological qualifiers)
+    and produces a new table of unique entities, along with a lookup table mapping original entities to consensus IDs.
-    Returns:
+    Parameters
     ----------
-    new_id_table: pd.DataFrame
-        Matching the schema of one of the tables within sbml_df_dict
-    lookup_table: pd.Series
-        Matches the index of the aggregated entities to new_ids
+    sbml_df : pd.DataFrame
+        Table of entities from multiple models, with model in the index (as produced by unnest_SBML_df).
+    table_schema : dict
+        Schema for the table being reduced.
+    pw_index : indices.PWIndex, optional
+        An index of all tables being aggregated (default: None).
+    defining_biological_qualifiers : list[str], optional
+        List of biological qualifier types which define distinct entities. Defaults to BQB_DEFINING_ATTRS.
+    Returns
+    -------
+    new_id_table : pd.DataFrame
+        Table matching the schema of one of the input models, with merged entities.
+    lookup_table : pd.Series
+        Series mapping the index of the aggregated entities to new consensus IDs.
     """
+    # Step 1: Build consensus identifiers to create clusters of equivalent entities
     indexed_cluster, cluster_consensus_identifiers = build_consensus_identifiers(
         sbml_df, table_schema, defining_biological_qualifiers
     )
-    # add cluster to reduce non-identifier attributes
+    # Step 2: Join cluster information to the original table
     agg_table_harmonized = sbml_df.join(indexed_cluster)
-    # create a new numbering schema off of cluster #s and id type
-    # print(agg_table_harmonized["cluster"])
-    # print(table_schema["pk"])
-    agg_table_harmonized["new_id"] = sbml_dfs_utils.id_formatter(
-        agg_table_harmonized["cluster"], table_schema["pk"]
-    )
-    lookup_table = agg_table_harmonized["new_id"]
+    # Step 3: Create lookup table for entity IDs
+    lookup_table = _create_entity_lookup_table(agg_table_harmonized, table_schema)
-    # add nameness_score as a measure of how-readable a possible name would be
-    # (this will help to select names which are more human readable after the merge)
+    # Step 4: Add nameness scores to help select representative names
     agg_table_harmonized = utils._add_nameness_score_wrapper(
         agg_table_harmonized, "label", table_schema
     )
-    # reduce to one row per new_id and set as the primary key of the source table
-    agg_table_reduced = (
-        agg_table_harmonized.reset_index(drop=True)
-        .sort_values(["nameness_score"])
-        .rename(columns={"new_id": table_schema["pk"]})
-        .groupby(table_schema["pk"])
-        .first()
-        .drop("nameness_score", axis=1)
-    )
-    new_id_table = (
-        agg_table_reduced.drop(table_schema["id"], axis=1)
-        .merge(cluster_consensus_identifiers, left_on="cluster", right_index=True)
-        .drop("cluster", axis=1)
+    # Step 5: Prepare the consensus table with one row per unique entity
+    new_id_table = _prepare_consensus_table(
+        agg_table_harmonized, table_schema, cluster_consensus_identifiers
     )
+    # Step 6: Add source information if required
     if "source" in table_schema.keys():
-        if type(pw_index) is not indices.PWIndex:
-            raise ValueError(
-                f"pw_index must be provided as a indices.PWIndex if there is a source but was type {type(pw_index)}"
-            )
-        # track the model(s) that each entity came from
-        new_sources = create_consensus_sources(
-            agg_table_harmonized, lookup_table, table_schema, pw_index
-        )
-        assert isinstance(new_sources, pd.Series)
-        new_id_table = new_id_table.drop(
-            table_schema[SOURCE_SPEC.SOURCE], axis=1
-        ).merge(new_sources, left_index=True, right_index=True)
-    # check that the index name and variables match the source
-    if set(sbml_df.index.names).difference({SOURCE_SPEC.MODEL}) != set(
-        new_id_table.index.names
-    ):
-        raise ValueError(
-            "The newly constructed id table's index does not match the inputs"
+        new_id_table = _add_consensus_sources(
+            new_id_table, agg_table_harmonized, lookup_table, table_schema, pw_index
         )
-    if set(sbml_df) != set(new_id_table.columns):
-        raise ValueError(
-            "The newly constructed id table's variables do not match the inputs"
-        )
+    # Step 7: Validate the resulting table
+    _validate_consensus_table(new_id_table, sbml_df)
     return new_id_table, lookup_table
@@ -381,163 +282,85 @@ def build_consensus_identifiers(
     defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
 ) -> tuple[pd.Series, pd.DataFrame]:
     """
-    Build Consensus Identifiers
-    Take a set of entities spanning multiple models and find all unique entities.
+    Build consensus identifiers by clustering entities that share biological identifiers.
-    Defining attributes provided in defining_biological_qualifiers will
-    be used for grouping; other identifiers will be added back at the end.
+    This function takes a set of entities spanning multiple models and finds all unique entities
+    by grouping them according to the provided biological qualifiers. It returns a mapping from
+    original entities to clusters and a DataFrame of consensus identifier objects for each cluster.
-    Parameters:
-    ----------
-    sbml_df: pd.DataFrame
-        One type of entity from sbml_dfs_dict expanded to include model its index,
-        as produced by unnest_SBML_df(sbml_dfs_dict)
-    table_schema: dict
-        Schema for the table sbml_df
-    defining_biological_qualifiers: [str]
-        A list of biological qualifier types which should be used for grouping
-    Returns:
+    Parameters
     ----------
-    indexed_cluster: pd.Series
-        Maps the index from sbml_df onto a set of clusters which define unique entities
-    cluster_consensus_identifiers_df: pd.DataFrame
-        Maps an index of clusters onto a consensus cpr.identifiers.Identifiers object
+    sbml_df : pd.DataFrame
+        Table of entities from multiple models, with model in the index (as produced by unnest_SBML_df).
+    table_schema : dict
+        Schema for the table being processed.
+    defining_biological_qualifiers : list[str], optional
+        List of biological qualifier types to use for grouping. Defaults to BQB_DEFINING_ATTRS.
+    Returns
+    -------
+    indexed_cluster : pd.Series
+        Series mapping the index from sbml_df onto a set of clusters which define unique entities.
+    cluster_consensus_identifiers_df : pd.DataFrame
+        DataFrame mapping clusters to consensus identifiers (Identifiers objects).
     """
-    # create a table which is one row per entry
+    # Step 1: Extract and validate identifiers
     meta_identifiers = sbml_dfs_utils.unnest_identifiers(sbml_df, table_schema["id"])
-    # check the identifiers for missing attributes
     _validate_meta_identifiers(meta_identifiers)
-    # remove some biological qualifier types types to avoid over-grouping
-    valid_identifiers = meta_identifiers.copy()
-    valid_identifiers = valid_identifiers[
-        meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
-    ]
-    # catch entries which no longer have any identifiers
-    # add a dummy identifier to these which will still uniquely tag them
-    filtered_entries = sbml_df.reset_index().merge(
-        valid_identifiers.reset_index(),
-        left_on=sbml_df.index.names,
-        right_on=sbml_df.index.names,
-        how="outer",
-    )[sbml_df.index.names + [IDENTIFIERS.IDENTIFIER]]
-    filtered_entries = filtered_entries[
-        filtered_entries[IDENTIFIERS.IDENTIFIER].isnull()
-    ]
-    if filtered_entries.shape[0] != 0:
-        logger.warning(
-            f"{filtered_entries.shape[0]} entries didn't possess identifiers and thus cannot be merged"
-        )
-        filtered_entries[SOURCE_SPEC.ENTRY] = 0
-        filtered_entries[IDENTIFIERS.ONTOLOGY] = "none"
-        filtered_entries[IDENTIFIERS.ONTOLOGY] = [
-            "dummy_value_" + str(val)
-            for val in random.sample(range(1, 100000000), filtered_entries.shape[0])
-        ]
-        filtered_entries[IDENTIFIERS.URL] = None
-        filtered_entries[IDENTIFIERS.BQB] = None
-        filtered_entries = filtered_entries.set_index(
-            sbml_df.index.names + [SOURCE_SPEC.ENTRY]
-        )
-        valid_identifiers = pd.concat([valid_identifiers, filtered_entries])
-    # combine multi-index into a single variable; combine ontology + identifiers as a single variable
-    valid_identifiers = utils.format_identifiers_as_edgelist(
-        valid_identifiers, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
+    # Step 2: Filter identifiers by biological qualifier type
+    valid_identifiers = _filter_identifiers_by_qualifier(
+        meta_identifiers, defining_biological_qualifiers
     )
-    # create a unique tag for a species from the original index
-    indexed_species_tags = (
-        valid_identifiers.reset_index()
-        .set_index(valid_identifiers.index.names, drop=False)[sbml_df.index.names]
-        .astype(str)
-        .apply("__".join, axis=1)
-    )
-    valid_identifiers.loc[:, "model_spec"] = indexed_species_tags
+    # Step 3: Handle entries that don't have identifiers
+    valid_identifiers = _handle_entries_without_identifiers(sbml_df, valid_identifiers)
-    # convert index-identifier edge list into a network
-    # doing this will allow any entities with matching ontologies to be
-    # added to the same cluster so that they can be merged
-    id_edgelist = pd.concat(
-        [
-            valid_identifiers[["ind", "id"]],
-            # add id-ind edges so that identifiers corresponding to the same entity are grouped
-            # these entries will be discarded when merging the results back in by "ind"
-            valid_identifiers[["model_spec", "id"]].rename(
-                columns={"model_spec": "ind"}
-            ),
-        ]
-    )
+    # Step 4: Prepare edgelist for clustering
+    id_edgelist = _prepare_identifier_edgelist(valid_identifiers, sbml_df)
-    # aggregate index entries which have overlapping identifiers
-    # using a greedy graph-based approach
+    # Step 5: Cluster entities based on shared identifiers
     ind_clusters = utils.find_weakly_connected_subgraphs(id_edgelist)
-    # add clusters to identifier entries
-    valid_identifiers = valid_identifiers.reset_index().merge(ind_clusters)
-    # all entries for the same (model, id) will have the same cluster so convert back to
-    # sbml_df index to facilitate join
-    indexed_cluster = valid_identifiers.groupby(sbml_df.index.names).first()["cluster"]
-    # combine equivalent entries into a single Identifiers object
-    # include identifiers which were filtered by bqb
-    all_cluster_identifiers = meta_identifiers.reset_index().merge(
-        indexed_cluster, left_on=sbml_df.index.names, right_index=True
+    # Step 6: Map entity indices to clusters
+    valid_identifiers_with_clusters = valid_identifiers.reset_index().merge(
+        ind_clusters
     )
+    indexed_cluster = valid_identifiers_with_clusters.groupby(
+        sbml_df.index.names
+    ).first()["cluster"]
-    cluster_consensus_identifiers = {
-        k: identifiers.Identifiers(
-            list(
-                v[
-                    [
-                        IDENTIFIERS.ONTOLOGY,
-                        IDENTIFIERS.IDENTIFIER,
-                        IDENTIFIERS.URL,
-                        IDENTIFIERS.BQB,
-                    ]
-                ]
-                .T.to_dict()
-                .values()
-            )
-        )
-        for k, v in all_cluster_identifiers.groupby("cluster")
-    }
-    # recover clusters which don't have any identifiers
-    catchup_clusters = {
-        c: identifiers.Identifiers(list())
-        for c in set(ind_clusters["cluster"].tolist()).difference(
-            cluster_consensus_identifiers
-        )
-    }
-    cluster_consensus_identifiers = {
-        **cluster_consensus_identifiers,
-        **catchup_clusters,
-    }
-    cluster_consensus_identifiers_df = pd.DataFrame(
-        cluster_consensus_identifiers, index=[table_schema["id"]]
-    ).T
-    cluster_consensus_identifiers_df.index.name = "cluster"
+    # Step 7: Create consensus identifiers for each cluster
+    cluster_consensus_identifiers_df = _create_cluster_identifiers(
+        meta_identifiers, indexed_cluster, sbml_df, ind_clusters, table_schema
+    )
     return indexed_cluster, cluster_consensus_identifiers_df
 def pre_consensus_ontology_check(
     sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
-):
-    """Check for shared ontologies across source models."""
+) -> tuple[list, pd.DataFrame]:
+    """
+    Check for shared ontologies across source models for a given table.
+    For compartments, species, or reactions tables, this function returns the set of ontologies
+    shared among all SBML_dfs in the input dictionary, as well as a DataFrame summarizing ontologies per model.
+    Parameters
+    ----------
+    sbml_dfs_dict : dict[str, sbml_dfs_core.SBML_dfs]
+        Dictionary of SBML_dfs objects from different models, keyed by model name.
+    tablename : str
+        Name of the table to check (should be one of 'compartments', 'species', or 'reactions').
+    Returns
+    -------
+    shared_onto_list : list
+        List of ontologies shared by all models for the specified table.
+    sbml_dict_onto_df : pd.DataFrame
+        DataFrame summarizing ontologies present in each model for the specified table.
+    """
     # tablename: compartments/species/reactions tables with Identifiers
     # returns shared ontologies among sbml_dfs in sbml_dfs_dict for
@@ -572,23 +395,23 @@ def pre_consensus_ontology_check(
     return shared_onto_list, sbml_dict_onto_df
-def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
-    """Flag cases where meta identifers are totally missing or BQB codes are not included"""
+def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> set[str]:
+    """
+    Check and return the set of ontologies shared by different sources in a consensus model's species table.
-    if meta_identifiers.shape[0] == 0:
-        raise ValueError(
-            '"meta_identifiers" was empty; some identifiers should be present'
-        )
-    n_null = sum(meta_identifiers["bqb"].isnull())
-    if n_null > 0:
-        msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
-        logger.warn(msg)
-    return None
+    This function examines the species table in a consensus SBML_dfs object, determines the ontologies
+    present for each source model, and returns the intersection of ontologies shared by all sources.
+    Parameters
+    ----------
+    sbml_dfs : sbml_dfs_core.SBML_dfs
+        The consensus SBML_dfs object containing merged species from multiple models.
-def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> set[str]:
+    Returns
+    -------
+    set[str]
+        Set of ontology terms shared by all sources in the consensus model's species table.
+    """
     # Checking the ontology in "species" shared by different sources in a consensus model
     # returns a set of shared ontologies by different sources
@@ -636,27 +459,6 @@ def post_consensus_species_ontology_check(sbml_dfs: sbml_dfs_core.SBML_dfs) -> s
     return shared_onto_set
-def _update_foreign_keys(
-    agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
-) -> pd.DataFrame:
-    """Update one or more foreign keys based on old-to-new foreign key lookup table(s)."""
-    for fk in table_schema["fk"]:
-        updated_fks = (
-            agg_tbl[fk]
-            .reset_index()
-            .merge(
-                fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
-            )
-            .drop(fk, axis=1)
-            .rename(columns={"new_id": fk})
-            .set_index(["model", table_schema["pk"]])
-        )
-        agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
-    return agg_tbl
 def pre_consensus_compartment_check(
     sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs], tablename: str
 ) -> tuple[list, dict]:
@@ -855,146 +657,57 @@ def construct_meta_entities_members(
         Matching the schema of one of the tables within sbml_df_dict
     lookup_table: pd.Series
         Matches the index of the aggregated entities to new_ids
     """
     logger.info(
         f"Merging {table} based on identical membership ({' + '.join(defining_attrs)})"
     )
-    # combine sbml_dfs by adding model to the index and concatinating all dfs
-    agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=defined_by)
-    # to debug and see names of species
-    # comp_species = unnest_SBML_df(sbml_dfs_dict, table="compartmentalized_species")
-    # agg_tbl = agg_tbl.merge(comp_species, left_on = ["model", "sc_id"], right_index = True )
-    # since all sbml_dfs have the same schema pull out one schema for reference
+    # Step 1: Get schemas for both tables
     table_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[table]
     defined_by_schema = sbml_dfs_dict[list(sbml_dfs_dict.keys())[0]].schema[defined_by]
-    # update ids using previously created lookup tables
-    for k in defined_lookup_tables.keys():
-        agg_tbl = (
-            agg_tbl.merge(
-                defined_lookup_tables[k],
-                left_on=[SOURCE_SPEC.MODEL, k],
-                right_index=True,
-            )
-            .drop(k, axis=1)
-            .rename(columns={"new_id": k})
-        )
-    # create a set of species x compartment instances for each reaction
-    defining_fk = set(defined_by_schema["fk"]).difference({table_schema["pk"]})
-    if (
-        len(defining_fk) != 1
-        or len(defining_fk.intersection(set(defined_by_schema["fk"]))) != 1
-    ):
-        raise ValueError(
-            f"A foreign key could not be found in {defined_by} which was a primary key in {table}"
-        )
-    else:
-        defining_fk = list(defining_fk)[0]
-    # define what it is to be a unique member based on a combination of defining_attrs
-    valid_defining_attrs = agg_tbl.columns.values.tolist()
-    invalid_defining_attrs = [
-        x for x in defining_attrs if x not in valid_defining_attrs
-    ]
-    if len(invalid_defining_attrs) != 0:
-        raise ValueError(
-            f"{', '.join(invalid_defining_attrs)} was not found; "
-            f"valid defining_attrs are {', '.join(valid_defining_attrs)}"
-        )
-    # create unique members
-    agg_tbl["member"] = agg_tbl[defining_attrs].astype(str).apply("__".join, axis=1)
-    # members are aggregated by reaction
-    membership_df = (
-        agg_tbl.reset_index()
-        .groupby(["model", table_schema["pk"]])
-        .agg(membership=("member", lambda x: (list(set(x)))))
+    # Step 2: Prepare the member table and validate its structure
+    agg_tbl, defining_fk = _prepare_member_table(
+        sbml_dfs_dict,
+        defined_by,
+        defined_lookup_tables,
+        table_schema,
+        defined_by_schema,
+        defining_attrs,
+        table,
     )
-    # check whether members are duplicated within a given group
-    # suggesting that distinct entities have been coerced into
-    # the same entity
-    for i in range(membership_df.shape[0]):
-        members = membership_df["membership"].iloc[i]
-        if len(members) != len(set(members)):
-            _ = agg_tbl.reset_index().merge(
-                membership_df.iloc[i : i + 1],
-                how="inner",
-                left_on=[SOURCE_SPEC.MODEL, table_schema["pk"]],
-                right_index=True,
-            )
-            raise ValueError(
-                "Members were duplicated suggesting overmerging in the source "
-            )
-    membership_df["member_string"] = [
-        _create_member_string(x) for x in membership_df["membership"]
-    ]
-    membership_lookup = membership_df.reset_index()
+    # Step 3: Create lookup table for entity membership
+    membership_lookup = _create_membership_lookup(agg_tbl, table_schema)
-    consensus_entities = membership_lookup.groupby("member_string").first()
-    consensus_entities["new_id"] = sbml_dfs_utils.id_formatter(
-        range(consensus_entities.shape[0]), table_schema["pk"]
+    # Step 4: Create consensus entities and lookup table
+    consensus_entities, lookup_table = _create_entity_consensus(
+        membership_lookup, table_schema
     )
-    lookup_table = membership_lookup.merge(
-        consensus_entities["new_id"], left_on="member_string", right_index=True
-    ).set_index([SOURCE_SPEC.MODEL, table_schema["pk"]])["new_id"]
-    # logging merges that occurred
+    # Step 5: Log merger information
     report_consensus_merges(
         lookup_table, table_schema, sbml_dfs_dict=sbml_dfs_dict, n_example_merges=5
     )
+    # Step 6: Get primary entity table and merge identifiers
     agg_primary_table = unnest_SBML_df(sbml_dfs_dict, table=table)
-    # add nameness_score as a measure of how-readable a possible name would be
-    # (this will help to select names which are more human readable after the merge)
-    agg_primary_table = utils._add_nameness_score_wrapper(
-        agg_primary_table, "label", table_schema
-    )
-    new_id_table = (
-        agg_primary_table.join(lookup_table)
-        .reset_index(drop=True)
-        .sort_values(["nameness_score"])
-        .rename(columns={"new_id": table_schema["pk"]})
-        .groupby(table_schema["pk"])
-        .first()[table_schema["vars"]]
-    )
-    # merge identifiers
     logger.info(f"Merging {table} identifiers")
-    indexed_old_identifiers = (
-        agg_primary_table.join(lookup_table)
-        .reset_index(drop=True)
-        .rename(columns={"new_id": table_schema["pk"]})
-        .groupby(table_schema["pk"])[table_schema["id"]]
+    updated_identifiers = _merge_entity_identifiers(
+        agg_primary_table, lookup_table, table_schema
     )
-    # combine merged identifiers into single identifier objects indexed by new id
-    updated_identifiers = indexed_old_identifiers.agg(identifiers.merge_identifiers)
-    # add merged identifiers back to new_id table overwriting existing ids
-    new_id_table = new_id_table.drop(table_schema["id"], axis=1).merge(
-        updated_identifiers, left_index=True, right_index=True
+    # Step 7: Create consensus table with merged entities
+    new_id_table = _create_consensus_table(
+        agg_primary_table, lookup_table, updated_identifiers, table_schema
     )
+    # Step 8: Add source information if present
     if "source" in table_schema.keys():
         logger.info(f"Merging {table} sources")
-        # track the model(s) that each entity came from
+        # Track the model(s) that each entity came from
         new_sources = create_consensus_sources(
             agg_primary_table.merge(lookup_table, left_index=True, right_index=True),
             lookup_table,
@@ -1190,6 +903,163 @@ def report_consensus_merges(
     return None
+def _create_entity_lookup_table(
+    agg_table_harmonized: pd.DataFrame, table_schema: dict
+) -> pd.Series:
+    """
+    Create a lookup table mapping original entity IDs to new consensus IDs.
+    Parameters:
+    ----------
+    agg_table_harmonized: pd.DataFrame
+        Table with cluster assignments for each entity
+    table_schema: dict
+        Schema for the table
+    Returns:
+    ----------
+    pd.Series
+        Lookup table mapping old entity IDs to new consensus IDs
+    """
+    # Create a new ID based on cluster number and entity type
+    agg_table_harmonized["new_id"] = sbml_dfs_utils.id_formatter(
+        agg_table_harmonized["cluster"], table_schema["pk"]
+    )
+    # Return the lookup series
+    return agg_table_harmonized["new_id"]
+def _prepare_consensus_table(
+    agg_table_harmonized: pd.DataFrame,
+    table_schema: dict,
+    cluster_consensus_identifiers: pd.DataFrame,
+) -> pd.DataFrame:
+    """
+    Prepare a consensus table with one row per unique entity.
+    Parameters:
+    ----------
+    agg_table_harmonized: pd.DataFrame
+        Table with nameness scores and cluster assignments
+    table_schema: dict
+        Schema for the table
+    cluster_consensus_identifiers: pd.DataFrame
+        Consensus identifiers for each cluster
+    Returns:
+    ----------
+    pd.DataFrame
+        New consensus table with merged entities
+    """
+    # Sort by nameness score and keep one row per new entity ID
+    agg_table_reduced = (
+        agg_table_harmonized.reset_index(drop=True)
+        .sort_values(["nameness_score"])
+        .rename(columns={"new_id": table_schema["pk"]})
+        .groupby(table_schema["pk"])
+        .first()
+        .drop("nameness_score", axis=1)
+    )
+    # Join in the consensus identifiers and drop the temporary cluster column
+    new_id_table = (
+        agg_table_reduced.drop(table_schema["id"], axis=1)
+        .merge(cluster_consensus_identifiers, left_on="cluster", right_index=True)
+        .drop("cluster", axis=1)
+    )
+    return new_id_table
+def _add_consensus_sources(
+    new_id_table: pd.DataFrame,
+    agg_table_harmonized: pd.DataFrame,
+    lookup_table: pd.Series,
+    table_schema: dict,
+    pw_index: indices.PWIndex | None,
+) -> pd.DataFrame:
+    """
+    Add source information to the consensus table.
+    Parameters:
+    ----------
+    new_id_table: pd.DataFrame
+        Consensus table without source information
+    agg_table_harmonized: pd.DataFrame
+        Original table with cluster assignments
+    lookup_table: pd.Series
+        Maps old IDs to new consensus IDs
+    table_schema: dict
+        Schema for the table
+    pw_index: indices.PWIndex | None
+        An index of all tables being aggregated
+    Returns:
+    ----------
+    pd.DataFrame
+        Consensus table with source information added
+    """
+    if type(pw_index) is not indices.PWIndex:
+        raise ValueError(
+            f"pw_index must be provided as a indices.PWIndex if there is a source but was type {type(pw_index)}"
+        )
+    # Track the model(s) that each entity came from
+    new_sources = create_consensus_sources(
+        agg_table_harmonized, lookup_table, table_schema, pw_index
+    )
+    assert isinstance(new_sources, pd.Series)
+    # Add the sources to the consensus table
+    updated_table = new_id_table.drop(table_schema[SOURCE_SPEC.SOURCE], axis=1).merge(
+        new_sources, left_index=True, right_index=True
+    )
+    return updated_table
+def _validate_consensus_table(
+    new_id_table: pd.DataFrame, sbml_df: pd.DataFrame
+) -> None:
+    """
+    Validate that the new consensus table has the same structure as the original.
+    Parameters:
+    ----------
+    new_id_table: pd.DataFrame
+        Newly created consensus table
+    sbml_df: pd.DataFrame
+        Original table from which consensus was built
+    Raises:
+    ------
+    ValueError
+        If index names or columns don't match
+    """
+    # Check that the index names match
+    if set(sbml_df.index.names).difference({SOURCE_SPEC.MODEL}) != set(
+        new_id_table.index.names
+    ):
+        raise ValueError(
+            f"The newly constructed id table's index does not match the inputs.\n"
+            f"Expected index names: {sbml_df.index.names}\n"
+            f"Actual index names: {new_id_table.index.names}"
+        )
+    # Check that the columns match
+    if set(sbml_df) != set(new_id_table.columns):
+        missing_in_new = set(sbml_df) - set(new_id_table.columns)
+        extra_in_new = set(new_id_table.columns) - set(sbml_df)
+        raise ValueError(
+            "The newly constructed id table's variables do not match the inputs.\n"
+            f"Expected columns: {list(sbml_df.columns)}\n"
+            f"Actual columns: {list(new_id_table.columns)}\n"
+            f"Missing in new: {missing_in_new}\n"
+            f"Extra in new: {extra_in_new}"
+        )
 def merge_entity_data(
     sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
     lookup_table: pd.Series,
@@ -1232,35 +1102,619 @@ def merge_entity_data(
     return entity_data
-def _check_sbml_dfs_dict(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
-    """Check models in SBML_dfs for problems which can be reported up-front
-    Args:
-        sbml_dfs_dict (dict(pd.DataFrame)): a dict of sbml_dfs models;
-        primarily used as an input for construct_consensus_model
-    Returns:
-        None
+def _create_consensus_entities(
+    sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
+    pw_index: indices.PWIndex,
+    defining_biological_qualifiers: list[str],
+) -> tuple[dict, dict]:
     """
+    Create consensus entities for all primary tables in the model.
-    for k, v in sbml_dfs_dict.items():
-        _check_sbml_dfs(sbml_dfs=v, model_label=k)
-    return None
+    This helper function creates consensus compartments, species, compartmentalized species,
+    reactions, and reaction species by finding shared entities across source models.
+    Parameters:
+    ----------
+    sbml_dfs_dict: dict{cpr.SBML_dfs}
+        A dictionary of SBML_dfs from different models
+    pw_index: indices.PWIndex
+        An index of all tables being aggregated
+    defining_biological_qualifiers: list[str]
+        Biological qualifier terms that define distinct entities
-def _check_sbml_dfs(
-    sbml_dfs: sbml_dfs_core.SBML_dfs, model_label: str, N_examples: int | str = 5
-) -> None:
-    """Check SBML_dfs for identifiers which are associated with different entities before a merge."""
+    Returns:
+    ----------
+    tuple:
+        - dict of consensus entities tables
+        - dict of lookup tables
+    """
+    # Step 1: Compartments
+    logger.info("Defining compartments based on unique ids")
+    comp_consensus_entities, comp_lookup_table = construct_meta_entities_identifiers(
+        sbml_dfs_dict=sbml_dfs_dict, pw_index=pw_index, table="compartments"
+    )
-    ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
-    defining_ids = ids[ids[IDENTIFIERS.BQB].isin(BQB_DEFINING_ATTRS)]
+    # Step 2: Species
+    logger.info("Defining species based on unique ids")
+    spec_consensus_entities, spec_lookup_table = construct_meta_entities_identifiers(
+        sbml_dfs_dict=sbml_dfs_dict,
+        pw_index=pw_index,
+        table=SBML_DFS.SPECIES,
+        defining_biological_qualifiers=defining_biological_qualifiers,
+    )
-    defining_identifier_counts = defining_ids.value_counts(
-        [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
+    # Step 3: Compartmentalized species
+    logger.info(
+        "Defining compartmentalized species based on unique species x compartments"
     )
-    degenerate_defining_identities = (
+    compspec_consensus_instances, compspec_lookup_table = construct_meta_entities_fk(
+        sbml_dfs_dict,
+        pw_index,
+        table=SBML_DFS.COMPARTMENTALIZED_SPECIES,
+        fk_lookup_tables={
+            SBML_DFS.C_ID: comp_lookup_table,
+            SBML_DFS.S_ID: spec_lookup_table,
+        },
+    )
+    # Step 4: Reactions
+    logger.info(
+        "Define reactions based on membership of identical compartmentalized species"
+    )
+    rxn_consensus_species, rxn_lookup_table = construct_meta_entities_members(
+        sbml_dfs_dict,
+        pw_index,
+        table=SBML_DFS.REACTIONS,
+        defined_by=SBML_DFS.REACTION_SPECIES,
+        defined_lookup_tables={SBML_DFS.SC_ID: compspec_lookup_table},
+        defining_attrs=[SBML_DFS.SC_ID, SBML_DFS.STOICHIOMETRY],
+    )
+    logger.info("Annotating reversibility based on merged reactions")
+    rxn_consensus_species = _resolve_reversibility(
+        sbml_dfs_dict, rxn_consensus_species, rxn_lookup_table
+    )
+    # Step 5: Reaction species
+    logger.info("Define reaction species based on reactions")
+    rxnspec_consensus_instances, rxnspec_lookup_table = construct_meta_entities_fk(
+        sbml_dfs_dict,
+        pw_index,
+        table=SBML_DFS.REACTION_SPECIES,
+        fk_lookup_tables={
+            SBML_DFS.R_ID: rxn_lookup_table,
+            SBML_DFS.SC_ID: compspec_lookup_table,
+        },
+        # retain species with different roles
+        extra_defining_attrs=[SBML_DFS.SBO_TERM],
+    )
+    consensus_entities = {
+        SBML_DFS.COMPARTMENTS: comp_consensus_entities,
+        SBML_DFS.SPECIES: spec_consensus_entities,
+        SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_consensus_instances,
+        SBML_DFS.REACTIONS: rxn_consensus_species,
+        SBML_DFS.REACTION_SPECIES: rxnspec_consensus_instances,
+    }
+    lookup_tables = {
+        SBML_DFS.COMPARTMENTS: comp_lookup_table,
+        SBML_DFS.SPECIES: spec_lookup_table,
+        SBML_DFS.COMPARTMENTALIZED_SPECIES: compspec_lookup_table,
+        SBML_DFS.REACTIONS: rxn_lookup_table,
+        SBML_DFS.REACTION_SPECIES: rxnspec_lookup_table,
+    }
+    return consensus_entities, lookup_tables
+def _add_entity_data(
+    sbml_dfs: sbml_dfs_core.SBML_dfs,
+    sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
+    lookup_tables: dict,
+) -> sbml_dfs_core.SBML_dfs:
+    """
+    Add entity data from component models to the consensus model.
+    Parameters:
+    ----------
+    sbml_dfs: sbml_dfs_core.SBML_dfs
+        The consensus model being built
+    sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]
+        A dictionary of SBML_dfs from different models
+    lookup_tables: dict
+        Dictionary of lookup tables for translating between old and new entity IDs
+    Returns:
+    ----------
+    sbml_dfs_core.SBML_dfs
+        The updated consensus model
+    """
+    # Add species data
+    consensus_species_data = merge_entity_data(
+        sbml_dfs_dict,
+        lookup_table=lookup_tables[SBML_DFS.SPECIES],
+        table=SBML_DFS.SPECIES,
+    )
+    for k in consensus_species_data.keys():
+        sbml_dfs.add_species_data(k, consensus_species_data[k])
+    # Add reactions data
+    consensus_reactions_data = merge_entity_data(
+        sbml_dfs_dict,
+        lookup_table=lookup_tables[SBML_DFS.REACTIONS],
+        table=SBML_DFS.REACTIONS,
+    )
+    for k in consensus_reactions_data.keys():
+        sbml_dfs.add_reactions_data(k, consensus_reactions_data[k])
+    return sbml_dfs
+def _prepare_member_table(
+    sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs],
+    defined_by: str,
+    defined_lookup_tables: dict,
+    table_schema: dict,
+    defined_by_schema: dict,
+    defining_attrs: list[str],
+    table: str = SBML_DFS.REACTIONS,
+) -> tuple[pd.DataFrame, str]:
+    """
+    Prepare a table of members and validate their structure.
+    Parameters:
+    ----------
+    sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]
+        Dictionary of SBML_dfs from different models
+    defined_by: str
+        Name of the table whose entries define membership
+    defined_lookup_tables: dict
+        Lookup tables for updating IDs
+    table_schema: dict
+        Schema for the main table
+    defined_by_schema: dict
+        Schema for the defining table
+    defining_attrs: list[str]
+        Attributes that define a unique member
+    table: str
+        Name of the main table (default: REACTIONS)
+    Returns:
+    ----------
+    tuple:
+        - Updated aggregated table with member strings
+        - Name of the foreign key
+    """
+    # Combine models into a single table
+    agg_tbl = unnest_SBML_df(sbml_dfs_dict, table=defined_by)
+    # Update IDs using previously created lookup tables
+    for k in defined_lookup_tables.keys():
+        agg_tbl = (
+            agg_tbl.merge(
+                defined_lookup_tables[k],
+                left_on=[SOURCE_SPEC.MODEL, k],
+                right_index=True,
+            )
+            .drop(k, axis=1)
+            .rename(columns={"new_id": k})
+        )
+    # Identify the foreign key
+    defining_fk = set(defined_by_schema["fk"]).difference({table_schema["pk"]})
+    if (
+        len(defining_fk) != 1
+        or len(defining_fk.intersection(set(defined_by_schema["fk"]))) != 1
+    ):
+        raise ValueError(
+            f"A foreign key could not be found in {defined_by} which was a primary key in {table}"
+        )
+    else:
+        defining_fk = list(defining_fk)[0]
+    # Validate defining attributes
+    valid_defining_attrs = agg_tbl.columns.values.tolist()
+    invalid_defining_attrs = [
+        x for x in defining_attrs if x not in valid_defining_attrs
+    ]
+    if len(invalid_defining_attrs) != 0:
+        raise ValueError(
+            f"{', '.join(invalid_defining_attrs)} was not found; "
+            f"valid defining_attrs are {', '.join(valid_defining_attrs)}"
+        )
+    # Create unique member strings
+    agg_tbl["member"] = agg_tbl[defining_attrs].astype(str).apply("__".join, axis=1)
+    return agg_tbl, defining_fk
+def _create_membership_lookup(
+    agg_tbl: pd.DataFrame, table_schema: dict
+) -> pd.DataFrame:
+    """
+    Create a lookup table for entity membership.
+    Parameters:
+    ----------
+    agg_tbl: pd.DataFrame
+        Table with member information
+    table_schema: dict
+        Schema for the table
+    Returns:
+    ----------
+    pd.DataFrame
+        Lookup table mapping entity IDs to member strings
+    """
+    # Group members by entity
+    membership_df = (
+        agg_tbl.reset_index()
+        .groupby(["model", table_schema["pk"]])
+        .agg(membership=("member", lambda x: (list(set(x)))))
+    )
+    # Check for duplicated members within an entity
+    for i in range(membership_df.shape[0]):
+        members = membership_df["membership"].iloc[i]
+        if len(members) != len(set(members)):
+            raise ValueError(
+                "Members were duplicated suggesting overmerging in the source"
+            )
+    # Convert membership lists to strings for comparison
+    membership_df["member_string"] = [
+        _create_member_string(x) for x in membership_df["membership"]
+    ]
+    return membership_df.reset_index()
+def _create_entity_consensus(
+    membership_lookup: pd.DataFrame, table_schema: dict
+) -> tuple[pd.DataFrame, pd.Series]:
+    """
+    Create consensus entities based on membership.
+    Parameters:
+    ----------
+    membership_lookup: pd.DataFrame
+        Table mapping entities to their member strings
+    table_schema: dict
+        Schema for the table
+    Returns:
+    ----------
+    tuple:
+        - Consensus entities DataFrame
+        - Lookup table mapping old IDs to new IDs
+    """
+    # Group by member string to find entities with identical members
+    consensus_entities = membership_lookup.groupby("member_string").first()
+    # Create new IDs for the consensus entities
+    consensus_entities["new_id"] = sbml_dfs_utils.id_formatter(
+        range(consensus_entities.shape[0]), table_schema["pk"]
+    )
+    # Create lookup table mapping original entities to consensus entities
+    lookup_table = membership_lookup.merge(
+        consensus_entities["new_id"], left_on="member_string", right_index=True
+    ).set_index([SOURCE_SPEC.MODEL, table_schema["pk"]])["new_id"]
+    return consensus_entities, lookup_table
+def _merge_entity_identifiers(
+    agg_primary_table: pd.DataFrame, lookup_table: pd.Series, table_schema: dict
+) -> pd.Series:
+    """
+    Merge identifiers from multiple entities.
+    Parameters:
+    ----------
+    agg_primary_table: pd.DataFrame
+        Table of entities
+    lookup_table: pd.Series
+        Lookup table mapping old IDs to new IDs
+    table_schema: dict
+        Schema for the table
+    Returns:
+    ----------
+    pd.Series
+        Series mapping new IDs to merged identifier objects
+    """
+    # Combine entities with the same consensus ID
+    indexed_old_identifiers = (
+        agg_primary_table.join(lookup_table)
+        .reset_index(drop=True)
+        .rename(columns={"new_id": table_schema["pk"]})
+        .groupby(table_schema["pk"])[table_schema["id"]]
+    )
+    # Merge identifier objects
+    return indexed_old_identifiers.agg(identifiers.merge_identifiers)
+def _create_consensus_table(
+    agg_primary_table: pd.DataFrame,
+    lookup_table: pd.Series,
+    updated_identifiers: pd.Series,
+    table_schema: dict,
+) -> pd.DataFrame:
+    """
+    Create a consensus table with merged entities.
+    Parameters:
+    ----------
+    agg_primary_table: pd.DataFrame
+        Table of entities
+    lookup_table: pd.Series
+        Lookup table mapping old IDs to new IDs
+    updated_identifiers: pd.Series
+        Series mapping new IDs to merged identifier objects
+    table_schema: dict
+        Schema for the table
+    Returns:
+    ----------
+    pd.DataFrame
+        Consensus table with one row per unique entity
+    """
+    # Add nameness scores to help select representative names
+    agg_primary_table_scored = utils._add_nameness_score_wrapper(
+        agg_primary_table, "label", table_schema
+    )
+    # Create a table with one row per consensus entity
+    new_id_table = (
+        agg_primary_table_scored.join(lookup_table)
+        .reset_index(drop=True)
+        .sort_values(["nameness_score"])
+        .rename(columns={"new_id": table_schema["pk"]})
+        .groupby(table_schema["pk"])
+        .first()[table_schema["vars"]]
+    )
+    # Replace identifiers with merged versions
+    new_id_table = new_id_table.drop(table_schema["id"], axis=1).merge(
+        updated_identifiers, left_index=True, right_index=True
+    )
+    return new_id_table
+def _filter_identifiers_by_qualifier(
+    meta_identifiers: pd.DataFrame, defining_biological_qualifiers: list[str]
+) -> pd.DataFrame:
+    """
+    Filter identifiers to only include those with specific biological qualifiers.
+    Parameters:
+    ----------
+    meta_identifiers: pd.DataFrame
+        Table of identifiers
+    defining_biological_qualifiers: list[str]
+        List of biological qualifier types to keep
+    Returns:
+    ----------
+    pd.DataFrame
+        Filtered identifiers
+    """
+    valid_identifiers = meta_identifiers.copy()
+    return valid_identifiers[
+        meta_identifiers[IDENTIFIERS.BQB].isin(defining_biological_qualifiers)
+    ]
+def _handle_entries_without_identifiers(
+    sbml_df: pd.DataFrame, valid_identifiers: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Handle entities that don't have identifiers by adding dummy identifiers.
+    Parameters:
+    ----------
+    sbml_df: pd.DataFrame
+        Original table of entities
+    valid_identifiers: pd.DataFrame
+        Table of identifiers that passed filtering
+    Returns:
+    ----------
+    pd.DataFrame
+        Valid identifiers with dummy entries added
+    """
+    # Find entries which no longer have any identifiers
+    filtered_entries = sbml_df.reset_index().merge(
+        valid_identifiers.reset_index(),
+        left_on=sbml_df.index.names,
+        right_on=sbml_df.index.names,
+        how="outer",
+    )[sbml_df.index.names + [IDENTIFIERS.IDENTIFIER]]
+    filtered_entries = filtered_entries[
+        filtered_entries[IDENTIFIERS.IDENTIFIER].isnull()
+    ]
+    if filtered_entries.shape[0] == 0:
+        return valid_identifiers
+    # Add dummy identifiers to these entries
+    logger.warning(
+        f"{filtered_entries.shape[0]} entries didn't possess identifiers and thus cannot be merged"
+    )
+    filtered_entries[SOURCE_SPEC.ENTRY] = 0
+    filtered_entries[IDENTIFIERS.ONTOLOGY] = "none"
+    filtered_entries[IDENTIFIERS.ONTOLOGY] = [
+        "dummy_value_" + str(val)
+        for val in random.sample(range(1, 100000000), filtered_entries.shape[0])
+    ]
+    filtered_entries[IDENTIFIERS.URL] = None
+    filtered_entries[IDENTIFIERS.BQB] = None
+    filtered_entries = filtered_entries.set_index(
+        sbml_df.index.names + [SOURCE_SPEC.ENTRY]
+    )
+    # Combine original valid identifiers with dummy identifiers
+    return pd.concat([valid_identifiers, filtered_entries])
+def _prepare_identifier_edgelist(
+    valid_identifiers: pd.DataFrame, sbml_df: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Prepare an edgelist for clustering identifiers.
+    Parameters:
+    ----------
+    valid_identifiers: pd.DataFrame
+        Table of identifiers
+    sbml_df: pd.DataFrame
+        Original table of entities
+    Returns:
+    ----------
+    pd.DataFrame
+        Edgelist connecting entities to their identifiers
+    """
+    # Format identifiers as edgelist
+    formatted_identifiers = utils.format_identifiers_as_edgelist(
+        valid_identifiers, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
+    )
+    # Create a unique tag for each entity from the original index
+    indexed_species_tags = (
+        formatted_identifiers.reset_index()
+        .set_index(formatted_identifiers.index.names, drop=False)[sbml_df.index.names]
+        .astype(str)
+        .apply("__".join, axis=1)
+    )
+    formatted_identifiers.loc[:, "model_spec"] = indexed_species_tags
+    # Create edgelist that connects entities to identifiers
+    id_edgelist = pd.concat(
+        [
+            formatted_identifiers[["ind", "id"]],
+            # Add edges connecting model-specific instances to their identifiers
+            formatted_identifiers[["model_spec", "id"]].rename(
+                columns={"model_spec": "ind"}
+            ),
+        ]
+    )
+    return id_edgelist
+def _create_cluster_identifiers(
+    meta_identifiers: pd.DataFrame,
+    indexed_cluster: pd.Series,
+    sbml_df: pd.DataFrame,
+    ind_clusters: pd.DataFrame,
+    table_schema: dict,
+) -> pd.DataFrame:
+    """
+    Create identifier objects for each cluster.
+    Parameters
+    ----------
+    meta_identifiers : pd.DataFrame
+        All identifiers (including those filtered out by BQB)
+    indexed_cluster : pd.Series
+        Maps entity indices to cluster IDs
+    sbml_df : pd.DataFrame
+        Original table of entities
+    ind_clusters : pd.DataFrame
+        Cluster assignments from graph algorithm
+    table_schema : dict
+        Schema for the table, used to determine the correct identifier column name
+    Returns
+    -------
+    pd.DataFrame
+        Table mapping clusters to their consensus identifiers, with the identifier column named according to the schema
+    """
+    # Combine all identifiers with cluster assignments
+    all_cluster_identifiers = meta_identifiers.reset_index().merge(
+        indexed_cluster, left_on=sbml_df.index.names, right_index=True
+    )
+    # Create an Identifiers object for each cluster
+    cluster_consensus_identifiers = {
+        k: identifiers.Identifiers(
+            list(
+                v[
+                    [
+                        IDENTIFIERS.ONTOLOGY,
+                        IDENTIFIERS.IDENTIFIER,
+                        IDENTIFIERS.URL,
+                        IDENTIFIERS.BQB,
+                    ]
+                ]
+                .T.to_dict()
+                .values()
+            )
+        )
+        for k, v in all_cluster_identifiers.groupby("cluster")
+    }
+    # Handle clusters that don't have any identifiers
+    catchup_clusters = {
+        c: identifiers.Identifiers(list())
+        for c in set(ind_clusters["cluster"].tolist()).difference(
+            cluster_consensus_identifiers
+        )
+    }
+    cluster_consensus_identifiers = {
+        **cluster_consensus_identifiers,
+        **catchup_clusters,
+    }
+    # Convert to DataFrame with correct column name
+    id_col = table_schema["id"]
+    cluster_consensus_identifiers_df = pd.DataFrame(
+        cluster_consensus_identifiers, index=[id_col]
+    ).T
+    cluster_consensus_identifiers_df.index.name = "cluster"
+    return cluster_consensus_identifiers_df
+def _check_sbml_dfs_dict(sbml_dfs_dict: dict[str, sbml_dfs_core.SBML_dfs]) -> None:
+    """Check models in SBML_dfs for problems which can be reported up-front
+    Args:
+        sbml_dfs_dict (dict(pd.DataFrame)): a dict of sbml_dfs models;
+        primarily used as an input for construct_consensus_model
+    Returns:
+        None
+    """
+    for k, v in sbml_dfs_dict.items():
+        _check_sbml_dfs(sbml_dfs=v, model_label=k)
+    return None
+def _check_sbml_dfs(
+    sbml_dfs: sbml_dfs_core.SBML_dfs, model_label: str, N_examples: int | str = 5
+) -> None:
+    """Check SBML_dfs for identifiers which are associated with different entities before a merge."""
+    ids = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
+    defining_ids = ids[ids[IDENTIFIERS.BQB].isin(BQB_DEFINING_ATTRS)]
+    defining_identifier_counts = defining_ids.value_counts(
+        [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
+    )
+    degenerate_defining_identities = (
         defining_identifier_counts[defining_identifier_counts > 1]
         .rename("N")
         .reset_index()
@@ -1314,9 +1768,46 @@ def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
     return None
+def _validate_meta_identifiers(meta_identifiers: pd.DataFrame) -> None:
+    """Flag cases where meta identifers are totally missing or BQB codes are not included"""
+    if meta_identifiers.shape[0] == 0:
+        raise ValueError(
+            '"meta_identifiers" was empty; some identifiers should be present'
+        )
+    n_null = sum(meta_identifiers["bqb"].isnull())
+    if n_null > 0:
+        msg = f"{n_null} identifiers were missing a bqb code and will not be mergeable"
+        logger.warn(msg)
+    return None
+def _update_foreign_keys(
+    agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
+) -> pd.DataFrame:
+    for fk in table_schema["fk"]:
+        updated_fks = (
+            agg_tbl[fk]
+            .reset_index()
+            .merge(
+                fk_lookup_tables[fk], left_on=[SOURCE_SPEC.MODEL, fk], right_index=True
+            )
+            .drop(fk, axis=1)
+            .rename(columns={"new_id": fk})
+            .set_index(["model", table_schema["pk"]])
+        )
+        agg_tbl = agg_tbl.drop(columns=fk).join(updated_fks)
+    return agg_tbl
 def _update_foreign_keys(
     agg_tbl: pd.DataFrame, table_schema: dict, fk_lookup_tables: dict
 ) -> pd.DataFrame:
+    """Update one or more foreign keys based on old-to-new foreign key lookup table(s)."""
     for fk in table_schema["fk"]:
         updated_fks = (
             agg_tbl[fk]
@@ -1378,8 +1869,14 @@ def _resolve_reversibility(
         SBML_DFS.R_ISREVERSIBLE, axis=1
     ).join(r_id_reversibility)
-    assert rxns_w_reversibility.shape[0] == rxn_consensus_species.shape[0]
-    assert all(rxns_w_reversibility[SBML_DFS.R_ISREVERSIBLE].isin([True, False]))
+    if rxns_w_reversibility.shape[0] != rxn_consensus_species.shape[0]:
+        raise ValueError(
+            "rxns_w_reversibility and rxn_consensus_species must have the same number of rows"
+        )
+    if not all(rxns_w_reversibility[SBML_DFS.R_ISREVERSIBLE].isin([True, False])):
+        raise ValueError(
+            "All rxns_w_reversibility[R_ISREVERSIBLE] must be True or False"
+        )
     return rxns_w_reversibility

napistu 0.1.0__py3-none-any.whl → 0.2.4.dev2__py3-none-any.whl

napistu 0.1.0py3-none-any.whl → 0.2.4.dev2py3-none-any.whl