napistu 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/sbml_dfs_core.py CHANGED
@@ -32,7 +32,6 @@ from napistu.constants import SBOTERM_NAMES
32
32
  from napistu.constants import SBO_ROLES_DEFS
33
33
  from napistu.constants import ENTITIES_W_DATA
34
34
  from napistu.constants import ENTITIES_TO_ENTITY_DATA
35
- from napistu.constants import CHARACTERISTIC_COMPLEX_ONTOLOGIES
36
35
  from napistu.ingestion.constants import GENERIC_COMPARTMENT
37
36
  from napistu.ingestion.constants import COMPARTMENT_ALIASES
38
37
  from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
@@ -1471,12 +1470,6 @@ def filter_to_characteristic_species_ids(
1471
1470
  # add components within modestly sized protein complexes
1472
1471
  # look at HAS_PART IDs
1473
1472
  bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
1474
- # filter to genes
1475
- bqb_has_parts_species = bqb_has_parts_species[
1476
- bqb_has_parts_species[IDENTIFIERS.ONTOLOGY].isin(
1477
- CHARACTERISTIC_COMPLEX_ONTOLOGIES
1478
- )
1479
- ]
1480
1473
 
1481
1474
  # number of species in a complex
1482
1475
  n_species_components = bqb_has_parts_species.value_counts(
@@ -1488,38 +1481,10 @@ def filter_to_characteristic_species_ids(
1488
1481
  ].index.get_level_values(SBML_DFS.S_ID)
1489
1482
  )
1490
1483
 
1491
- # number of complexes a species is part of
1492
- n_complexes_involvedin = bqb_has_parts_species.value_counts(
1493
- [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
1494
- )
1495
- promiscuous_component_identifiers_index = n_complexes_involvedin[
1496
- n_complexes_involvedin > max_promiscuity
1497
- ].index
1498
- promiscuous_component_identifiers = pd.Series(
1499
- data=[True] * len(promiscuous_component_identifiers_index),
1500
- index=promiscuous_component_identifiers_index,
1501
- name="is_shared_component",
1502
- dtype=bool,
1503
- )
1504
-
1505
- if len(promiscuous_component_identifiers) == 0:
1506
- # no complexes to filter
1507
- return species_ids
1508
-
1509
- filtered_bqb_has_parts = bqb_has_parts_species.merge(
1510
- promiscuous_component_identifiers,
1511
- left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
1512
- right_index=True,
1513
- how="left",
1484
+ filtered_bqb_has_parts = _filter_promiscuous_components(
1485
+ bqb_has_parts_species, max_promiscuity
1514
1486
  )
1515
1487
 
1516
- filtered_bqb_has_parts["is_shared_component"] = filtered_bqb_has_parts[
1517
- "is_shared_component"
1518
- ].fillna(False)
1519
- # drop identifiers shared as components across many species
1520
- filtered_bqb_has_parts = filtered_bqb_has_parts[
1521
- ~filtered_bqb_has_parts["is_shared_component"]
1522
- ].drop(["is_shared_component"], axis=1)
1523
1488
  # drop species parts if there are many components
1524
1489
  filtered_bqb_has_parts = filtered_bqb_has_parts[
1525
1490
  ~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
@@ -1887,550 +1852,199 @@ def sbml_dfs_from_edgelist(
1887
1852
  keep_reactions_data: bool | str = False,
1888
1853
  ) -> SBML_dfs:
1889
1854
  """
1890
- Create SBML_dfs from Edgelist
1891
-
1892
- Combine a set of interactions into an sbml.SBML_dfs mechanistic model
1855
+ Create SBML_dfs from interaction edgelist.
1893
1856
 
1894
- Parameters:
1895
- interaction_edgelist (pd.DataFrame): A table containing interactions:
1896
- - upstream_name (str): matching "s_name" from "species_df"
1897
- - downstream_name (str): matching "s_name" from "species_df"
1898
- - upstream_compartment (str): compartment of "upstream_name"
1899
- with names matching "c_name" from "compartments_df"
1900
- - downstream_compartment (str): compartment of "downstream_name"
1901
- with names matching "c_name" from "compartments_df"
1902
- - r_name (str): a name for the interaction
1903
- - sbo_term (str): sbo term defining the type of
1904
- molecular interaction (see MINI_SBO_FROM_NAME)
1905
- - r_Identifiers (identifiers.Identifiers): identifiers
1906
- supporting the interaction (e.g., pubmed ids)
1907
- - r_isreversible (bool): Is this reaction reversible?
1908
- If True, the reaction is reversible
1909
- By default, the interactions of TRRUST networks are irreversible, and reversible for STRING networks
1910
- species_df (pd.DataFrame): A table defining unique molecular
1911
- species participating in "interaction_edgelist":
1912
- - s_name (str): name of molecular species
1913
- - s_Identifiers (identifiers.Identifiers): identifiers
1914
- defining the species
1915
- compartments_df (pd.DataFrame): A table defining compartments
1916
- where interactions are occurring "interaction_edgelist":
1917
- - c_name (str): name of compartment
1918
- - c_Identifiers (identifiers.Identifiers):
1919
- identifiers defining the compartment (see
1920
- bigg.annotate_recon() for a set of names > go categories)
1921
- interaction_source (source.Source): A source object
1922
- which will tie model entities to the interaction source
1923
- upstream_stoichiometry (int): stoichiometry of
1924
- upstream species in reaction
1925
- downstream_stoichiometry (int): stoichiometry of
1926
- downstream species in reaction
1927
- downstream_sbo_name (str): sbo term defining the
1928
- type of molecular interaction for the downstream reactand
1929
- (see MINI_SBO_FROM_NAME)
1930
- keep_species_data (bool | str): Should species data
1931
- be kept in the model? If True, all species data will be kept
1932
- and saved as "species_data" in the SBML_dfs. The label will be 'source'
1933
- If False, no species data will be kept.
1934
- If a string: label for the species data to be kept.
1935
- keep_reactions_data (bool | str): Should reaction data be kept in the model?
1936
- If True, all reaction data will be kept and saved
1937
- as "reactions_data" in the SBML_dfs. The label will be 'source'.
1938
- If False, no reaction data will be kept.
1939
- If a string: label for the reaction data to be kept.
1940
-
1941
- Returns:
1942
- sbml.SBML_dfs
1857
+ Combines a set of molecular interactions into a mechanistic SBML_dfs model
1858
+ by processing interaction data, species information, and compartment definitions.
1943
1859
 
1860
+ Parameters
1861
+ ----------
1862
+ interaction_edgelist : pd.DataFrame
1863
+ Table containing molecular interactions with columns:
1864
+ - upstream_name : str, matches "s_name" from species_df
1865
+ - downstream_name : str, matches "s_name" from species_df
1866
+ - upstream_compartment : str, matches "c_name" from compartments_df
1867
+ - downstream_compartment : str, matches "c_name" from compartments_df
1868
+ - r_name : str, name for the interaction
1869
+ - sbo_term : str, SBO term defining interaction type
1870
+ - r_Identifiers : identifiers.Identifiers, supporting identifiers
1871
+ - r_isreversible : bool, whether reaction is reversible
1872
+ species_df : pd.DataFrame
1873
+ Table defining molecular species with columns:
1874
+ - s_name : str, name of molecular species
1875
+ - s_Identifiers : identifiers.Identifiers, species identifiers
1876
+ compartments_df : pd.DataFrame
1877
+ Table defining compartments with columns:
1878
+ - c_name : str, name of compartment
1879
+ - c_Identifiers : identifiers.Identifiers, compartment identifiers
1880
+ interaction_source : source.Source
1881
+ Source object linking model entities to interaction source
1882
+ upstream_stoichiometry : int, default 0
1883
+ Stoichiometry of upstream species in reactions
1884
+ downstream_stoichiometry : int, default 1
1885
+ Stoichiometry of downstream species in reactions
1886
+ downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
1887
+ SBO term for downstream reactant type
1888
+ keep_species_data : bool or str, default False
1889
+ Whether to preserve extra species columns. If True, saves as 'source' label.
1890
+ If string, uses as custom label. If False, discards extra data.
1891
+ keep_reactions_data : bool or str, default False
1892
+ Whether to preserve extra reaction columns. If True, saves as 'source' label.
1893
+ If string, uses as custom label. If False, discards extra data.
1894
+
1895
+ Returns
1896
+ -------
1897
+ SBML_dfs
1898
+ Validated SBML data structure containing compartments, species,
1899
+ compartmentalized species, reactions, and reaction species tables.
1944
1900
  """
1901
+ # 1. Validate inputs
1902
+ _edgelist_validate_inputs(interaction_edgelist, species_df, compartments_df)
1945
1903
 
1946
- # check input dfs for required variables
1947
- _sbml_dfs_from_edgelist_validate_inputs(
1948
- interaction_edgelist, species_df, compartments_df
1904
+ # 2. Identify which extra columns to preserve
1905
+ extra_columns = _edgelist_identify_extra_columns(
1906
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
1949
1907
  )
1950
1908
 
1951
- # Identify extra columns in the input data.
1952
- # if keep_reactions_data is True, this will be added
1953
- # as `reaction_data`
1954
- interaction_edgelist_required_vars = {
1955
- "upstream_name",
1956
- "downstream_name",
1957
- "upstream_compartment",
1958
- "downstream_compartment",
1959
- SBML_DFS.R_NAME,
1960
- SBML_DFS.SBO_TERM,
1961
- SBML_DFS.R_IDENTIFIERS,
1962
- SBML_DFS.R_ISREVERSIBLE,
1963
- }
1964
- if keep_reactions_data is not False:
1965
- extra_reactions_columns = [
1966
- c
1967
- for c in interaction_edgelist.columns
1968
- if c not in interaction_edgelist_required_vars
1969
- ]
1970
- else:
1971
- extra_reactions_columns = []
1972
- # Extra species columns
1973
- if keep_species_data is not False:
1974
- extra_species_columns = [
1975
- c
1976
- for c in species_df.columns
1977
- if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
1978
- ]
1979
- else:
1980
- extra_species_columns = []
1981
-
1982
- # format compartments
1983
- compartments_df[SBML_DFS.C_SOURCE] = interaction_source
1984
- compartments_df[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
1985
- range(compartments_df.shape[0]), SBML_DFS.C_ID
1909
+ # 3. Process compartments and species tables
1910
+ processed_compartments = _edgelist_process_compartments(
1911
+ compartments_df, interaction_source
1912
+ )
1913
+ processed_species, species_data = _edgelist_process_species(
1914
+ species_df, interaction_source, extra_columns["species"]
1986
1915
  )
1987
- compartments_df = compartments_df.set_index(SBML_DFS.C_ID)[
1988
- [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
1989
- ]
1990
1916
 
1991
- # format species
1992
- species_df[SBML_DFS.S_SOURCE] = interaction_source
1993
- species_df[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
1994
- range(species_df.shape[0]), SBML_DFS.S_ID
1917
+ # 4. Create compartmentalized species
1918
+ comp_species = _edgelist_create_compartmentalized_species(
1919
+ interaction_edgelist,
1920
+ processed_species,
1921
+ processed_compartments,
1922
+ interaction_source,
1995
1923
  )
1996
1924
 
1997
- required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
1998
- species_df = species_df.set_index(SBML_DFS.S_ID)[
1999
- required_cols + extra_species_columns
2000
- ]
2001
- # Keep extra columns to save them as extra data
2002
- species_data = species_df[extra_species_columns]
2003
- # Remove extra columns
2004
- species_df = species_df[required_cols]
1925
+ # 5. Create reactions and reaction species
1926
+ reactions, reaction_species, reactions_data = (
1927
+ _edgelist_create_reactions_and_species(
1928
+ interaction_edgelist,
1929
+ comp_species,
1930
+ processed_species,
1931
+ processed_compartments,
1932
+ interaction_source,
1933
+ upstream_stoichiometry,
1934
+ downstream_stoichiometry,
1935
+ downstream_sbo_name,
1936
+ extra_columns["reactions"],
1937
+ )
1938
+ )
2005
1939
 
2006
- # create compartmentalized species
1940
+ # 6. Assemble final SBML_dfs object
1941
+ sbml_model = _edgelist_assemble_sbml_model(
1942
+ processed_compartments,
1943
+ processed_species,
1944
+ comp_species,
1945
+ reactions,
1946
+ reaction_species,
1947
+ species_data,
1948
+ reactions_data,
1949
+ keep_species_data,
1950
+ keep_reactions_data,
1951
+ extra_columns,
1952
+ )
2007
1953
 
2008
- # define all distinct upstream and downstream compartmentalized species
2009
- comp_species = pd.concat(
2010
- [
2011
- interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
2012
- {
2013
- "upstream_name": SBML_DFS.S_NAME,
2014
- "upstream_compartment": SBML_DFS.C_NAME,
2015
- },
2016
- axis=1,
2017
- ),
2018
- interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
2019
- {
2020
- "downstream_name": SBML_DFS.S_NAME,
2021
- "downstream_compartment": SBML_DFS.C_NAME,
2022
- },
2023
- axis=1,
2024
- ),
2025
- ]
2026
- ).drop_duplicates()
1954
+ return sbml_model
2027
1955
 
2028
- # merge to add species and compartments primary keys
2029
- comp_species_w_ids = comp_species.merge(
2030
- species_df[SBML_DFS.S_NAME].reset_index(),
2031
- how="left",
2032
- left_on=SBML_DFS.S_NAME,
2033
- right_on=SBML_DFS.S_NAME,
2034
- ).merge(
2035
- compartments_df[SBML_DFS.C_NAME].reset_index(),
2036
- how="left",
2037
- left_on=SBML_DFS.C_NAME,
2038
- right_on=SBML_DFS.C_NAME,
2039
- )
1956
+ return sbml_model
2040
1957
 
2041
- # check whether all species and compartments exist
2042
- _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
2043
1958
 
2044
- # name compounds
2045
- comp_species_w_ids[SBML_DFS.SC_NAME] = [
2046
- f"{s} [{c}]"
2047
- for s, c in zip(
2048
- comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
2049
- )
2050
- ]
2051
- # add source object
2052
- comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
2053
- # name index
2054
- comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
2055
- range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
2056
- )
2057
- comp_species_w_ids = comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
2058
- [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
2059
- ]
1959
+ def species_type_types(x):
1960
+ """Assign a high-level molecule type to a molecular species"""
2060
1961
 
2061
- # create reactions
1962
+ if isinstance(x, identifiers.Identifiers):
1963
+ if x.filter(["chebi"]):
1964
+ return "metabolite"
1965
+ elif x.filter(["molodex"]):
1966
+ return "drug"
1967
+ else:
1968
+ return "protein"
1969
+ else:
1970
+ return "unknown"
2062
1971
 
2063
- # create a from cs_species -> to cs_species edgelist
2064
- # interaction_edgelist
2065
- comp_species_w_names = (
2066
- comp_species_w_ids.reset_index()
2067
- .merge(species_df[SBML_DFS.S_NAME].reset_index())
2068
- .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
2069
- )
2070
1972
 
2071
- interaction_edgelist_w_cspecies = interaction_edgelist.merge(
2072
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2073
- {
2074
- SBML_DFS.SC_ID: "sc_id_up",
2075
- SBML_DFS.S_NAME: "upstream_name",
2076
- SBML_DFS.C_NAME: "upstream_compartment",
2077
- },
2078
- axis=1,
2079
- ),
2080
- how="left",
2081
- ).merge(
2082
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
1973
+ def stub_ids(ids):
1974
+ if len(ids) == 0:
1975
+ return pd.DataFrame(
2083
1976
  {
2084
- SBML_DFS.SC_ID: "sc_id_down",
2085
- SBML_DFS.S_NAME: "downstream_name",
2086
- SBML_DFS.C_NAME: "downstream_compartment",
2087
- },
2088
- axis=1,
2089
- ),
2090
- how="left",
2091
- )[
2092
- REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
2093
- ]
2094
-
2095
- # some extra checks
2096
- if interaction_edgelist.shape[0] != interaction_edgelist_w_cspecies.shape[0]:
2097
- raise ValueError(
2098
- "Merging compartmentalized species to interaction_edgelist"
2099
- " resulted in an increase in the tables from "
2100
- f"{interaction_edgelist.shape[0]} to "
2101
- f"{interaction_edgelist_w_cspecies.shape[0]} indicating"
2102
- " a 1-many join which should have been 1-1"
1977
+ IDENTIFIERS.ONTOLOGY: [None],
1978
+ IDENTIFIERS.IDENTIFIER: [None],
1979
+ IDENTIFIERS.URL: [None],
1980
+ IDENTIFIERS.BQB: [None],
1981
+ }
2103
1982
  )
1983
+ else:
1984
+ return pd.DataFrame(ids)
2104
1985
 
2105
- # create one reaction per interaction
2106
- interaction_edgelist_w_cspecies[SBML_DFS.R_SOURCE] = interaction_source
2107
- interaction_edgelist_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
2108
- range(interaction_edgelist_w_cspecies.shape[0]), SBML_DFS.R_ID
2109
- )
2110
1986
 
2111
- reactions_df_columns = [
2112
- SBML_DFS.R_NAME,
2113
- SBML_DFS.R_IDENTIFIERS,
2114
- SBML_DFS.R_SOURCE,
2115
- SBML_DFS.R_ISREVERSIBLE,
2116
- ]
2117
- reactions_df = interaction_edgelist_w_cspecies.copy().set_index(SBML_DFS.R_ID)[
2118
- reactions_df_columns + extra_reactions_columns
2119
- ]
2120
- # Keep extra columns to save them as extra data
2121
- reactions_data = reactions_df[extra_reactions_columns]
2122
- reactions_df = reactions_df[reactions_df_columns]
1987
+ def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
1988
+ """
1989
+ Add an sbo_role column to the reaction_species table.
2123
1990
 
2124
- # define upstream and downstream comp species as reaction species
2125
- reaction_species_df = pd.concat(
2126
- [
2127
- # upstream interactions are defined by sbo_term and should generally
2128
- # be modifiers/stimulator/inhibitor/interactor
2129
- interaction_edgelist_w_cspecies[["sc_id_up", "sbo_term", "r_id"]]
2130
- .assign(stoichiometry=upstream_stoichiometry)
2131
- .rename({"sc_id_up": "sc_id"}, axis=1),
2132
- # downstream interactions indicate some modification of the state
2133
- # of the species and hence are defined as product
2134
- interaction_edgelist_w_cspecies[["sc_id_down", "r_id"]]
2135
- .assign(
2136
- stoichiometry=downstream_stoichiometry,
2137
- sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
2138
- )
2139
- .rename({"sc_id_down": "sc_id"}, axis=1),
2140
- ]
2141
- )
2142
- reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
2143
- range(reaction_species_df.shape[0]), "rsc_id"
2144
- )
2145
- reaction_species_df = reaction_species_df.set_index("rsc_id")
1991
+ The sbo_role column is a string column that contains the SBO role of the reaction species.
1992
+ The values in the sbo_role column are taken from the sbo_term column.
2146
1993
 
2147
- # form sbml_dfs object
2148
- sbml_tbl_dict: MutableMapping[str, pd.DataFrame | dict[str, pd.DataFrame]] = {
2149
- "compartments": compartments_df,
2150
- "species": species_df,
2151
- "compartmentalized_species": comp_species_w_ids,
2152
- "reactions": reactions_df,
2153
- "reaction_species": reaction_species_df,
2154
- }
2155
- if len(extra_reactions_columns) > 0:
2156
- if isinstance(keep_reactions_data, str):
2157
- reactions_data_label = keep_reactions_data
2158
- else:
2159
- reactions_data_label = "source"
2160
- sbml_tbl_dict["reactions_data"] = {reactions_data_label: reactions_data}
1994
+ The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
1995
+ """
2161
1996
 
2162
- if len(extra_species_columns) > 0:
2163
- if isinstance(keep_species_data, str):
2164
- species_data_label = keep_species_data
2165
- else:
2166
- species_data_label = "source"
2167
- sbml_tbl_dict["species_data"] = {species_data_label: species_data}
1997
+ validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
2168
1998
 
2169
- sbml_model = SBML_dfs(sbml_tbl_dict)
2170
- sbml_model.validate()
1999
+ reaction_species = (
2000
+ reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
2001
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
2002
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
2003
+ )
2171
2004
 
2172
- return sbml_model
2005
+ undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
2006
+ SBO_NAME_TO_ROLE.values()
2007
+ )
2008
+ if len(undefined_roles) > 0:
2009
+ logger.warning(
2010
+ f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
2011
+ )
2012
+ mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
2013
+ reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
2173
2014
 
2015
+ return reaction_species
2174
2016
 
2175
- def _sbml_dfs_from_edgelist_validate_inputs(
2176
- interaction_edgelist: pd.DataFrame,
2177
- species_df: pd.DataFrame,
2178
- compartments_df: pd.DataFrame,
2179
- ) -> None:
2180
- """Check that the inputs for creating an SBML_dfs from an edgelist are appropriate."""
2181
2017
 
2182
- # check compartments
2183
- compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
2184
- compartments_df_columns = set(compartments_df.columns.tolist())
2185
- missing_required_fields = compartments_df_expected_vars.difference(
2186
- compartments_df_columns
2187
- )
2188
- if len(missing_required_fields) > 0:
2018
+ def find_underspecified_reactions(
2019
+ reaction_species_w_roles: pd.DataFrame,
2020
+ ) -> pd.DataFrame:
2021
+
2022
+ # check that both sbo_role and "new" are present
2023
+ if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
2189
2024
  raise ValueError(
2190
- f"{', '.join(missing_required_fields)} are required variables"
2191
- ' in "compartments_df" but were not present in the input file.'
2025
+ "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
2192
2026
  )
2193
-
2194
- # check species
2195
- species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2196
- species_df_columns = set(species_df.columns.tolist())
2197
- missing_required_fields = species_df_expected_vars.difference(species_df_columns)
2198
- if len(missing_required_fields) > 0:
2027
+ if "new" not in reaction_species_w_roles.columns:
2199
2028
  raise ValueError(
2200
- f"{', '.join(missing_required_fields)} are required"
2201
- ' variables in "species_df" but were not present '
2202
- "in the input file."
2029
+ "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2203
2030
  )
2204
-
2205
- # check interactions
2206
- interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
2207
- missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
2208
- interaction_edgelist_columns
2209
- )
2210
- if len(missing_required_fields) > 0:
2031
+ # check that new is a boolean column
2032
+ if reaction_species_w_roles["new"].dtype != bool:
2211
2033
  raise ValueError(
2212
- f"{', '.join(missing_required_fields)} are required "
2213
- 'variables in "interaction_edgelist" but were not '
2214
- "present in the input file."
2034
+ "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2215
2035
  )
2216
2036
 
2217
- return None
2037
+ reactions_with_lost_defining_members = set(
2038
+ reaction_species_w_roles.query("~new")
2039
+ .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
2040
+ .tolist()
2041
+ )
2218
2042
 
2219
-
2220
- def _sbml_dfs_from_edgelist_check_cspecies_merge(
2221
- merged_species: pd.DataFrame, original_species: pd.DataFrame
2222
- ) -> None:
2223
- """Check for a mismatch between the provided species data and species implied by the edgelist."""
2224
-
2225
- # check for 1-many merge
2226
- if merged_species.shape[0] != original_species.shape[0]:
2227
- raise ValueError(
2228
- "Merging compartmentalized species to species_df"
2229
- " and compartments_df by names resulted in an "
2230
- f"increase in the tables from {original_species.shape[0]}"
2231
- f" to {merged_species.shape[0]} indicating that names were"
2232
- " not unique"
2233
- )
2234
-
2235
- # check for missing species and compartments
2236
- missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
2237
- SBML_DFS.C_NAME
2238
- ].unique()
2239
- if len(missing_compartments) >= 1:
2240
- raise ValueError(
2241
- f"{len(missing_compartments)} compartments were present in"
2242
- ' "interaction_edgelist" but not "compartments_df":'
2243
- f" {', '.join(missing_compartments)}"
2244
- )
2245
-
2246
- missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
2247
- SBML_DFS.S_NAME
2248
- ].unique()
2249
- if len(missing_species) >= 1:
2250
- raise ValueError(
2251
- f"{len(missing_species)} species were present in "
2252
- '"interaction_edgelist" but not "species_df":'
2253
- f" {', '.join(missing_species)}"
2254
- )
2255
-
2256
- return None
2257
-
2258
-
2259
- def _stub_compartments(
2260
- stubbed_compartment: str = GENERIC_COMPARTMENT,
2261
- ) -> pd.DataFrame:
2262
- """Stub Compartments
2263
-
2264
- Create a compartments table with only a single compartment
2265
-
2266
- Args:
2267
- stubbed_compartment (str): the name of a compartment which should match the
2268
- keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
2269
-
2270
- Returns:
2271
- compartments_df (pd.DataFrame): compartments dataframe
2272
- """
2273
-
2274
- if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
2275
- raise ValueError(
2276
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
2277
- )
2278
-
2279
- if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
2280
- raise ValueError(
2281
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
2282
- )
2283
-
2284
- stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
2285
-
2286
- formatted_uri = identifiers.format_uri(
2287
- uri=identifiers.create_uri_url(
2288
- ontology=ONTOLOGIES.GO,
2289
- identifier=stubbed_compartment_id,
2290
- ),
2291
- biological_qualifier_type=BQB.IS,
2292
- )
2293
-
2294
- compartments_df = pd.DataFrame(
2295
- {
2296
- SBML_DFS.C_NAME: [stubbed_compartment],
2297
- SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
2298
- }
2299
- )
2300
- compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
2301
- compartments_df.index.name = SBML_DFS.C_ID
2302
-
2303
- return compartments_df
2304
-
2305
-
2306
- def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
2307
- """Validates a table against a reference
2308
-
2309
- This check if the table has the same index, no duplicates in the index
2310
- and that all values in the index are in the reference table.
2311
-
2312
- Args:
2313
- data_table (pd.DataFrame): a table with data that should
2314
- match the reference
2315
- ref_table (pd.DataFrame): a reference table
2316
-
2317
- Raises:
2318
- ValueError: not same index name
2319
- ValueError: index contains duplicates
2320
- ValueError: index not subset of index of reactions table
2321
- """
2322
- ref_index_name = ref_table.index.name
2323
- if data_table.index.name != ref_index_name:
2324
- raise ValueError(
2325
- "the index name for reaction data table was not"
2326
- f" {ref_index_name}: {data_table.index.name}"
2327
- )
2328
- ids = data_table.index
2329
- if any(ids.duplicated()):
2330
- raise ValueError(
2331
- "the index for reaction data table " "contained duplicate values"
2332
- )
2333
- if not all(ids.isin(ref_table.index)):
2334
- raise ValueError(
2335
- "the index for reaction data table contained values"
2336
- " not found in the reactions table"
2337
- )
2338
- if not isinstance(data_table, pd.DataFrame):
2339
- raise TypeError(
2340
- f"The data table was type {type(data_table).__name__}"
2341
- " but must be a pd.DataFrame"
2342
- )
2343
-
2344
-
2345
- def species_type_types(x):
2346
- """Assign a high-level molecule type to a molecular species"""
2347
-
2348
- if isinstance(x, identifiers.Identifiers):
2349
- if x.filter(["chebi"]):
2350
- return "metabolite"
2351
- elif x.filter(["molodex"]):
2352
- return "drug"
2353
- else:
2354
- return "protein"
2355
- else:
2356
- return "unknown"
2357
-
2358
-
2359
- def stub_ids(ids):
2360
- if len(ids) == 0:
2361
- return pd.DataFrame(
2362
- {
2363
- IDENTIFIERS.ONTOLOGY: [None],
2364
- IDENTIFIERS.IDENTIFIER: [None],
2365
- IDENTIFIERS.URL: [None],
2366
- IDENTIFIERS.BQB: [None],
2367
- }
2368
- )
2369
- else:
2370
- return pd.DataFrame(ids)
2371
-
2372
-
2373
- def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
2374
- """
2375
- Add an sbo_role column to the reaction_species table.
2376
-
2377
- The sbo_role column is a string column that contains the SBO role of the reaction species.
2378
- The values in the sbo_role column are taken from the sbo_term column.
2379
-
2380
- The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
2381
- """
2382
-
2383
- validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
2384
-
2385
- reaction_species = (
2386
- reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
2387
- .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
2388
- .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
2389
- )
2390
-
2391
- undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
2392
- SBO_NAME_TO_ROLE.values()
2393
- )
2394
- if len(undefined_roles) > 0:
2395
- logger.warning(
2396
- f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
2397
- )
2398
- mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
2399
- reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
2400
-
2401
- return reaction_species
2402
-
2403
-
2404
- def find_underspecified_reactions(
2405
- reaction_species_w_roles: pd.DataFrame,
2406
- ) -> pd.DataFrame:
2407
-
2408
- # check that both sbo_role and "new" are present
2409
- if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
2410
- raise ValueError(
2411
- "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
2412
- )
2413
- if "new" not in reaction_species_w_roles.columns:
2414
- raise ValueError(
2415
- "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2416
- )
2417
- # check that new is a boolean column
2418
- if reaction_species_w_roles["new"].dtype != bool:
2419
- raise ValueError(
2420
- "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2421
- )
2422
-
2423
- reactions_with_lost_defining_members = set(
2424
- reaction_species_w_roles.query("~new")
2425
- .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
2426
- .tolist()
2427
- )
2428
-
2429
- N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
2430
- if N_reactions_with_lost_defining_members > 0:
2431
- logger.info(
2432
- f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
2433
- )
2043
+ N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
2044
+ if N_reactions_with_lost_defining_members > 0:
2045
+ logger.info(
2046
+ f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
2047
+ )
2434
2048
 
2435
2049
  # find the cases where all "new" values for a given (r_id, sbo_term) are False
2436
2050
  reactions_with_lost_requirements = set(
@@ -2595,3 +2209,594 @@ def _perform_sbml_dfs_table_validation(
2595
2209
  # check for empty table
2596
2210
  if table_data.shape[0] == 0:
2597
2211
  raise ValueError(f"{table_name} contained no entries")
2212
+
2213
+
2214
+ def _filter_promiscuous_components(
2215
+ bqb_has_parts_species: pd.DataFrame, max_promiscuity: int
2216
+ ) -> pd.DataFrame:
2217
+
2218
+ # number of complexes a species is part of
2219
+ n_complexes_involvedin = bqb_has_parts_species.value_counts(
2220
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
2221
+ )
2222
+ promiscuous_component_identifiers_index = n_complexes_involvedin[
2223
+ n_complexes_involvedin > max_promiscuity
2224
+ ].index
2225
+ promiscuous_component_identifiers = pd.Series(
2226
+ data=[True] * len(promiscuous_component_identifiers_index),
2227
+ index=promiscuous_component_identifiers_index,
2228
+ name="is_shared_component",
2229
+ dtype=bool,
2230
+ )
2231
+
2232
+ if len(promiscuous_component_identifiers) == 0:
2233
+ return bqb_has_parts_species
2234
+
2235
+ filtered_bqb_has_parts = bqb_has_parts_species.merge(
2236
+ promiscuous_component_identifiers,
2237
+ left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
2238
+ right_index=True,
2239
+ how="left",
2240
+ )
2241
+
2242
+ filtered_bqb_has_parts["is_shared_component"] = (
2243
+ filtered_bqb_has_parts["is_shared_component"].astype("boolean").fillna(False)
2244
+ )
2245
+ # drop identifiers shared as components across many species
2246
+ filtered_bqb_has_parts = filtered_bqb_has_parts[
2247
+ ~filtered_bqb_has_parts["is_shared_component"]
2248
+ ].drop(["is_shared_component"], axis=1)
2249
+
2250
+ return filtered_bqb_has_parts
2251
+
2252
+
2253
+ def _edgelist_validate_inputs(
2254
+ interaction_edgelist: pd.DataFrame,
2255
+ species_df: pd.DataFrame,
2256
+ compartments_df: pd.DataFrame,
2257
+ ) -> None:
2258
+ """
2259
+ Validate input DataFrames have required columns.
2260
+
2261
+ Parameters
2262
+ ----------
2263
+ interaction_edgelist : pd.DataFrame
2264
+ Interaction data to validate
2265
+ species_df : pd.DataFrame
2266
+ Species data to validate
2267
+ compartments_df : pd.DataFrame
2268
+ Compartments data to validate
2269
+ """
2270
+
2271
+ # check compartments
2272
+ compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
2273
+ compartments_df_columns = set(compartments_df.columns.tolist())
2274
+ missing_required_fields = compartments_df_expected_vars.difference(
2275
+ compartments_df_columns
2276
+ )
2277
+ if len(missing_required_fields) > 0:
2278
+ raise ValueError(
2279
+ f"{', '.join(missing_required_fields)} are required variables"
2280
+ ' in "compartments_df" but were not present in the input file.'
2281
+ )
2282
+
2283
+ # check species
2284
+ species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2285
+ species_df_columns = set(species_df.columns.tolist())
2286
+ missing_required_fields = species_df_expected_vars.difference(species_df_columns)
2287
+ if len(missing_required_fields) > 0:
2288
+ raise ValueError(
2289
+ f"{', '.join(missing_required_fields)} are required"
2290
+ ' variables in "species_df" but were not present '
2291
+ "in the input file."
2292
+ )
2293
+
2294
+ # check interactions
2295
+ interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
2296
+ missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
2297
+ interaction_edgelist_columns
2298
+ )
2299
+ if len(missing_required_fields) > 0:
2300
+ raise ValueError(
2301
+ f"{', '.join(missing_required_fields)} are required "
2302
+ 'variables in "interaction_edgelist" but were not '
2303
+ "present in the input file."
2304
+ )
2305
+
2306
+ return None
2307
+
2308
+
2309
+ def _edgelist_identify_extra_columns(
2310
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
2311
+ ):
2312
+ """
2313
+ Identify extra columns in input data that should be preserved.
2314
+
2315
+ Parameters
2316
+ ----------
2317
+ interaction_edgelist : pd.DataFrame
2318
+ Interaction data containing potential extra columns
2319
+ species_df : pd.DataFrame
2320
+ Species data containing potential extra columns
2321
+ keep_reactions_data : bool or str
2322
+ Whether to keep extra reaction columns
2323
+ keep_species_data : bool or str
2324
+ Whether to keep extra species columns
2325
+
2326
+ Returns
2327
+ -------
2328
+ dict
2329
+ Dictionary with 'reactions' and 'species' keys containing lists of extra column names
2330
+ """
2331
+ extra_reactions_columns = []
2332
+ extra_species_columns = []
2333
+
2334
+ if keep_reactions_data is not False:
2335
+ extra_reactions_columns = [
2336
+ c
2337
+ for c in interaction_edgelist.columns
2338
+ if c not in INTERACTION_EDGELIST_EXPECTED_VARS
2339
+ ]
2340
+
2341
+ if keep_species_data is not False:
2342
+ extra_species_columns = [
2343
+ c
2344
+ for c in species_df.columns
2345
+ if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2346
+ ]
2347
+
2348
+ return {"reactions": extra_reactions_columns, "species": extra_species_columns}
2349
+
2350
+
2351
+ def _edgelist_process_compartments(compartments_df, interaction_source):
2352
+ """
2353
+ Format compartments DataFrame with source and ID columns.
2354
+
2355
+ Parameters
2356
+ ----------
2357
+ compartments_df : pd.DataFrame
2358
+ Raw compartments data
2359
+ interaction_source : source.Source
2360
+ Source object to assign to compartments
2361
+
2362
+ Returns
2363
+ -------
2364
+ pd.DataFrame
2365
+ Processed compartments with IDs, indexed by compartment ID
2366
+ """
2367
+ compartments = compartments_df.copy()
2368
+ compartments[SBML_DFS.C_SOURCE] = interaction_source
2369
+ compartments[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
2370
+ range(compartments.shape[0]), SBML_DFS.C_ID
2371
+ )
2372
+ return compartments.set_index(SBML_DFS.C_ID)[
2373
+ [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
2374
+ ]
2375
+
2376
+
2377
+ def _edgelist_process_species(species_df, interaction_source, extra_species_columns):
2378
+ """
2379
+ Format species DataFrame and extract extra data.
2380
+
2381
+ Parameters
2382
+ ----------
2383
+ species_df : pd.DataFrame
2384
+ Raw species data
2385
+ interaction_source : source.Source
2386
+ Source object to assign to species
2387
+ extra_species_columns : list
2388
+ Names of extra columns to preserve separately
2389
+
2390
+ Returns
2391
+ -------
2392
+ tuple of pd.DataFrame
2393
+ Processed species DataFrame and species extra data DataFrame
2394
+ """
2395
+ species = species_df.copy()
2396
+ species[SBML_DFS.S_SOURCE] = interaction_source
2397
+ species[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
2398
+ range(species.shape[0]), SBML_DFS.S_ID
2399
+ )
2400
+
2401
+ required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
2402
+ species_indexed = species.set_index(SBML_DFS.S_ID)[
2403
+ required_cols + extra_species_columns
2404
+ ]
2405
+
2406
+ # Separate extra data from main species table
2407
+ species_data = species_indexed[extra_species_columns]
2408
+ processed_species = species_indexed[required_cols]
2409
+
2410
+ return processed_species, species_data
2411
+
2412
+
2413
+ def _edgelist_create_compartmentalized_species(
2414
+ interaction_edgelist, species_df, compartments_df, interaction_source
2415
+ ):
2416
+ """
2417
+ Create compartmentalized species from interactions.
2418
+
2419
+ Parameters
2420
+ ----------
2421
+ interaction_edgelist : pd.DataFrame
2422
+ Interaction data containing species-compartment combinations
2423
+ species_df : pd.DataFrame
2424
+ Processed species data with IDs
2425
+ compartments_df : pd.DataFrame
2426
+ Processed compartments data with IDs
2427
+ interaction_source : source.Source
2428
+ Source object to assign to compartmentalized species
2429
+
2430
+ Returns
2431
+ -------
2432
+ pd.DataFrame
2433
+ Compartmentalized species with formatted names and IDs
2434
+ """
2435
+ # Get all distinct upstream and downstream compartmentalized species
2436
+ comp_species = pd.concat(
2437
+ [
2438
+ interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
2439
+ {
2440
+ "upstream_name": SBML_DFS.S_NAME,
2441
+ "upstream_compartment": SBML_DFS.C_NAME,
2442
+ },
2443
+ axis=1,
2444
+ ),
2445
+ interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
2446
+ {
2447
+ "downstream_name": SBML_DFS.S_NAME,
2448
+ "downstream_compartment": SBML_DFS.C_NAME,
2449
+ },
2450
+ axis=1,
2451
+ ),
2452
+ ]
2453
+ ).drop_duplicates()
2454
+
2455
+ # Add species and compartment IDs
2456
+ comp_species_w_ids = comp_species.merge(
2457
+ species_df[SBML_DFS.S_NAME].reset_index(), how="left", on=SBML_DFS.S_NAME
2458
+ ).merge(
2459
+ compartments_df[SBML_DFS.C_NAME].reset_index(), how="left", on=SBML_DFS.C_NAME
2460
+ )
2461
+
2462
+ # Validate merge was successful
2463
+ _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
2464
+
2465
+ # Format compartmentalized species with names, source, and IDs
2466
+ comp_species_w_ids[SBML_DFS.SC_NAME] = [
2467
+ f"{s} [{c}]"
2468
+ for s, c in zip(
2469
+ comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
2470
+ )
2471
+ ]
2472
+ comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
2473
+ comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
2474
+ range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
2475
+ )
2476
+
2477
+ return comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
2478
+ [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
2479
+ ]
2480
+
2481
+
2482
+ def _edgelist_create_reactions_and_species(
2483
+ interaction_edgelist,
2484
+ comp_species,
2485
+ species_df,
2486
+ compartments_df,
2487
+ interaction_source,
2488
+ upstream_stoichiometry,
2489
+ downstream_stoichiometry,
2490
+ downstream_sbo_name,
2491
+ extra_reactions_columns,
2492
+ ):
2493
+ """
2494
+ Create reactions and reaction species from interactions.
2495
+
2496
+ Parameters
2497
+ ----------
2498
+ interaction_edgelist : pd.DataFrame
2499
+ Original interaction data
2500
+ comp_species : pd.DataFrame
2501
+ Compartmentalized species with IDs
2502
+ species_df : pd.DataFrame
2503
+ Processed species data with IDs
2504
+ compartments_df : pd.DataFrame
2505
+ Processed compartments data with IDs
2506
+ interaction_source : source.Source
2507
+ Source object for reactions
2508
+ upstream_stoichiometry : int
2509
+ Stoichiometry for upstream species
2510
+ downstream_stoichiometry : int
2511
+ Stoichiometry for downstream species
2512
+ downstream_sbo_name : str
2513
+ SBO term name for downstream species
2514
+ extra_reactions_columns : list
2515
+ Names of extra columns to preserve
2516
+
2517
+ Returns
2518
+ -------
2519
+ tuple
2520
+ (reactions_df, reaction_species_df, reactions_data)
2521
+ """
2522
+ # Add compartmentalized species IDs to interactions
2523
+ comp_species_w_names = (
2524
+ comp_species.reset_index()
2525
+ .merge(species_df[SBML_DFS.S_NAME].reset_index())
2526
+ .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
2527
+ )
2528
+
2529
+ interaction_w_cspecies = interaction_edgelist.merge(
2530
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2531
+ {
2532
+ SBML_DFS.SC_ID: "sc_id_up",
2533
+ SBML_DFS.S_NAME: "upstream_name",
2534
+ SBML_DFS.C_NAME: "upstream_compartment",
2535
+ },
2536
+ axis=1,
2537
+ ),
2538
+ how="left",
2539
+ ).merge(
2540
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2541
+ {
2542
+ SBML_DFS.SC_ID: "sc_id_down",
2543
+ SBML_DFS.S_NAME: "downstream_name",
2544
+ SBML_DFS.C_NAME: "downstream_compartment",
2545
+ },
2546
+ axis=1,
2547
+ ),
2548
+ how="left",
2549
+ )[
2550
+ REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
2551
+ ]
2552
+
2553
+ # Validate merge didn't create duplicates
2554
+ if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
2555
+ raise ValueError(
2556
+ f"Merging compartmentalized species resulted in row count change "
2557
+ f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
2558
+ )
2559
+
2560
+ # Create reaction IDs FIRST - before using them
2561
+ interaction_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
2562
+ range(interaction_w_cspecies.shape[0]), SBML_DFS.R_ID
2563
+ )
2564
+
2565
+ # Create reactions DataFrame
2566
+ interactions_copy = interaction_w_cspecies.copy()
2567
+ interactions_copy[SBML_DFS.R_SOURCE] = interaction_source
2568
+
2569
+ reactions_columns = [
2570
+ SBML_DFS.R_NAME,
2571
+ SBML_DFS.R_IDENTIFIERS,
2572
+ SBML_DFS.R_SOURCE,
2573
+ SBML_DFS.R_ISREVERSIBLE,
2574
+ ]
2575
+
2576
+ reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
2577
+ reactions_columns + extra_reactions_columns
2578
+ ]
2579
+
2580
+ # Separate extra data
2581
+ reactions_data = reactions_df[extra_reactions_columns]
2582
+ reactions_df = reactions_df[reactions_columns]
2583
+
2584
+ # Create reaction species relationships - NOW r_id exists
2585
+ reaction_species_df = pd.concat(
2586
+ [
2587
+ # Upstream species (modifiers/stimulators/inhibitors)
2588
+ interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
2589
+ .assign(stoichiometry=upstream_stoichiometry)
2590
+ .rename({"sc_id_up": "sc_id"}, axis=1),
2591
+ # Downstream species (products)
2592
+ interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
2593
+ .assign(
2594
+ stoichiometry=downstream_stoichiometry,
2595
+ sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
2596
+ )
2597
+ .rename({"sc_id_down": "sc_id"}, axis=1),
2598
+ ]
2599
+ )
2600
+
2601
+ reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
2602
+ range(reaction_species_df.shape[0]), "rsc_id"
2603
+ )
2604
+
2605
+ reaction_species_df = reaction_species_df.set_index("rsc_id")
2606
+
2607
+ return reactions_df, reaction_species_df, reactions_data
2608
+
2609
+
2610
+ def _edgelist_assemble_sbml_model(
2611
+ compartments,
2612
+ species,
2613
+ comp_species,
2614
+ reactions,
2615
+ reaction_species,
2616
+ species_data,
2617
+ reactions_data,
2618
+ keep_species_data,
2619
+ keep_reactions_data,
2620
+ extra_columns,
2621
+ ):
2622
+ """
2623
+ Assemble the final SBML_dfs object.
2624
+
2625
+ Parameters
2626
+ ----------
2627
+ compartments : pd.DataFrame
2628
+ Processed compartments data
2629
+ species : pd.DataFrame
2630
+ Processed species data
2631
+ comp_species : pd.DataFrame
2632
+ Compartmentalized species data
2633
+ reactions : pd.DataFrame
2634
+ Reactions data
2635
+ reaction_species : pd.DataFrame
2636
+ Reaction species relationships
2637
+ species_data : pd.DataFrame
2638
+ Extra species data to include
2639
+ reactions_data : pd.DataFrame
2640
+ Extra reactions data to include
2641
+ keep_species_data : bool or str
2642
+ Label for species extra data
2643
+ keep_reactions_data : bool or str
2644
+ Label for reactions extra data
2645
+ extra_columns : dict
2646
+ Dictionary containing lists of extra column names
2647
+
2648
+ Returns
2649
+ -------
2650
+ SBML_dfs
2651
+ Validated SBML data structure
2652
+ """
2653
+ sbml_tbl_dict = {
2654
+ "compartments": compartments,
2655
+ "species": species,
2656
+ "compartmentalized_species": comp_species,
2657
+ "reactions": reactions,
2658
+ "reaction_species": reaction_species,
2659
+ }
2660
+
2661
+ # Add extra data if requested
2662
+ if len(extra_columns["reactions"]) > 0:
2663
+ data_label = (
2664
+ keep_reactions_data if isinstance(keep_reactions_data, str) else "source"
2665
+ )
2666
+ sbml_tbl_dict["reactions_data"] = {data_label: reactions_data}
2667
+
2668
+ if len(extra_columns["species"]) > 0:
2669
+ data_label = (
2670
+ keep_species_data if isinstance(keep_species_data, str) else "source"
2671
+ )
2672
+ sbml_tbl_dict["species_data"] = {data_label: species_data}
2673
+
2674
+ sbml_model = SBML_dfs(sbml_tbl_dict)
2675
+ sbml_model.validate()
2676
+
2677
+ return sbml_model
2678
+
2679
+
2680
+ def _sbml_dfs_from_edgelist_check_cspecies_merge(
2681
+ merged_species: pd.DataFrame, original_species: pd.DataFrame
2682
+ ) -> None:
2683
+ """Check for a mismatch between the provided species data and species implied by the edgelist."""
2684
+
2685
+ # check for 1-many merge
2686
+ if merged_species.shape[0] != original_species.shape[0]:
2687
+ raise ValueError(
2688
+ "Merging compartmentalized species to species_df"
2689
+ " and compartments_df by names resulted in an "
2690
+ f"increase in the tables from {original_species.shape[0]}"
2691
+ f" to {merged_species.shape[0]} indicating that names were"
2692
+ " not unique"
2693
+ )
2694
+
2695
+ # check for missing species and compartments
2696
+ missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
2697
+ SBML_DFS.C_NAME
2698
+ ].unique()
2699
+ if len(missing_compartments) >= 1:
2700
+ raise ValueError(
2701
+ f"{len(missing_compartments)} compartments were present in"
2702
+ ' "interaction_edgelist" but not "compartments_df":'
2703
+ f" {', '.join(missing_compartments)}"
2704
+ )
2705
+
2706
+ missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
2707
+ SBML_DFS.S_NAME
2708
+ ].unique()
2709
+ if len(missing_species) >= 1:
2710
+ raise ValueError(
2711
+ f"{len(missing_species)} species were present in "
2712
+ '"interaction_edgelist" but not "species_df":'
2713
+ f" {', '.join(missing_species)}"
2714
+ )
2715
+
2716
+ return None
2717
+
2718
+
2719
+ def _stub_compartments(
2720
+ stubbed_compartment: str = GENERIC_COMPARTMENT,
2721
+ ) -> pd.DataFrame:
2722
+ """Stub Compartments
2723
+
2724
+ Create a compartments table with only a single compartment
2725
+
2726
+ Args:
2727
+ stubbed_compartment (str): the name of a compartment which should match the
2728
+ keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
2729
+
2730
+ Returns:
2731
+ compartments_df (pd.DataFrame): compartments dataframe
2732
+ """
2733
+
2734
+ if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
2735
+ raise ValueError(
2736
+ f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
2737
+ )
2738
+
2739
+ if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
2740
+ raise ValueError(
2741
+ f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
2742
+ )
2743
+
2744
+ stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
2745
+
2746
+ formatted_uri = identifiers.format_uri(
2747
+ uri=identifiers.create_uri_url(
2748
+ ontology=ONTOLOGIES.GO,
2749
+ identifier=stubbed_compartment_id,
2750
+ ),
2751
+ biological_qualifier_type=BQB.IS,
2752
+ )
2753
+
2754
+ compartments_df = pd.DataFrame(
2755
+ {
2756
+ SBML_DFS.C_NAME: [stubbed_compartment],
2757
+ SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
2758
+ }
2759
+ )
2760
+ compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
2761
+ compartments_df.index.name = SBML_DFS.C_ID
2762
+
2763
+ return compartments_df
2764
+
2765
+
2766
+ def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
2767
+ """Validates a table against a reference
2768
+
2769
+ This check if the table has the same index, no duplicates in the index
2770
+ and that all values in the index are in the reference table.
2771
+
2772
+ Args:
2773
+ data_table (pd.DataFrame): a table with data that should
2774
+ match the reference
2775
+ ref_table (pd.DataFrame): a reference table
2776
+
2777
+ Raises:
2778
+ ValueError: not same index name
2779
+ ValueError: index contains duplicates
2780
+ ValueError: index not subset of index of reactions table
2781
+ """
2782
+ ref_index_name = ref_table.index.name
2783
+ if data_table.index.name != ref_index_name:
2784
+ raise ValueError(
2785
+ "the index name for reaction data table was not"
2786
+ f" {ref_index_name}: {data_table.index.name}"
2787
+ )
2788
+ ids = data_table.index
2789
+ if any(ids.duplicated()):
2790
+ raise ValueError(
2791
+ "the index for reaction data table " "contained duplicate values"
2792
+ )
2793
+ if not all(ids.isin(ref_table.index)):
2794
+ raise ValueError(
2795
+ "the index for reaction data table contained values"
2796
+ " not found in the reactions table"
2797
+ )
2798
+ if not isinstance(data_table, pd.DataFrame):
2799
+ raise TypeError(
2800
+ f"The data table was type {type(data_table).__name__}"
2801
+ " but must be a pd.DataFrame"
2802
+ )