napistu 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
napistu/sbml_dfs_core.py CHANGED
@@ -1852,550 +1852,199 @@ def sbml_dfs_from_edgelist(
1852
1852
  keep_reactions_data: bool | str = False,
1853
1853
  ) -> SBML_dfs:
1854
1854
  """
1855
- Create SBML_dfs from Edgelist
1855
+ Create SBML_dfs from interaction edgelist.
1856
1856
 
1857
- Combine a set of interactions into an sbml.SBML_dfs mechanistic model
1858
-
1859
- Parameters:
1860
- interaction_edgelist (pd.DataFrame): A table containing interactions:
1861
- - upstream_name (str): matching "s_name" from "species_df"
1862
- - downstream_name (str): matching "s_name" from "species_df"
1863
- - upstream_compartment (str): compartment of "upstream_name"
1864
- with names matching "c_name" from "compartments_df"
1865
- - downstream_compartment (str): compartment of "downstream_name"
1866
- with names matching "c_name" from "compartments_df"
1867
- - r_name (str): a name for the interaction
1868
- - sbo_term (str): sbo term defining the type of
1869
- molecular interaction (see MINI_SBO_FROM_NAME)
1870
- - r_Identifiers (identifiers.Identifiers): identifiers
1871
- supporting the interaction (e.g., pubmed ids)
1872
- - r_isreversible (bool): Is this reaction reversible?
1873
- If True, the reaction is reversible
1874
- By default, the interactions of TRRUST networks are irreversible, and reversible for STRING networks
1875
- species_df (pd.DataFrame): A table defining unique molecular
1876
- species participating in "interaction_edgelist":
1877
- - s_name (str): name of molecular species
1878
- - s_Identifiers (identifiers.Identifiers): identifiers
1879
- defining the species
1880
- compartments_df (pd.DataFrame): A table defining compartments
1881
- where interactions are occurring "interaction_edgelist":
1882
- - c_name (str): name of compartment
1883
- - c_Identifiers (identifiers.Identifiers):
1884
- identifiers defining the compartment (see
1885
- bigg.annotate_recon() for a set of names > go categories)
1886
- interaction_source (source.Source): A source object
1887
- which will tie model entities to the interaction source
1888
- upstream_stoichiometry (int): stoichiometry of
1889
- upstream species in reaction
1890
- downstream_stoichiometry (int): stoichiometry of
1891
- downstream species in reaction
1892
- downstream_sbo_name (str): sbo term defining the
1893
- type of molecular interaction for the downstream reactand
1894
- (see MINI_SBO_FROM_NAME)
1895
- keep_species_data (bool | str): Should species data
1896
- be kept in the model? If True, all species data will be kept
1897
- and saved as "species_data" in the SBML_dfs. The label will be 'source'
1898
- If False, no species data will be kept.
1899
- If a string: label for the species data to be kept.
1900
- keep_reactions_data (bool | str): Should reaction data be kept in the model?
1901
- If True, all reaction data will be kept and saved
1902
- as "reactions_data" in the SBML_dfs. The label will be 'source'.
1903
- If False, no reaction data will be kept.
1904
- If a string: label for the reaction data to be kept.
1905
-
1906
- Returns:
1907
- sbml.SBML_dfs
1857
+ Combines a set of molecular interactions into a mechanistic SBML_dfs model
1858
+ by processing interaction data, species information, and compartment definitions.
1908
1859
 
1860
+ Parameters
1861
+ ----------
1862
+ interaction_edgelist : pd.DataFrame
1863
+ Table containing molecular interactions with columns:
1864
+ - upstream_name : str, matches "s_name" from species_df
1865
+ - downstream_name : str, matches "s_name" from species_df
1866
+ - upstream_compartment : str, matches "c_name" from compartments_df
1867
+ - downstream_compartment : str, matches "c_name" from compartments_df
1868
+ - r_name : str, name for the interaction
1869
+ - sbo_term : str, SBO term defining interaction type
1870
+ - r_Identifiers : identifiers.Identifiers, supporting identifiers
1871
+ - r_isreversible : bool, whether reaction is reversible
1872
+ species_df : pd.DataFrame
1873
+ Table defining molecular species with columns:
1874
+ - s_name : str, name of molecular species
1875
+ - s_Identifiers : identifiers.Identifiers, species identifiers
1876
+ compartments_df : pd.DataFrame
1877
+ Table defining compartments with columns:
1878
+ - c_name : str, name of compartment
1879
+ - c_Identifiers : identifiers.Identifiers, compartment identifiers
1880
+ interaction_source : source.Source
1881
+ Source object linking model entities to interaction source
1882
+ upstream_stoichiometry : int, default 0
1883
+ Stoichiometry of upstream species in reactions
1884
+ downstream_stoichiometry : int, default 1
1885
+ Stoichiometry of downstream species in reactions
1886
+ downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
1887
+ SBO term for downstream reactant type
1888
+ keep_species_data : bool or str, default False
1889
+ Whether to preserve extra species columns. If True, saves as 'source' label.
1890
+ If string, uses as custom label. If False, discards extra data.
1891
+ keep_reactions_data : bool or str, default False
1892
+ Whether to preserve extra reaction columns. If True, saves as 'source' label.
1893
+ If string, uses as custom label. If False, discards extra data.
1894
+
1895
+ Returns
1896
+ -------
1897
+ SBML_dfs
1898
+ Validated SBML data structure containing compartments, species,
1899
+ compartmentalized species, reactions, and reaction species tables.
1909
1900
  """
1901
+ # 1. Validate inputs
1902
+ _edgelist_validate_inputs(interaction_edgelist, species_df, compartments_df)
1910
1903
 
1911
- # check input dfs for required variables
1912
- _sbml_dfs_from_edgelist_validate_inputs(
1913
- interaction_edgelist, species_df, compartments_df
1904
+ # 2. Identify which extra columns to preserve
1905
+ extra_columns = _edgelist_identify_extra_columns(
1906
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
1914
1907
  )
1915
1908
 
1916
- # Identify extra columns in the input data.
1917
- # if keep_reactions_data is True, this will be added
1918
- # as `reaction_data`
1919
- interaction_edgelist_required_vars = {
1920
- "upstream_name",
1921
- "downstream_name",
1922
- "upstream_compartment",
1923
- "downstream_compartment",
1924
- SBML_DFS.R_NAME,
1925
- SBML_DFS.SBO_TERM,
1926
- SBML_DFS.R_IDENTIFIERS,
1927
- SBML_DFS.R_ISREVERSIBLE,
1928
- }
1929
- if keep_reactions_data is not False:
1930
- extra_reactions_columns = [
1931
- c
1932
- for c in interaction_edgelist.columns
1933
- if c not in interaction_edgelist_required_vars
1934
- ]
1935
- else:
1936
- extra_reactions_columns = []
1937
- # Extra species columns
1938
- if keep_species_data is not False:
1939
- extra_species_columns = [
1940
- c
1941
- for c in species_df.columns
1942
- if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
1943
- ]
1944
- else:
1945
- extra_species_columns = []
1946
-
1947
- # format compartments
1948
- compartments_df[SBML_DFS.C_SOURCE] = interaction_source
1949
- compartments_df[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
1950
- range(compartments_df.shape[0]), SBML_DFS.C_ID
1909
+ # 3. Process compartments and species tables
1910
+ processed_compartments = _edgelist_process_compartments(
1911
+ compartments_df, interaction_source
1951
1912
  )
1952
- compartments_df = compartments_df.set_index(SBML_DFS.C_ID)[
1953
- [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
1954
- ]
1955
-
1956
- # format species
1957
- species_df[SBML_DFS.S_SOURCE] = interaction_source
1958
- species_df[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
1959
- range(species_df.shape[0]), SBML_DFS.S_ID
1913
+ processed_species, species_data = _edgelist_process_species(
1914
+ species_df, interaction_source, extra_columns["species"]
1960
1915
  )
1961
1916
 
1962
- required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
1963
- species_df = species_df.set_index(SBML_DFS.S_ID)[
1964
- required_cols + extra_species_columns
1965
- ]
1966
- # Keep extra columns to save them as extra data
1967
- species_data = species_df[extra_species_columns]
1968
- # Remove extra columns
1969
- species_df = species_df[required_cols]
1970
-
1971
- # create compartmentalized species
1972
-
1973
- # define all distinct upstream and downstream compartmentalized species
1974
- comp_species = pd.concat(
1975
- [
1976
- interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
1977
- {
1978
- "upstream_name": SBML_DFS.S_NAME,
1979
- "upstream_compartment": SBML_DFS.C_NAME,
1980
- },
1981
- axis=1,
1982
- ),
1983
- interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
1984
- {
1985
- "downstream_name": SBML_DFS.S_NAME,
1986
- "downstream_compartment": SBML_DFS.C_NAME,
1987
- },
1988
- axis=1,
1989
- ),
1990
- ]
1991
- ).drop_duplicates()
1992
-
1993
- # merge to add species and compartments primary keys
1994
- comp_species_w_ids = comp_species.merge(
1995
- species_df[SBML_DFS.S_NAME].reset_index(),
1996
- how="left",
1997
- left_on=SBML_DFS.S_NAME,
1998
- right_on=SBML_DFS.S_NAME,
1999
- ).merge(
2000
- compartments_df[SBML_DFS.C_NAME].reset_index(),
2001
- how="left",
2002
- left_on=SBML_DFS.C_NAME,
2003
- right_on=SBML_DFS.C_NAME,
1917
+ # 4. Create compartmentalized species
1918
+ comp_species = _edgelist_create_compartmentalized_species(
1919
+ interaction_edgelist,
1920
+ processed_species,
1921
+ processed_compartments,
1922
+ interaction_source,
2004
1923
  )
2005
1924
 
2006
- # check whether all species and compartments exist
2007
- _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
2008
-
2009
- # name compounds
2010
- comp_species_w_ids[SBML_DFS.SC_NAME] = [
2011
- f"{s} [{c}]"
2012
- for s, c in zip(
2013
- comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
1925
+ # 5. Create reactions and reaction species
1926
+ reactions, reaction_species, reactions_data = (
1927
+ _edgelist_create_reactions_and_species(
1928
+ interaction_edgelist,
1929
+ comp_species,
1930
+ processed_species,
1931
+ processed_compartments,
1932
+ interaction_source,
1933
+ upstream_stoichiometry,
1934
+ downstream_stoichiometry,
1935
+ downstream_sbo_name,
1936
+ extra_columns["reactions"],
2014
1937
  )
2015
- ]
2016
- # add source object
2017
- comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
2018
- # name index
2019
- comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
2020
- range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
2021
1938
  )
2022
- comp_species_w_ids = comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
2023
- [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
2024
- ]
2025
1939
 
2026
- # create reactions
2027
-
2028
- # create a from cs_species -> to cs_species edgelist
2029
- # interaction_edgelist
2030
- comp_species_w_names = (
2031
- comp_species_w_ids.reset_index()
2032
- .merge(species_df[SBML_DFS.S_NAME].reset_index())
2033
- .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
1940
+ # 6. Assemble final SBML_dfs object
1941
+ sbml_model = _edgelist_assemble_sbml_model(
1942
+ processed_compartments,
1943
+ processed_species,
1944
+ comp_species,
1945
+ reactions,
1946
+ reaction_species,
1947
+ species_data,
1948
+ reactions_data,
1949
+ keep_species_data,
1950
+ keep_reactions_data,
1951
+ extra_columns,
2034
1952
  )
2035
1953
 
2036
- interaction_edgelist_w_cspecies = interaction_edgelist.merge(
2037
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2038
- {
2039
- SBML_DFS.SC_ID: "sc_id_up",
2040
- SBML_DFS.S_NAME: "upstream_name",
2041
- SBML_DFS.C_NAME: "upstream_compartment",
2042
- },
2043
- axis=1,
2044
- ),
2045
- how="left",
2046
- ).merge(
2047
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2048
- {
2049
- SBML_DFS.SC_ID: "sc_id_down",
2050
- SBML_DFS.S_NAME: "downstream_name",
2051
- SBML_DFS.C_NAME: "downstream_compartment",
2052
- },
2053
- axis=1,
2054
- ),
2055
- how="left",
2056
- )[
2057
- REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
2058
- ]
2059
-
2060
- # some extra checks
2061
- if interaction_edgelist.shape[0] != interaction_edgelist_w_cspecies.shape[0]:
2062
- raise ValueError(
2063
- "Merging compartmentalized species to interaction_edgelist"
2064
- " resulted in an increase in the tables from "
2065
- f"{interaction_edgelist.shape[0]} to "
2066
- f"{interaction_edgelist_w_cspecies.shape[0]} indicating"
2067
- " a 1-many join which should have been 1-1"
2068
- )
1954
+ return sbml_model
2069
1955
 
2070
- # create one reaction per interaction
2071
- interaction_edgelist_w_cspecies[SBML_DFS.R_SOURCE] = interaction_source
2072
- interaction_edgelist_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
2073
- range(interaction_edgelist_w_cspecies.shape[0]), SBML_DFS.R_ID
2074
- )
1956
+ return sbml_model
2075
1957
 
2076
- reactions_df_columns = [
2077
- SBML_DFS.R_NAME,
2078
- SBML_DFS.R_IDENTIFIERS,
2079
- SBML_DFS.R_SOURCE,
2080
- SBML_DFS.R_ISREVERSIBLE,
2081
- ]
2082
- reactions_df = interaction_edgelist_w_cspecies.copy().set_index(SBML_DFS.R_ID)[
2083
- reactions_df_columns + extra_reactions_columns
2084
- ]
2085
- # Keep extra columns to save them as extra data
2086
- reactions_data = reactions_df[extra_reactions_columns]
2087
- reactions_df = reactions_df[reactions_df_columns]
2088
1958
 
2089
- # define upstream and downstream comp species as reaction species
2090
- reaction_species_df = pd.concat(
2091
- [
2092
- # upstream interactions are defined by sbo_term and should generally
2093
- # be modifiers/stimulator/inhibitor/interactor
2094
- interaction_edgelist_w_cspecies[["sc_id_up", "sbo_term", "r_id"]]
2095
- .assign(stoichiometry=upstream_stoichiometry)
2096
- .rename({"sc_id_up": "sc_id"}, axis=1),
2097
- # downstream interactions indicate some modification of the state
2098
- # of the species and hence are defined as product
2099
- interaction_edgelist_w_cspecies[["sc_id_down", "r_id"]]
2100
- .assign(
2101
- stoichiometry=downstream_stoichiometry,
2102
- sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
2103
- )
2104
- .rename({"sc_id_down": "sc_id"}, axis=1),
2105
- ]
2106
- )
2107
- reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
2108
- range(reaction_species_df.shape[0]), "rsc_id"
2109
- )
2110
- reaction_species_df = reaction_species_df.set_index("rsc_id")
1959
+ def species_type_types(x):
1960
+ """Assign a high-level molecule type to a molecular species"""
2111
1961
 
2112
- # form sbml_dfs object
2113
- sbml_tbl_dict: MutableMapping[str, pd.DataFrame | dict[str, pd.DataFrame]] = {
2114
- "compartments": compartments_df,
2115
- "species": species_df,
2116
- "compartmentalized_species": comp_species_w_ids,
2117
- "reactions": reactions_df,
2118
- "reaction_species": reaction_species_df,
2119
- }
2120
- if len(extra_reactions_columns) > 0:
2121
- if isinstance(keep_reactions_data, str):
2122
- reactions_data_label = keep_reactions_data
1962
+ if isinstance(x, identifiers.Identifiers):
1963
+ if x.filter(["chebi"]):
1964
+ return "metabolite"
1965
+ elif x.filter(["molodex"]):
1966
+ return "drug"
2123
1967
  else:
2124
- reactions_data_label = "source"
2125
- sbml_tbl_dict["reactions_data"] = {reactions_data_label: reactions_data}
1968
+ return "protein"
1969
+ else:
1970
+ return "unknown"
2126
1971
 
2127
- if len(extra_species_columns) > 0:
2128
- if isinstance(keep_species_data, str):
2129
- species_data_label = keep_species_data
2130
- else:
2131
- species_data_label = "source"
2132
- sbml_tbl_dict["species_data"] = {species_data_label: species_data}
2133
1972
 
2134
- sbml_model = SBML_dfs(sbml_tbl_dict)
2135
- sbml_model.validate()
1973
+ def stub_ids(ids):
1974
+ if len(ids) == 0:
1975
+ return pd.DataFrame(
1976
+ {
1977
+ IDENTIFIERS.ONTOLOGY: [None],
1978
+ IDENTIFIERS.IDENTIFIER: [None],
1979
+ IDENTIFIERS.URL: [None],
1980
+ IDENTIFIERS.BQB: [None],
1981
+ }
1982
+ )
1983
+ else:
1984
+ return pd.DataFrame(ids)
2136
1985
 
2137
- return sbml_model
2138
1986
 
1987
+ def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
1988
+ """
1989
+ Add an sbo_role column to the reaction_species table.
2139
1990
 
2140
- def _sbml_dfs_from_edgelist_validate_inputs(
2141
- interaction_edgelist: pd.DataFrame,
2142
- species_df: pd.DataFrame,
2143
- compartments_df: pd.DataFrame,
2144
- ) -> None:
2145
- """Check that the inputs for creating an SBML_dfs from an edgelist are appropriate."""
1991
+ The sbo_role column is a string column that contains the SBO role of the reaction species.
1992
+ The values in the sbo_role column are taken from the sbo_term column.
2146
1993
 
2147
- # check compartments
2148
- compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
2149
- compartments_df_columns = set(compartments_df.columns.tolist())
2150
- missing_required_fields = compartments_df_expected_vars.difference(
2151
- compartments_df_columns
2152
- )
2153
- if len(missing_required_fields) > 0:
2154
- raise ValueError(
2155
- f"{', '.join(missing_required_fields)} are required variables"
2156
- ' in "compartments_df" but were not present in the input file.'
2157
- )
1994
+ The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
1995
+ """
2158
1996
 
2159
- # check species
2160
- species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2161
- species_df_columns = set(species_df.columns.tolist())
2162
- missing_required_fields = species_df_expected_vars.difference(species_df_columns)
2163
- if len(missing_required_fields) > 0:
2164
- raise ValueError(
2165
- f"{', '.join(missing_required_fields)} are required"
2166
- ' variables in "species_df" but were not present '
2167
- "in the input file."
2168
- )
1997
+ validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
2169
1998
 
2170
- # check interactions
2171
- interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
2172
- missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
2173
- interaction_edgelist_columns
1999
+ reaction_species = (
2000
+ reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
2001
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
2002
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
2174
2003
  )
2175
- if len(missing_required_fields) > 0:
2176
- raise ValueError(
2177
- f"{', '.join(missing_required_fields)} are required "
2178
- 'variables in "interaction_edgelist" but were not '
2179
- "present in the input file."
2004
+
2005
+ undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
2006
+ SBO_NAME_TO_ROLE.values()
2007
+ )
2008
+ if len(undefined_roles) > 0:
2009
+ logger.warning(
2010
+ f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
2180
2011
  )
2012
+ mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
2013
+ reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
2181
2014
 
2182
- return None
2015
+ return reaction_species
2183
2016
 
2184
2017
 
2185
- def _sbml_dfs_from_edgelist_check_cspecies_merge(
2186
- merged_species: pd.DataFrame, original_species: pd.DataFrame
2187
- ) -> None:
2188
- """Check for a mismatch between the provided species data and species implied by the edgelist."""
2018
+ def find_underspecified_reactions(
2019
+ reaction_species_w_roles: pd.DataFrame,
2020
+ ) -> pd.DataFrame:
2189
2021
 
2190
- # check for 1-many merge
2191
- if merged_species.shape[0] != original_species.shape[0]:
2022
+ # check that both sbo_role and "new" are present
2023
+ if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
2192
2024
  raise ValueError(
2193
- "Merging compartmentalized species to species_df"
2194
- " and compartments_df by names resulted in an "
2195
- f"increase in the tables from {original_species.shape[0]}"
2196
- f" to {merged_species.shape[0]} indicating that names were"
2197
- " not unique"
2025
+ "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
2198
2026
  )
2199
-
2200
- # check for missing species and compartments
2201
- missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
2202
- SBML_DFS.C_NAME
2203
- ].unique()
2204
- if len(missing_compartments) >= 1:
2027
+ if "new" not in reaction_species_w_roles.columns:
2205
2028
  raise ValueError(
2206
- f"{len(missing_compartments)} compartments were present in"
2207
- ' "interaction_edgelist" but not "compartments_df":'
2208
- f" {', '.join(missing_compartments)}"
2029
+ "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2209
2030
  )
2210
-
2211
- missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
2212
- SBML_DFS.S_NAME
2213
- ].unique()
2214
- if len(missing_species) >= 1:
2031
+ # check that new is a boolean column
2032
+ if reaction_species_w_roles["new"].dtype != bool:
2215
2033
  raise ValueError(
2216
- f"{len(missing_species)} species were present in "
2217
- '"interaction_edgelist" but not "species_df":'
2218
- f" {', '.join(missing_species)}"
2034
+ "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2219
2035
  )
2220
2036
 
2221
- return None
2037
+ reactions_with_lost_defining_members = set(
2038
+ reaction_species_w_roles.query("~new")
2039
+ .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
2040
+ .tolist()
2041
+ )
2222
2042
 
2223
-
2224
- def _stub_compartments(
2225
- stubbed_compartment: str = GENERIC_COMPARTMENT,
2226
- ) -> pd.DataFrame:
2227
- """Stub Compartments
2228
-
2229
- Create a compartments table with only a single compartment
2230
-
2231
- Args:
2232
- stubbed_compartment (str): the name of a compartment which should match the
2233
- keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
2234
-
2235
- Returns:
2236
- compartments_df (pd.DataFrame): compartments dataframe
2237
- """
2238
-
2239
- if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
2240
- raise ValueError(
2241
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
2242
- )
2243
-
2244
- if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
2245
- raise ValueError(
2246
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
2247
- )
2248
-
2249
- stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
2250
-
2251
- formatted_uri = identifiers.format_uri(
2252
- uri=identifiers.create_uri_url(
2253
- ontology=ONTOLOGIES.GO,
2254
- identifier=stubbed_compartment_id,
2255
- ),
2256
- biological_qualifier_type=BQB.IS,
2257
- )
2258
-
2259
- compartments_df = pd.DataFrame(
2260
- {
2261
- SBML_DFS.C_NAME: [stubbed_compartment],
2262
- SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
2263
- }
2264
- )
2265
- compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
2266
- compartments_df.index.name = SBML_DFS.C_ID
2267
-
2268
- return compartments_df
2269
-
2270
-
2271
- def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
2272
- """Validates a table against a reference
2273
-
2274
- This check if the table has the same index, no duplicates in the index
2275
- and that all values in the index are in the reference table.
2276
-
2277
- Args:
2278
- data_table (pd.DataFrame): a table with data that should
2279
- match the reference
2280
- ref_table (pd.DataFrame): a reference table
2281
-
2282
- Raises:
2283
- ValueError: not same index name
2284
- ValueError: index contains duplicates
2285
- ValueError: index not subset of index of reactions table
2286
- """
2287
- ref_index_name = ref_table.index.name
2288
- if data_table.index.name != ref_index_name:
2289
- raise ValueError(
2290
- "the index name for reaction data table was not"
2291
- f" {ref_index_name}: {data_table.index.name}"
2292
- )
2293
- ids = data_table.index
2294
- if any(ids.duplicated()):
2295
- raise ValueError(
2296
- "the index for reaction data table " "contained duplicate values"
2297
- )
2298
- if not all(ids.isin(ref_table.index)):
2299
- raise ValueError(
2300
- "the index for reaction data table contained values"
2301
- " not found in the reactions table"
2302
- )
2303
- if not isinstance(data_table, pd.DataFrame):
2304
- raise TypeError(
2305
- f"The data table was type {type(data_table).__name__}"
2306
- " but must be a pd.DataFrame"
2307
- )
2308
-
2309
-
2310
- def species_type_types(x):
2311
- """Assign a high-level molecule type to a molecular species"""
2312
-
2313
- if isinstance(x, identifiers.Identifiers):
2314
- if x.filter(["chebi"]):
2315
- return "metabolite"
2316
- elif x.filter(["molodex"]):
2317
- return "drug"
2318
- else:
2319
- return "protein"
2320
- else:
2321
- return "unknown"
2322
-
2323
-
2324
- def stub_ids(ids):
2325
- if len(ids) == 0:
2326
- return pd.DataFrame(
2327
- {
2328
- IDENTIFIERS.ONTOLOGY: [None],
2329
- IDENTIFIERS.IDENTIFIER: [None],
2330
- IDENTIFIERS.URL: [None],
2331
- IDENTIFIERS.BQB: [None],
2332
- }
2333
- )
2334
- else:
2335
- return pd.DataFrame(ids)
2336
-
2337
-
2338
- def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
2339
- """
2340
- Add an sbo_role column to the reaction_species table.
2341
-
2342
- The sbo_role column is a string column that contains the SBO role of the reaction species.
2343
- The values in the sbo_role column are taken from the sbo_term column.
2344
-
2345
- The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
2346
- """
2347
-
2348
- validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
2349
-
2350
- reaction_species = (
2351
- reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
2352
- .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
2353
- .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
2354
- )
2355
-
2356
- undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
2357
- SBO_NAME_TO_ROLE.values()
2358
- )
2359
- if len(undefined_roles) > 0:
2360
- logger.warning(
2361
- f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
2362
- )
2363
- mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
2364
- reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
2365
-
2366
- return reaction_species
2367
-
2368
-
2369
- def find_underspecified_reactions(
2370
- reaction_species_w_roles: pd.DataFrame,
2371
- ) -> pd.DataFrame:
2372
-
2373
- # check that both sbo_role and "new" are present
2374
- if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
2375
- raise ValueError(
2376
- "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
2377
- )
2378
- if "new" not in reaction_species_w_roles.columns:
2379
- raise ValueError(
2380
- "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2381
- )
2382
- # check that new is a boolean column
2383
- if reaction_species_w_roles["new"].dtype != bool:
2384
- raise ValueError(
2385
- "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2386
- )
2387
-
2388
- reactions_with_lost_defining_members = set(
2389
- reaction_species_w_roles.query("~new")
2390
- .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
2391
- .tolist()
2392
- )
2393
-
2394
- N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
2395
- if N_reactions_with_lost_defining_members > 0:
2396
- logger.info(
2397
- f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
2398
- )
2043
+ N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
2044
+ if N_reactions_with_lost_defining_members > 0:
2045
+ logger.info(
2046
+ f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
2047
+ )
2399
2048
 
2400
2049
  # find the cases where all "new" values for a given (r_id, sbo_term) are False
2401
2050
  reactions_with_lost_requirements = set(
@@ -2599,3 +2248,555 @@ def _filter_promiscuous_components(
2599
2248
  ].drop(["is_shared_component"], axis=1)
2600
2249
 
2601
2250
  return filtered_bqb_has_parts
2251
+
2252
+
2253
+ def _edgelist_validate_inputs(
2254
+ interaction_edgelist: pd.DataFrame,
2255
+ species_df: pd.DataFrame,
2256
+ compartments_df: pd.DataFrame,
2257
+ ) -> None:
2258
+ """
2259
+ Validate input DataFrames have required columns.
2260
+
2261
+ Parameters
2262
+ ----------
2263
+ interaction_edgelist : pd.DataFrame
2264
+ Interaction data to validate
2265
+ species_df : pd.DataFrame
2266
+ Species data to validate
2267
+ compartments_df : pd.DataFrame
2268
+ Compartments data to validate
2269
+ """
2270
+
2271
+ # check compartments
2272
+ compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
2273
+ compartments_df_columns = set(compartments_df.columns.tolist())
2274
+ missing_required_fields = compartments_df_expected_vars.difference(
2275
+ compartments_df_columns
2276
+ )
2277
+ if len(missing_required_fields) > 0:
2278
+ raise ValueError(
2279
+ f"{', '.join(missing_required_fields)} are required variables"
2280
+ ' in "compartments_df" but were not present in the input file.'
2281
+ )
2282
+
2283
+ # check species
2284
+ species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2285
+ species_df_columns = set(species_df.columns.tolist())
2286
+ missing_required_fields = species_df_expected_vars.difference(species_df_columns)
2287
+ if len(missing_required_fields) > 0:
2288
+ raise ValueError(
2289
+ f"{', '.join(missing_required_fields)} are required"
2290
+ ' variables in "species_df" but were not present '
2291
+ "in the input file."
2292
+ )
2293
+
2294
+ # check interactions
2295
+ interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
2296
+ missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
2297
+ interaction_edgelist_columns
2298
+ )
2299
+ if len(missing_required_fields) > 0:
2300
+ raise ValueError(
2301
+ f"{', '.join(missing_required_fields)} are required "
2302
+ 'variables in "interaction_edgelist" but were not '
2303
+ "present in the input file."
2304
+ )
2305
+
2306
+ return None
2307
+
2308
+
2309
+ def _edgelist_identify_extra_columns(
2310
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
2311
+ ):
2312
+ """
2313
+ Identify extra columns in input data that should be preserved.
2314
+
2315
+ Parameters
2316
+ ----------
2317
+ interaction_edgelist : pd.DataFrame
2318
+ Interaction data containing potential extra columns
2319
+ species_df : pd.DataFrame
2320
+ Species data containing potential extra columns
2321
+ keep_reactions_data : bool or str
2322
+ Whether to keep extra reaction columns
2323
+ keep_species_data : bool or str
2324
+ Whether to keep extra species columns
2325
+
2326
+ Returns
2327
+ -------
2328
+ dict
2329
+ Dictionary with 'reactions' and 'species' keys containing lists of extra column names
2330
+ """
2331
+ extra_reactions_columns = []
2332
+ extra_species_columns = []
2333
+
2334
+ if keep_reactions_data is not False:
2335
+ extra_reactions_columns = [
2336
+ c
2337
+ for c in interaction_edgelist.columns
2338
+ if c not in INTERACTION_EDGELIST_EXPECTED_VARS
2339
+ ]
2340
+
2341
+ if keep_species_data is not False:
2342
+ extra_species_columns = [
2343
+ c
2344
+ for c in species_df.columns
2345
+ if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2346
+ ]
2347
+
2348
+ return {"reactions": extra_reactions_columns, "species": extra_species_columns}
2349
+
2350
+
2351
+ def _edgelist_process_compartments(compartments_df, interaction_source):
2352
+ """
2353
+ Format compartments DataFrame with source and ID columns.
2354
+
2355
+ Parameters
2356
+ ----------
2357
+ compartments_df : pd.DataFrame
2358
+ Raw compartments data
2359
+ interaction_source : source.Source
2360
+ Source object to assign to compartments
2361
+
2362
+ Returns
2363
+ -------
2364
+ pd.DataFrame
2365
+ Processed compartments with IDs, indexed by compartment ID
2366
+ """
2367
+ compartments = compartments_df.copy()
2368
+ compartments[SBML_DFS.C_SOURCE] = interaction_source
2369
+ compartments[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
2370
+ range(compartments.shape[0]), SBML_DFS.C_ID
2371
+ )
2372
+ return compartments.set_index(SBML_DFS.C_ID)[
2373
+ [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
2374
+ ]
2375
+
2376
+
2377
+ def _edgelist_process_species(species_df, interaction_source, extra_species_columns):
2378
+ """
2379
+ Format species DataFrame and extract extra data.
2380
+
2381
+ Parameters
2382
+ ----------
2383
+ species_df : pd.DataFrame
2384
+ Raw species data
2385
+ interaction_source : source.Source
2386
+ Source object to assign to species
2387
+ extra_species_columns : list
2388
+ Names of extra columns to preserve separately
2389
+
2390
+ Returns
2391
+ -------
2392
+ tuple of pd.DataFrame
2393
+ Processed species DataFrame and species extra data DataFrame
2394
+ """
2395
+ species = species_df.copy()
2396
+ species[SBML_DFS.S_SOURCE] = interaction_source
2397
+ species[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
2398
+ range(species.shape[0]), SBML_DFS.S_ID
2399
+ )
2400
+
2401
+ required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
2402
+ species_indexed = species.set_index(SBML_DFS.S_ID)[
2403
+ required_cols + extra_species_columns
2404
+ ]
2405
+
2406
+ # Separate extra data from main species table
2407
+ species_data = species_indexed[extra_species_columns]
2408
+ processed_species = species_indexed[required_cols]
2409
+
2410
+ return processed_species, species_data
2411
+
2412
+
2413
+ def _edgelist_create_compartmentalized_species(
2414
+ interaction_edgelist, species_df, compartments_df, interaction_source
2415
+ ):
2416
+ """
2417
+ Create compartmentalized species from interactions.
2418
+
2419
+ Parameters
2420
+ ----------
2421
+ interaction_edgelist : pd.DataFrame
2422
+ Interaction data containing species-compartment combinations
2423
+ species_df : pd.DataFrame
2424
+ Processed species data with IDs
2425
+ compartments_df : pd.DataFrame
2426
+ Processed compartments data with IDs
2427
+ interaction_source : source.Source
2428
+ Source object to assign to compartmentalized species
2429
+
2430
+ Returns
2431
+ -------
2432
+ pd.DataFrame
2433
+ Compartmentalized species with formatted names and IDs
2434
+ """
2435
+ # Get all distinct upstream and downstream compartmentalized species
2436
+ comp_species = pd.concat(
2437
+ [
2438
+ interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
2439
+ {
2440
+ "upstream_name": SBML_DFS.S_NAME,
2441
+ "upstream_compartment": SBML_DFS.C_NAME,
2442
+ },
2443
+ axis=1,
2444
+ ),
2445
+ interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
2446
+ {
2447
+ "downstream_name": SBML_DFS.S_NAME,
2448
+ "downstream_compartment": SBML_DFS.C_NAME,
2449
+ },
2450
+ axis=1,
2451
+ ),
2452
+ ]
2453
+ ).drop_duplicates()
2454
+
2455
+ # Add species and compartment IDs
2456
+ comp_species_w_ids = comp_species.merge(
2457
+ species_df[SBML_DFS.S_NAME].reset_index(), how="left", on=SBML_DFS.S_NAME
2458
+ ).merge(
2459
+ compartments_df[SBML_DFS.C_NAME].reset_index(), how="left", on=SBML_DFS.C_NAME
2460
+ )
2461
+
2462
+ # Validate merge was successful
2463
+ _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
2464
+
2465
+ # Format compartmentalized species with names, source, and IDs
2466
+ comp_species_w_ids[SBML_DFS.SC_NAME] = [
2467
+ f"{s} [{c}]"
2468
+ for s, c in zip(
2469
+ comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
2470
+ )
2471
+ ]
2472
+ comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
2473
+ comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
2474
+ range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
2475
+ )
2476
+
2477
+ return comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
2478
+ [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
2479
+ ]
2480
+
2481
+
2482
+ def _edgelist_create_reactions_and_species(
2483
+ interaction_edgelist,
2484
+ comp_species,
2485
+ species_df,
2486
+ compartments_df,
2487
+ interaction_source,
2488
+ upstream_stoichiometry,
2489
+ downstream_stoichiometry,
2490
+ downstream_sbo_name,
2491
+ extra_reactions_columns,
2492
+ ):
2493
+ """
2494
+ Create reactions and reaction species from interactions.
2495
+
2496
+ Parameters
2497
+ ----------
2498
+ interaction_edgelist : pd.DataFrame
2499
+ Original interaction data
2500
+ comp_species : pd.DataFrame
2501
+ Compartmentalized species with IDs
2502
+ species_df : pd.DataFrame
2503
+ Processed species data with IDs
2504
+ compartments_df : pd.DataFrame
2505
+ Processed compartments data with IDs
2506
+ interaction_source : source.Source
2507
+ Source object for reactions
2508
+ upstream_stoichiometry : int
2509
+ Stoichiometry for upstream species
2510
+ downstream_stoichiometry : int
2511
+ Stoichiometry for downstream species
2512
+ downstream_sbo_name : str
2513
+ SBO term name for downstream species
2514
+ extra_reactions_columns : list
2515
+ Names of extra columns to preserve
2516
+
2517
+ Returns
2518
+ -------
2519
+ tuple
2520
+ (reactions_df, reaction_species_df, reactions_data)
2521
+ """
2522
+ # Add compartmentalized species IDs to interactions
2523
+ comp_species_w_names = (
2524
+ comp_species.reset_index()
2525
+ .merge(species_df[SBML_DFS.S_NAME].reset_index())
2526
+ .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
2527
+ )
2528
+
2529
+ interaction_w_cspecies = interaction_edgelist.merge(
2530
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2531
+ {
2532
+ SBML_DFS.SC_ID: "sc_id_up",
2533
+ SBML_DFS.S_NAME: "upstream_name",
2534
+ SBML_DFS.C_NAME: "upstream_compartment",
2535
+ },
2536
+ axis=1,
2537
+ ),
2538
+ how="left",
2539
+ ).merge(
2540
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2541
+ {
2542
+ SBML_DFS.SC_ID: "sc_id_down",
2543
+ SBML_DFS.S_NAME: "downstream_name",
2544
+ SBML_DFS.C_NAME: "downstream_compartment",
2545
+ },
2546
+ axis=1,
2547
+ ),
2548
+ how="left",
2549
+ )[
2550
+ REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
2551
+ ]
2552
+
2553
+ # Validate merge didn't create duplicates
2554
+ if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
2555
+ raise ValueError(
2556
+ f"Merging compartmentalized species resulted in row count change "
2557
+ f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
2558
+ )
2559
+
2560
+ # Create reaction IDs FIRST - before using them
2561
+ interaction_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
2562
+ range(interaction_w_cspecies.shape[0]), SBML_DFS.R_ID
2563
+ )
2564
+
2565
+ # Create reactions DataFrame
2566
+ interactions_copy = interaction_w_cspecies.copy()
2567
+ interactions_copy[SBML_DFS.R_SOURCE] = interaction_source
2568
+
2569
+ reactions_columns = [
2570
+ SBML_DFS.R_NAME,
2571
+ SBML_DFS.R_IDENTIFIERS,
2572
+ SBML_DFS.R_SOURCE,
2573
+ SBML_DFS.R_ISREVERSIBLE,
2574
+ ]
2575
+
2576
+ reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
2577
+ reactions_columns + extra_reactions_columns
2578
+ ]
2579
+
2580
+ # Separate extra data
2581
+ reactions_data = reactions_df[extra_reactions_columns]
2582
+ reactions_df = reactions_df[reactions_columns]
2583
+
2584
+ # Create reaction species relationships - NOW r_id exists
2585
+ reaction_species_df = pd.concat(
2586
+ [
2587
+ # Upstream species (modifiers/stimulators/inhibitors)
2588
+ interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
2589
+ .assign(stoichiometry=upstream_stoichiometry)
2590
+ .rename({"sc_id_up": "sc_id"}, axis=1),
2591
+ # Downstream species (products)
2592
+ interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
2593
+ .assign(
2594
+ stoichiometry=downstream_stoichiometry,
2595
+ sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
2596
+ )
2597
+ .rename({"sc_id_down": "sc_id"}, axis=1),
2598
+ ]
2599
+ )
2600
+
2601
+ reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
2602
+ range(reaction_species_df.shape[0]), "rsc_id"
2603
+ )
2604
+
2605
+ reaction_species_df = reaction_species_df.set_index("rsc_id")
2606
+
2607
+ return reactions_df, reaction_species_df, reactions_data
2608
+
2609
+
2610
+ def _edgelist_assemble_sbml_model(
2611
+ compartments,
2612
+ species,
2613
+ comp_species,
2614
+ reactions,
2615
+ reaction_species,
2616
+ species_data,
2617
+ reactions_data,
2618
+ keep_species_data,
2619
+ keep_reactions_data,
2620
+ extra_columns,
2621
+ ):
2622
+ """
2623
+ Assemble the final SBML_dfs object.
2624
+
2625
+ Parameters
2626
+ ----------
2627
+ compartments : pd.DataFrame
2628
+ Processed compartments data
2629
+ species : pd.DataFrame
2630
+ Processed species data
2631
+ comp_species : pd.DataFrame
2632
+ Compartmentalized species data
2633
+ reactions : pd.DataFrame
2634
+ Reactions data
2635
+ reaction_species : pd.DataFrame
2636
+ Reaction species relationships
2637
+ species_data : pd.DataFrame
2638
+ Extra species data to include
2639
+ reactions_data : pd.DataFrame
2640
+ Extra reactions data to include
2641
+ keep_species_data : bool or str
2642
+ Label for species extra data
2643
+ keep_reactions_data : bool or str
2644
+ Label for reactions extra data
2645
+ extra_columns : dict
2646
+ Dictionary containing lists of extra column names
2647
+
2648
+ Returns
2649
+ -------
2650
+ SBML_dfs
2651
+ Validated SBML data structure
2652
+ """
2653
+ sbml_tbl_dict = {
2654
+ "compartments": compartments,
2655
+ "species": species,
2656
+ "compartmentalized_species": comp_species,
2657
+ "reactions": reactions,
2658
+ "reaction_species": reaction_species,
2659
+ }
2660
+
2661
+ # Add extra data if requested
2662
+ if len(extra_columns["reactions"]) > 0:
2663
+ data_label = (
2664
+ keep_reactions_data if isinstance(keep_reactions_data, str) else "source"
2665
+ )
2666
+ sbml_tbl_dict["reactions_data"] = {data_label: reactions_data}
2667
+
2668
+ if len(extra_columns["species"]) > 0:
2669
+ data_label = (
2670
+ keep_species_data if isinstance(keep_species_data, str) else "source"
2671
+ )
2672
+ sbml_tbl_dict["species_data"] = {data_label: species_data}
2673
+
2674
+ sbml_model = SBML_dfs(sbml_tbl_dict)
2675
+ sbml_model.validate()
2676
+
2677
+ return sbml_model
2678
+
2679
+
2680
+ def _sbml_dfs_from_edgelist_check_cspecies_merge(
2681
+ merged_species: pd.DataFrame, original_species: pd.DataFrame
2682
+ ) -> None:
2683
+ """Check for a mismatch between the provided species data and species implied by the edgelist."""
2684
+
2685
+ # check for 1-many merge
2686
+ if merged_species.shape[0] != original_species.shape[0]:
2687
+ raise ValueError(
2688
+ "Merging compartmentalized species to species_df"
2689
+ " and compartments_df by names resulted in an "
2690
+ f"increase in the tables from {original_species.shape[0]}"
2691
+ f" to {merged_species.shape[0]} indicating that names were"
2692
+ " not unique"
2693
+ )
2694
+
2695
+ # check for missing species and compartments
2696
+ missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
2697
+ SBML_DFS.C_NAME
2698
+ ].unique()
2699
+ if len(missing_compartments) >= 1:
2700
+ raise ValueError(
2701
+ f"{len(missing_compartments)} compartments were present in"
2702
+ ' "interaction_edgelist" but not "compartments_df":'
2703
+ f" {', '.join(missing_compartments)}"
2704
+ )
2705
+
2706
+ missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
2707
+ SBML_DFS.S_NAME
2708
+ ].unique()
2709
+ if len(missing_species) >= 1:
2710
+ raise ValueError(
2711
+ f"{len(missing_species)} species were present in "
2712
+ '"interaction_edgelist" but not "species_df":'
2713
+ f" {', '.join(missing_species)}"
2714
+ )
2715
+
2716
+ return None
2717
+
2718
+
2719
+ def _stub_compartments(
2720
+ stubbed_compartment: str = GENERIC_COMPARTMENT,
2721
+ ) -> pd.DataFrame:
2722
+ """Stub Compartments
2723
+
2724
+ Create a compartments table with only a single compartment
2725
+
2726
+ Args:
2727
+ stubbed_compartment (str): the name of a compartment which should match the
2728
+ keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
2729
+
2730
+ Returns:
2731
+ compartments_df (pd.DataFrame): compartments dataframe
2732
+ """
2733
+
2734
+ if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
2735
+ raise ValueError(
2736
+ f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
2737
+ )
2738
+
2739
+ if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
2740
+ raise ValueError(
2741
+ f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
2742
+ )
2743
+
2744
+ stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
2745
+
2746
+ formatted_uri = identifiers.format_uri(
2747
+ uri=identifiers.create_uri_url(
2748
+ ontology=ONTOLOGIES.GO,
2749
+ identifier=stubbed_compartment_id,
2750
+ ),
2751
+ biological_qualifier_type=BQB.IS,
2752
+ )
2753
+
2754
+ compartments_df = pd.DataFrame(
2755
+ {
2756
+ SBML_DFS.C_NAME: [stubbed_compartment],
2757
+ SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
2758
+ }
2759
+ )
2760
+ compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
2761
+ compartments_df.index.name = SBML_DFS.C_ID
2762
+
2763
+ return compartments_df
2764
+
2765
+
2766
+ def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
2767
+ """Validates a table against a reference
2768
+
2769
+ This check if the table has the same index, no duplicates in the index
2770
+ and that all values in the index are in the reference table.
2771
+
2772
+ Args:
2773
+ data_table (pd.DataFrame): a table with data that should
2774
+ match the reference
2775
+ ref_table (pd.DataFrame): a reference table
2776
+
2777
+ Raises:
2778
+ ValueError: not same index name
2779
+ ValueError: index contains duplicates
2780
+ ValueError: index not subset of index of reactions table
2781
+ """
2782
+ ref_index_name = ref_table.index.name
2783
+ if data_table.index.name != ref_index_name:
2784
+ raise ValueError(
2785
+ "the index name for reaction data table was not"
2786
+ f" {ref_index_name}: {data_table.index.name}"
2787
+ )
2788
+ ids = data_table.index
2789
+ if any(ids.duplicated()):
2790
+ raise ValueError(
2791
+ "the index for reaction data table " "contained duplicate values"
2792
+ )
2793
+ if not all(ids.isin(ref_table.index)):
2794
+ raise ValueError(
2795
+ "the index for reaction data table contained values"
2796
+ " not found in the reactions table"
2797
+ )
2798
+ if not isinstance(data_table, pd.DataFrame):
2799
+ raise TypeError(
2800
+ f"The data table was type {type(data_table).__name__}"
2801
+ " but must be a pd.DataFrame"
2802
+ )