napistu 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/constants.py +4 -4
- napistu/network/ig_utils.py +35 -0
- napistu/network/precompute.py +2 -1
- napistu/sbml_dfs_core.py +702 -501
- napistu/source.py +1 -1
- {napistu-0.3.4.dist-info → napistu-0.3.5.dist-info}/METADATA +1 -1
- {napistu-0.3.4.dist-info → napistu-0.3.5.dist-info}/RECORD +13 -13
- tests/test_network_ig_utils.py +36 -0
- tests/test_sbml_dfs_core.py +131 -0
- {napistu-0.3.4.dist-info → napistu-0.3.5.dist-info}/WHEEL +0 -0
- {napistu-0.3.4.dist-info → napistu-0.3.5.dist-info}/entry_points.txt +0 -0
- {napistu-0.3.4.dist-info → napistu-0.3.5.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.3.4.dist-info → napistu-0.3.5.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_core.py
CHANGED
@@ -1852,550 +1852,199 @@ def sbml_dfs_from_edgelist(
|
|
1852
1852
|
keep_reactions_data: bool | str = False,
|
1853
1853
|
) -> SBML_dfs:
|
1854
1854
|
"""
|
1855
|
-
Create SBML_dfs from
|
1855
|
+
Create SBML_dfs from interaction edgelist.
|
1856
1856
|
|
1857
|
-
|
1858
|
-
|
1859
|
-
Parameters:
|
1860
|
-
interaction_edgelist (pd.DataFrame): A table containing interactions:
|
1861
|
-
- upstream_name (str): matching "s_name" from "species_df"
|
1862
|
-
- downstream_name (str): matching "s_name" from "species_df"
|
1863
|
-
- upstream_compartment (str): compartment of "upstream_name"
|
1864
|
-
with names matching "c_name" from "compartments_df"
|
1865
|
-
- downstream_compartment (str): compartment of "downstream_name"
|
1866
|
-
with names matching "c_name" from "compartments_df"
|
1867
|
-
- r_name (str): a name for the interaction
|
1868
|
-
- sbo_term (str): sbo term defining the type of
|
1869
|
-
molecular interaction (see MINI_SBO_FROM_NAME)
|
1870
|
-
- r_Identifiers (identifiers.Identifiers): identifiers
|
1871
|
-
supporting the interaction (e.g., pubmed ids)
|
1872
|
-
- r_isreversible (bool): Is this reaction reversible?
|
1873
|
-
If True, the reaction is reversible
|
1874
|
-
By default, the interactions of TRRUST networks are irreversible, and reversible for STRING networks
|
1875
|
-
species_df (pd.DataFrame): A table defining unique molecular
|
1876
|
-
species participating in "interaction_edgelist":
|
1877
|
-
- s_name (str): name of molecular species
|
1878
|
-
- s_Identifiers (identifiers.Identifiers): identifiers
|
1879
|
-
defining the species
|
1880
|
-
compartments_df (pd.DataFrame): A table defining compartments
|
1881
|
-
where interactions are occurring "interaction_edgelist":
|
1882
|
-
- c_name (str): name of compartment
|
1883
|
-
- c_Identifiers (identifiers.Identifiers):
|
1884
|
-
identifiers defining the compartment (see
|
1885
|
-
bigg.annotate_recon() for a set of names > go categories)
|
1886
|
-
interaction_source (source.Source): A source object
|
1887
|
-
which will tie model entities to the interaction source
|
1888
|
-
upstream_stoichiometry (int): stoichiometry of
|
1889
|
-
upstream species in reaction
|
1890
|
-
downstream_stoichiometry (int): stoichiometry of
|
1891
|
-
downstream species in reaction
|
1892
|
-
downstream_sbo_name (str): sbo term defining the
|
1893
|
-
type of molecular interaction for the downstream reactand
|
1894
|
-
(see MINI_SBO_FROM_NAME)
|
1895
|
-
keep_species_data (bool | str): Should species data
|
1896
|
-
be kept in the model? If True, all species data will be kept
|
1897
|
-
and saved as "species_data" in the SBML_dfs. The label will be 'source'
|
1898
|
-
If False, no species data will be kept.
|
1899
|
-
If a string: label for the species data to be kept.
|
1900
|
-
keep_reactions_data (bool | str): Should reaction data be kept in the model?
|
1901
|
-
If True, all reaction data will be kept and saved
|
1902
|
-
as "reactions_data" in the SBML_dfs. The label will be 'source'.
|
1903
|
-
If False, no reaction data will be kept.
|
1904
|
-
If a string: label for the reaction data to be kept.
|
1905
|
-
|
1906
|
-
Returns:
|
1907
|
-
sbml.SBML_dfs
|
1857
|
+
Combines a set of molecular interactions into a mechanistic SBML_dfs model
|
1858
|
+
by processing interaction data, species information, and compartment definitions.
|
1908
1859
|
|
1860
|
+
Parameters
|
1861
|
+
----------
|
1862
|
+
interaction_edgelist : pd.DataFrame
|
1863
|
+
Table containing molecular interactions with columns:
|
1864
|
+
- upstream_name : str, matches "s_name" from species_df
|
1865
|
+
- downstream_name : str, matches "s_name" from species_df
|
1866
|
+
- upstream_compartment : str, matches "c_name" from compartments_df
|
1867
|
+
- downstream_compartment : str, matches "c_name" from compartments_df
|
1868
|
+
- r_name : str, name for the interaction
|
1869
|
+
- sbo_term : str, SBO term defining interaction type
|
1870
|
+
- r_Identifiers : identifiers.Identifiers, supporting identifiers
|
1871
|
+
- r_isreversible : bool, whether reaction is reversible
|
1872
|
+
species_df : pd.DataFrame
|
1873
|
+
Table defining molecular species with columns:
|
1874
|
+
- s_name : str, name of molecular species
|
1875
|
+
- s_Identifiers : identifiers.Identifiers, species identifiers
|
1876
|
+
compartments_df : pd.DataFrame
|
1877
|
+
Table defining compartments with columns:
|
1878
|
+
- c_name : str, name of compartment
|
1879
|
+
- c_Identifiers : identifiers.Identifiers, compartment identifiers
|
1880
|
+
interaction_source : source.Source
|
1881
|
+
Source object linking model entities to interaction source
|
1882
|
+
upstream_stoichiometry : int, default 0
|
1883
|
+
Stoichiometry of upstream species in reactions
|
1884
|
+
downstream_stoichiometry : int, default 1
|
1885
|
+
Stoichiometry of downstream species in reactions
|
1886
|
+
downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
|
1887
|
+
SBO term for downstream reactant type
|
1888
|
+
keep_species_data : bool or str, default False
|
1889
|
+
Whether to preserve extra species columns. If True, saves as 'source' label.
|
1890
|
+
If string, uses as custom label. If False, discards extra data.
|
1891
|
+
keep_reactions_data : bool or str, default False
|
1892
|
+
Whether to preserve extra reaction columns. If True, saves as 'source' label.
|
1893
|
+
If string, uses as custom label. If False, discards extra data.
|
1894
|
+
|
1895
|
+
Returns
|
1896
|
+
-------
|
1897
|
+
SBML_dfs
|
1898
|
+
Validated SBML data structure containing compartments, species,
|
1899
|
+
compartmentalized species, reactions, and reaction species tables.
|
1909
1900
|
"""
|
1901
|
+
# 1. Validate inputs
|
1902
|
+
_edgelist_validate_inputs(interaction_edgelist, species_df, compartments_df)
|
1910
1903
|
|
1911
|
-
#
|
1912
|
-
|
1913
|
-
interaction_edgelist, species_df,
|
1904
|
+
# 2. Identify which extra columns to preserve
|
1905
|
+
extra_columns = _edgelist_identify_extra_columns(
|
1906
|
+
interaction_edgelist, species_df, keep_reactions_data, keep_species_data
|
1914
1907
|
)
|
1915
1908
|
|
1916
|
-
#
|
1917
|
-
|
1918
|
-
|
1919
|
-
interaction_edgelist_required_vars = {
|
1920
|
-
"upstream_name",
|
1921
|
-
"downstream_name",
|
1922
|
-
"upstream_compartment",
|
1923
|
-
"downstream_compartment",
|
1924
|
-
SBML_DFS.R_NAME,
|
1925
|
-
SBML_DFS.SBO_TERM,
|
1926
|
-
SBML_DFS.R_IDENTIFIERS,
|
1927
|
-
SBML_DFS.R_ISREVERSIBLE,
|
1928
|
-
}
|
1929
|
-
if keep_reactions_data is not False:
|
1930
|
-
extra_reactions_columns = [
|
1931
|
-
c
|
1932
|
-
for c in interaction_edgelist.columns
|
1933
|
-
if c not in interaction_edgelist_required_vars
|
1934
|
-
]
|
1935
|
-
else:
|
1936
|
-
extra_reactions_columns = []
|
1937
|
-
# Extra species columns
|
1938
|
-
if keep_species_data is not False:
|
1939
|
-
extra_species_columns = [
|
1940
|
-
c
|
1941
|
-
for c in species_df.columns
|
1942
|
-
if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
1943
|
-
]
|
1944
|
-
else:
|
1945
|
-
extra_species_columns = []
|
1946
|
-
|
1947
|
-
# format compartments
|
1948
|
-
compartments_df[SBML_DFS.C_SOURCE] = interaction_source
|
1949
|
-
compartments_df[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
|
1950
|
-
range(compartments_df.shape[0]), SBML_DFS.C_ID
|
1909
|
+
# 3. Process compartments and species tables
|
1910
|
+
processed_compartments = _edgelist_process_compartments(
|
1911
|
+
compartments_df, interaction_source
|
1951
1912
|
)
|
1952
|
-
|
1953
|
-
|
1954
|
-
]
|
1955
|
-
|
1956
|
-
# format species
|
1957
|
-
species_df[SBML_DFS.S_SOURCE] = interaction_source
|
1958
|
-
species_df[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
|
1959
|
-
range(species_df.shape[0]), SBML_DFS.S_ID
|
1913
|
+
processed_species, species_data = _edgelist_process_species(
|
1914
|
+
species_df, interaction_source, extra_columns["species"]
|
1960
1915
|
)
|
1961
1916
|
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1967
|
-
|
1968
|
-
# Remove extra columns
|
1969
|
-
species_df = species_df[required_cols]
|
1970
|
-
|
1971
|
-
# create compartmentalized species
|
1972
|
-
|
1973
|
-
# define all distinct upstream and downstream compartmentalized species
|
1974
|
-
comp_species = pd.concat(
|
1975
|
-
[
|
1976
|
-
interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
|
1977
|
-
{
|
1978
|
-
"upstream_name": SBML_DFS.S_NAME,
|
1979
|
-
"upstream_compartment": SBML_DFS.C_NAME,
|
1980
|
-
},
|
1981
|
-
axis=1,
|
1982
|
-
),
|
1983
|
-
interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
|
1984
|
-
{
|
1985
|
-
"downstream_name": SBML_DFS.S_NAME,
|
1986
|
-
"downstream_compartment": SBML_DFS.C_NAME,
|
1987
|
-
},
|
1988
|
-
axis=1,
|
1989
|
-
),
|
1990
|
-
]
|
1991
|
-
).drop_duplicates()
|
1992
|
-
|
1993
|
-
# merge to add species and compartments primary keys
|
1994
|
-
comp_species_w_ids = comp_species.merge(
|
1995
|
-
species_df[SBML_DFS.S_NAME].reset_index(),
|
1996
|
-
how="left",
|
1997
|
-
left_on=SBML_DFS.S_NAME,
|
1998
|
-
right_on=SBML_DFS.S_NAME,
|
1999
|
-
).merge(
|
2000
|
-
compartments_df[SBML_DFS.C_NAME].reset_index(),
|
2001
|
-
how="left",
|
2002
|
-
left_on=SBML_DFS.C_NAME,
|
2003
|
-
right_on=SBML_DFS.C_NAME,
|
1917
|
+
# 4. Create compartmentalized species
|
1918
|
+
comp_species = _edgelist_create_compartmentalized_species(
|
1919
|
+
interaction_edgelist,
|
1920
|
+
processed_species,
|
1921
|
+
processed_compartments,
|
1922
|
+
interaction_source,
|
2004
1923
|
)
|
2005
1924
|
|
2006
|
-
#
|
2007
|
-
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2013
|
-
|
1925
|
+
# 5. Create reactions and reaction species
|
1926
|
+
reactions, reaction_species, reactions_data = (
|
1927
|
+
_edgelist_create_reactions_and_species(
|
1928
|
+
interaction_edgelist,
|
1929
|
+
comp_species,
|
1930
|
+
processed_species,
|
1931
|
+
processed_compartments,
|
1932
|
+
interaction_source,
|
1933
|
+
upstream_stoichiometry,
|
1934
|
+
downstream_stoichiometry,
|
1935
|
+
downstream_sbo_name,
|
1936
|
+
extra_columns["reactions"],
|
2014
1937
|
)
|
2015
|
-
]
|
2016
|
-
# add source object
|
2017
|
-
comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
|
2018
|
-
# name index
|
2019
|
-
comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
|
2020
|
-
range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
|
2021
1938
|
)
|
2022
|
-
comp_species_w_ids = comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
|
2023
|
-
[SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
|
2024
|
-
]
|
2025
1939
|
|
2026
|
-
#
|
2027
|
-
|
2028
|
-
|
2029
|
-
|
2030
|
-
|
2031
|
-
|
2032
|
-
|
2033
|
-
|
1940
|
+
# 6. Assemble final SBML_dfs object
|
1941
|
+
sbml_model = _edgelist_assemble_sbml_model(
|
1942
|
+
processed_compartments,
|
1943
|
+
processed_species,
|
1944
|
+
comp_species,
|
1945
|
+
reactions,
|
1946
|
+
reaction_species,
|
1947
|
+
species_data,
|
1948
|
+
reactions_data,
|
1949
|
+
keep_species_data,
|
1950
|
+
keep_reactions_data,
|
1951
|
+
extra_columns,
|
2034
1952
|
)
|
2035
1953
|
|
2036
|
-
|
2037
|
-
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
2038
|
-
{
|
2039
|
-
SBML_DFS.SC_ID: "sc_id_up",
|
2040
|
-
SBML_DFS.S_NAME: "upstream_name",
|
2041
|
-
SBML_DFS.C_NAME: "upstream_compartment",
|
2042
|
-
},
|
2043
|
-
axis=1,
|
2044
|
-
),
|
2045
|
-
how="left",
|
2046
|
-
).merge(
|
2047
|
-
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
2048
|
-
{
|
2049
|
-
SBML_DFS.SC_ID: "sc_id_down",
|
2050
|
-
SBML_DFS.S_NAME: "downstream_name",
|
2051
|
-
SBML_DFS.C_NAME: "downstream_compartment",
|
2052
|
-
},
|
2053
|
-
axis=1,
|
2054
|
-
),
|
2055
|
-
how="left",
|
2056
|
-
)[
|
2057
|
-
REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
|
2058
|
-
]
|
2059
|
-
|
2060
|
-
# some extra checks
|
2061
|
-
if interaction_edgelist.shape[0] != interaction_edgelist_w_cspecies.shape[0]:
|
2062
|
-
raise ValueError(
|
2063
|
-
"Merging compartmentalized species to interaction_edgelist"
|
2064
|
-
" resulted in an increase in the tables from "
|
2065
|
-
f"{interaction_edgelist.shape[0]} to "
|
2066
|
-
f"{interaction_edgelist_w_cspecies.shape[0]} indicating"
|
2067
|
-
" a 1-many join which should have been 1-1"
|
2068
|
-
)
|
1954
|
+
return sbml_model
|
2069
1955
|
|
2070
|
-
|
2071
|
-
interaction_edgelist_w_cspecies[SBML_DFS.R_SOURCE] = interaction_source
|
2072
|
-
interaction_edgelist_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
|
2073
|
-
range(interaction_edgelist_w_cspecies.shape[0]), SBML_DFS.R_ID
|
2074
|
-
)
|
1956
|
+
return sbml_model
|
2075
1957
|
|
2076
|
-
reactions_df_columns = [
|
2077
|
-
SBML_DFS.R_NAME,
|
2078
|
-
SBML_DFS.R_IDENTIFIERS,
|
2079
|
-
SBML_DFS.R_SOURCE,
|
2080
|
-
SBML_DFS.R_ISREVERSIBLE,
|
2081
|
-
]
|
2082
|
-
reactions_df = interaction_edgelist_w_cspecies.copy().set_index(SBML_DFS.R_ID)[
|
2083
|
-
reactions_df_columns + extra_reactions_columns
|
2084
|
-
]
|
2085
|
-
# Keep extra columns to save them as extra data
|
2086
|
-
reactions_data = reactions_df[extra_reactions_columns]
|
2087
|
-
reactions_df = reactions_df[reactions_df_columns]
|
2088
1958
|
|
2089
|
-
|
2090
|
-
|
2091
|
-
[
|
2092
|
-
# upstream interactions are defined by sbo_term and should generally
|
2093
|
-
# be modifiers/stimulator/inhibitor/interactor
|
2094
|
-
interaction_edgelist_w_cspecies[["sc_id_up", "sbo_term", "r_id"]]
|
2095
|
-
.assign(stoichiometry=upstream_stoichiometry)
|
2096
|
-
.rename({"sc_id_up": "sc_id"}, axis=1),
|
2097
|
-
# downstream interactions indicate some modification of the state
|
2098
|
-
# of the species and hence are defined as product
|
2099
|
-
interaction_edgelist_w_cspecies[["sc_id_down", "r_id"]]
|
2100
|
-
.assign(
|
2101
|
-
stoichiometry=downstream_stoichiometry,
|
2102
|
-
sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
|
2103
|
-
)
|
2104
|
-
.rename({"sc_id_down": "sc_id"}, axis=1),
|
2105
|
-
]
|
2106
|
-
)
|
2107
|
-
reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
|
2108
|
-
range(reaction_species_df.shape[0]), "rsc_id"
|
2109
|
-
)
|
2110
|
-
reaction_species_df = reaction_species_df.set_index("rsc_id")
|
1959
|
+
def species_type_types(x):
|
1960
|
+
"""Assign a high-level molecule type to a molecular species"""
|
2111
1961
|
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
2115
|
-
"
|
2116
|
-
|
2117
|
-
"reactions": reactions_df,
|
2118
|
-
"reaction_species": reaction_species_df,
|
2119
|
-
}
|
2120
|
-
if len(extra_reactions_columns) > 0:
|
2121
|
-
if isinstance(keep_reactions_data, str):
|
2122
|
-
reactions_data_label = keep_reactions_data
|
1962
|
+
if isinstance(x, identifiers.Identifiers):
|
1963
|
+
if x.filter(["chebi"]):
|
1964
|
+
return "metabolite"
|
1965
|
+
elif x.filter(["molodex"]):
|
1966
|
+
return "drug"
|
2123
1967
|
else:
|
2124
|
-
|
2125
|
-
|
1968
|
+
return "protein"
|
1969
|
+
else:
|
1970
|
+
return "unknown"
|
2126
1971
|
|
2127
|
-
if len(extra_species_columns) > 0:
|
2128
|
-
if isinstance(keep_species_data, str):
|
2129
|
-
species_data_label = keep_species_data
|
2130
|
-
else:
|
2131
|
-
species_data_label = "source"
|
2132
|
-
sbml_tbl_dict["species_data"] = {species_data_label: species_data}
|
2133
1972
|
|
2134
|
-
|
2135
|
-
|
1973
|
+
def stub_ids(ids):
|
1974
|
+
if len(ids) == 0:
|
1975
|
+
return pd.DataFrame(
|
1976
|
+
{
|
1977
|
+
IDENTIFIERS.ONTOLOGY: [None],
|
1978
|
+
IDENTIFIERS.IDENTIFIER: [None],
|
1979
|
+
IDENTIFIERS.URL: [None],
|
1980
|
+
IDENTIFIERS.BQB: [None],
|
1981
|
+
}
|
1982
|
+
)
|
1983
|
+
else:
|
1984
|
+
return pd.DataFrame(ids)
|
2136
1985
|
|
2137
|
-
return sbml_model
|
2138
1986
|
|
1987
|
+
def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
|
1988
|
+
"""
|
1989
|
+
Add an sbo_role column to the reaction_species table.
|
2139
1990
|
|
2140
|
-
|
2141
|
-
|
2142
|
-
species_df: pd.DataFrame,
|
2143
|
-
compartments_df: pd.DataFrame,
|
2144
|
-
) -> None:
|
2145
|
-
"""Check that the inputs for creating an SBML_dfs from an edgelist are appropriate."""
|
1991
|
+
The sbo_role column is a string column that contains the SBO role of the reaction species.
|
1992
|
+
The values in the sbo_role column are taken from the sbo_term column.
|
2146
1993
|
|
2147
|
-
|
2148
|
-
|
2149
|
-
compartments_df_columns = set(compartments_df.columns.tolist())
|
2150
|
-
missing_required_fields = compartments_df_expected_vars.difference(
|
2151
|
-
compartments_df_columns
|
2152
|
-
)
|
2153
|
-
if len(missing_required_fields) > 0:
|
2154
|
-
raise ValueError(
|
2155
|
-
f"{', '.join(missing_required_fields)} are required variables"
|
2156
|
-
' in "compartments_df" but were not present in the input file.'
|
2157
|
-
)
|
1994
|
+
The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
|
1995
|
+
"""
|
2158
1996
|
|
2159
|
-
|
2160
|
-
species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
2161
|
-
species_df_columns = set(species_df.columns.tolist())
|
2162
|
-
missing_required_fields = species_df_expected_vars.difference(species_df_columns)
|
2163
|
-
if len(missing_required_fields) > 0:
|
2164
|
-
raise ValueError(
|
2165
|
-
f"{', '.join(missing_required_fields)} are required"
|
2166
|
-
' variables in "species_df" but were not present '
|
2167
|
-
"in the input file."
|
2168
|
-
)
|
1997
|
+
validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
|
2169
1998
|
|
2170
|
-
|
2171
|
-
|
2172
|
-
|
2173
|
-
|
1999
|
+
reaction_species = (
|
2000
|
+
reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
|
2001
|
+
.replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
|
2002
|
+
.replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
|
2174
2003
|
)
|
2175
|
-
|
2176
|
-
|
2177
|
-
|
2178
|
-
|
2179
|
-
|
2004
|
+
|
2005
|
+
undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
|
2006
|
+
SBO_NAME_TO_ROLE.values()
|
2007
|
+
)
|
2008
|
+
if len(undefined_roles) > 0:
|
2009
|
+
logger.warning(
|
2010
|
+
f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
|
2180
2011
|
)
|
2012
|
+
mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
|
2013
|
+
reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
|
2181
2014
|
|
2182
|
-
return
|
2015
|
+
return reaction_species
|
2183
2016
|
|
2184
2017
|
|
2185
|
-
def
|
2186
|
-
|
2187
|
-
) ->
|
2188
|
-
"""Check for a mismatch between the provided species data and species implied by the edgelist."""
|
2018
|
+
def find_underspecified_reactions(
|
2019
|
+
reaction_species_w_roles: pd.DataFrame,
|
2020
|
+
) -> pd.DataFrame:
|
2189
2021
|
|
2190
|
-
# check
|
2191
|
-
if
|
2022
|
+
# check that both sbo_role and "new" are present
|
2023
|
+
if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
|
2192
2024
|
raise ValueError(
|
2193
|
-
"
|
2194
|
-
" and compartments_df by names resulted in an "
|
2195
|
-
f"increase in the tables from {original_species.shape[0]}"
|
2196
|
-
f" to {merged_species.shape[0]} indicating that names were"
|
2197
|
-
" not unique"
|
2025
|
+
"The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
|
2198
2026
|
)
|
2199
|
-
|
2200
|
-
# check for missing species and compartments
|
2201
|
-
missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
|
2202
|
-
SBML_DFS.C_NAME
|
2203
|
-
].unique()
|
2204
|
-
if len(missing_compartments) >= 1:
|
2027
|
+
if "new" not in reaction_species_w_roles.columns:
|
2205
2028
|
raise ValueError(
|
2206
|
-
|
2207
|
-
' "interaction_edgelist" but not "compartments_df":'
|
2208
|
-
f" {', '.join(missing_compartments)}"
|
2029
|
+
"The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2209
2030
|
)
|
2210
|
-
|
2211
|
-
|
2212
|
-
SBML_DFS.S_NAME
|
2213
|
-
].unique()
|
2214
|
-
if len(missing_species) >= 1:
|
2031
|
+
# check that new is a boolean column
|
2032
|
+
if reaction_species_w_roles["new"].dtype != bool:
|
2215
2033
|
raise ValueError(
|
2216
|
-
|
2217
|
-
'"interaction_edgelist" but not "species_df":'
|
2218
|
-
f" {', '.join(missing_species)}"
|
2034
|
+
"The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2219
2035
|
)
|
2220
2036
|
|
2221
|
-
|
2037
|
+
reactions_with_lost_defining_members = set(
|
2038
|
+
reaction_species_w_roles.query("~new")
|
2039
|
+
.query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
|
2040
|
+
.tolist()
|
2041
|
+
)
|
2222
2042
|
|
2223
|
-
|
2224
|
-
|
2225
|
-
|
2226
|
-
|
2227
|
-
|
2228
|
-
|
2229
|
-
Create a compartments table with only a single compartment
|
2230
|
-
|
2231
|
-
Args:
|
2232
|
-
stubbed_compartment (str): the name of a compartment which should match the
|
2233
|
-
keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
|
2234
|
-
|
2235
|
-
Returns:
|
2236
|
-
compartments_df (pd.DataFrame): compartments dataframe
|
2237
|
-
"""
|
2238
|
-
|
2239
|
-
if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
|
2240
|
-
raise ValueError(
|
2241
|
-
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
|
2242
|
-
)
|
2243
|
-
|
2244
|
-
if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
|
2245
|
-
raise ValueError(
|
2246
|
-
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
|
2247
|
-
)
|
2248
|
-
|
2249
|
-
stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
|
2250
|
-
|
2251
|
-
formatted_uri = identifiers.format_uri(
|
2252
|
-
uri=identifiers.create_uri_url(
|
2253
|
-
ontology=ONTOLOGIES.GO,
|
2254
|
-
identifier=stubbed_compartment_id,
|
2255
|
-
),
|
2256
|
-
biological_qualifier_type=BQB.IS,
|
2257
|
-
)
|
2258
|
-
|
2259
|
-
compartments_df = pd.DataFrame(
|
2260
|
-
{
|
2261
|
-
SBML_DFS.C_NAME: [stubbed_compartment],
|
2262
|
-
SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
|
2263
|
-
}
|
2264
|
-
)
|
2265
|
-
compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
|
2266
|
-
compartments_df.index.name = SBML_DFS.C_ID
|
2267
|
-
|
2268
|
-
return compartments_df
|
2269
|
-
|
2270
|
-
|
2271
|
-
def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
|
2272
|
-
"""Validates a table against a reference
|
2273
|
-
|
2274
|
-
This check if the table has the same index, no duplicates in the index
|
2275
|
-
and that all values in the index are in the reference table.
|
2276
|
-
|
2277
|
-
Args:
|
2278
|
-
data_table (pd.DataFrame): a table with data that should
|
2279
|
-
match the reference
|
2280
|
-
ref_table (pd.DataFrame): a reference table
|
2281
|
-
|
2282
|
-
Raises:
|
2283
|
-
ValueError: not same index name
|
2284
|
-
ValueError: index contains duplicates
|
2285
|
-
ValueError: index not subset of index of reactions table
|
2286
|
-
"""
|
2287
|
-
ref_index_name = ref_table.index.name
|
2288
|
-
if data_table.index.name != ref_index_name:
|
2289
|
-
raise ValueError(
|
2290
|
-
"the index name for reaction data table was not"
|
2291
|
-
f" {ref_index_name}: {data_table.index.name}"
|
2292
|
-
)
|
2293
|
-
ids = data_table.index
|
2294
|
-
if any(ids.duplicated()):
|
2295
|
-
raise ValueError(
|
2296
|
-
"the index for reaction data table " "contained duplicate values"
|
2297
|
-
)
|
2298
|
-
if not all(ids.isin(ref_table.index)):
|
2299
|
-
raise ValueError(
|
2300
|
-
"the index for reaction data table contained values"
|
2301
|
-
" not found in the reactions table"
|
2302
|
-
)
|
2303
|
-
if not isinstance(data_table, pd.DataFrame):
|
2304
|
-
raise TypeError(
|
2305
|
-
f"The data table was type {type(data_table).__name__}"
|
2306
|
-
" but must be a pd.DataFrame"
|
2307
|
-
)
|
2308
|
-
|
2309
|
-
|
2310
|
-
def species_type_types(x):
|
2311
|
-
"""Assign a high-level molecule type to a molecular species"""
|
2312
|
-
|
2313
|
-
if isinstance(x, identifiers.Identifiers):
|
2314
|
-
if x.filter(["chebi"]):
|
2315
|
-
return "metabolite"
|
2316
|
-
elif x.filter(["molodex"]):
|
2317
|
-
return "drug"
|
2318
|
-
else:
|
2319
|
-
return "protein"
|
2320
|
-
else:
|
2321
|
-
return "unknown"
|
2322
|
-
|
2323
|
-
|
2324
|
-
def stub_ids(ids):
|
2325
|
-
if len(ids) == 0:
|
2326
|
-
return pd.DataFrame(
|
2327
|
-
{
|
2328
|
-
IDENTIFIERS.ONTOLOGY: [None],
|
2329
|
-
IDENTIFIERS.IDENTIFIER: [None],
|
2330
|
-
IDENTIFIERS.URL: [None],
|
2331
|
-
IDENTIFIERS.BQB: [None],
|
2332
|
-
}
|
2333
|
-
)
|
2334
|
-
else:
|
2335
|
-
return pd.DataFrame(ids)
|
2336
|
-
|
2337
|
-
|
2338
|
-
def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
|
2339
|
-
"""
|
2340
|
-
Add an sbo_role column to the reaction_species table.
|
2341
|
-
|
2342
|
-
The sbo_role column is a string column that contains the SBO role of the reaction species.
|
2343
|
-
The values in the sbo_role column are taken from the sbo_term column.
|
2344
|
-
|
2345
|
-
The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
|
2346
|
-
"""
|
2347
|
-
|
2348
|
-
validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
|
2349
|
-
|
2350
|
-
reaction_species = (
|
2351
|
-
reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
|
2352
|
-
.replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
|
2353
|
-
.replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
|
2354
|
-
)
|
2355
|
-
|
2356
|
-
undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
|
2357
|
-
SBO_NAME_TO_ROLE.values()
|
2358
|
-
)
|
2359
|
-
if len(undefined_roles) > 0:
|
2360
|
-
logger.warning(
|
2361
|
-
f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
|
2362
|
-
)
|
2363
|
-
mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
|
2364
|
-
reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
|
2365
|
-
|
2366
|
-
return reaction_species
|
2367
|
-
|
2368
|
-
|
2369
|
-
def find_underspecified_reactions(
|
2370
|
-
reaction_species_w_roles: pd.DataFrame,
|
2371
|
-
) -> pd.DataFrame:
|
2372
|
-
|
2373
|
-
# check that both sbo_role and "new" are present
|
2374
|
-
if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
|
2375
|
-
raise ValueError(
|
2376
|
-
"The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
|
2377
|
-
)
|
2378
|
-
if "new" not in reaction_species_w_roles.columns:
|
2379
|
-
raise ValueError(
|
2380
|
-
"The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2381
|
-
)
|
2382
|
-
# check that new is a boolean column
|
2383
|
-
if reaction_species_w_roles["new"].dtype != bool:
|
2384
|
-
raise ValueError(
|
2385
|
-
"The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2386
|
-
)
|
2387
|
-
|
2388
|
-
reactions_with_lost_defining_members = set(
|
2389
|
-
reaction_species_w_roles.query("~new")
|
2390
|
-
.query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
|
2391
|
-
.tolist()
|
2392
|
-
)
|
2393
|
-
|
2394
|
-
N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
|
2395
|
-
if N_reactions_with_lost_defining_members > 0:
|
2396
|
-
logger.info(
|
2397
|
-
f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
|
2398
|
-
)
|
2043
|
+
N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
|
2044
|
+
if N_reactions_with_lost_defining_members > 0:
|
2045
|
+
logger.info(
|
2046
|
+
f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
|
2047
|
+
)
|
2399
2048
|
|
2400
2049
|
# find the cases where all "new" values for a given (r_id, sbo_term) are False
|
2401
2050
|
reactions_with_lost_requirements = set(
|
@@ -2599,3 +2248,555 @@ def _filter_promiscuous_components(
|
|
2599
2248
|
].drop(["is_shared_component"], axis=1)
|
2600
2249
|
|
2601
2250
|
return filtered_bqb_has_parts
|
2251
|
+
|
2252
|
+
|
2253
|
+
def _edgelist_validate_inputs(
|
2254
|
+
interaction_edgelist: pd.DataFrame,
|
2255
|
+
species_df: pd.DataFrame,
|
2256
|
+
compartments_df: pd.DataFrame,
|
2257
|
+
) -> None:
|
2258
|
+
"""
|
2259
|
+
Validate input DataFrames have required columns.
|
2260
|
+
|
2261
|
+
Parameters
|
2262
|
+
----------
|
2263
|
+
interaction_edgelist : pd.DataFrame
|
2264
|
+
Interaction data to validate
|
2265
|
+
species_df : pd.DataFrame
|
2266
|
+
Species data to validate
|
2267
|
+
compartments_df : pd.DataFrame
|
2268
|
+
Compartments data to validate
|
2269
|
+
"""
|
2270
|
+
|
2271
|
+
# check compartments
|
2272
|
+
compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
|
2273
|
+
compartments_df_columns = set(compartments_df.columns.tolist())
|
2274
|
+
missing_required_fields = compartments_df_expected_vars.difference(
|
2275
|
+
compartments_df_columns
|
2276
|
+
)
|
2277
|
+
if len(missing_required_fields) > 0:
|
2278
|
+
raise ValueError(
|
2279
|
+
f"{', '.join(missing_required_fields)} are required variables"
|
2280
|
+
' in "compartments_df" but were not present in the input file.'
|
2281
|
+
)
|
2282
|
+
|
2283
|
+
# check species
|
2284
|
+
species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
2285
|
+
species_df_columns = set(species_df.columns.tolist())
|
2286
|
+
missing_required_fields = species_df_expected_vars.difference(species_df_columns)
|
2287
|
+
if len(missing_required_fields) > 0:
|
2288
|
+
raise ValueError(
|
2289
|
+
f"{', '.join(missing_required_fields)} are required"
|
2290
|
+
' variables in "species_df" but were not present '
|
2291
|
+
"in the input file."
|
2292
|
+
)
|
2293
|
+
|
2294
|
+
# check interactions
|
2295
|
+
interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
|
2296
|
+
missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
|
2297
|
+
interaction_edgelist_columns
|
2298
|
+
)
|
2299
|
+
if len(missing_required_fields) > 0:
|
2300
|
+
raise ValueError(
|
2301
|
+
f"{', '.join(missing_required_fields)} are required "
|
2302
|
+
'variables in "interaction_edgelist" but were not '
|
2303
|
+
"present in the input file."
|
2304
|
+
)
|
2305
|
+
|
2306
|
+
return None
|
2307
|
+
|
2308
|
+
|
2309
|
+
def _edgelist_identify_extra_columns(
|
2310
|
+
interaction_edgelist, species_df, keep_reactions_data, keep_species_data
|
2311
|
+
):
|
2312
|
+
"""
|
2313
|
+
Identify extra columns in input data that should be preserved.
|
2314
|
+
|
2315
|
+
Parameters
|
2316
|
+
----------
|
2317
|
+
interaction_edgelist : pd.DataFrame
|
2318
|
+
Interaction data containing potential extra columns
|
2319
|
+
species_df : pd.DataFrame
|
2320
|
+
Species data containing potential extra columns
|
2321
|
+
keep_reactions_data : bool or str
|
2322
|
+
Whether to keep extra reaction columns
|
2323
|
+
keep_species_data : bool or str
|
2324
|
+
Whether to keep extra species columns
|
2325
|
+
|
2326
|
+
Returns
|
2327
|
+
-------
|
2328
|
+
dict
|
2329
|
+
Dictionary with 'reactions' and 'species' keys containing lists of extra column names
|
2330
|
+
"""
|
2331
|
+
extra_reactions_columns = []
|
2332
|
+
extra_species_columns = []
|
2333
|
+
|
2334
|
+
if keep_reactions_data is not False:
|
2335
|
+
extra_reactions_columns = [
|
2336
|
+
c
|
2337
|
+
for c in interaction_edgelist.columns
|
2338
|
+
if c not in INTERACTION_EDGELIST_EXPECTED_VARS
|
2339
|
+
]
|
2340
|
+
|
2341
|
+
if keep_species_data is not False:
|
2342
|
+
extra_species_columns = [
|
2343
|
+
c
|
2344
|
+
for c in species_df.columns
|
2345
|
+
if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
2346
|
+
]
|
2347
|
+
|
2348
|
+
return {"reactions": extra_reactions_columns, "species": extra_species_columns}
|
2349
|
+
|
2350
|
+
|
2351
|
+
def _edgelist_process_compartments(compartments_df, interaction_source):
|
2352
|
+
"""
|
2353
|
+
Format compartments DataFrame with source and ID columns.
|
2354
|
+
|
2355
|
+
Parameters
|
2356
|
+
----------
|
2357
|
+
compartments_df : pd.DataFrame
|
2358
|
+
Raw compartments data
|
2359
|
+
interaction_source : source.Source
|
2360
|
+
Source object to assign to compartments
|
2361
|
+
|
2362
|
+
Returns
|
2363
|
+
-------
|
2364
|
+
pd.DataFrame
|
2365
|
+
Processed compartments with IDs, indexed by compartment ID
|
2366
|
+
"""
|
2367
|
+
compartments = compartments_df.copy()
|
2368
|
+
compartments[SBML_DFS.C_SOURCE] = interaction_source
|
2369
|
+
compartments[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
|
2370
|
+
range(compartments.shape[0]), SBML_DFS.C_ID
|
2371
|
+
)
|
2372
|
+
return compartments.set_index(SBML_DFS.C_ID)[
|
2373
|
+
[SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
|
2374
|
+
]
|
2375
|
+
|
2376
|
+
|
2377
|
+
def _edgelist_process_species(species_df, interaction_source, extra_species_columns):
|
2378
|
+
"""
|
2379
|
+
Format species DataFrame and extract extra data.
|
2380
|
+
|
2381
|
+
Parameters
|
2382
|
+
----------
|
2383
|
+
species_df : pd.DataFrame
|
2384
|
+
Raw species data
|
2385
|
+
interaction_source : source.Source
|
2386
|
+
Source object to assign to species
|
2387
|
+
extra_species_columns : list
|
2388
|
+
Names of extra columns to preserve separately
|
2389
|
+
|
2390
|
+
Returns
|
2391
|
+
-------
|
2392
|
+
tuple of pd.DataFrame
|
2393
|
+
Processed species DataFrame and species extra data DataFrame
|
2394
|
+
"""
|
2395
|
+
species = species_df.copy()
|
2396
|
+
species[SBML_DFS.S_SOURCE] = interaction_source
|
2397
|
+
species[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
|
2398
|
+
range(species.shape[0]), SBML_DFS.S_ID
|
2399
|
+
)
|
2400
|
+
|
2401
|
+
required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
|
2402
|
+
species_indexed = species.set_index(SBML_DFS.S_ID)[
|
2403
|
+
required_cols + extra_species_columns
|
2404
|
+
]
|
2405
|
+
|
2406
|
+
# Separate extra data from main species table
|
2407
|
+
species_data = species_indexed[extra_species_columns]
|
2408
|
+
processed_species = species_indexed[required_cols]
|
2409
|
+
|
2410
|
+
return processed_species, species_data
|
2411
|
+
|
2412
|
+
|
2413
|
+
def _edgelist_create_compartmentalized_species(
|
2414
|
+
interaction_edgelist, species_df, compartments_df, interaction_source
|
2415
|
+
):
|
2416
|
+
"""
|
2417
|
+
Create compartmentalized species from interactions.
|
2418
|
+
|
2419
|
+
Parameters
|
2420
|
+
----------
|
2421
|
+
interaction_edgelist : pd.DataFrame
|
2422
|
+
Interaction data containing species-compartment combinations
|
2423
|
+
species_df : pd.DataFrame
|
2424
|
+
Processed species data with IDs
|
2425
|
+
compartments_df : pd.DataFrame
|
2426
|
+
Processed compartments data with IDs
|
2427
|
+
interaction_source : source.Source
|
2428
|
+
Source object to assign to compartmentalized species
|
2429
|
+
|
2430
|
+
Returns
|
2431
|
+
-------
|
2432
|
+
pd.DataFrame
|
2433
|
+
Compartmentalized species with formatted names and IDs
|
2434
|
+
"""
|
2435
|
+
# Get all distinct upstream and downstream compartmentalized species
|
2436
|
+
comp_species = pd.concat(
|
2437
|
+
[
|
2438
|
+
interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
|
2439
|
+
{
|
2440
|
+
"upstream_name": SBML_DFS.S_NAME,
|
2441
|
+
"upstream_compartment": SBML_DFS.C_NAME,
|
2442
|
+
},
|
2443
|
+
axis=1,
|
2444
|
+
),
|
2445
|
+
interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
|
2446
|
+
{
|
2447
|
+
"downstream_name": SBML_DFS.S_NAME,
|
2448
|
+
"downstream_compartment": SBML_DFS.C_NAME,
|
2449
|
+
},
|
2450
|
+
axis=1,
|
2451
|
+
),
|
2452
|
+
]
|
2453
|
+
).drop_duplicates()
|
2454
|
+
|
2455
|
+
# Add species and compartment IDs
|
2456
|
+
comp_species_w_ids = comp_species.merge(
|
2457
|
+
species_df[SBML_DFS.S_NAME].reset_index(), how="left", on=SBML_DFS.S_NAME
|
2458
|
+
).merge(
|
2459
|
+
compartments_df[SBML_DFS.C_NAME].reset_index(), how="left", on=SBML_DFS.C_NAME
|
2460
|
+
)
|
2461
|
+
|
2462
|
+
# Validate merge was successful
|
2463
|
+
_sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
|
2464
|
+
|
2465
|
+
# Format compartmentalized species with names, source, and IDs
|
2466
|
+
comp_species_w_ids[SBML_DFS.SC_NAME] = [
|
2467
|
+
f"{s} [{c}]"
|
2468
|
+
for s, c in zip(
|
2469
|
+
comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
|
2470
|
+
)
|
2471
|
+
]
|
2472
|
+
comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
|
2473
|
+
comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
|
2474
|
+
range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
|
2475
|
+
)
|
2476
|
+
|
2477
|
+
return comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
|
2478
|
+
[SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
|
2479
|
+
]
|
2480
|
+
|
2481
|
+
|
2482
|
+
def _edgelist_create_reactions_and_species(
|
2483
|
+
interaction_edgelist,
|
2484
|
+
comp_species,
|
2485
|
+
species_df,
|
2486
|
+
compartments_df,
|
2487
|
+
interaction_source,
|
2488
|
+
upstream_stoichiometry,
|
2489
|
+
downstream_stoichiometry,
|
2490
|
+
downstream_sbo_name,
|
2491
|
+
extra_reactions_columns,
|
2492
|
+
):
|
2493
|
+
"""
|
2494
|
+
Create reactions and reaction species from interactions.
|
2495
|
+
|
2496
|
+
Parameters
|
2497
|
+
----------
|
2498
|
+
interaction_edgelist : pd.DataFrame
|
2499
|
+
Original interaction data
|
2500
|
+
comp_species : pd.DataFrame
|
2501
|
+
Compartmentalized species with IDs
|
2502
|
+
species_df : pd.DataFrame
|
2503
|
+
Processed species data with IDs
|
2504
|
+
compartments_df : pd.DataFrame
|
2505
|
+
Processed compartments data with IDs
|
2506
|
+
interaction_source : source.Source
|
2507
|
+
Source object for reactions
|
2508
|
+
upstream_stoichiometry : int
|
2509
|
+
Stoichiometry for upstream species
|
2510
|
+
downstream_stoichiometry : int
|
2511
|
+
Stoichiometry for downstream species
|
2512
|
+
downstream_sbo_name : str
|
2513
|
+
SBO term name for downstream species
|
2514
|
+
extra_reactions_columns : list
|
2515
|
+
Names of extra columns to preserve
|
2516
|
+
|
2517
|
+
Returns
|
2518
|
+
-------
|
2519
|
+
tuple
|
2520
|
+
(reactions_df, reaction_species_df, reactions_data)
|
2521
|
+
"""
|
2522
|
+
# Add compartmentalized species IDs to interactions
|
2523
|
+
comp_species_w_names = (
|
2524
|
+
comp_species.reset_index()
|
2525
|
+
.merge(species_df[SBML_DFS.S_NAME].reset_index())
|
2526
|
+
.merge(compartments_df[SBML_DFS.C_NAME].reset_index())
|
2527
|
+
)
|
2528
|
+
|
2529
|
+
interaction_w_cspecies = interaction_edgelist.merge(
|
2530
|
+
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
2531
|
+
{
|
2532
|
+
SBML_DFS.SC_ID: "sc_id_up",
|
2533
|
+
SBML_DFS.S_NAME: "upstream_name",
|
2534
|
+
SBML_DFS.C_NAME: "upstream_compartment",
|
2535
|
+
},
|
2536
|
+
axis=1,
|
2537
|
+
),
|
2538
|
+
how="left",
|
2539
|
+
).merge(
|
2540
|
+
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
2541
|
+
{
|
2542
|
+
SBML_DFS.SC_ID: "sc_id_down",
|
2543
|
+
SBML_DFS.S_NAME: "downstream_name",
|
2544
|
+
SBML_DFS.C_NAME: "downstream_compartment",
|
2545
|
+
},
|
2546
|
+
axis=1,
|
2547
|
+
),
|
2548
|
+
how="left",
|
2549
|
+
)[
|
2550
|
+
REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
|
2551
|
+
]
|
2552
|
+
|
2553
|
+
# Validate merge didn't create duplicates
|
2554
|
+
if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
|
2555
|
+
raise ValueError(
|
2556
|
+
f"Merging compartmentalized species resulted in row count change "
|
2557
|
+
f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
|
2558
|
+
)
|
2559
|
+
|
2560
|
+
# Create reaction IDs FIRST - before using them
|
2561
|
+
interaction_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
|
2562
|
+
range(interaction_w_cspecies.shape[0]), SBML_DFS.R_ID
|
2563
|
+
)
|
2564
|
+
|
2565
|
+
# Create reactions DataFrame
|
2566
|
+
interactions_copy = interaction_w_cspecies.copy()
|
2567
|
+
interactions_copy[SBML_DFS.R_SOURCE] = interaction_source
|
2568
|
+
|
2569
|
+
reactions_columns = [
|
2570
|
+
SBML_DFS.R_NAME,
|
2571
|
+
SBML_DFS.R_IDENTIFIERS,
|
2572
|
+
SBML_DFS.R_SOURCE,
|
2573
|
+
SBML_DFS.R_ISREVERSIBLE,
|
2574
|
+
]
|
2575
|
+
|
2576
|
+
reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
|
2577
|
+
reactions_columns + extra_reactions_columns
|
2578
|
+
]
|
2579
|
+
|
2580
|
+
# Separate extra data
|
2581
|
+
reactions_data = reactions_df[extra_reactions_columns]
|
2582
|
+
reactions_df = reactions_df[reactions_columns]
|
2583
|
+
|
2584
|
+
# Create reaction species relationships - NOW r_id exists
|
2585
|
+
reaction_species_df = pd.concat(
|
2586
|
+
[
|
2587
|
+
# Upstream species (modifiers/stimulators/inhibitors)
|
2588
|
+
interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
|
2589
|
+
.assign(stoichiometry=upstream_stoichiometry)
|
2590
|
+
.rename({"sc_id_up": "sc_id"}, axis=1),
|
2591
|
+
# Downstream species (products)
|
2592
|
+
interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
|
2593
|
+
.assign(
|
2594
|
+
stoichiometry=downstream_stoichiometry,
|
2595
|
+
sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
|
2596
|
+
)
|
2597
|
+
.rename({"sc_id_down": "sc_id"}, axis=1),
|
2598
|
+
]
|
2599
|
+
)
|
2600
|
+
|
2601
|
+
reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
|
2602
|
+
range(reaction_species_df.shape[0]), "rsc_id"
|
2603
|
+
)
|
2604
|
+
|
2605
|
+
reaction_species_df = reaction_species_df.set_index("rsc_id")
|
2606
|
+
|
2607
|
+
return reactions_df, reaction_species_df, reactions_data
|
2608
|
+
|
2609
|
+
|
2610
|
+
def _edgelist_assemble_sbml_model(
|
2611
|
+
compartments,
|
2612
|
+
species,
|
2613
|
+
comp_species,
|
2614
|
+
reactions,
|
2615
|
+
reaction_species,
|
2616
|
+
species_data,
|
2617
|
+
reactions_data,
|
2618
|
+
keep_species_data,
|
2619
|
+
keep_reactions_data,
|
2620
|
+
extra_columns,
|
2621
|
+
):
|
2622
|
+
"""
|
2623
|
+
Assemble the final SBML_dfs object.
|
2624
|
+
|
2625
|
+
Parameters
|
2626
|
+
----------
|
2627
|
+
compartments : pd.DataFrame
|
2628
|
+
Processed compartments data
|
2629
|
+
species : pd.DataFrame
|
2630
|
+
Processed species data
|
2631
|
+
comp_species : pd.DataFrame
|
2632
|
+
Compartmentalized species data
|
2633
|
+
reactions : pd.DataFrame
|
2634
|
+
Reactions data
|
2635
|
+
reaction_species : pd.DataFrame
|
2636
|
+
Reaction species relationships
|
2637
|
+
species_data : pd.DataFrame
|
2638
|
+
Extra species data to include
|
2639
|
+
reactions_data : pd.DataFrame
|
2640
|
+
Extra reactions data to include
|
2641
|
+
keep_species_data : bool or str
|
2642
|
+
Label for species extra data
|
2643
|
+
keep_reactions_data : bool or str
|
2644
|
+
Label for reactions extra data
|
2645
|
+
extra_columns : dict
|
2646
|
+
Dictionary containing lists of extra column names
|
2647
|
+
|
2648
|
+
Returns
|
2649
|
+
-------
|
2650
|
+
SBML_dfs
|
2651
|
+
Validated SBML data structure
|
2652
|
+
"""
|
2653
|
+
sbml_tbl_dict = {
|
2654
|
+
"compartments": compartments,
|
2655
|
+
"species": species,
|
2656
|
+
"compartmentalized_species": comp_species,
|
2657
|
+
"reactions": reactions,
|
2658
|
+
"reaction_species": reaction_species,
|
2659
|
+
}
|
2660
|
+
|
2661
|
+
# Add extra data if requested
|
2662
|
+
if len(extra_columns["reactions"]) > 0:
|
2663
|
+
data_label = (
|
2664
|
+
keep_reactions_data if isinstance(keep_reactions_data, str) else "source"
|
2665
|
+
)
|
2666
|
+
sbml_tbl_dict["reactions_data"] = {data_label: reactions_data}
|
2667
|
+
|
2668
|
+
if len(extra_columns["species"]) > 0:
|
2669
|
+
data_label = (
|
2670
|
+
keep_species_data if isinstance(keep_species_data, str) else "source"
|
2671
|
+
)
|
2672
|
+
sbml_tbl_dict["species_data"] = {data_label: species_data}
|
2673
|
+
|
2674
|
+
sbml_model = SBML_dfs(sbml_tbl_dict)
|
2675
|
+
sbml_model.validate()
|
2676
|
+
|
2677
|
+
return sbml_model
|
2678
|
+
|
2679
|
+
|
2680
|
+
def _sbml_dfs_from_edgelist_check_cspecies_merge(
|
2681
|
+
merged_species: pd.DataFrame, original_species: pd.DataFrame
|
2682
|
+
) -> None:
|
2683
|
+
"""Check for a mismatch between the provided species data and species implied by the edgelist."""
|
2684
|
+
|
2685
|
+
# check for 1-many merge
|
2686
|
+
if merged_species.shape[0] != original_species.shape[0]:
|
2687
|
+
raise ValueError(
|
2688
|
+
"Merging compartmentalized species to species_df"
|
2689
|
+
" and compartments_df by names resulted in an "
|
2690
|
+
f"increase in the tables from {original_species.shape[0]}"
|
2691
|
+
f" to {merged_species.shape[0]} indicating that names were"
|
2692
|
+
" not unique"
|
2693
|
+
)
|
2694
|
+
|
2695
|
+
# check for missing species and compartments
|
2696
|
+
missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
|
2697
|
+
SBML_DFS.C_NAME
|
2698
|
+
].unique()
|
2699
|
+
if len(missing_compartments) >= 1:
|
2700
|
+
raise ValueError(
|
2701
|
+
f"{len(missing_compartments)} compartments were present in"
|
2702
|
+
' "interaction_edgelist" but not "compartments_df":'
|
2703
|
+
f" {', '.join(missing_compartments)}"
|
2704
|
+
)
|
2705
|
+
|
2706
|
+
missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
|
2707
|
+
SBML_DFS.S_NAME
|
2708
|
+
].unique()
|
2709
|
+
if len(missing_species) >= 1:
|
2710
|
+
raise ValueError(
|
2711
|
+
f"{len(missing_species)} species were present in "
|
2712
|
+
'"interaction_edgelist" but not "species_df":'
|
2713
|
+
f" {', '.join(missing_species)}"
|
2714
|
+
)
|
2715
|
+
|
2716
|
+
return None
|
2717
|
+
|
2718
|
+
|
2719
|
+
def _stub_compartments(
|
2720
|
+
stubbed_compartment: str = GENERIC_COMPARTMENT,
|
2721
|
+
) -> pd.DataFrame:
|
2722
|
+
"""Stub Compartments
|
2723
|
+
|
2724
|
+
Create a compartments table with only a single compartment
|
2725
|
+
|
2726
|
+
Args:
|
2727
|
+
stubbed_compartment (str): the name of a compartment which should match the
|
2728
|
+
keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
|
2729
|
+
|
2730
|
+
Returns:
|
2731
|
+
compartments_df (pd.DataFrame): compartments dataframe
|
2732
|
+
"""
|
2733
|
+
|
2734
|
+
if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
|
2735
|
+
raise ValueError(
|
2736
|
+
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
|
2737
|
+
)
|
2738
|
+
|
2739
|
+
if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
|
2740
|
+
raise ValueError(
|
2741
|
+
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
|
2742
|
+
)
|
2743
|
+
|
2744
|
+
stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
|
2745
|
+
|
2746
|
+
formatted_uri = identifiers.format_uri(
|
2747
|
+
uri=identifiers.create_uri_url(
|
2748
|
+
ontology=ONTOLOGIES.GO,
|
2749
|
+
identifier=stubbed_compartment_id,
|
2750
|
+
),
|
2751
|
+
biological_qualifier_type=BQB.IS,
|
2752
|
+
)
|
2753
|
+
|
2754
|
+
compartments_df = pd.DataFrame(
|
2755
|
+
{
|
2756
|
+
SBML_DFS.C_NAME: [stubbed_compartment],
|
2757
|
+
SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
|
2758
|
+
}
|
2759
|
+
)
|
2760
|
+
compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
|
2761
|
+
compartments_df.index.name = SBML_DFS.C_ID
|
2762
|
+
|
2763
|
+
return compartments_df
|
2764
|
+
|
2765
|
+
|
2766
|
+
def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
|
2767
|
+
"""Validates a table against a reference
|
2768
|
+
|
2769
|
+
This check if the table has the same index, no duplicates in the index
|
2770
|
+
and that all values in the index are in the reference table.
|
2771
|
+
|
2772
|
+
Args:
|
2773
|
+
data_table (pd.DataFrame): a table with data that should
|
2774
|
+
match the reference
|
2775
|
+
ref_table (pd.DataFrame): a reference table
|
2776
|
+
|
2777
|
+
Raises:
|
2778
|
+
ValueError: not same index name
|
2779
|
+
ValueError: index contains duplicates
|
2780
|
+
ValueError: index not subset of index of reactions table
|
2781
|
+
"""
|
2782
|
+
ref_index_name = ref_table.index.name
|
2783
|
+
if data_table.index.name != ref_index_name:
|
2784
|
+
raise ValueError(
|
2785
|
+
"the index name for reaction data table was not"
|
2786
|
+
f" {ref_index_name}: {data_table.index.name}"
|
2787
|
+
)
|
2788
|
+
ids = data_table.index
|
2789
|
+
if any(ids.duplicated()):
|
2790
|
+
raise ValueError(
|
2791
|
+
"the index for reaction data table " "contained duplicate values"
|
2792
|
+
)
|
2793
|
+
if not all(ids.isin(ref_table.index)):
|
2794
|
+
raise ValueError(
|
2795
|
+
"the index for reaction data table contained values"
|
2796
|
+
" not found in the reactions table"
|
2797
|
+
)
|
2798
|
+
if not isinstance(data_table, pd.DataFrame):
|
2799
|
+
raise TypeError(
|
2800
|
+
f"The data table was type {type(data_table).__name__}"
|
2801
|
+
" but must be a pd.DataFrame"
|
2802
|
+
)
|