napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
napistu/sbml_dfs_core.py
ADDED
@@ -0,0 +1,2216 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import re
|
5
|
+
from typing import Any
|
6
|
+
from typing import Iterable
|
7
|
+
from typing import Mapping
|
8
|
+
from typing import MutableMapping
|
9
|
+
from typing import TYPE_CHECKING
|
10
|
+
|
11
|
+
import numpy as np
|
12
|
+
import pandas as pd
|
13
|
+
from napistu import identifiers
|
14
|
+
from napistu import sbml_dfs_utils
|
15
|
+
from napistu import source
|
16
|
+
from napistu import utils
|
17
|
+
from napistu.constants import SBML_DFS
|
18
|
+
from napistu.constants import SBML_DFS_SCHEMA
|
19
|
+
from napistu.constants import IDENTIFIERS
|
20
|
+
from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
|
21
|
+
from napistu.constants import CPR_STANDARD_OUTPUTS
|
22
|
+
from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
|
23
|
+
from napistu.constants import BQB_PRIORITIES
|
24
|
+
from napistu.constants import ONTOLOGY_PRIORITIES
|
25
|
+
from napistu.constants import BQB
|
26
|
+
from napistu.constants import BQB_DEFINING_ATTRS
|
27
|
+
from napistu.constants import COMPARTMENTS
|
28
|
+
from napistu.constants import COMPARTMENT_ALIASES
|
29
|
+
from napistu.constants import COMPARTMENTS_GO_TERMS
|
30
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
31
|
+
from napistu.constants import MINI_SBO_TO_NAME
|
32
|
+
from napistu.constants import ONTOLOGIES
|
33
|
+
from napistu.constants import SBO_NAME_TO_ROLE
|
34
|
+
from napistu.constants import SBOTERM_NAMES
|
35
|
+
from napistu.constants import CHARACTERISTIC_COMPLEX_ONTOLOGIES
|
36
|
+
from napistu.ingestion import sbml
|
37
|
+
from fs import open_fs
|
38
|
+
|
39
|
+
logger = logging.getLogger(__name__)
|
40
|
+
|
41
|
+
|
42
|
+
class SBML_dfs:
|
43
|
+
"""
|
44
|
+
System Biology Markup Language Model Data Frames.
|
45
|
+
|
46
|
+
Attributes
|
47
|
+
----------
|
48
|
+
compartments: pd.DataFrame
|
49
|
+
sub-cellular compartments in the model
|
50
|
+
species: pd.DataFrame
|
51
|
+
molecular species in the model
|
52
|
+
species_data: Dict[str, pd.DataFrame]: Additional data for species.
|
53
|
+
DataFrames with additional data and index = species_id
|
54
|
+
reactions: pd.DataFrame
|
55
|
+
reactions in the model
|
56
|
+
reactions_data: Dict[str, pd.DataFrame]: Additional data for reactions.
|
57
|
+
DataFrames with additional data and index = reaction_id
|
58
|
+
reaction_species: pd.DataFrame
|
59
|
+
One entry per species participating in a reaction
|
60
|
+
schema: dict
|
61
|
+
dictionary reprenting the structure of the other attributes and meaning of their variables
|
62
|
+
|
63
|
+
Methods
|
64
|
+
-------
|
65
|
+
get_table(entity_type, required_attributes)
|
66
|
+
Get a table from the SBML_dfs object and optionally validate that it contains a set of required attributes
|
67
|
+
search_by_ids(ids, entity_type, identifiers_df, ontologies)
|
68
|
+
Pull out identifiers and entities matching a set of query ids which optionally match a set of ontologies
|
69
|
+
search_by_name(name, entity_type, partial_match)
|
70
|
+
Pull out a set of entities by name or partial string match [default]
|
71
|
+
get_cspecies_features()
|
72
|
+
Returns additional attributes of compartmentalized species
|
73
|
+
get_species_features()
|
74
|
+
Returns additional attributes of species
|
75
|
+
get_identifiers(id_type)
|
76
|
+
Returns a DataFrame containing identifiers from the id_type table
|
77
|
+
get_uri_urls(entity_type, entity_ids = None)
|
78
|
+
Returns a Series containing reference urls for each entity
|
79
|
+
validate()
|
80
|
+
Validate that the sbml_dfs follows the schema and identify clear pathologies
|
81
|
+
validate_and_rec()
|
82
|
+
Validate the sbml_dfs and attempt to automatically resolve common issues
|
83
|
+
"""
|
84
|
+
|
85
|
+
compartments: pd.DataFrame
|
86
|
+
species: pd.DataFrame
|
87
|
+
species_data: dict[str, pd.DataFrame]
|
88
|
+
reactions: pd.DataFrame
|
89
|
+
reactions_data: dict[str, pd.DataFrame]
|
90
|
+
reaction_species: pd.DataFrame
|
91
|
+
schema: dict
|
92
|
+
_required_entities: set[str]
|
93
|
+
_optional_entities: set[str]
|
94
|
+
|
95
|
+
def __init__(
|
96
|
+
self,
|
97
|
+
sbml_model: (
|
98
|
+
sbml.SBML | MutableMapping[str, pd.DataFrame | dict[str, pd.DataFrame]]
|
99
|
+
),
|
100
|
+
validate: bool = True,
|
101
|
+
resolve: bool = True,
|
102
|
+
) -> None:
|
103
|
+
"""
|
104
|
+
Creates a pathway
|
105
|
+
|
106
|
+
Parameters
|
107
|
+
----------
|
108
|
+
sbml_model : cpr.SBML or a dict containing tables following the sbml_dfs schema
|
109
|
+
A SBML model produced by cpr.SBML().
|
110
|
+
validate (bool): if True then call self.validate() to identify formatting issues
|
111
|
+
resolve (bool): if True then try to automatically resolve common problems
|
112
|
+
|
113
|
+
Returns
|
114
|
+
-------
|
115
|
+
None.
|
116
|
+
"""
|
117
|
+
|
118
|
+
self.schema = SBML_DFS_SCHEMA.SCHEMA
|
119
|
+
self._required_entities = SBML_DFS_SCHEMA.REQUIRED_ENTITIES
|
120
|
+
self._optional_entities = SBML_DFS_SCHEMA.OPTIONAL_ENTITIES
|
121
|
+
|
122
|
+
# Initialize the dynamic attributes for type checking
|
123
|
+
if TYPE_CHECKING:
|
124
|
+
self.compartments = pd.DataFrame()
|
125
|
+
self.species = pd.DataFrame()
|
126
|
+
self.compartmentalized_species = pd.DataFrame()
|
127
|
+
self.reactions = pd.DataFrame()
|
128
|
+
self.reaction_species = pd.DataFrame()
|
129
|
+
|
130
|
+
# create a model from dictionary entries
|
131
|
+
if isinstance(sbml_model, dict):
|
132
|
+
for ent in SBML_DFS_SCHEMA.REQUIRED_ENTITIES:
|
133
|
+
setattr(self, ent, sbml_model[ent])
|
134
|
+
for ent in SBML_DFS_SCHEMA.OPTIONAL_ENTITIES:
|
135
|
+
if ent in sbml_model:
|
136
|
+
setattr(self, ent, sbml_model[ent])
|
137
|
+
else:
|
138
|
+
self = sbml.sbml_df_from_sbml(self, sbml_model)
|
139
|
+
|
140
|
+
for ent in SBML_DFS_SCHEMA.OPTIONAL_ENTITIES:
|
141
|
+
# Initialize optional entities if not set
|
142
|
+
if not hasattr(self, ent):
|
143
|
+
setattr(self, ent, {})
|
144
|
+
|
145
|
+
if validate:
|
146
|
+
if resolve:
|
147
|
+
self.validate_and_resolve()
|
148
|
+
else:
|
149
|
+
self.validate()
|
150
|
+
else:
|
151
|
+
if resolve:
|
152
|
+
logger.warning(
|
153
|
+
'"validate" = False so "resolve" will be ignored (eventhough it was True)'
|
154
|
+
)
|
155
|
+
|
156
|
+
def get_table(
|
157
|
+
self, entity_type: str, required_attributes: None | set[str] = None
|
158
|
+
) -> pd.DataFrame:
|
159
|
+
"""
|
160
|
+
Get Table
|
161
|
+
|
162
|
+
Get a table from the SBML_dfs object and optionally validate that it contains a set of required attributes.
|
163
|
+
"""
|
164
|
+
|
165
|
+
schema = self.schema
|
166
|
+
|
167
|
+
if entity_type not in schema.keys():
|
168
|
+
raise ValueError(
|
169
|
+
f"{entity_type} does not match a table in the SBML_dfs object. The tables "
|
170
|
+
f"which are present are {', '.join(schema.keys())}"
|
171
|
+
)
|
172
|
+
|
173
|
+
if required_attributes is not None:
|
174
|
+
assert isinstance(required_attributes, set)
|
175
|
+
|
176
|
+
# determine whether required_attributes are appropriate
|
177
|
+
VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
|
178
|
+
invalid_required_attributes = required_attributes.difference(
|
179
|
+
VALID_REQUIRED_ATTRIBUTES
|
180
|
+
)
|
181
|
+
|
182
|
+
if len(invalid_required_attributes) > 0:
|
183
|
+
raise ValueError(
|
184
|
+
f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
|
185
|
+
f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
|
186
|
+
)
|
187
|
+
|
188
|
+
# determine if required_attributes are satisified
|
189
|
+
invalid_attrs = [
|
190
|
+
s for s in required_attributes if s not in schema[entity_type].keys()
|
191
|
+
]
|
192
|
+
if len(invalid_attrs) > 0:
|
193
|
+
raise ValueError(
|
194
|
+
f"The following required attributes are not present for the {entity_type} table: "
|
195
|
+
f"{', '.join(invalid_attrs)}."
|
196
|
+
)
|
197
|
+
|
198
|
+
return getattr(self, entity_type)
|
199
|
+
|
200
|
+
def search_by_ids(
|
201
|
+
self,
|
202
|
+
ids: list[str],
|
203
|
+
entity_type: str,
|
204
|
+
identifiers_df: pd.DataFrame,
|
205
|
+
ontologies: None | set[str] = None,
|
206
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
207
|
+
# validate inputs
|
208
|
+
entity_table = self.get_table(entity_type, required_attributes={"id"})
|
209
|
+
entity_pk = self.schema[entity_type]["pk"]
|
210
|
+
|
211
|
+
utils.match_pd_vars(
|
212
|
+
identifiers_df,
|
213
|
+
req_vars={
|
214
|
+
entity_pk,
|
215
|
+
IDENTIFIERS.ONTOLOGY,
|
216
|
+
IDENTIFIERS.IDENTIFIER,
|
217
|
+
IDENTIFIERS.URL,
|
218
|
+
IDENTIFIERS.BQB,
|
219
|
+
},
|
220
|
+
allow_series=False,
|
221
|
+
).assert_present()
|
222
|
+
|
223
|
+
if ontologies is not None:
|
224
|
+
assert isinstance(ontologies, set)
|
225
|
+
ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
|
226
|
+
invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
|
227
|
+
if len(invalid_ontologies) > 0:
|
228
|
+
raise ValueError(
|
229
|
+
f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
|
230
|
+
f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
|
231
|
+
)
|
232
|
+
|
233
|
+
# fitler to just to identifiers matchign the ontologies of interest
|
234
|
+
identifiers_df = identifiers_df.query("ontology in @ontologies")
|
235
|
+
|
236
|
+
matching_identifiers = identifiers_df.loc[
|
237
|
+
identifiers_df["identifier"].isin(ids)
|
238
|
+
]
|
239
|
+
entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
|
240
|
+
|
241
|
+
return entity_subset, matching_identifiers
|
242
|
+
|
243
|
+
def search_by_name(
|
244
|
+
self, name: str, entity_type: str, partial_match: bool = True
|
245
|
+
) -> pd.DataFrame:
|
246
|
+
entity_table = self.get_table(entity_type, required_attributes={"label"})
|
247
|
+
label_attr = self.schema[entity_type]["label"]
|
248
|
+
|
249
|
+
if partial_match:
|
250
|
+
matches = entity_table.loc[
|
251
|
+
entity_table[label_attr].str.contains(name, case=False)
|
252
|
+
]
|
253
|
+
else:
|
254
|
+
matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
|
255
|
+
return matches
|
256
|
+
|
257
|
+
def get_species_features(self) -> pd.DataFrame:
|
258
|
+
species = self.species
|
259
|
+
augmented_species = species.assign(
|
260
|
+
**{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
|
261
|
+
)
|
262
|
+
|
263
|
+
return augmented_species
|
264
|
+
|
265
|
+
def get_cspecies_features(self) -> pd.DataFrame:
|
266
|
+
cspecies_n_connections = (
|
267
|
+
self.reaction_species["sc_id"].value_counts().rename("sc_degree")
|
268
|
+
)
|
269
|
+
|
270
|
+
cspecies_n_children = (
|
271
|
+
self.reaction_species.loc[
|
272
|
+
self.reaction_species[SBML_DFS.STOICHIOMETRY] <= 0, "sc_id"
|
273
|
+
]
|
274
|
+
.value_counts()
|
275
|
+
.rename("sc_children")
|
276
|
+
)
|
277
|
+
|
278
|
+
cspecies_n_parents = (
|
279
|
+
self.reaction_species.loc[
|
280
|
+
self.reaction_species[SBML_DFS.STOICHIOMETRY] > 0, "sc_id"
|
281
|
+
]
|
282
|
+
.value_counts()
|
283
|
+
.rename("sc_parents")
|
284
|
+
)
|
285
|
+
|
286
|
+
species_features = self.get_species_features()["species_type"]
|
287
|
+
|
288
|
+
return (
|
289
|
+
self.compartmentalized_species.join(cspecies_n_connections)
|
290
|
+
.join(cspecies_n_children)
|
291
|
+
.join(cspecies_n_parents)
|
292
|
+
.fillna(0)
|
293
|
+
.astype(
|
294
|
+
{"sc_degree": "int32", "sc_children": "int32", "sc_parents": "int32"}
|
295
|
+
)
|
296
|
+
.merge(species_features, left_on="s_id", right_index=True)
|
297
|
+
.drop(columns=["sc_name", "s_id", "c_id"])
|
298
|
+
)
|
299
|
+
|
300
|
+
def get_identifiers(self, id_type) -> pd.DataFrame:
|
301
|
+
selected_table = self.get_table(id_type, {"id"})
|
302
|
+
schema = self.schema
|
303
|
+
|
304
|
+
identifiers_dict = dict()
|
305
|
+
for sysid in selected_table.index:
|
306
|
+
id_entry = selected_table[schema[id_type]["id"]][sysid]
|
307
|
+
|
308
|
+
if isinstance(id_entry, identifiers.Identifiers):
|
309
|
+
identifiers_dict[sysid] = pd.DataFrame(id_entry.ids)
|
310
|
+
elif np.isnan(id_entry):
|
311
|
+
continue
|
312
|
+
else:
|
313
|
+
raise ValueError(
|
314
|
+
f"id_entry was a {type(id_entry)} and must either be"
|
315
|
+
" an identifiers.Identifiers object or NaN"
|
316
|
+
)
|
317
|
+
identifiers_tbl = pd.concat(identifiers_dict)
|
318
|
+
|
319
|
+
identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
|
320
|
+
identifiers_tbl = identifiers_tbl.reset_index()
|
321
|
+
|
322
|
+
named_identifiers = identifiers_tbl.merge(
|
323
|
+
selected_table.drop(schema[id_type]["id"], axis=1),
|
324
|
+
left_on=schema[id_type]["pk"],
|
325
|
+
right_index=True,
|
326
|
+
)
|
327
|
+
|
328
|
+
return named_identifiers
|
329
|
+
|
330
|
+
def get_uri_urls(
|
331
|
+
self,
|
332
|
+
entity_type: str,
|
333
|
+
entity_ids: Iterable[str] | None = None,
|
334
|
+
required_ontology: str | None = None,
|
335
|
+
) -> pd.Series:
|
336
|
+
schema = self.schema
|
337
|
+
|
338
|
+
# valid entities and their identifier variables
|
339
|
+
valid_entity_types = [
|
340
|
+
SBML_DFS.COMPARTMENTS,
|
341
|
+
SBML_DFS.SPECIES,
|
342
|
+
SBML_DFS.REACTIONS,
|
343
|
+
]
|
344
|
+
|
345
|
+
if entity_type not in valid_entity_types:
|
346
|
+
raise ValueError(
|
347
|
+
f"{entity_type} is an invalid entity_type; valid types "
|
348
|
+
f"are {', '.join(valid_entity_types)}"
|
349
|
+
)
|
350
|
+
|
351
|
+
entity_table = getattr(self, entity_type)
|
352
|
+
|
353
|
+
if entity_ids is not None:
|
354
|
+
# ensure that entity_ids are unique and then convert back to list
|
355
|
+
# to support pandas indexing
|
356
|
+
entity_ids = list(set(entity_ids))
|
357
|
+
|
358
|
+
# filter to a subset of identifiers if one is provided
|
359
|
+
entity_table = entity_table.loc[entity_ids]
|
360
|
+
|
361
|
+
# create a dataframe of all identifiers for the select entities
|
362
|
+
all_ids = pd.concat(
|
363
|
+
[
|
364
|
+
sbml_dfs_utils._stub_ids(
|
365
|
+
entity_table[schema[entity_type]["id"]][i].ids
|
366
|
+
).assign(id=entity_table.index[i])
|
367
|
+
for i in range(0, entity_table.shape[0])
|
368
|
+
]
|
369
|
+
).rename(columns={"id": schema[entity_type]["pk"]})
|
370
|
+
|
371
|
+
# set priorities for ontologies and bqb terms
|
372
|
+
|
373
|
+
if required_ontology is None:
|
374
|
+
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
375
|
+
ONTOLOGY_PRIORITIES, how="left"
|
376
|
+
)
|
377
|
+
else:
|
378
|
+
ontology_priorities = pd.DataFrame(
|
379
|
+
[{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
|
380
|
+
)
|
381
|
+
# if only a single ontology is sought then just return matching entries
|
382
|
+
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
383
|
+
ontology_priorities, how="inner"
|
384
|
+
)
|
385
|
+
|
386
|
+
uri_urls = (
|
387
|
+
all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
|
388
|
+
.groupby(schema[entity_type]["pk"])
|
389
|
+
.first()[IDENTIFIERS.URL]
|
390
|
+
)
|
391
|
+
return uri_urls
|
392
|
+
|
393
|
+
def get_network_summary(self) -> Mapping[str, Any]:
|
394
|
+
"""Return diagnostic statistics about the network
|
395
|
+
|
396
|
+
Returns:
|
397
|
+
Mapping[str, Any]: A dictionary of diagnostic statistics with entries:
|
398
|
+
n_species_types [int]: Number of species types
|
399
|
+
dict_n_species_per_type [dict[str, int]]: Number of
|
400
|
+
species per species type
|
401
|
+
n_species [int]: Number of species
|
402
|
+
n_cspecies [int]: Number of compartmentalized species
|
403
|
+
n_reaction_species [int]: Number of reaction species
|
404
|
+
n_reactions [int]: Number of reactions
|
405
|
+
n_compartments [int]: Number of compartments
|
406
|
+
dict_n_species_per_compartment [dict[str, int]]:
|
407
|
+
Number of species per compartment
|
408
|
+
stats_species_per_reaction [dict[str, float]]:
|
409
|
+
Statistics on the number of reactands per reaction
|
410
|
+
top10_species_per_reaction [list[dict[str, Any]]]:
|
411
|
+
Top 10 reactions with highest number of reactands
|
412
|
+
stats_degree [dict[str, float]]: Statistics on the degree
|
413
|
+
of a species (number of reactions it is involved in)
|
414
|
+
top10_degree [list[dict[str, Any]]]:
|
415
|
+
Top 10 species with highest degree
|
416
|
+
stats_identifiers_per_species [dict[str, float]]:
|
417
|
+
Statistics on the number of identifiers per species
|
418
|
+
top10_identifiers_per_species [list[dict[str, Any]]]:
|
419
|
+
Top 10 species with highest number of identifiers
|
420
|
+
"""
|
421
|
+
stats: MutableMapping[str, Any] = {}
|
422
|
+
species_features = self.get_species_features()
|
423
|
+
stats["n_species_types"] = species_features["species_type"].nunique()
|
424
|
+
stats["dict_n_species_per_type"] = (
|
425
|
+
species_features.groupby(by="species_type").size().to_dict()
|
426
|
+
)
|
427
|
+
stats["n_species"] = self.species.shape[0]
|
428
|
+
stats["n_cspecies"] = self.compartmentalized_species.shape[0]
|
429
|
+
stats["n_reaction_species"] = self.reaction_species.shape[0]
|
430
|
+
stats["n_reactions"] = self.reactions.shape[0]
|
431
|
+
stats["n_compartments"] = self.compartments.shape[0]
|
432
|
+
stats["dict_n_species_per_compartment"] = (
|
433
|
+
self.compartmentalized_species.groupby(SBML_DFS.C_ID)
|
434
|
+
.size()
|
435
|
+
.rename("n_species") # type:ignore
|
436
|
+
.to_frame()
|
437
|
+
.join(self.compartments[[SBML_DFS.C_NAME]])
|
438
|
+
.reset_index(drop=False)
|
439
|
+
.to_dict(orient="records")
|
440
|
+
)
|
441
|
+
per_reaction_stats = self.reaction_species.groupby(SBML_DFS.R_ID).size()
|
442
|
+
stats["stats_species_per_reactions"] = per_reaction_stats.describe().to_dict()
|
443
|
+
stats["top10_species_per_reactions"] = (
|
444
|
+
per_reaction_stats.sort_values(ascending=False) # type:ignore
|
445
|
+
.head(10)
|
446
|
+
.rename("n_species")
|
447
|
+
.to_frame()
|
448
|
+
.join(self.reactions[[SBML_DFS.R_NAME]])
|
449
|
+
.reset_index(drop=False)
|
450
|
+
.to_dict(orient="records")
|
451
|
+
)
|
452
|
+
|
453
|
+
cspecies_features = self.get_cspecies_features()
|
454
|
+
stats["stats_degree"] = cspecies_features["sc_degree"].describe().to_dict()
|
455
|
+
stats["top10_degree"] = (
|
456
|
+
cspecies_features.sort_values("sc_degree", ascending=False)
|
457
|
+
.head(10)[["sc_degree", "sc_children", "sc_parents", "species_type"]]
|
458
|
+
.merge(
|
459
|
+
self.compartmentalized_species[[SBML_DFS.S_ID, SBML_DFS.C_ID]],
|
460
|
+
on=SBML_DFS.SC_ID,
|
461
|
+
)
|
462
|
+
.merge(self.compartments[[SBML_DFS.C_NAME]], on=SBML_DFS.C_ID)
|
463
|
+
.merge(self.species[[SBML_DFS.S_NAME]], on=SBML_DFS.S_ID)
|
464
|
+
.reset_index(drop=False)
|
465
|
+
.to_dict(orient="records")
|
466
|
+
)
|
467
|
+
s_identifiers = sbml_dfs_utils.unnest_identifiers(
|
468
|
+
self.species, SBML_DFS.S_IDENTIFIERS
|
469
|
+
)
|
470
|
+
identifiers_stats = s_identifiers.groupby("s_id").size()
|
471
|
+
stats["stats_identifiers_per_species"] = identifiers_stats.describe().to_dict()
|
472
|
+
stats["top10_identifiers_per_species"] = (
|
473
|
+
identifiers_stats.sort_values(ascending=False)
|
474
|
+
.head(10)
|
475
|
+
.rename("n_identifiers")
|
476
|
+
.to_frame()
|
477
|
+
.join(species_features[[SBML_DFS.S_NAME, "species_type"]])
|
478
|
+
.reset_index(drop=False)
|
479
|
+
.to_dict(orient="records")
|
480
|
+
)
|
481
|
+
|
482
|
+
return stats
|
483
|
+
|
484
|
+
def add_species_data(self, label: str, data: pd.DataFrame):
|
485
|
+
"""Adds additional species_data with validation
|
486
|
+
|
487
|
+
Args:
|
488
|
+
label (str): the label for the new data
|
489
|
+
data (pd.DataFrame): the data
|
490
|
+
|
491
|
+
Raises:
|
492
|
+
ValueError: if the data is not valid, ie does not match with `species`
|
493
|
+
"""
|
494
|
+
self._validate_species_data(data)
|
495
|
+
if label in self.species_data:
|
496
|
+
raise ValueError(
|
497
|
+
f"{label} already exists in species_data. " "Drop it first."
|
498
|
+
)
|
499
|
+
self.species_data[label] = data
|
500
|
+
|
501
|
+
def add_reactions_data(self, label: str, data: pd.DataFrame):
|
502
|
+
"""Adds additional reaction_data with validation
|
503
|
+
|
504
|
+
Args:
|
505
|
+
label (str): the label for the new data
|
506
|
+
data (pd.DataFrame): the data
|
507
|
+
|
508
|
+
Raises:
|
509
|
+
ValueError: if the data is not valid, ie does not match with `reactions`
|
510
|
+
"""
|
511
|
+
self._validate_reactions_data(data)
|
512
|
+
if label in self.reactions_data:
|
513
|
+
raise ValueError(
|
514
|
+
f"{label} already exists in reactions_data. Drop it first."
|
515
|
+
)
|
516
|
+
self.reactions_data[label] = data
|
517
|
+
|
518
|
+
def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
519
|
+
"""
|
520
|
+
Starting with a set of compartmentalized species determine which reactions should be removed
|
521
|
+
based on there removal. Then remove these reactions, compartmentalized species, and species.
|
522
|
+
|
523
|
+
"""
|
524
|
+
|
525
|
+
# find reactions which should be totally removed since they are losing critical species
|
526
|
+
removed_reactions = find_underspecified_reactions(self, sc_ids)
|
527
|
+
self.remove_reactions(removed_reactions)
|
528
|
+
|
529
|
+
self._remove_compartmentalized_species(sc_ids)
|
530
|
+
|
531
|
+
# remove species (and their associated species data if all their cspecies have been lost)
|
532
|
+
self._remove_unused_species()
|
533
|
+
|
534
|
+
def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
|
535
|
+
"""Removes reactions from the model
|
536
|
+
|
537
|
+
Args:
|
538
|
+
r_ids (List[str]): the reactions to remove
|
539
|
+
remove_species (bool, optional): whether to remove species that are no longer
|
540
|
+
part of any reactions. Defaults to False.
|
541
|
+
"""
|
542
|
+
# remove corresponding reactions_species
|
543
|
+
self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
|
544
|
+
# remove reactions
|
545
|
+
self.reactions = self.reactions.drop(index=list(r_ids))
|
546
|
+
# remove reactions_data
|
547
|
+
if hasattr(self, "reactions_data"):
|
548
|
+
for k, data in self.reactions_data.items():
|
549
|
+
self.reactions_data[k] = data.drop(index=list(r_ids))
|
550
|
+
# remove species if requested
|
551
|
+
if remove_species:
|
552
|
+
self._remove_unused_cspecies()
|
553
|
+
self._remove_unused_species()
|
554
|
+
|
555
|
+
def validate(self):
|
556
|
+
"""Validates the object for obvious errors"""
|
557
|
+
|
558
|
+
if not hasattr(self, "schema"):
|
559
|
+
raise ValueError("No schema found")
|
560
|
+
|
561
|
+
required_tables = self._required_entities
|
562
|
+
schema_tables = set(self.schema.keys())
|
563
|
+
|
564
|
+
extra_tables = schema_tables.difference(required_tables)
|
565
|
+
if len(extra_tables) != 0:
|
566
|
+
logger.debug(
|
567
|
+
f"{len(extra_tables)} unexpected tables found: "
|
568
|
+
f"{', '.join(extra_tables)}"
|
569
|
+
)
|
570
|
+
|
571
|
+
missing_tables = required_tables.difference(schema_tables)
|
572
|
+
if len(missing_tables) != 0:
|
573
|
+
raise ValueError(
|
574
|
+
f"Missing {len(missing_tables)} required tables: "
|
575
|
+
f"{', '.join(missing_tables)}"
|
576
|
+
)
|
577
|
+
|
578
|
+
# check individual tables
|
579
|
+
|
580
|
+
for table in required_tables:
|
581
|
+
table_schema = self.schema[table]
|
582
|
+
table_data = getattr(self, table)
|
583
|
+
|
584
|
+
if not isinstance(table_data, pd.DataFrame):
|
585
|
+
raise ValueError(
|
586
|
+
f"{table} must be a pd.DataFrame, but was a " f"{type(table_data)}"
|
587
|
+
)
|
588
|
+
|
589
|
+
# check index
|
590
|
+
expected_index_name = table_schema["pk"]
|
591
|
+
if table_data.index.name != expected_index_name:
|
592
|
+
raise ValueError(
|
593
|
+
f"the index name for {table} was not the pk: "
|
594
|
+
f"{expected_index_name}"
|
595
|
+
)
|
596
|
+
|
597
|
+
# check that all entries in the index are unique
|
598
|
+
if len(set(table_data.index.tolist())) != table_data.shape[0]:
|
599
|
+
duplicated_pks = table_data.index.value_counts()
|
600
|
+
duplicated_pks = duplicated_pks[duplicated_pks > 1]
|
601
|
+
|
602
|
+
example_duplicates = duplicated_pks.index[
|
603
|
+
0 : min(duplicated_pks.shape[0], 5)
|
604
|
+
]
|
605
|
+
raise ValueError(
|
606
|
+
f"{duplicated_pks.shape[0]} primary keys were "
|
607
|
+
f"duplicated including {', '.join(example_duplicates)}"
|
608
|
+
)
|
609
|
+
|
610
|
+
# check variables
|
611
|
+
expected_vars = set(table_schema["vars"])
|
612
|
+
table_vars = set(list(table_data.columns))
|
613
|
+
|
614
|
+
extra_vars = table_vars.difference(expected_vars)
|
615
|
+
if len(extra_vars) != 0:
|
616
|
+
logger.debug(
|
617
|
+
f"{len(extra_vars)} extra variables were found"
|
618
|
+
f" for {table}: {', '.join(extra_vars)}"
|
619
|
+
)
|
620
|
+
|
621
|
+
missing_vars = expected_vars.difference(table_vars)
|
622
|
+
if len(missing_vars) != 0:
|
623
|
+
raise ValueError(
|
624
|
+
f"Missing {len(missing_vars)} required variables"
|
625
|
+
f" for {table}: {', '.join(missing_vars)}"
|
626
|
+
)
|
627
|
+
|
628
|
+
# check
|
629
|
+
if table_data.shape[0] == 0:
|
630
|
+
raise ValueError(f"{table} contained no entries")
|
631
|
+
|
632
|
+
# check whether pks and fks agree
|
633
|
+
|
634
|
+
pk_df = pd.DataFrame(
|
635
|
+
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
636
|
+
)
|
637
|
+
|
638
|
+
fk_df = (
|
639
|
+
pd.DataFrame(
|
640
|
+
[
|
641
|
+
{"fk_table": k, "fk": v["fk"]}
|
642
|
+
for k, v in self.schema.items()
|
643
|
+
if "fk" in v.keys()
|
644
|
+
]
|
645
|
+
)
|
646
|
+
.set_index("fk_table")["fk"]
|
647
|
+
.apply(pd.Series)
|
648
|
+
.reset_index()
|
649
|
+
.melt(id_vars="fk_table")
|
650
|
+
.drop(["variable"], axis=1)
|
651
|
+
.rename(columns={"value": "key"})
|
652
|
+
)
|
653
|
+
|
654
|
+
pk_fk_correspondences = pk_df.merge(fk_df)
|
655
|
+
|
656
|
+
for i in range(0, pk_fk_correspondences.shape[0]):
|
657
|
+
pk_table_keys = set(
|
658
|
+
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
659
|
+
)
|
660
|
+
if None in pk_table_keys:
|
661
|
+
raise ValueError(
|
662
|
+
f"{pk_fk_correspondences['pk_table'][i]} had "
|
663
|
+
"missing values in its index"
|
664
|
+
)
|
665
|
+
|
666
|
+
fk_table_keys = set(
|
667
|
+
getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
|
668
|
+
:, pk_fk_correspondences["key"][i]
|
669
|
+
]
|
670
|
+
)
|
671
|
+
if None in fk_table_keys:
|
672
|
+
raise ValueError(
|
673
|
+
f"{pk_fk_correspondences['fk_table'][i]} included "
|
674
|
+
f"missing {pk_fk_correspondences['key'][i]} values"
|
675
|
+
)
|
676
|
+
|
677
|
+
# all foreign keys need to match a primary key
|
678
|
+
|
679
|
+
extra_fks = fk_table_keys.difference(pk_table_keys)
|
680
|
+
if len(extra_fks) != 0:
|
681
|
+
raise ValueError(
|
682
|
+
f"{len(extra_fks)} distinct "
|
683
|
+
f"{pk_fk_correspondences['key'][i]} values were"
|
684
|
+
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
685
|
+
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
686
|
+
" All foreign keys must have a matching primary key.\n\n"
|
687
|
+
f"Extra key are: {', '.join(extra_fks)}"
|
688
|
+
)
|
689
|
+
|
690
|
+
# check optional data tables:
|
691
|
+
for k, v in self.species_data.items():
|
692
|
+
try:
|
693
|
+
self._validate_species_data(v)
|
694
|
+
except ValueError as e:
|
695
|
+
raise ValueError(f"species data {k} was invalid.") from e
|
696
|
+
|
697
|
+
for k, v in self.reactions_data.items():
|
698
|
+
try:
|
699
|
+
self._validate_reactions_data(v)
|
700
|
+
except ValueError as e:
|
701
|
+
raise ValueError(f"reactions data {k} was invalid.") from e
|
702
|
+
|
703
|
+
# validate reaction_species sbo_terms and stoi
|
704
|
+
self._validate_reaction_species()
|
705
|
+
|
706
|
+
def validate_and_resolve(self):
|
707
|
+
"""Call validate and try to iteratively resolve common validation errors"""
|
708
|
+
|
709
|
+
current_exception = None
|
710
|
+
validated = False
|
711
|
+
|
712
|
+
while not validated:
|
713
|
+
try:
|
714
|
+
self.validate()
|
715
|
+
validated = True
|
716
|
+
except Exception as e:
|
717
|
+
e_str = str(e)
|
718
|
+
if e_str == current_exception:
|
719
|
+
logger.warning(
|
720
|
+
"Automated resolution of an Exception was attempted but failed"
|
721
|
+
)
|
722
|
+
raise e
|
723
|
+
|
724
|
+
# try to resolve
|
725
|
+
self._attempt_resolve(e)
|
726
|
+
|
727
|
+
def _remove_unused_cspecies(self):
|
728
|
+
"""Removes compartmentalized species that are no
|
729
|
+
longer part of any reactions"""
|
730
|
+
sc_ids = self._get_unused_cspecies()
|
731
|
+
self._remove_compartmentalized_species(sc_ids)
|
732
|
+
|
733
|
+
def _get_unused_cspecies(self) -> set[str]:
|
734
|
+
"""Returns a set of compartmentalized species
|
735
|
+
that are not part of any reactions"""
|
736
|
+
sc_ids = set(self.compartmentalized_species.index) - set(
|
737
|
+
self.reaction_species[SBML_DFS.SC_ID]
|
738
|
+
)
|
739
|
+
return sc_ids # type: ignore
|
740
|
+
|
741
|
+
def _remove_unused_species(self):
|
742
|
+
"""Removes species that are no longer part of any
|
743
|
+
compartmentalized species"""
|
744
|
+
s_ids = self._get_unused_species()
|
745
|
+
self._remove_species(s_ids)
|
746
|
+
|
747
|
+
def _get_unused_species(self) -> set[str]:
|
748
|
+
"""Returns a list of species that are not part of any reactions"""
|
749
|
+
s_ids = set(self.species.index) - set(
|
750
|
+
self.compartmentalized_species[SBML_DFS.S_ID]
|
751
|
+
)
|
752
|
+
return s_ids # type: ignore
|
753
|
+
|
754
|
+
def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
755
|
+
"""Removes compartmentalized species from the model
|
756
|
+
|
757
|
+
This should not be directly used by the user, as it can lead to
|
758
|
+
invalid reactions when removing species without a logic to decide
|
759
|
+
if the reaction needs to be removed as well.
|
760
|
+
|
761
|
+
Args:
|
762
|
+
sc_ids (Iterable[str]): the compartmentalized species to remove
|
763
|
+
"""
|
764
|
+
# Remove compartmentalized species
|
765
|
+
self.compartmentalized_species = self.compartmentalized_species.drop(
|
766
|
+
index=list(sc_ids)
|
767
|
+
)
|
768
|
+
# remove corresponding reactions_species
|
769
|
+
self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
|
770
|
+
|
771
|
+
def _remove_species(self, s_ids: Iterable[str]):
|
772
|
+
"""Removes species from the model
|
773
|
+
|
774
|
+
This should not be directly used by the user, as it can lead to
|
775
|
+
invalid reactions when removing species without a logic to decide
|
776
|
+
if the reaction needs to be removed as well.
|
777
|
+
|
778
|
+
This removes the species and corresponding compartmentalized species and
|
779
|
+
reactions_species.
|
780
|
+
|
781
|
+
Args:
|
782
|
+
s_ids (Iterable[str]): the species to remove
|
783
|
+
"""
|
784
|
+
sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
|
785
|
+
self._remove_compartmentalized_species(sc_ids)
|
786
|
+
# Remove species
|
787
|
+
self.species = self.species.drop(index=list(s_ids))
|
788
|
+
# remove data
|
789
|
+
for k, data in self.species_data.items():
|
790
|
+
self.species_data[k] = data.drop(index=list(s_ids))
|
791
|
+
|
792
|
+
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
793
|
+
"""Validates species data attribute
|
794
|
+
|
795
|
+
Args:
|
796
|
+
species_data_table (pd.DataFrame): a species data table
|
797
|
+
|
798
|
+
Raises:
|
799
|
+
ValueError: s_id not index name
|
800
|
+
ValueError: s_id index contains duplicates
|
801
|
+
ValueError: s_id not in species table
|
802
|
+
"""
|
803
|
+
_validate_matching_data(species_data_table, self.species)
|
804
|
+
|
805
|
+
def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
|
806
|
+
"""Validates reactions data attribute
|
807
|
+
|
808
|
+
Args:
|
809
|
+
reactions_data_table (pd.DataFrame): a reactions data table
|
810
|
+
|
811
|
+
Raises:
|
812
|
+
ValueError: r_id not index name
|
813
|
+
ValueError: r_id index contains duplicates
|
814
|
+
ValueError: r_id not in reactions table
|
815
|
+
"""
|
816
|
+
_validate_matching_data(reactions_data_table, self.reactions)
|
817
|
+
|
818
|
+
def _validate_reaction_species(self):
|
819
|
+
assert all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull())
|
820
|
+
|
821
|
+
# test for null SBO terms
|
822
|
+
n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
|
823
|
+
if n_null_sbo_terms != 0:
|
824
|
+
raise ValueError(
|
825
|
+
f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
|
826
|
+
)
|
827
|
+
|
828
|
+
# find invalid SBO terms
|
829
|
+
sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
|
830
|
+
invalid_sbo_term_counts = sbo_counts[
|
831
|
+
~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
|
832
|
+
]
|
833
|
+
|
834
|
+
if invalid_sbo_term_counts.shape[0] != 0:
|
835
|
+
invalid_sbo_counts_str = ", ".join(
|
836
|
+
[f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
|
837
|
+
)
|
838
|
+
raise ValueError(
|
839
|
+
f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
|
840
|
+
f"defined {invalid_sbo_counts_str}"
|
841
|
+
)
|
842
|
+
|
843
|
+
def _attempt_resolve(self, e):
|
844
|
+
str_e = str(e)
|
845
|
+
if str_e == "compartmentalized_species included missing c_id values":
|
846
|
+
logger.warning(str_e)
|
847
|
+
logger.warning(
|
848
|
+
"Attempting to resolve with infer_uncompartmentalized_species_location()"
|
849
|
+
)
|
850
|
+
self = infer_uncompartmentalized_species_location(self)
|
851
|
+
elif re.search("sbo_terms were not defined", str_e):
|
852
|
+
logger.warning(str_e)
|
853
|
+
logger.warning("Attempting to resolve with infer_sbo_terms()")
|
854
|
+
self = infer_sbo_terms(self)
|
855
|
+
else:
|
856
|
+
logger.warning(
|
857
|
+
"An error occurred which could not be automatically resolved"
|
858
|
+
)
|
859
|
+
raise e
|
860
|
+
|
861
|
+
|
862
|
+
def species_status(s_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
|
863
|
+
"""
|
864
|
+
Species Status
|
865
|
+
|
866
|
+
Return all of the reaction's a species particpates in.
|
867
|
+
|
868
|
+
Parameters:
|
869
|
+
s_id: str
|
870
|
+
A species ID
|
871
|
+
sbml_dfs: SBML_dfs
|
872
|
+
|
873
|
+
Returns:
|
874
|
+
pd.DataFrame, one row reaction
|
875
|
+
"""
|
876
|
+
|
877
|
+
matching_species = sbml_dfs.species.loc[s_id]
|
878
|
+
|
879
|
+
if not isinstance(matching_species, pd.Series):
|
880
|
+
raise ValueError(f"{s_id} did not match a single species")
|
881
|
+
|
882
|
+
# find all rxns species particpate in
|
883
|
+
|
884
|
+
matching_compartmentalized_species = sbml_dfs.compartmentalized_species[
|
885
|
+
sbml_dfs.compartmentalized_species.s_id.isin([s_id])
|
886
|
+
]
|
887
|
+
|
888
|
+
rxns_participating = sbml_dfs.reaction_species[
|
889
|
+
sbml_dfs.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
|
890
|
+
]
|
891
|
+
|
892
|
+
# find all participants in these rxns
|
893
|
+
|
894
|
+
full_rxns_participating = sbml_dfs.reaction_species[
|
895
|
+
sbml_dfs.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
|
896
|
+
].merge(
|
897
|
+
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
898
|
+
)
|
899
|
+
|
900
|
+
reaction_descriptions = pd.concat(
|
901
|
+
[
|
902
|
+
reaction_summary(x, sbml_dfs)
|
903
|
+
for x in set(full_rxns_participating[SBML_DFS.R_ID].tolist())
|
904
|
+
]
|
905
|
+
)
|
906
|
+
|
907
|
+
status = (
|
908
|
+
full_rxns_participating.loc[
|
909
|
+
full_rxns_participating[SBML_DFS.SC_ID].isin(
|
910
|
+
matching_compartmentalized_species.index.values.tolist()
|
911
|
+
),
|
912
|
+
[SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
|
913
|
+
]
|
914
|
+
.merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
|
915
|
+
.reset_index(drop=True)
|
916
|
+
.drop(SBML_DFS.R_ID, axis=1)
|
917
|
+
)
|
918
|
+
|
919
|
+
return status
|
920
|
+
|
921
|
+
|
922
|
+
def reaction_summary(r_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
|
923
|
+
"""
|
924
|
+
Reaction Summary
|
925
|
+
|
926
|
+
Return a reaction's name and a human-readable formula.
|
927
|
+
|
928
|
+
Parameters:
|
929
|
+
r_id: str
|
930
|
+
A reaction ID
|
931
|
+
sbml_dfs: SBML_dfs
|
932
|
+
|
933
|
+
Returns:
|
934
|
+
one row pd.DataFrame
|
935
|
+
"""
|
936
|
+
|
937
|
+
logger.warning(
|
938
|
+
"reaction_summary is deprecated and will be removed in a future version of rcpr; "
|
939
|
+
"please use reaction_summaries() instead"
|
940
|
+
)
|
941
|
+
|
942
|
+
matching_reaction = sbml_dfs.reactions.loc[r_id]
|
943
|
+
|
944
|
+
if not isinstance(matching_reaction, pd.Series):
|
945
|
+
raise ValueError(f"{r_id} did not match a single reaction")
|
946
|
+
|
947
|
+
matching_reaction = sbml_dfs.reactions.loc[r_id]
|
948
|
+
|
949
|
+
matching_reaction_species = sbml_dfs.reaction_species[
|
950
|
+
sbml_dfs.reaction_species.r_id.isin([r_id])
|
951
|
+
].merge(
|
952
|
+
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
953
|
+
)
|
954
|
+
|
955
|
+
# collapse all reaction species to a formula string
|
956
|
+
|
957
|
+
if len(matching_reaction_species[SBML_DFS.C_ID].unique()) == 1:
|
958
|
+
augmented_matching_reaction_species = matching_reaction_species.merge(
|
959
|
+
sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True
|
960
|
+
).merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
|
961
|
+
str_formula = (
|
962
|
+
construct_formula_string(
|
963
|
+
augmented_matching_reaction_species, sbml_dfs.reactions, SBML_DFS.S_NAME
|
964
|
+
)
|
965
|
+
+ " ["
|
966
|
+
+ augmented_matching_reaction_species[SBML_DFS.C_NAME][0]
|
967
|
+
+ "]"
|
968
|
+
)
|
969
|
+
else:
|
970
|
+
str_formula = construct_formula_string(
|
971
|
+
matching_reaction_species, sbml_dfs.reactions, SBML_DFS.SC_NAME
|
972
|
+
)
|
973
|
+
|
974
|
+
output = pd.DataFrame(
|
975
|
+
{
|
976
|
+
SBML_DFS.R_NAME: matching_reaction[SBML_DFS.R_NAME],
|
977
|
+
"r_formula_str": str_formula,
|
978
|
+
},
|
979
|
+
index=[r_id],
|
980
|
+
)
|
981
|
+
|
982
|
+
output.index.name = SBML_DFS.R_ID
|
983
|
+
|
984
|
+
return output
|
985
|
+
|
986
|
+
|
987
|
+
def reaction_summaries(sbml_dfs: SBML_dfs, r_ids=None) -> pd.Series:
|
988
|
+
"""
|
989
|
+
Reaction Summary
|
990
|
+
|
991
|
+
Return human-readable formulas for reactions.
|
992
|
+
|
993
|
+
Parameters:
|
994
|
+
----------
|
995
|
+
sbml_dfs: sbml.SBML_dfs
|
996
|
+
A relational mechanistic model
|
997
|
+
r_ids: [str], str or None
|
998
|
+
Reaction IDs or None for all reactions
|
999
|
+
|
1000
|
+
Returns:
|
1001
|
+
----------
|
1002
|
+
formula_strs: pd.Series
|
1003
|
+
"""
|
1004
|
+
|
1005
|
+
if isinstance(r_ids, str):
|
1006
|
+
r_ids = [r_ids]
|
1007
|
+
|
1008
|
+
if r_ids is None:
|
1009
|
+
matching_reactions = sbml_dfs.reactions
|
1010
|
+
else:
|
1011
|
+
matching_reactions = sbml_dfs.reactions.loc[r_ids]
|
1012
|
+
|
1013
|
+
matching_reaction_species = sbml_dfs.reaction_species[
|
1014
|
+
sbml_dfs.reaction_species.r_id.isin(matching_reactions.index)
|
1015
|
+
].merge(
|
1016
|
+
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1017
|
+
)
|
1018
|
+
|
1019
|
+
# split into within compartment and cross-compartment reactions
|
1020
|
+
r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
|
1021
|
+
SBML_DFS.C_ID
|
1022
|
+
].nunique()
|
1023
|
+
|
1024
|
+
# identify reactions which work across compartments
|
1025
|
+
r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
|
1026
|
+
# there species must be labelled with the sc_name to specify where a species exists
|
1027
|
+
if r_id_cross_compartment.shape[0] > 0:
|
1028
|
+
rxn_eqtn_cross_compartment = (
|
1029
|
+
matching_reaction_species[
|
1030
|
+
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1031
|
+
r_id_cross_compartment.index
|
1032
|
+
)
|
1033
|
+
]
|
1034
|
+
.sort_values([SBML_DFS.SC_NAME])
|
1035
|
+
.groupby(SBML_DFS.R_ID)
|
1036
|
+
.apply(
|
1037
|
+
lambda x: construct_formula_string(
|
1038
|
+
x, sbml_dfs.reactions, SBML_DFS.SC_NAME
|
1039
|
+
)
|
1040
|
+
)
|
1041
|
+
.rename("r_formula_str")
|
1042
|
+
)
|
1043
|
+
else:
|
1044
|
+
rxn_eqtn_cross_compartment = None
|
1045
|
+
|
1046
|
+
# identify reactions which occur within a single compartment; for these the reaction
|
1047
|
+
# can be labelled with the compartment and individual species can receive a more readable s_name
|
1048
|
+
r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
|
1049
|
+
if r_id_within_compartment.shape[0] > 0:
|
1050
|
+
# add s_name
|
1051
|
+
augmented_matching_reaction_species = (
|
1052
|
+
matching_reaction_species[
|
1053
|
+
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1054
|
+
r_id_within_compartment.index
|
1055
|
+
)
|
1056
|
+
]
|
1057
|
+
.merge(sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True)
|
1058
|
+
.merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
|
1059
|
+
.sort_values([SBML_DFS.S_NAME])
|
1060
|
+
)
|
1061
|
+
# create formulas based on s_names of components
|
1062
|
+
rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
|
1063
|
+
[SBML_DFS.R_ID, SBML_DFS.C_NAME]
|
1064
|
+
).apply(
|
1065
|
+
lambda x: construct_formula_string(x, sbml_dfs.reactions, SBML_DFS.S_NAME)
|
1066
|
+
)
|
1067
|
+
# add compartment for each reaction
|
1068
|
+
rxn_eqtn_within_compartment = pd.Series(
|
1069
|
+
[
|
1070
|
+
y + ": " + x
|
1071
|
+
for x, y in zip(
|
1072
|
+
rxn_eqtn_within_compartment,
|
1073
|
+
rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.C_NAME),
|
1074
|
+
)
|
1075
|
+
],
|
1076
|
+
index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
|
1077
|
+
).rename("r_formula_str")
|
1078
|
+
else:
|
1079
|
+
rxn_eqtn_within_compartment = None
|
1080
|
+
|
1081
|
+
formula_strs = pd.concat([rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment])
|
1082
|
+
|
1083
|
+
return formula_strs
|
1084
|
+
|
1085
|
+
|
1086
|
+
def construct_formula_string(
|
1087
|
+
reaction_species_df: pd.DataFrame,
|
1088
|
+
reactions_df: pd.DataFrame,
|
1089
|
+
name_var: str,
|
1090
|
+
) -> str:
|
1091
|
+
"""
|
1092
|
+
Construct Formula String
|
1093
|
+
|
1094
|
+
Convert a table of reaction species into a formula string
|
1095
|
+
|
1096
|
+
Parameters:
|
1097
|
+
----------
|
1098
|
+
reaction_species_df: pd.DataFrame
|
1099
|
+
Table containing a reactions' species
|
1100
|
+
reactions_df: pd.DataFrame
|
1101
|
+
smbl.reactions
|
1102
|
+
name_var: str
|
1103
|
+
Name used to label species
|
1104
|
+
|
1105
|
+
Returns:
|
1106
|
+
----------
|
1107
|
+
formula_str: str
|
1108
|
+
String representation of a reactions substrates, products and
|
1109
|
+
modifiers
|
1110
|
+
|
1111
|
+
"""
|
1112
|
+
|
1113
|
+
reaction_species_df["label"] = [
|
1114
|
+
add_stoi_to_species_name(x, y)
|
1115
|
+
for x, y in zip(
|
1116
|
+
reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
|
1117
|
+
)
|
1118
|
+
]
|
1119
|
+
|
1120
|
+
rxn_reversible = bool(
|
1121
|
+
reactions_df.loc[reaction_species_df[SBML_DFS.R_ID][0], SBML_DFS.R_ISREVERSIBLE]
|
1122
|
+
) # convert from a np.bool_ to bool if needed
|
1123
|
+
assert isinstance(rxn_reversible, bool)
|
1124
|
+
|
1125
|
+
if rxn_reversible:
|
1126
|
+
arrow_type = " <-> "
|
1127
|
+
else:
|
1128
|
+
arrow_type = " -> "
|
1129
|
+
|
1130
|
+
substrates = " + ".join(
|
1131
|
+
reaction_species_df["label"][
|
1132
|
+
reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
|
1133
|
+
].tolist()
|
1134
|
+
)
|
1135
|
+
products = " + ".join(
|
1136
|
+
reaction_species_df["label"][
|
1137
|
+
reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
|
1138
|
+
].tolist()
|
1139
|
+
)
|
1140
|
+
modifiers = " + ".join(
|
1141
|
+
reaction_species_df["label"][
|
1142
|
+
reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
|
1143
|
+
].tolist()
|
1144
|
+
)
|
1145
|
+
if modifiers != "":
|
1146
|
+
modifiers = f" ---- modifiers: {modifiers}]"
|
1147
|
+
|
1148
|
+
return f"{substrates}{arrow_type}{products}{modifiers}"
|
1149
|
+
|
1150
|
+
|
1151
|
+
def add_stoi_to_species_name(stoi: float | int, name: str) -> str:
|
1152
|
+
"""
|
1153
|
+
Add Stoi To Species Name
|
1154
|
+
|
1155
|
+
Add # of molecules to a species name
|
1156
|
+
|
1157
|
+
Parameters:
|
1158
|
+
----------
|
1159
|
+
stoi: float or int
|
1160
|
+
Number of molecules
|
1161
|
+
name: str
|
1162
|
+
Name of species
|
1163
|
+
|
1164
|
+
Returns:
|
1165
|
+
----------
|
1166
|
+
name: str
|
1167
|
+
Name containing number of species
|
1168
|
+
|
1169
|
+
"""
|
1170
|
+
|
1171
|
+
if stoi in [-1, 0, 1]:
|
1172
|
+
return name
|
1173
|
+
else:
|
1174
|
+
return str(abs(stoi)) + " " + name
|
1175
|
+
|
1176
|
+
|
1177
|
+
def filter_to_characteristic_species_ids(
|
1178
|
+
species_ids: pd.DataFrame,
|
1179
|
+
max_complex_size: int = 4,
|
1180
|
+
max_promiscuity: int = 20,
|
1181
|
+
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
1182
|
+
) -> pd.DataFrame:
|
1183
|
+
"""
|
1184
|
+
Filter to Characteristic Species IDs
|
1185
|
+
|
1186
|
+
Remove identifiers corresponding to one component within a large protein
|
1187
|
+
complexes and non-characteristic annotations such as pubmed references and
|
1188
|
+
homologues.
|
1189
|
+
|
1190
|
+
Parameters
|
1191
|
+
----------
|
1192
|
+
species_ids: pd.DataFrame
|
1193
|
+
A table of identifiers produced by sdbml_dfs.get_identifiers("species")
|
1194
|
+
max_complex_size: int
|
1195
|
+
The largest size of a complex, where BQB_HAS_PART terms will be retained.
|
1196
|
+
In most cases, complexes are handled with specific formation and
|
1197
|
+
dissolutation reactions,but these identifiers will be pulled in when
|
1198
|
+
searching by identifiers or searching the identifiers associated with a
|
1199
|
+
species against an external resource such as Open Targets.
|
1200
|
+
max_promiscuity: int
|
1201
|
+
Maximum number of species where a single molecule can act as a
|
1202
|
+
BQB_HAS_PART component associated with a single identifier (and common ontology).
|
1203
|
+
defining_biological_qualifiers (list[str]):
|
1204
|
+
BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
|
1205
|
+
permissive settings would include homologs, different forms of the same gene.
|
1206
|
+
|
1207
|
+
Returns:
|
1208
|
+
--------
|
1209
|
+
species_id: pd.DataFrame
|
1210
|
+
Input species filtered to characteristic identifiers
|
1211
|
+
|
1212
|
+
"""
|
1213
|
+
|
1214
|
+
if not isinstance(species_ids, pd.DataFrame):
|
1215
|
+
raise TypeError(
|
1216
|
+
f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
|
1217
|
+
)
|
1218
|
+
|
1219
|
+
if not isinstance(max_complex_size, int):
|
1220
|
+
raise TypeError(
|
1221
|
+
f"max_complex_size was a {type(max_complex_size)} but must be an int"
|
1222
|
+
)
|
1223
|
+
|
1224
|
+
if not isinstance(max_promiscuity, int):
|
1225
|
+
raise TypeError(
|
1226
|
+
f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
|
1227
|
+
)
|
1228
|
+
|
1229
|
+
if not isinstance(defining_biological_qualifiers, list):
|
1230
|
+
raise TypeError(
|
1231
|
+
f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
|
1232
|
+
)
|
1233
|
+
|
1234
|
+
# primary annotations of a species
|
1235
|
+
bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
|
1236
|
+
|
1237
|
+
# add components within modestly sized protein complexes
|
1238
|
+
# look at HAS_PART IDs
|
1239
|
+
bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
|
1240
|
+
# filter to genes
|
1241
|
+
bqb_has_parts_species = bqb_has_parts_species[
|
1242
|
+
bqb_has_parts_species[IDENTIFIERS.ONTOLOGY].isin(
|
1243
|
+
CHARACTERISTIC_COMPLEX_ONTOLOGIES
|
1244
|
+
)
|
1245
|
+
]
|
1246
|
+
|
1247
|
+
# number of species in a complex
|
1248
|
+
n_species_components = bqb_has_parts_species.value_counts(
|
1249
|
+
[IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
|
1250
|
+
)
|
1251
|
+
big_complex_sids = set(
|
1252
|
+
n_species_components[
|
1253
|
+
n_species_components > max_complex_size
|
1254
|
+
].index.get_level_values(SBML_DFS.S_ID)
|
1255
|
+
)
|
1256
|
+
|
1257
|
+
# number of complexes a species is part of
|
1258
|
+
n_complexes_involvedin = bqb_has_parts_species.value_counts(
|
1259
|
+
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
1260
|
+
)
|
1261
|
+
promiscuous_component_identifiers_index = n_complexes_involvedin[
|
1262
|
+
n_complexes_involvedin > max_promiscuity
|
1263
|
+
].index
|
1264
|
+
promiscuous_component_identifiers = pd.Series(
|
1265
|
+
data=[True] * len(promiscuous_component_identifiers_index),
|
1266
|
+
index=promiscuous_component_identifiers_index,
|
1267
|
+
name="is_shared_component",
|
1268
|
+
)
|
1269
|
+
|
1270
|
+
if len(promiscuous_component_identifiers) == 0:
|
1271
|
+
# no complexes to filter
|
1272
|
+
return species_ids
|
1273
|
+
|
1274
|
+
filtered_bqb_has_parts = bqb_has_parts_species.merge(
|
1275
|
+
promiscuous_component_identifiers,
|
1276
|
+
left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
|
1277
|
+
right_index=True,
|
1278
|
+
how="left",
|
1279
|
+
)
|
1280
|
+
|
1281
|
+
filtered_bqb_has_parts["is_shared_component"] = filtered_bqb_has_parts[
|
1282
|
+
"is_shared_component"
|
1283
|
+
].fillna(False)
|
1284
|
+
# drop identifiers shared as components across many species
|
1285
|
+
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
1286
|
+
~filtered_bqb_has_parts["is_shared_component"]
|
1287
|
+
].drop(["is_shared_component"], axis=1)
|
1288
|
+
# drop species parts if there are many components
|
1289
|
+
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
1290
|
+
~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
|
1291
|
+
]
|
1292
|
+
|
1293
|
+
# combine primary identifiers and rare components
|
1294
|
+
characteristic_species_ids = pd.concat(
|
1295
|
+
[
|
1296
|
+
bqb_is_species,
|
1297
|
+
filtered_bqb_has_parts,
|
1298
|
+
]
|
1299
|
+
)
|
1300
|
+
|
1301
|
+
return characteristic_species_ids
|
1302
|
+
|
1303
|
+
|
1304
|
+
def infer_uncompartmentalized_species_location(sbml_dfs: SBML_dfs) -> SBML_dfs:
|
1305
|
+
"""
|
1306
|
+
Infer Uncompartmentalized Species Location
|
1307
|
+
|
1308
|
+
If the compartment of a subset of compartmentalized species
|
1309
|
+
was not specified, infer an appropriate compartment from
|
1310
|
+
other members of reactions they particpate in
|
1311
|
+
|
1312
|
+
Parameters:
|
1313
|
+
----------
|
1314
|
+
sbml_dfs: sbml.SBML_dfs
|
1315
|
+
A relational pathway model
|
1316
|
+
|
1317
|
+
Returns:
|
1318
|
+
----------
|
1319
|
+
sbml_dfs: sbml.SBML_dfs
|
1320
|
+
A relational pathway model (with filled in species compartments)
|
1321
|
+
|
1322
|
+
"""
|
1323
|
+
|
1324
|
+
default_compartment = (
|
1325
|
+
sbml_dfs.compartmentalized_species.value_counts(SBML_DFS.C_ID)
|
1326
|
+
.rename("N")
|
1327
|
+
.reset_index()
|
1328
|
+
.sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
|
1329
|
+
)
|
1330
|
+
if not isinstance(default_compartment, str):
|
1331
|
+
raise ValueError(
|
1332
|
+
"No default compartment could be found - compartment "
|
1333
|
+
"information may not be present"
|
1334
|
+
)
|
1335
|
+
|
1336
|
+
# infer the compartments of species missing compartments
|
1337
|
+
|
1338
|
+
missing_compartment_scids = sbml_dfs.compartmentalized_species[
|
1339
|
+
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1340
|
+
].index.tolist()
|
1341
|
+
if len(missing_compartment_scids) == 0:
|
1342
|
+
logger.info(
|
1343
|
+
"All compartmentalized species have compartments, "
|
1344
|
+
"returning input sbml_dfs"
|
1345
|
+
)
|
1346
|
+
return sbml_dfs
|
1347
|
+
|
1348
|
+
participating_reactions = (
|
1349
|
+
sbml_dfs.reaction_species[
|
1350
|
+
sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
1351
|
+
][SBML_DFS.R_ID]
|
1352
|
+
.unique()
|
1353
|
+
.tolist()
|
1354
|
+
)
|
1355
|
+
reaction_participants = sbml_dfs.reaction_species[
|
1356
|
+
sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
|
1357
|
+
].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
|
1358
|
+
reaction_participants = reaction_participants.merge(
|
1359
|
+
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID],
|
1360
|
+
left_on=SBML_DFS.SC_ID,
|
1361
|
+
right_index=True,
|
1362
|
+
)
|
1363
|
+
|
1364
|
+
# find a default compartment to fall back on if all compartmental information is missing
|
1365
|
+
|
1366
|
+
primary_reaction_compartment = (
|
1367
|
+
reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
|
1368
|
+
.rename("N")
|
1369
|
+
.reset_index()
|
1370
|
+
.sort_values("N", ascending=False)
|
1371
|
+
.groupby(SBML_DFS.R_ID)
|
1372
|
+
.first()[SBML_DFS.C_ID]
|
1373
|
+
.reset_index()
|
1374
|
+
)
|
1375
|
+
|
1376
|
+
inferred_compartmentalization = (
|
1377
|
+
sbml_dfs.reaction_species[
|
1378
|
+
sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
1379
|
+
]
|
1380
|
+
.merge(primary_reaction_compartment)
|
1381
|
+
.value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
|
1382
|
+
.rename("N")
|
1383
|
+
.reset_index()
|
1384
|
+
.sort_values("N", ascending=False)
|
1385
|
+
.groupby(SBML_DFS.SC_ID)
|
1386
|
+
.first()
|
1387
|
+
.reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
|
1388
|
+
)
|
1389
|
+
logger.info(
|
1390
|
+
f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
|
1391
|
+
)
|
1392
|
+
|
1393
|
+
# define where a reaction is most likely to occur based on the compartmentalization of its particpants
|
1394
|
+
species_with_unknown_compartmentalization = set(
|
1395
|
+
missing_compartment_scids
|
1396
|
+
).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
|
1397
|
+
if len(species_with_unknown_compartmentalization) != 0:
|
1398
|
+
logger.warning(
|
1399
|
+
f"{len(species_with_unknown_compartmentalization)} "
|
1400
|
+
"species compartmentalization could not be inferred"
|
1401
|
+
" from other reaction particpants. Their compartmentalization "
|
1402
|
+
f"will be set to the default of {default_compartment}"
|
1403
|
+
)
|
1404
|
+
|
1405
|
+
inferred_compartmentalization = pd.concat(
|
1406
|
+
[
|
1407
|
+
inferred_compartmentalization,
|
1408
|
+
pd.DataFrame(
|
1409
|
+
{SBML_DFS.SC_ID: list(species_with_unknown_compartmentalization)}
|
1410
|
+
).assign(c_id=default_compartment),
|
1411
|
+
]
|
1412
|
+
)
|
1413
|
+
|
1414
|
+
if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
|
1415
|
+
raise ValueError(
|
1416
|
+
f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
|
1417
|
+
)
|
1418
|
+
|
1419
|
+
updated_compartmentalized_species = pd.concat(
|
1420
|
+
[
|
1421
|
+
sbml_dfs.compartmentalized_species[
|
1422
|
+
~sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1423
|
+
],
|
1424
|
+
sbml_dfs.compartmentalized_species[
|
1425
|
+
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1426
|
+
]
|
1427
|
+
.drop(SBML_DFS.C_ID, axis=1)
|
1428
|
+
.merge(
|
1429
|
+
inferred_compartmentalization, left_index=True, right_on=SBML_DFS.SC_ID
|
1430
|
+
)
|
1431
|
+
.set_index(SBML_DFS.SC_ID),
|
1432
|
+
]
|
1433
|
+
)
|
1434
|
+
|
1435
|
+
if (
|
1436
|
+
updated_compartmentalized_species.shape[0]
|
1437
|
+
!= sbml_dfs.compartmentalized_species.shape[0]
|
1438
|
+
):
|
1439
|
+
raise ValueError(
|
1440
|
+
f"Trying to overwrite {sbml_dfs.compartmentalized_species.shape[0]}"
|
1441
|
+
" compartmentalized species with "
|
1442
|
+
f"{updated_compartmentalized_species.shape[0]}"
|
1443
|
+
)
|
1444
|
+
|
1445
|
+
if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
|
1446
|
+
raise ValueError("Some species compartments are still missing")
|
1447
|
+
|
1448
|
+
sbml_dfs.compartmentalized_species = updated_compartmentalized_species
|
1449
|
+
|
1450
|
+
return sbml_dfs
|
1451
|
+
|
1452
|
+
|
1453
|
+
def infer_sbo_terms(sbml_dfs: SBML_dfs) -> SBML_dfs:
|
1454
|
+
"""
|
1455
|
+
Infer SBO Terms
|
1456
|
+
|
1457
|
+
Define SBO terms based on stoichiometry for reaction_species with missing terms
|
1458
|
+
|
1459
|
+
Parameters:
|
1460
|
+
----------
|
1461
|
+
sbml_dfs: sbml.SBML_dfs
|
1462
|
+
A relational pathway model
|
1463
|
+
|
1464
|
+
Returns:
|
1465
|
+
----------
|
1466
|
+
sbml_dfs: sbml.SBML_dfs
|
1467
|
+
A relational pathway model (with missing/invalid reaction species sbo_terms resolved)
|
1468
|
+
|
1469
|
+
"""
|
1470
|
+
|
1471
|
+
valid_sbo_terms = sbml_dfs.reaction_species[
|
1472
|
+
sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
1473
|
+
]
|
1474
|
+
|
1475
|
+
invalid_sbo_terms = sbml_dfs.reaction_species[
|
1476
|
+
~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
1477
|
+
]
|
1478
|
+
|
1479
|
+
assert all(sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].notnull())
|
1480
|
+
if invalid_sbo_terms.shape[0] == 0:
|
1481
|
+
logger.info("All sbo_terms were valid; returning input sbml_dfs")
|
1482
|
+
return sbml_dfs
|
1483
|
+
|
1484
|
+
logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
|
1485
|
+
|
1486
|
+
# add missing/invalid terms based on stoichiometry
|
1487
|
+
invalid_sbo_terms.loc[
|
1488
|
+
invalid_sbo_terms[sbml_dfs.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
|
1489
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
|
1490
|
+
|
1491
|
+
invalid_sbo_terms.loc[
|
1492
|
+
invalid_sbo_terms[sbml_dfs.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
|
1493
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
|
1494
|
+
|
1495
|
+
invalid_sbo_terms.loc[
|
1496
|
+
invalid_sbo_terms[sbml_dfs.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
|
1497
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
|
1498
|
+
|
1499
|
+
updated_reaction_species = pd.concat(
|
1500
|
+
[valid_sbo_terms, invalid_sbo_terms]
|
1501
|
+
).sort_index()
|
1502
|
+
|
1503
|
+
assert sbml_dfs.reaction_species.shape[0] == updated_reaction_species.shape[0]
|
1504
|
+
sbml_dfs.reaction_species = updated_reaction_species
|
1505
|
+
|
1506
|
+
return sbml_dfs
|
1507
|
+
|
1508
|
+
|
1509
|
+
def name_compartmentalized_species(sbml_dfs):
|
1510
|
+
"""
|
1511
|
+
Name Compartmentalized Species
|
1512
|
+
|
1513
|
+
Rename compartmentalized species if they have the same
|
1514
|
+
name as their species
|
1515
|
+
|
1516
|
+
Parameters
|
1517
|
+
----------
|
1518
|
+
sbml_dfs : SBML_dfs
|
1519
|
+
A model formed by aggregating pathways
|
1520
|
+
|
1521
|
+
Returns:
|
1522
|
+
----------
|
1523
|
+
sbml_dfs
|
1524
|
+
"""
|
1525
|
+
|
1526
|
+
augmented_cspecies = sbml_dfs.compartmentalized_species.merge(
|
1527
|
+
sbml_dfs.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
|
1528
|
+
).merge(
|
1529
|
+
sbml_dfs.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
|
1530
|
+
)
|
1531
|
+
augmented_cspecies[SBML_DFS.SC_NAME] = [
|
1532
|
+
f"{s} [{c}]" if sc == s else sc
|
1533
|
+
for sc, c, s in zip(
|
1534
|
+
augmented_cspecies[SBML_DFS.SC_NAME],
|
1535
|
+
augmented_cspecies[SBML_DFS.C_NAME],
|
1536
|
+
augmented_cspecies[SBML_DFS.S_NAME],
|
1537
|
+
)
|
1538
|
+
]
|
1539
|
+
|
1540
|
+
sbml_dfs.compartmentalized_species = augmented_cspecies.loc[
|
1541
|
+
:, sbml_dfs.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
|
1542
|
+
]
|
1543
|
+
|
1544
|
+
return sbml_dfs
|
1545
|
+
|
1546
|
+
|
1547
|
+
def export_sbml_dfs(
|
1548
|
+
model_prefix: str,
|
1549
|
+
sbml_dfs: SBML_dfs,
|
1550
|
+
outdir: str,
|
1551
|
+
overwrite: bool = False,
|
1552
|
+
dogmatic: bool = True,
|
1553
|
+
) -> None:
|
1554
|
+
"""
|
1555
|
+
Export SBML_dfs
|
1556
|
+
|
1557
|
+
Export summaries of species identifiers and each table underlying
|
1558
|
+
an SBML_dfs pathway model
|
1559
|
+
|
1560
|
+
Params
|
1561
|
+
------
|
1562
|
+
model_prefix: str
|
1563
|
+
Label to prepend to all exported files
|
1564
|
+
sbml_dfs: sbml.SBML_dfs
|
1565
|
+
A pathway model
|
1566
|
+
outdir: str
|
1567
|
+
Path to an existing directory where results should be saved
|
1568
|
+
overwrite: bool
|
1569
|
+
Should the directory be overwritten if it already exists?
|
1570
|
+
dogmatic: bool
|
1571
|
+
If True then treat genes, transcript, and proteins as separate species. If False
|
1572
|
+
then treat them interchangeably.
|
1573
|
+
|
1574
|
+
Returns
|
1575
|
+
-------
|
1576
|
+
None
|
1577
|
+
|
1578
|
+
"""
|
1579
|
+
|
1580
|
+
if not isinstance(model_prefix, str):
|
1581
|
+
raise TypeError(f"model_prefix was a {type(model_prefix)} " "and must be a str")
|
1582
|
+
if not isinstance(sbml_dfs, SBML_dfs):
|
1583
|
+
raise TypeError(
|
1584
|
+
f"sbml_dfs was a {type(sbml_dfs)} and must" " be an sbml.SBML_dfs"
|
1585
|
+
)
|
1586
|
+
# select valid BQB attributes based on dogmatic flag
|
1587
|
+
defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
|
1588
|
+
|
1589
|
+
# pre-summarize ontologies
|
1590
|
+
species_identifiers = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
|
1591
|
+
# drop some BQB_HAS_PART annotations
|
1592
|
+
species_identifiers = filter_to_characteristic_species_ids(
|
1593
|
+
species_identifiers,
|
1594
|
+
defining_biological_qualifiers=defining_biological_qualifiers,
|
1595
|
+
)
|
1596
|
+
|
1597
|
+
try:
|
1598
|
+
utils.initialize_dir(outdir, overwrite=overwrite)
|
1599
|
+
except FileExistsError:
|
1600
|
+
logger.warning(
|
1601
|
+
f"Directory {outdir} already exists and overwrite is False. "
|
1602
|
+
"Files will be added to the existing directory."
|
1603
|
+
)
|
1604
|
+
with open_fs(outdir, writeable=True) as fs:
|
1605
|
+
species_identifiers_path = (
|
1606
|
+
model_prefix + CPR_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
|
1607
|
+
)
|
1608
|
+
with fs.openbin(species_identifiers_path, "w") as f:
|
1609
|
+
species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
|
1610
|
+
f, sep="\t", index=False
|
1611
|
+
)
|
1612
|
+
|
1613
|
+
# export jsons
|
1614
|
+
species_path = model_prefix + CPR_STANDARD_OUTPUTS.SPECIES
|
1615
|
+
reactions_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTIONS
|
1616
|
+
reation_species_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTION_SPECIES
|
1617
|
+
compartments_path = model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTS
|
1618
|
+
compartmentalized_species_path = (
|
1619
|
+
model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
|
1620
|
+
)
|
1621
|
+
with fs.openbin(species_path, "w") as f:
|
1622
|
+
sbml_dfs.species[[SBML_DFS.S_NAME]].to_json(f)
|
1623
|
+
|
1624
|
+
with fs.openbin(reactions_path, "w") as f:
|
1625
|
+
sbml_dfs.reactions[[SBML_DFS.R_NAME]].to_json(f)
|
1626
|
+
|
1627
|
+
with fs.openbin(reation_species_path, "w") as f:
|
1628
|
+
sbml_dfs.reaction_species.to_json(f)
|
1629
|
+
|
1630
|
+
with fs.openbin(compartments_path, "w") as f:
|
1631
|
+
sbml_dfs.compartments[[SBML_DFS.C_NAME]].to_json(f)
|
1632
|
+
|
1633
|
+
with fs.openbin(compartmentalized_species_path, "w") as f:
|
1634
|
+
sbml_dfs.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
|
1635
|
+
f
|
1636
|
+
)
|
1637
|
+
|
1638
|
+
return None
|
1639
|
+
|
1640
|
+
|
1641
|
+
def sbml_dfs_from_edgelist(
|
1642
|
+
interaction_edgelist: pd.DataFrame,
|
1643
|
+
species_df: pd.DataFrame,
|
1644
|
+
compartments_df: pd.DataFrame,
|
1645
|
+
interaction_source: source.Source,
|
1646
|
+
upstream_stoichiometry: int = 0,
|
1647
|
+
downstream_stoichiometry: int = 1,
|
1648
|
+
downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
|
1649
|
+
keep_species_data: bool | str = False,
|
1650
|
+
keep_reactions_data: bool | str = False,
|
1651
|
+
) -> SBML_dfs:
|
1652
|
+
"""
|
1653
|
+
Create SBML_dfs from Edgelist
|
1654
|
+
|
1655
|
+
Combine a set of interactions into an sbml.SBML_dfs mechanistic model
|
1656
|
+
|
1657
|
+
Parameters:
|
1658
|
+
interaction_edgelist (pd.DataFrame): A table containing interactions:
|
1659
|
+
- upstream_name (str): matching "s_name" from "species_df"
|
1660
|
+
- downstream_name (str): matching "s_name" from "species_df"
|
1661
|
+
- upstream_compartment (str): compartment of "upstream_name"
|
1662
|
+
with names matching "c_name" from "compartments_df"
|
1663
|
+
- downstream_compartment (str): compartment of "downstream_name"
|
1664
|
+
with names matching "c_name" from "compartments_df"
|
1665
|
+
- r_name (str): a name for the interaction
|
1666
|
+
- sbo_term (str): sbo term defining the type of
|
1667
|
+
molecular interaction (see MINI_SBO_FROM_NAME)
|
1668
|
+
- r_Identifiers (identifiers.Identifiers): identifiers
|
1669
|
+
supporting the interaction (e.g., pubmed ids)
|
1670
|
+
- r_isreversible (bool): Is this reaction reversible?
|
1671
|
+
If True, the reaction is reversible
|
1672
|
+
By default, the interactions of TRRUST networks are irreversible, and reversible for STRING networks
|
1673
|
+
species_df (pd.DataFrame): A table defining unique molecular
|
1674
|
+
species participating in "interaction_edgelist":
|
1675
|
+
- s_name (str): name of molecular species
|
1676
|
+
- s_Identifiers (identifiers.Identifiers): identifiers
|
1677
|
+
defining the species
|
1678
|
+
compartments_df (pd.DataFrame): A table defining compartments
|
1679
|
+
where interactions are occurring "interaction_edgelist":
|
1680
|
+
- c_name (str): name of compartment
|
1681
|
+
- c_Identifiers (identifiers.Identifiers):
|
1682
|
+
identifiers defining the compartment (see
|
1683
|
+
bigg.annotate_recon() for a set of names > go categories)
|
1684
|
+
interaction_source (source.Source): A source object
|
1685
|
+
which will tie model entities to the interaction source
|
1686
|
+
upstream_stoichiometry (int): stoichiometry of
|
1687
|
+
upstream species in reaction
|
1688
|
+
downstream_stoichiometry (int): stoichiometry of
|
1689
|
+
downstream species in reaction
|
1690
|
+
downstream_sbo_name (str): sbo term defining the
|
1691
|
+
type of molecular interaction for the downstream reactand
|
1692
|
+
(see MINI_SBO_FROM_NAME)
|
1693
|
+
keep_species_data (bool | str): Should species data
|
1694
|
+
be kept in the model? If True, all species data will be kept
|
1695
|
+
and saved as "species_data" in the SBML_dfs. The label will be 'source'
|
1696
|
+
If False, no species data will be kept.
|
1697
|
+
If a string: label for the species data to be kept.
|
1698
|
+
keep_reactions_data (bool | str): Should reaction data be kept in the model?
|
1699
|
+
If True, all reaction data will be kept and saved
|
1700
|
+
as "reactions_data" in the SBML_dfs. The label will be 'source'.
|
1701
|
+
If False, no reaction data will be kept.
|
1702
|
+
If a string: label for the reaction data to be kept.
|
1703
|
+
|
1704
|
+
Returns:
|
1705
|
+
sbml.SBML_dfs
|
1706
|
+
|
1707
|
+
"""
|
1708
|
+
|
1709
|
+
# check input dfs for required variables
|
1710
|
+
_sbml_dfs_from_edgelist_validate_inputs(
|
1711
|
+
interaction_edgelist, species_df, compartments_df
|
1712
|
+
)
|
1713
|
+
|
1714
|
+
# Identify extra columns in the input data.
|
1715
|
+
# if keep_reactions_data is True, this will be added
|
1716
|
+
# as `reaction_data`
|
1717
|
+
interaction_edgelist_required_vars = {
|
1718
|
+
"upstream_name",
|
1719
|
+
"downstream_name",
|
1720
|
+
"upstream_compartment",
|
1721
|
+
"downstream_compartment",
|
1722
|
+
SBML_DFS.R_NAME,
|
1723
|
+
SBML_DFS.SBO_TERM,
|
1724
|
+
SBML_DFS.R_IDENTIFIERS,
|
1725
|
+
SBML_DFS.R_ISREVERSIBLE,
|
1726
|
+
}
|
1727
|
+
if keep_reactions_data is not False:
|
1728
|
+
extra_reactions_columns = [
|
1729
|
+
c
|
1730
|
+
for c in interaction_edgelist.columns
|
1731
|
+
if c not in interaction_edgelist_required_vars
|
1732
|
+
]
|
1733
|
+
else:
|
1734
|
+
extra_reactions_columns = []
|
1735
|
+
# Extra species columns
|
1736
|
+
if keep_species_data is not False:
|
1737
|
+
extra_species_columns = [
|
1738
|
+
c
|
1739
|
+
for c in species_df.columns
|
1740
|
+
if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
1741
|
+
]
|
1742
|
+
else:
|
1743
|
+
extra_species_columns = []
|
1744
|
+
|
1745
|
+
# format compartments
|
1746
|
+
compartments_df[SBML_DFS.C_SOURCE] = interaction_source
|
1747
|
+
compartments_df[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
|
1748
|
+
range(compartments_df.shape[0]), SBML_DFS.C_ID
|
1749
|
+
)
|
1750
|
+
compartments_df = compartments_df.set_index(SBML_DFS.C_ID)[
|
1751
|
+
[SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
|
1752
|
+
]
|
1753
|
+
|
1754
|
+
# format species
|
1755
|
+
species_df[SBML_DFS.S_SOURCE] = interaction_source
|
1756
|
+
species_df[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
|
1757
|
+
range(species_df.shape[0]), SBML_DFS.S_ID
|
1758
|
+
)
|
1759
|
+
|
1760
|
+
required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
|
1761
|
+
species_df = species_df.set_index(SBML_DFS.S_ID)[
|
1762
|
+
required_cols + extra_species_columns
|
1763
|
+
]
|
1764
|
+
# Keep extra columns to save them as extra data
|
1765
|
+
species_data = species_df[extra_species_columns]
|
1766
|
+
# Remove extra columns
|
1767
|
+
species_df = species_df[required_cols]
|
1768
|
+
|
1769
|
+
# create compartmentalized species
|
1770
|
+
|
1771
|
+
# define all distinct upstream and downstream compartmentalized species
|
1772
|
+
comp_species = pd.concat(
|
1773
|
+
[
|
1774
|
+
interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
|
1775
|
+
{
|
1776
|
+
"upstream_name": SBML_DFS.S_NAME,
|
1777
|
+
"upstream_compartment": SBML_DFS.C_NAME,
|
1778
|
+
},
|
1779
|
+
axis=1,
|
1780
|
+
),
|
1781
|
+
interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
|
1782
|
+
{
|
1783
|
+
"downstream_name": SBML_DFS.S_NAME,
|
1784
|
+
"downstream_compartment": SBML_DFS.C_NAME,
|
1785
|
+
},
|
1786
|
+
axis=1,
|
1787
|
+
),
|
1788
|
+
]
|
1789
|
+
).drop_duplicates()
|
1790
|
+
|
1791
|
+
# merge to add species and compartments primary keys
|
1792
|
+
comp_species_w_ids = comp_species.merge(
|
1793
|
+
species_df[SBML_DFS.S_NAME].reset_index(),
|
1794
|
+
how="left",
|
1795
|
+
left_on=SBML_DFS.S_NAME,
|
1796
|
+
right_on=SBML_DFS.S_NAME,
|
1797
|
+
).merge(
|
1798
|
+
compartments_df[SBML_DFS.C_NAME].reset_index(),
|
1799
|
+
how="left",
|
1800
|
+
left_on=SBML_DFS.C_NAME,
|
1801
|
+
right_on=SBML_DFS.C_NAME,
|
1802
|
+
)
|
1803
|
+
|
1804
|
+
# check whether all species and compartments exist
|
1805
|
+
_sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
|
1806
|
+
|
1807
|
+
# name compounds
|
1808
|
+
comp_species_w_ids[SBML_DFS.SC_NAME] = [
|
1809
|
+
f"{s} [{c}]"
|
1810
|
+
for s, c in zip(
|
1811
|
+
comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
|
1812
|
+
)
|
1813
|
+
]
|
1814
|
+
# add source object
|
1815
|
+
comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
|
1816
|
+
# name index
|
1817
|
+
comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
|
1818
|
+
range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
|
1819
|
+
)
|
1820
|
+
comp_species_w_ids = comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
|
1821
|
+
[SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
|
1822
|
+
]
|
1823
|
+
|
1824
|
+
# create reactions
|
1825
|
+
|
1826
|
+
# create a from cs_species -> to cs_species edgelist
|
1827
|
+
# interaction_edgelist
|
1828
|
+
comp_species_w_names = (
|
1829
|
+
comp_species_w_ids.reset_index()
|
1830
|
+
.merge(species_df[SBML_DFS.S_NAME].reset_index())
|
1831
|
+
.merge(compartments_df[SBML_DFS.C_NAME].reset_index())
|
1832
|
+
)
|
1833
|
+
|
1834
|
+
interaction_edgelist_w_cspecies = interaction_edgelist.merge(
|
1835
|
+
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
1836
|
+
{
|
1837
|
+
SBML_DFS.SC_ID: "sc_id_up",
|
1838
|
+
SBML_DFS.S_NAME: "upstream_name",
|
1839
|
+
SBML_DFS.C_NAME: "upstream_compartment",
|
1840
|
+
},
|
1841
|
+
axis=1,
|
1842
|
+
),
|
1843
|
+
how="left",
|
1844
|
+
).merge(
|
1845
|
+
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
1846
|
+
{
|
1847
|
+
SBML_DFS.SC_ID: "sc_id_down",
|
1848
|
+
SBML_DFS.S_NAME: "downstream_name",
|
1849
|
+
SBML_DFS.C_NAME: "downstream_compartment",
|
1850
|
+
},
|
1851
|
+
axis=1,
|
1852
|
+
),
|
1853
|
+
how="left",
|
1854
|
+
)[
|
1855
|
+
REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
|
1856
|
+
]
|
1857
|
+
|
1858
|
+
# some extra checks
|
1859
|
+
if interaction_edgelist.shape[0] != interaction_edgelist_w_cspecies.shape[0]:
|
1860
|
+
raise ValueError(
|
1861
|
+
"Merging compartmentalized species to interaction_edgelist"
|
1862
|
+
" resulted in an increase in the tables from "
|
1863
|
+
f"{interaction_edgelist.shape[0]} to "
|
1864
|
+
f"{interaction_edgelist_w_cspecies.shape[0]} indicating"
|
1865
|
+
" a 1-many join which should have been 1-1"
|
1866
|
+
)
|
1867
|
+
|
1868
|
+
# create one reaction per interaction
|
1869
|
+
interaction_edgelist_w_cspecies[SBML_DFS.R_SOURCE] = interaction_source
|
1870
|
+
interaction_edgelist_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
|
1871
|
+
range(interaction_edgelist_w_cspecies.shape[0]), SBML_DFS.R_ID
|
1872
|
+
)
|
1873
|
+
|
1874
|
+
reactions_df_columns = [
|
1875
|
+
SBML_DFS.R_NAME,
|
1876
|
+
SBML_DFS.R_IDENTIFIERS,
|
1877
|
+
SBML_DFS.R_SOURCE,
|
1878
|
+
SBML_DFS.R_ISREVERSIBLE,
|
1879
|
+
]
|
1880
|
+
reactions_df = interaction_edgelist_w_cspecies.copy().set_index(SBML_DFS.R_ID)[
|
1881
|
+
reactions_df_columns + extra_reactions_columns
|
1882
|
+
]
|
1883
|
+
# Keep extra columns to save them as extra data
|
1884
|
+
reactions_data = reactions_df[extra_reactions_columns]
|
1885
|
+
reactions_df = reactions_df[reactions_df_columns]
|
1886
|
+
|
1887
|
+
# define upstream and downstream comp species as reaction species
|
1888
|
+
reaction_species_df = pd.concat(
|
1889
|
+
[
|
1890
|
+
# upstream interactions are defined by sbo_term and should generally
|
1891
|
+
# be modifiers/stimulator/inhibitor/interactor
|
1892
|
+
interaction_edgelist_w_cspecies[["sc_id_up", "sbo_term", "r_id"]]
|
1893
|
+
.assign(stoichiometry=upstream_stoichiometry)
|
1894
|
+
.rename({"sc_id_up": "sc_id"}, axis=1),
|
1895
|
+
# downstream interactions indicate some modification of the state
|
1896
|
+
# of the species and hence are defined as product
|
1897
|
+
interaction_edgelist_w_cspecies[["sc_id_down", "r_id"]]
|
1898
|
+
.assign(
|
1899
|
+
stoichiometry=downstream_stoichiometry,
|
1900
|
+
sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
|
1901
|
+
)
|
1902
|
+
.rename({"sc_id_down": "sc_id"}, axis=1),
|
1903
|
+
]
|
1904
|
+
)
|
1905
|
+
reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
|
1906
|
+
range(reaction_species_df.shape[0]), "rsc_id"
|
1907
|
+
)
|
1908
|
+
reaction_species_df = reaction_species_df.set_index("rsc_id")
|
1909
|
+
|
1910
|
+
# form sbml_dfs object
|
1911
|
+
sbml_tbl_dict: MutableMapping[str, pd.DataFrame | dict[str, pd.DataFrame]] = {
|
1912
|
+
"compartments": compartments_df,
|
1913
|
+
"species": species_df,
|
1914
|
+
"compartmentalized_species": comp_species_w_ids,
|
1915
|
+
"reactions": reactions_df,
|
1916
|
+
"reaction_species": reaction_species_df,
|
1917
|
+
}
|
1918
|
+
if len(extra_reactions_columns) > 0:
|
1919
|
+
if isinstance(keep_reactions_data, str):
|
1920
|
+
reactions_data_label = keep_reactions_data
|
1921
|
+
else:
|
1922
|
+
reactions_data_label = "source"
|
1923
|
+
sbml_tbl_dict["reactions_data"] = {reactions_data_label: reactions_data}
|
1924
|
+
|
1925
|
+
if len(extra_species_columns) > 0:
|
1926
|
+
if isinstance(keep_species_data, str):
|
1927
|
+
species_data_label = keep_species_data
|
1928
|
+
else:
|
1929
|
+
species_data_label = "source"
|
1930
|
+
sbml_tbl_dict["species_data"] = {species_data_label: species_data}
|
1931
|
+
|
1932
|
+
sbml_model = SBML_dfs(sbml_tbl_dict)
|
1933
|
+
sbml_model.validate()
|
1934
|
+
|
1935
|
+
return sbml_model
|
1936
|
+
|
1937
|
+
|
1938
|
+
def find_underspecified_reactions(
|
1939
|
+
sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
|
1940
|
+
) -> set[str]:
|
1941
|
+
"""
|
1942
|
+
Find Underspecified reactions
|
1943
|
+
|
1944
|
+
Identity reactions which should be removed if a set of molecular species are removed
|
1945
|
+
from the system.
|
1946
|
+
|
1947
|
+
Params:
|
1948
|
+
sbml_dfs (SBML_dfs):
|
1949
|
+
A pathway representation
|
1950
|
+
sc_ids (list[str])
|
1951
|
+
A list of compartmentalized species ids (sc_ids) which will be removed.
|
1952
|
+
|
1953
|
+
Returns:
|
1954
|
+
underspecified_reactions (set[str]):
|
1955
|
+
A list of reactions which should be removed because they will not occur once
|
1956
|
+
\"sc_ids\" are removed.
|
1957
|
+
|
1958
|
+
"""
|
1959
|
+
|
1960
|
+
updated_reaction_species = sbml_dfs.reaction_species.copy()
|
1961
|
+
updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
|
1962
|
+
sc_ids
|
1963
|
+
)
|
1964
|
+
|
1965
|
+
updated_reaction_species = (
|
1966
|
+
updated_reaction_species.assign(
|
1967
|
+
sbo_role=updated_reaction_species[SBML_DFS.SBO_TERM]
|
1968
|
+
)
|
1969
|
+
.replace({"sbo_role": MINI_SBO_TO_NAME})
|
1970
|
+
.replace({"sbo_role": SBO_NAME_TO_ROLE})
|
1971
|
+
)
|
1972
|
+
|
1973
|
+
reactions_with_lost_defining_members = set(
|
1974
|
+
updated_reaction_species.query("~new")
|
1975
|
+
.query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
|
1976
|
+
.tolist()
|
1977
|
+
)
|
1978
|
+
|
1979
|
+
N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
|
1980
|
+
if N_reactions_with_lost_defining_members > 0:
|
1981
|
+
logger.info(
|
1982
|
+
f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
|
1983
|
+
)
|
1984
|
+
|
1985
|
+
# for each reaction what are the required sbo_terms?
|
1986
|
+
reactions_with_requirements = (
|
1987
|
+
updated_reaction_species.query("sbo_role == 'REQUIRED'")[
|
1988
|
+
["r_id", "sbo_term", "new"]
|
1989
|
+
]
|
1990
|
+
.drop_duplicates()
|
1991
|
+
.reset_index(drop=True)
|
1992
|
+
)
|
1993
|
+
|
1994
|
+
# which required members are still present after removing some entries
|
1995
|
+
reactions_with_lost_requirements = set(
|
1996
|
+
reactions_with_requirements.query("~new")
|
1997
|
+
.merge(
|
1998
|
+
reactions_with_requirements.query("new").rename(
|
1999
|
+
{"new": "still_present"}, axis=1
|
2000
|
+
),
|
2001
|
+
how="left",
|
2002
|
+
)
|
2003
|
+
.fillna(False)[SBML_DFS.R_ID]
|
2004
|
+
.tolist()
|
2005
|
+
)
|
2006
|
+
|
2007
|
+
N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
|
2008
|
+
if N_reactions_with_lost_requirements > 0:
|
2009
|
+
logger.info(
|
2010
|
+
f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
|
2011
|
+
)
|
2012
|
+
|
2013
|
+
underspecified_reactions = reactions_with_lost_defining_members.union(
|
2014
|
+
reactions_with_lost_requirements
|
2015
|
+
)
|
2016
|
+
|
2017
|
+
return underspecified_reactions
|
2018
|
+
|
2019
|
+
|
2020
|
+
def _sbml_dfs_from_edgelist_validate_inputs(
|
2021
|
+
interaction_edgelist: pd.DataFrame,
|
2022
|
+
species_df: pd.DataFrame,
|
2023
|
+
compartments_df: pd.DataFrame,
|
2024
|
+
) -> None:
|
2025
|
+
"""Check that the inputs for creating an SBML_dfs from an edgelist are appropriate."""
|
2026
|
+
|
2027
|
+
# check compartments
|
2028
|
+
compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
|
2029
|
+
compartments_df_columns = set(compartments_df.columns.tolist())
|
2030
|
+
missing_required_fields = compartments_df_expected_vars.difference(
|
2031
|
+
compartments_df_columns
|
2032
|
+
)
|
2033
|
+
if len(missing_required_fields) > 0:
|
2034
|
+
raise ValueError(
|
2035
|
+
f"{', '.join(missing_required_fields)} are required variables"
|
2036
|
+
' in "compartments_df" but were not present in the input file.'
|
2037
|
+
)
|
2038
|
+
|
2039
|
+
# check species
|
2040
|
+
species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
2041
|
+
species_df_columns = set(species_df.columns.tolist())
|
2042
|
+
missing_required_fields = species_df_expected_vars.difference(species_df_columns)
|
2043
|
+
if len(missing_required_fields) > 0:
|
2044
|
+
raise ValueError(
|
2045
|
+
f"{', '.join(missing_required_fields)} are required"
|
2046
|
+
' variables in "species_df" but were not present '
|
2047
|
+
"in the input file."
|
2048
|
+
)
|
2049
|
+
|
2050
|
+
# check interactions
|
2051
|
+
interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
|
2052
|
+
missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
|
2053
|
+
interaction_edgelist_columns
|
2054
|
+
)
|
2055
|
+
if len(missing_required_fields) > 0:
|
2056
|
+
raise ValueError(
|
2057
|
+
f"{', '.join(missing_required_fields)} are required "
|
2058
|
+
'variables in "interaction_edgelist" but were not '
|
2059
|
+
"present in the input file."
|
2060
|
+
)
|
2061
|
+
|
2062
|
+
return None
|
2063
|
+
|
2064
|
+
|
2065
|
+
def _sbml_dfs_from_edgelist_check_cspecies_merge(
|
2066
|
+
merged_species: pd.DataFrame, original_species: pd.DataFrame
|
2067
|
+
) -> None:
|
2068
|
+
"""Check for a mismatch between the provided species data and species implied by the edgelist."""
|
2069
|
+
|
2070
|
+
# check for 1-many merge
|
2071
|
+
if merged_species.shape[0] != original_species.shape[0]:
|
2072
|
+
raise ValueError(
|
2073
|
+
"Merging compartmentalized species to species_df"
|
2074
|
+
" and compartments_df by names resulted in an "
|
2075
|
+
f"increase in the tables from {original_species.shape[0]}"
|
2076
|
+
f" to {merged_species.shape[0]} indicating that names were"
|
2077
|
+
" not unique"
|
2078
|
+
)
|
2079
|
+
|
2080
|
+
# check for missing species and compartments
|
2081
|
+
missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
|
2082
|
+
SBML_DFS.C_NAME
|
2083
|
+
].unique()
|
2084
|
+
if len(missing_compartments) >= 1:
|
2085
|
+
raise ValueError(
|
2086
|
+
f"{len(missing_compartments)} compartments were present in"
|
2087
|
+
' "interaction_edgelist" but not "compartments_df":'
|
2088
|
+
f" {', '.join(missing_compartments)}"
|
2089
|
+
)
|
2090
|
+
|
2091
|
+
missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
|
2092
|
+
SBML_DFS.S_NAME
|
2093
|
+
].unique()
|
2094
|
+
if len(missing_species) >= 1:
|
2095
|
+
raise ValueError(
|
2096
|
+
f"{len(missing_species)} species were present in "
|
2097
|
+
'"interaction_edgelist" but not "species_df":'
|
2098
|
+
f" {', '.join(missing_species)}"
|
2099
|
+
)
|
2100
|
+
|
2101
|
+
return None
|
2102
|
+
|
2103
|
+
|
2104
|
+
def _stub_compartments(
|
2105
|
+
stubbed_compartment: str = "CELLULAR_COMPONENT",
|
2106
|
+
) -> pd.DataFrame:
|
2107
|
+
"""Stub Compartments
|
2108
|
+
|
2109
|
+
Create a compartments table with only a single compartment
|
2110
|
+
|
2111
|
+
Args:
|
2112
|
+
stubbed_compartment (str): the name of a compartment which should match the
|
2113
|
+
keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
|
2114
|
+
|
2115
|
+
Returns:
|
2116
|
+
compartments_df (pd.DataFrame): compartments dataframe
|
2117
|
+
"""
|
2118
|
+
|
2119
|
+
if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
|
2120
|
+
raise ValueError(
|
2121
|
+
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
|
2122
|
+
)
|
2123
|
+
|
2124
|
+
if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
|
2125
|
+
raise ValueError(
|
2126
|
+
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
|
2127
|
+
)
|
2128
|
+
|
2129
|
+
stubbed_compartment_name = COMPARTMENTS[stubbed_compartment]
|
2130
|
+
stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
|
2131
|
+
|
2132
|
+
formatted_uri = identifiers.format_uri(
|
2133
|
+
uri=identifiers.create_uri_url(
|
2134
|
+
ontology=ONTOLOGIES.GO,
|
2135
|
+
identifier=stubbed_compartment_id,
|
2136
|
+
),
|
2137
|
+
biological_qualifier_type=BQB.IS,
|
2138
|
+
)
|
2139
|
+
|
2140
|
+
compartments_df = pd.DataFrame(
|
2141
|
+
{
|
2142
|
+
SBML_DFS.C_NAME: [stubbed_compartment_name],
|
2143
|
+
SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
|
2144
|
+
}
|
2145
|
+
)
|
2146
|
+
compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
|
2147
|
+
compartments_df.index.name = SBML_DFS.C_ID
|
2148
|
+
|
2149
|
+
return compartments_df
|
2150
|
+
|
2151
|
+
|
2152
|
+
def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
|
2153
|
+
"""Validates a table against a reference
|
2154
|
+
|
2155
|
+
This check if the table has the same index, no duplicates in the index
|
2156
|
+
and that all values in the index are in the reference table.
|
2157
|
+
|
2158
|
+
Args:
|
2159
|
+
data_table (pd.DataFrame): a table with data that should
|
2160
|
+
match the reference
|
2161
|
+
ref_table (pd.DataFrame): a reference table
|
2162
|
+
|
2163
|
+
Raises:
|
2164
|
+
ValueError: not same index name
|
2165
|
+
ValueError: index contains duplicates
|
2166
|
+
ValueError: index not subset of index of reactions table
|
2167
|
+
"""
|
2168
|
+
ref_index_name = ref_table.index.name
|
2169
|
+
if data_table.index.name != ref_index_name:
|
2170
|
+
raise ValueError(
|
2171
|
+
"the index name for reaction data table was not"
|
2172
|
+
f" {ref_index_name}: {data_table.index.name}"
|
2173
|
+
)
|
2174
|
+
ids = data_table.index
|
2175
|
+
if any(ids.duplicated()):
|
2176
|
+
raise ValueError(
|
2177
|
+
"the index for reaction data table " "contained duplicate values"
|
2178
|
+
)
|
2179
|
+
if not all(ids.isin(ref_table.index)):
|
2180
|
+
raise ValueError(
|
2181
|
+
"the index for reaction data table contained values"
|
2182
|
+
" not found in the reactions table"
|
2183
|
+
)
|
2184
|
+
if not isinstance(data_table, pd.DataFrame):
|
2185
|
+
raise TypeError(
|
2186
|
+
f"The data table was type {type(data_table).__name__}"
|
2187
|
+
" but must be a pd.DataFrame"
|
2188
|
+
)
|
2189
|
+
|
2190
|
+
|
2191
|
+
def species_type_types(x):
|
2192
|
+
"""Assign a high-level molecule type to a molecular species"""
|
2193
|
+
|
2194
|
+
if isinstance(x, identifiers.Identifiers):
|
2195
|
+
if x.filter(["chebi"]):
|
2196
|
+
return "metabolite"
|
2197
|
+
elif x.filter(["molodex"]):
|
2198
|
+
return "drug"
|
2199
|
+
else:
|
2200
|
+
return "protein"
|
2201
|
+
else:
|
2202
|
+
return "unknown"
|
2203
|
+
|
2204
|
+
|
2205
|
+
def stub_ids(ids):
|
2206
|
+
if len(ids) == 0:
|
2207
|
+
return pd.DataFrame(
|
2208
|
+
{
|
2209
|
+
IDENTIFIERS.ONTOLOGY: [None],
|
2210
|
+
IDENTIFIERS.IDENTIFIER: [None],
|
2211
|
+
IDENTIFIERS.URL: [None],
|
2212
|
+
IDENTIFIERS.BQB: [None],
|
2213
|
+
}
|
2214
|
+
)
|
2215
|
+
else:
|
2216
|
+
return pd.DataFrame(ids)
|