napistu 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +18 -18
- napistu/consensus.py +3 -2
- napistu/constants.py +5 -5
- napistu/context/filtering.py +2 -1
- napistu/identifiers.py +3 -6
- napistu/ingestion/bigg.py +6 -6
- napistu/ingestion/string.py +2 -1
- napistu/ingestion/yeast.py +2 -1
- napistu/matching/interactions.py +4 -4
- napistu/modify/uncompartmentalize.py +1 -1
- napistu/network/ig_utils.py +35 -0
- napistu/network/net_create.py +1 -1
- napistu/network/paths.py +1 -1
- napistu/network/precompute.py +2 -1
- napistu/ontologies/dogma.py +2 -1
- napistu/sbml_dfs_core.py +1330 -2016
- napistu/sbml_dfs_utils.py +1082 -143
- napistu/source.py +1 -1
- {napistu-0.3.4.dist-info → napistu-0.3.6.dist-info}/METADATA +2 -2
- {napistu-0.3.4.dist-info → napistu-0.3.6.dist-info}/RECORD +32 -32
- tests/conftest.py +43 -0
- tests/test_consensus.py +88 -0
- tests/test_context_filtering.py +2 -2
- tests/test_network_ig_utils.py +36 -0
- tests/test_ontologies_genodexito.py +3 -0
- tests/test_ontologies_mygene.py +3 -0
- tests/test_sbml_dfs_core.py +221 -191
- tests/test_sbml_dfs_utils.py +194 -36
- {napistu-0.3.4.dist-info → napistu-0.3.6.dist-info}/WHEEL +0 -0
- {napistu-0.3.4.dist-info → napistu-0.3.6.dist-info}/entry_points.txt +0 -0
- {napistu-0.3.4.dist-info → napistu-0.3.6.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.3.4.dist-info → napistu-0.3.6.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_core.py
CHANGED
@@ -7,8 +7,12 @@ from typing import Iterable
|
|
7
7
|
from typing import Mapping
|
8
8
|
from typing import MutableMapping
|
9
9
|
from typing import TYPE_CHECKING
|
10
|
+
from typing import Optional
|
11
|
+
from typing import Union
|
10
12
|
|
13
|
+
from fs import open_fs
|
11
14
|
import pandas as pd
|
15
|
+
|
12
16
|
from napistu import identifiers
|
13
17
|
from napistu import sbml_dfs_utils
|
14
18
|
from napistu import source
|
@@ -17,25 +21,14 @@ from napistu.ingestion import sbml
|
|
17
21
|
from napistu.constants import SBML_DFS
|
18
22
|
from napistu.constants import SBML_DFS_SCHEMA
|
19
23
|
from napistu.constants import IDENTIFIERS
|
20
|
-
from napistu.constants import
|
21
|
-
from napistu.constants import CPR_STANDARD_OUTPUTS
|
22
|
-
from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
|
24
|
+
from napistu.constants import NAPISTU_STANDARD_OUTPUTS
|
23
25
|
from napistu.constants import BQB_PRIORITIES
|
24
26
|
from napistu.constants import ONTOLOGY_PRIORITIES
|
25
|
-
from napistu.constants import BQB
|
26
|
-
from napistu.constants import BQB_DEFINING_ATTRS
|
27
27
|
from napistu.constants import MINI_SBO_FROM_NAME
|
28
28
|
from napistu.constants import MINI_SBO_TO_NAME
|
29
|
-
from napistu.constants import ONTOLOGIES
|
30
|
-
from napistu.constants import SBO_NAME_TO_ROLE
|
31
29
|
from napistu.constants import SBOTERM_NAMES
|
32
|
-
from napistu.constants import SBO_ROLES_DEFS
|
33
30
|
from napistu.constants import ENTITIES_W_DATA
|
34
31
|
from napistu.constants import ENTITIES_TO_ENTITY_DATA
|
35
|
-
from napistu.ingestion.constants import GENERIC_COMPARTMENT
|
36
|
-
from napistu.ingestion.constants import COMPARTMENT_ALIASES
|
37
|
-
from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
|
38
|
-
from fs import open_fs
|
39
32
|
|
40
33
|
logger = logging.getLogger(__name__)
|
41
34
|
|
@@ -65,26 +58,76 @@ class SBML_dfs:
|
|
65
58
|
schema : dict
|
66
59
|
Dictionary representing the structure of the other attributes and meaning of their variables
|
67
60
|
|
68
|
-
Methods
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
61
|
+
Public Methods (alphabetical)
|
62
|
+
----------------------------
|
63
|
+
add_reactions_data(label, data)
|
64
|
+
Add a new reactions data table to the model with validation.
|
65
|
+
add_species_data(label, data)
|
66
|
+
Add a new species data table to the model with validation.
|
67
|
+
export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
|
68
|
+
Export the SBML_dfs model and its tables to files in a specified directory.
|
69
|
+
get_characteristic_species_ids(dogmatic=True)
|
70
|
+
Return characteristic systematic identifiers for molecular species, optionally using a strict or loose definition.
|
76
71
|
get_cspecies_features()
|
77
|
-
|
78
|
-
get_species_features()
|
79
|
-
Get additional attributes of species
|
72
|
+
Compute and return additional features for compartmentalized species, such as degree and type.
|
80
73
|
get_identifiers(id_type)
|
81
|
-
|
82
|
-
|
83
|
-
|
74
|
+
Retrieve a table of identifiers for a specified entity type (e.g., species or reactions).
|
75
|
+
get_network_summary()
|
76
|
+
Return a dictionary of diagnostic statistics summarizing the network structure.
|
77
|
+
get_species_features()
|
78
|
+
Compute and return additional features for species, such as species type.
|
79
|
+
get_table(entity_type, required_attributes=None)
|
80
|
+
Retrieve a table for a given entity type, optionally validating required attributes.
|
81
|
+
get_uri_urls(entity_type, entity_ids=None, required_ontology=None)
|
82
|
+
Return reference URLs for specified entities, optionally filtered by ontology.
|
83
|
+
infer_sbo_terms()
|
84
|
+
Infer and fill in missing SBO terms for reaction species based on stoichiometry.
|
85
|
+
infer_uncompartmentalized_species_location()
|
86
|
+
Infer and assign compartments for compartmentalized species with missing compartment information.
|
87
|
+
name_compartmentalized_species()
|
88
|
+
Rename compartmentalized species to include compartment information if needed.
|
89
|
+
reaction_formulas(r_ids=None)
|
90
|
+
Generate human-readable reaction formulas for specified reactions.
|
91
|
+
reaction_summaries(r_ids=None)
|
92
|
+
Return a summary DataFrame for specified reactions, including names and formulas.
|
93
|
+
remove_compartmentalized_species(sc_ids)
|
94
|
+
Remove specified compartmentalized species and associated reactions from the model.
|
95
|
+
remove_reactions(r_ids, remove_species=False)
|
96
|
+
Remove specified reactions and optionally remove unused species.
|
97
|
+
remove_reactions_data(label)
|
98
|
+
Remove a reactions data table by label.
|
99
|
+
remove_species_data(label)
|
100
|
+
Remove a species data table by label.
|
101
|
+
search_by_ids(ids, entity_type, identifiers_df, ontologies=None)
|
102
|
+
Find entities and identifiers matching a set of query IDs.
|
103
|
+
search_by_name(name, entity_type, partial_match=True)
|
104
|
+
Find entities by exact or partial name match.
|
105
|
+
select_species_data(species_data_table)
|
106
|
+
Select a species data table from the SBML_dfs object by name.
|
107
|
+
species_status(s_id)
|
108
|
+
Return all reactions a species participates in, with stoichiometry and formula information.
|
84
109
|
validate()
|
85
|
-
Validate the SBML_dfs structure and relationships
|
110
|
+
Validate the SBML_dfs structure and relationships.
|
86
111
|
validate_and_resolve()
|
87
|
-
Validate and attempt to automatically fix common issues
|
112
|
+
Validate and attempt to automatically fix common issues.
|
113
|
+
|
114
|
+
Private/Hidden Methods (alphabetical, appear after public methods)
|
115
|
+
-----------------------------------------------------------------
|
116
|
+
_attempt_resolve(e)
|
117
|
+
_check_pk_fk_correspondence()
|
118
|
+
_find_underspecified_reactions_by_scids(sc_ids)
|
119
|
+
_get_unused_cspecies()
|
120
|
+
_get_unused_species()
|
121
|
+
_remove_compartmentalized_species(sc_ids)
|
122
|
+
_remove_entity_data(entity_type, label)
|
123
|
+
_remove_species(s_ids)
|
124
|
+
_remove_unused_cspecies()
|
125
|
+
_remove_unused_species()
|
126
|
+
_validate_r_ids(r_ids)
|
127
|
+
_validate_reaction_species()
|
128
|
+
_validate_reactions_data(reactions_data_table)
|
129
|
+
_validate_species_data(species_data_table)
|
130
|
+
_validate_table(table_name)
|
88
131
|
"""
|
89
132
|
|
90
133
|
compartments: pd.DataFrame
|
@@ -162,193 +205,176 @@ class SBML_dfs:
|
|
162
205
|
'"validate" = False so "resolve" will be ignored (eventhough it was True)'
|
163
206
|
)
|
164
207
|
|
165
|
-
|
166
|
-
|
167
|
-
|
208
|
+
# =============================================================================
|
209
|
+
# PUBLIC METHODS (ALPHABETICAL ORDER)
|
210
|
+
# =============================================================================
|
211
|
+
|
212
|
+
def add_reactions_data(self, label: str, data: pd.DataFrame):
|
168
213
|
"""
|
169
|
-
|
214
|
+
Add additional reaction data with validation.
|
170
215
|
|
171
216
|
Parameters
|
172
217
|
----------
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
Must be passed as a set, e.g. {'id'}, not a string.
|
178
|
-
|
179
|
-
Returns
|
180
|
-
-------
|
181
|
-
pd.DataFrame
|
182
|
-
The requested table
|
218
|
+
label : str
|
219
|
+
Label for the new data
|
220
|
+
data : pd.DataFrame
|
221
|
+
Data to add, must be indexed by reaction_id
|
183
222
|
|
184
223
|
Raises
|
185
224
|
------
|
186
225
|
ValueError
|
187
|
-
If
|
188
|
-
TypeError
|
189
|
-
If required_attributes is not a set
|
226
|
+
If the data is invalid or label already exists
|
190
227
|
"""
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
if entity_type not in schema.keys():
|
228
|
+
self._validate_reactions_data(data)
|
229
|
+
if label in self.reactions_data:
|
195
230
|
raise ValueError(
|
196
|
-
f"{
|
197
|
-
f"which are present are {', '.join(schema.keys())}"
|
198
|
-
)
|
199
|
-
|
200
|
-
if required_attributes is not None:
|
201
|
-
if not isinstance(required_attributes, set):
|
202
|
-
raise TypeError(
|
203
|
-
f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
|
204
|
-
"Did you pass a string instead of a set?"
|
205
|
-
)
|
206
|
-
|
207
|
-
# determine whether required_attributes are appropriate
|
208
|
-
VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
|
209
|
-
invalid_required_attributes = required_attributes.difference(
|
210
|
-
VALID_REQUIRED_ATTRIBUTES
|
231
|
+
f"{label} already exists in reactions_data. " "Drop it first."
|
211
232
|
)
|
233
|
+
self.reactions_data[label] = data
|
212
234
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
|
217
|
-
)
|
235
|
+
def add_species_data(self, label: str, data: pd.DataFrame):
|
236
|
+
"""
|
237
|
+
Add additional species data with validation.
|
218
238
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
f"The following required attributes are not present for the {entity_type} table: "
|
226
|
-
f"{', '.join(invalid_attrs)}."
|
227
|
-
)
|
239
|
+
Parameters
|
240
|
+
----------
|
241
|
+
label : str
|
242
|
+
Label for the new data
|
243
|
+
data : pd.DataFrame
|
244
|
+
Data to add, must be indexed by species_id
|
228
245
|
|
229
|
-
|
246
|
+
Raises
|
247
|
+
------
|
248
|
+
ValueError
|
249
|
+
If the data is invalid or label already exists
|
250
|
+
"""
|
251
|
+
self._validate_species_data(data)
|
252
|
+
if label in self.species_data:
|
253
|
+
raise ValueError(
|
254
|
+
f"{label} already exists in species_data. " "Drop it first."
|
255
|
+
)
|
256
|
+
self.species_data[label] = data
|
230
257
|
|
231
|
-
def
|
258
|
+
def export_sbml_dfs(
|
232
259
|
self,
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
) ->
|
260
|
+
model_prefix: str,
|
261
|
+
outdir: str,
|
262
|
+
overwrite: bool = False,
|
263
|
+
dogmatic: bool = True,
|
264
|
+
) -> None:
|
238
265
|
"""
|
239
|
-
|
266
|
+
Export SBML_dfs
|
240
267
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
268
|
+
Export summaries of species identifiers and each table underlying
|
269
|
+
an SBML_dfs pathway model
|
270
|
+
|
271
|
+
Params
|
272
|
+
------
|
273
|
+
model_prefix: str
|
274
|
+
Label to prepend to all exported files
|
275
|
+
outdir: str
|
276
|
+
Path to an existing directory where results should be saved
|
277
|
+
overwrite: bool
|
278
|
+
Should the directory be overwritten if it already exists?
|
279
|
+
dogmatic: bool
|
280
|
+
If True then treat genes, transcript, and proteins as separate species. If False
|
281
|
+
then treat them interchangeably.
|
251
282
|
|
252
283
|
Returns
|
253
284
|
-------
|
254
|
-
|
255
|
-
- Matching entities
|
256
|
-
- Matching identifiers
|
257
|
-
|
258
|
-
Raises
|
259
|
-
------
|
260
|
-
ValueError
|
261
|
-
If entity_type is invalid or ontologies are invalid
|
262
|
-
TypeError
|
263
|
-
If ontologies is not a set
|
285
|
+
None
|
264
286
|
"""
|
265
|
-
|
266
|
-
|
267
|
-
|
287
|
+
if not isinstance(model_prefix, str):
|
288
|
+
raise TypeError(
|
289
|
+
f"model_prefix was a {type(model_prefix)} " "and must be a str"
|
290
|
+
)
|
291
|
+
if not isinstance(self, SBML_dfs):
|
292
|
+
raise TypeError(
|
293
|
+
f"sbml_dfs was a {type(self)} and must" " be an sbml.SBML_dfs"
|
294
|
+
)
|
268
295
|
|
269
|
-
|
270
|
-
|
271
|
-
req_vars={
|
272
|
-
entity_pk,
|
273
|
-
IDENTIFIERS.ONTOLOGY,
|
274
|
-
IDENTIFIERS.IDENTIFIER,
|
275
|
-
IDENTIFIERS.URL,
|
276
|
-
IDENTIFIERS.BQB,
|
277
|
-
},
|
278
|
-
allow_series=False,
|
279
|
-
).assert_present()
|
296
|
+
# filter to identifiers which make sense when mapping from ids -> species
|
297
|
+
species_identifiers = self.get_characteristic_species_ids(dogmatic=dogmatic)
|
280
298
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
299
|
+
try:
|
300
|
+
utils.initialize_dir(outdir, overwrite=overwrite)
|
301
|
+
except FileExistsError:
|
302
|
+
logger.warning(
|
303
|
+
f"Directory {outdir} already exists and overwrite is False. "
|
304
|
+
"Files will be added to the existing directory."
|
305
|
+
)
|
306
|
+
with open_fs(outdir, writeable=True) as fs:
|
307
|
+
species_identifiers_path = (
|
308
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
|
309
|
+
)
|
310
|
+
with fs.openbin(species_identifiers_path, "w") as f:
|
311
|
+
species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
|
312
|
+
f, sep="\t", index=False
|
293
313
|
)
|
294
314
|
|
295
|
-
#
|
296
|
-
|
315
|
+
# export jsons
|
316
|
+
species_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES
|
317
|
+
reactions_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTIONS
|
318
|
+
reation_species_path = (
|
319
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTION_SPECIES
|
320
|
+
)
|
321
|
+
compartments_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTS
|
322
|
+
compartmentalized_species_path = (
|
323
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
|
324
|
+
)
|
325
|
+
with fs.openbin(species_path, "w") as f:
|
326
|
+
self.species[[SBML_DFS.S_NAME]].to_json(f)
|
297
327
|
|
298
|
-
|
299
|
-
|
300
|
-
]
|
301
|
-
entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
|
328
|
+
with fs.openbin(reactions_path, "w") as f:
|
329
|
+
self.reactions[[SBML_DFS.R_NAME]].to_json(f)
|
302
330
|
|
303
|
-
|
331
|
+
with fs.openbin(reation_species_path, "w") as f:
|
332
|
+
self.reaction_species.to_json(f)
|
304
333
|
|
305
|
-
|
306
|
-
|
307
|
-
|
334
|
+
with fs.openbin(compartments_path, "w") as f:
|
335
|
+
self.compartments[[SBML_DFS.C_NAME]].to_json(f)
|
336
|
+
|
337
|
+
with fs.openbin(compartmentalized_species_path, "w") as f:
|
338
|
+
self.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
|
339
|
+
f
|
340
|
+
)
|
341
|
+
|
342
|
+
return None
|
343
|
+
|
344
|
+
def get_characteristic_species_ids(self, dogmatic: bool = True) -> pd.DataFrame:
|
308
345
|
"""
|
309
|
-
|
346
|
+
Get Characteristic Species IDs
|
347
|
+
|
348
|
+
List the systematic identifiers which are characteristic of molecular species, e.g., excluding subcomponents, and optionally, treating proteins, transcripts, and genes equiavlently.
|
310
349
|
|
311
350
|
Parameters
|
312
351
|
----------
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
partial_match : bool, optional
|
318
|
-
Whether to allow partial string matches, by default True
|
352
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
353
|
+
The SBML_dfs object.
|
354
|
+
dogmatic : bool, default=True
|
355
|
+
Whether to use the dogmatic flag to determine which BQB attributes are valid.
|
319
356
|
|
320
357
|
Returns
|
321
358
|
-------
|
322
359
|
pd.DataFrame
|
323
|
-
|
360
|
+
A DataFrame containing the systematic identifiers which are characteristic of molecular species.
|
324
361
|
"""
|
325
|
-
entity_table = self.get_table(entity_type, required_attributes={"label"})
|
326
|
-
label_attr = self.schema[entity_type]["label"]
|
327
362
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
else:
|
333
|
-
matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
|
334
|
-
return matches
|
363
|
+
# select valid BQB attributes based on dogmatic flag
|
364
|
+
defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(
|
365
|
+
dogmatic
|
366
|
+
)
|
335
367
|
|
336
|
-
|
337
|
-
|
338
|
-
Get additional attributes of species.
|
368
|
+
# pre-summarize ontologies
|
369
|
+
species_identifiers = self.get_identifiers(SBML_DFS.SPECIES)
|
339
370
|
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
- species_type: Classification of the species (e.g., metabolite, protein)
|
345
|
-
"""
|
346
|
-
species = self.species
|
347
|
-
augmented_species = species.assign(
|
348
|
-
**{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
|
371
|
+
# drop some BQB_HAS_PART annotations
|
372
|
+
species_identifiers = sbml_dfs_utils.filter_to_characteristic_species_ids(
|
373
|
+
species_identifiers,
|
374
|
+
defining_biological_qualifiers=defining_biological_qualifiers,
|
349
375
|
)
|
350
376
|
|
351
|
-
return
|
377
|
+
return species_identifiers
|
352
378
|
|
353
379
|
def get_cspecies_features(self) -> pd.DataFrame:
|
354
380
|
"""
|
@@ -445,113 +471,28 @@ class SBML_dfs:
|
|
445
471
|
|
446
472
|
return named_identifiers
|
447
473
|
|
448
|
-
def
|
449
|
-
self,
|
450
|
-
entity_type: str,
|
451
|
-
entity_ids: Iterable[str] | None = None,
|
452
|
-
required_ontology: str | None = None,
|
453
|
-
) -> pd.Series:
|
474
|
+
def get_network_summary(self) -> Mapping[str, Any]:
|
454
475
|
"""
|
455
|
-
Get
|
456
|
-
|
457
|
-
Parameters
|
458
|
-
----------
|
459
|
-
entity_type : str
|
460
|
-
Type of entity to get URLs for (e.g., 'species', 'reactions')
|
461
|
-
entity_ids : Optional[Iterable[str]], optional
|
462
|
-
Specific entities to get URLs for, by default None (all entities)
|
463
|
-
required_ontology : Optional[str], optional
|
464
|
-
Specific ontology to get URLs from, by default None
|
476
|
+
Get diagnostic statistics about the network.
|
465
477
|
|
466
478
|
Returns
|
467
479
|
-------
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
if entity_type not in valid_entity_types:
|
486
|
-
raise ValueError(
|
487
|
-
f"{entity_type} is an invalid entity_type; valid types "
|
488
|
-
f"are {', '.join(valid_entity_types)}"
|
489
|
-
)
|
490
|
-
|
491
|
-
entity_table = getattr(self, entity_type)
|
492
|
-
|
493
|
-
if entity_ids is not None:
|
494
|
-
# ensure that entity_ids are unique and then convert back to list
|
495
|
-
# to support pandas indexing
|
496
|
-
entity_ids = list(set(entity_ids))
|
497
|
-
|
498
|
-
# filter to a subset of identifiers if one is provided
|
499
|
-
entity_table = entity_table.loc[entity_ids]
|
500
|
-
|
501
|
-
# create a dataframe of all identifiers for the select entities
|
502
|
-
all_ids = pd.concat(
|
503
|
-
[
|
504
|
-
sbml_dfs_utils._stub_ids(
|
505
|
-
entity_table[schema[entity_type]["id"]].iloc[i].ids
|
506
|
-
).assign(id=entity_table.index[i])
|
507
|
-
for i in range(0, entity_table.shape[0])
|
508
|
-
]
|
509
|
-
).rename(columns={"id": schema[entity_type]["pk"]})
|
510
|
-
|
511
|
-
# set priorities for ontologies and bqb terms
|
512
|
-
|
513
|
-
if required_ontology is None:
|
514
|
-
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
515
|
-
ONTOLOGY_PRIORITIES, how="left"
|
516
|
-
)
|
517
|
-
else:
|
518
|
-
ontology_priorities = pd.DataFrame(
|
519
|
-
[{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
|
520
|
-
)
|
521
|
-
# if only a single ontology is sought then just return matching entries
|
522
|
-
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
523
|
-
ontology_priorities, how="inner"
|
524
|
-
)
|
525
|
-
|
526
|
-
uri_urls = (
|
527
|
-
all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
|
528
|
-
.groupby(schema[entity_type]["pk"])
|
529
|
-
.first()[IDENTIFIERS.URL]
|
530
|
-
)
|
531
|
-
return uri_urls
|
532
|
-
|
533
|
-
def get_network_summary(self) -> Mapping[str, Any]:
|
534
|
-
"""
|
535
|
-
Get diagnostic statistics about the network.
|
536
|
-
|
537
|
-
Returns
|
538
|
-
-------
|
539
|
-
Mapping[str, Any]
|
540
|
-
Dictionary of diagnostic statistics including:
|
541
|
-
- n_species_types: Number of species types
|
542
|
-
- dict_n_species_per_type: Number of species per type
|
543
|
-
- n_species: Number of species
|
544
|
-
- n_cspecies: Number of compartmentalized species
|
545
|
-
- n_reaction_species: Number of reaction species
|
546
|
-
- n_reactions: Number of reactions
|
547
|
-
- n_compartments: Number of compartments
|
548
|
-
- dict_n_species_per_compartment: Number of species per compartment
|
549
|
-
- stats_species_per_reaction: Statistics on reactands per reaction
|
550
|
-
- top10_species_per_reaction: Top 10 reactions by number of reactands
|
551
|
-
- stats_degree: Statistics on species connectivity
|
552
|
-
- top10_degree: Top 10 species by connectivity
|
553
|
-
- stats_identifiers_per_species: Statistics on identifiers per species
|
554
|
-
- top10_identifiers_per_species: Top 10 species by number of identifiers
|
480
|
+
Mapping[str, Any]
|
481
|
+
Dictionary of diagnostic statistics including:
|
482
|
+
- n_species_types: Number of species types
|
483
|
+
- dict_n_species_per_type: Number of species per type
|
484
|
+
- n_species: Number of species
|
485
|
+
- n_cspecies: Number of compartmentalized species
|
486
|
+
- n_reaction_species: Number of reaction species
|
487
|
+
- n_reactions: Number of reactions
|
488
|
+
- n_compartments: Number of compartments
|
489
|
+
- dict_n_species_per_compartment: Number of species per compartment
|
490
|
+
- stats_species_per_reaction: Statistics on reactands per reaction
|
491
|
+
- top10_species_per_reaction: Top 10 reactions by number of reactands
|
492
|
+
- stats_degree: Statistics on species connectivity
|
493
|
+
- top10_degree: Top 10 species by connectivity
|
494
|
+
- stats_identifiers_per_species: Statistics on identifiers per species
|
495
|
+
- top10_identifiers_per_species: Top 10 species by number of identifiers
|
555
496
|
"""
|
556
497
|
stats: MutableMapping[str, Any] = {}
|
557
498
|
species_features = self.get_species_features()
|
@@ -616,1986 +557,1359 @@ class SBML_dfs:
|
|
616
557
|
|
617
558
|
return stats
|
618
559
|
|
619
|
-
def
|
560
|
+
def get_species_features(self) -> pd.DataFrame:
|
620
561
|
"""
|
621
|
-
|
622
|
-
|
623
|
-
Parameters
|
624
|
-
----------
|
625
|
-
label : str
|
626
|
-
Label for the new data
|
627
|
-
data : pd.DataFrame
|
628
|
-
Data to add, must be indexed by species_id
|
562
|
+
Get additional attributes of species.
|
629
563
|
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
564
|
+
Returns
|
565
|
+
-------
|
566
|
+
pd.DataFrame
|
567
|
+
Species with additional features including:
|
568
|
+
- species_type: Classification of the species (e.g., metabolite, protein)
|
634
569
|
"""
|
635
|
-
self.
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
570
|
+
species = self.species
|
571
|
+
augmented_species = species.assign(
|
572
|
+
**{
|
573
|
+
"species_type": lambda d: d["s_Identifiers"].apply(
|
574
|
+
sbml_dfs_utils.species_type_types
|
575
|
+
)
|
576
|
+
}
|
577
|
+
)
|
641
578
|
|
642
|
-
|
643
|
-
"""
|
644
|
-
Remove species data by label.
|
645
|
-
"""
|
646
|
-
self._remove_entity_data(SBML_DFS.SPECIES, label)
|
579
|
+
return augmented_species
|
647
580
|
|
648
|
-
def
|
581
|
+
def get_table(
|
582
|
+
self, entity_type: str, required_attributes: None | set[str] = None
|
583
|
+
) -> pd.DataFrame:
|
649
584
|
"""
|
650
|
-
|
585
|
+
Get a table from the SBML_dfs object with optional attribute validation.
|
651
586
|
|
652
587
|
Parameters
|
653
588
|
----------
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
589
|
+
entity_type : str
|
590
|
+
The type of entity table to retrieve (e.g., 'species', 'reactions')
|
591
|
+
required_attributes : Optional[Set[str]], optional
|
592
|
+
Set of attributes that must be present in the table, by default None.
|
593
|
+
Must be passed as a set, e.g. {'id'}, not a string.
|
594
|
+
|
595
|
+
Returns
|
596
|
+
-------
|
597
|
+
pd.DataFrame
|
598
|
+
The requested table
|
658
599
|
|
659
600
|
Raises
|
660
601
|
------
|
661
602
|
ValueError
|
662
|
-
If
|
603
|
+
If entity_type is invalid or required attributes are missing
|
604
|
+
TypeError
|
605
|
+
If required_attributes is not a set
|
663
606
|
"""
|
664
|
-
self._validate_reactions_data(data)
|
665
|
-
if label in self.reactions_data:
|
666
|
-
raise ValueError(
|
667
|
-
f"{label} already exists in reactions_data. Drop it first."
|
668
|
-
)
|
669
|
-
self.reactions_data[label] = data
|
670
607
|
|
671
|
-
|
672
|
-
"""
|
673
|
-
Remove reactions data by label.
|
674
|
-
"""
|
675
|
-
self._remove_entity_data(SBML_DFS.REACTIONS, label)
|
608
|
+
schema = self.schema
|
676
609
|
|
677
|
-
|
678
|
-
|
679
|
-
|
610
|
+
if entity_type not in schema.keys():
|
611
|
+
raise ValueError(
|
612
|
+
f"{entity_type} does not match a table in the SBML_dfs object. The tables "
|
613
|
+
f"which are present are {', '.join(schema.keys())}"
|
614
|
+
)
|
680
615
|
|
681
|
-
|
682
|
-
|
683
|
-
|
616
|
+
if required_attributes is not None:
|
617
|
+
if not isinstance(required_attributes, set):
|
618
|
+
raise TypeError(
|
619
|
+
f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
|
620
|
+
"Did you pass a string instead of a set?"
|
621
|
+
)
|
684
622
|
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
623
|
+
# determine whether required_attributes are appropriate
|
624
|
+
VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
|
625
|
+
invalid_required_attributes = required_attributes.difference(
|
626
|
+
VALID_REQUIRED_ATTRIBUTES
|
627
|
+
)
|
690
628
|
|
691
|
-
|
692
|
-
|
693
|
-
|
629
|
+
if len(invalid_required_attributes) > 0:
|
630
|
+
raise ValueError(
|
631
|
+
f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
|
632
|
+
f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
|
633
|
+
)
|
694
634
|
|
695
|
-
|
635
|
+
# determine if required_attributes are satisified
|
636
|
+
invalid_attrs = [
|
637
|
+
s for s in required_attributes if s not in schema[entity_type].keys()
|
638
|
+
]
|
639
|
+
if len(invalid_attrs) > 0:
|
640
|
+
raise ValueError(
|
641
|
+
f"The following required attributes are not present for the {entity_type} table: "
|
642
|
+
f"{', '.join(invalid_attrs)}."
|
643
|
+
)
|
696
644
|
|
697
|
-
|
698
|
-
self._remove_unused_species()
|
645
|
+
return getattr(self, entity_type)
|
699
646
|
|
700
|
-
def
|
647
|
+
def get_uri_urls(
|
648
|
+
self,
|
649
|
+
entity_type: str,
|
650
|
+
entity_ids: Iterable[str] | None = None,
|
651
|
+
required_ontology: str | None = None,
|
652
|
+
) -> pd.Series:
|
701
653
|
"""
|
702
|
-
|
654
|
+
Get reference URLs for specified entities.
|
703
655
|
|
704
656
|
Parameters
|
705
657
|
----------
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
# remove corresponding reactions_species
|
713
|
-
self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
|
714
|
-
# remove reactions
|
715
|
-
self.reactions = self.reactions.drop(index=list(r_ids))
|
716
|
-
# remove reactions_data
|
717
|
-
if hasattr(self, "reactions_data"):
|
718
|
-
for k, data in self.reactions_data.items():
|
719
|
-
self.reactions_data[k] = data.drop(index=list(r_ids))
|
720
|
-
# remove species if requested
|
721
|
-
if remove_species:
|
722
|
-
self._remove_unused_cspecies()
|
723
|
-
self._remove_unused_species()
|
724
|
-
|
725
|
-
def validate(self):
|
726
|
-
"""
|
727
|
-
Validate the SBML_dfs structure and relationships.
|
658
|
+
entity_type : str
|
659
|
+
Type of entity to get URLs for (e.g., 'species', 'reactions')
|
660
|
+
entity_ids : Optional[Iterable[str]], optional
|
661
|
+
Specific entities to get URLs for, by default None (all entities)
|
662
|
+
required_ontology : Optional[str], optional
|
663
|
+
Specific ontology to get URLs from, by default None
|
728
664
|
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
- Primary key uniqueness
|
734
|
-
- Foreign key relationships
|
735
|
-
- Optional data table validity
|
736
|
-
- Reaction species validity
|
665
|
+
Returns
|
666
|
+
-------
|
667
|
+
pd.Series
|
668
|
+
Series mapping entity IDs to their reference URLs
|
737
669
|
|
738
670
|
Raises
|
739
671
|
------
|
740
672
|
ValueError
|
741
|
-
If
|
673
|
+
If entity_type is invalid
|
742
674
|
"""
|
675
|
+
schema = self.schema
|
743
676
|
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
677
|
+
# valid entities and their identifier variables
|
678
|
+
valid_entity_types = [
|
679
|
+
SBML_DFS.COMPARTMENTS,
|
680
|
+
SBML_DFS.SPECIES,
|
681
|
+
SBML_DFS.REACTIONS,
|
682
|
+
]
|
749
683
|
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
f"{
|
754
|
-
f"{', '.join(extra_tables)}"
|
684
|
+
if entity_type not in valid_entity_types:
|
685
|
+
raise ValueError(
|
686
|
+
f"{entity_type} is an invalid entity_type; valid types "
|
687
|
+
f"are {', '.join(valid_entity_types)}"
|
755
688
|
)
|
756
689
|
|
757
|
-
|
758
|
-
if len(missing_tables) != 0:
|
759
|
-
raise ValueError(
|
760
|
-
f"Missing {len(missing_tables)} required tables: "
|
761
|
-
f"{', '.join(missing_tables)}"
|
762
|
-
)
|
763
|
-
|
764
|
-
# check individual tables
|
765
|
-
for table in required_tables:
|
766
|
-
self._validate_table(table)
|
690
|
+
entity_table = getattr(self, entity_type)
|
767
691
|
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
692
|
+
if entity_ids is not None:
|
693
|
+
# ensure that entity_ids are unique and then convert back to list
|
694
|
+
# to support pandas indexing
|
695
|
+
entity_ids = list(set(entity_ids))
|
772
696
|
|
773
|
-
|
774
|
-
|
775
|
-
[
|
776
|
-
{"fk_table": k, "fk": v["fk"]}
|
777
|
-
for k, v in self.schema.items()
|
778
|
-
if "fk" in v.keys()
|
779
|
-
]
|
780
|
-
)
|
781
|
-
.set_index("fk_table")["fk"]
|
782
|
-
.apply(pd.Series)
|
783
|
-
.reset_index()
|
784
|
-
.melt(id_vars="fk_table")
|
785
|
-
.drop(["variable"], axis=1)
|
786
|
-
.rename(columns={"value": "key"})
|
787
|
-
)
|
697
|
+
# filter to a subset of identifiers if one is provided
|
698
|
+
entity_table = entity_table.loc[entity_ids]
|
788
699
|
|
789
|
-
|
700
|
+
# create a dataframe of all identifiers for the select entities
|
701
|
+
all_ids = pd.concat(
|
702
|
+
[
|
703
|
+
sbml_dfs_utils._id_dict_to_df(
|
704
|
+
entity_table[schema[entity_type]["id"]].iloc[i].ids
|
705
|
+
).assign(id=entity_table.index[i])
|
706
|
+
for i in range(0, entity_table.shape[0])
|
707
|
+
]
|
708
|
+
).rename(columns={"id": schema[entity_type]["pk"]})
|
790
709
|
|
791
|
-
for
|
792
|
-
pk_table_keys = set(
|
793
|
-
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
794
|
-
)
|
795
|
-
if None in pk_table_keys:
|
796
|
-
raise ValueError(
|
797
|
-
f"{pk_fk_correspondences['pk_table'][i]} had "
|
798
|
-
"missing values in its index"
|
799
|
-
)
|
710
|
+
# set priorities for ontologies and bqb terms
|
800
711
|
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
]
|
712
|
+
if required_ontology is None:
|
713
|
+
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
714
|
+
ONTOLOGY_PRIORITIES, how="left"
|
805
715
|
)
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
f"missing {pk_fk_correspondences['key'][i]} values"
|
810
|
-
)
|
811
|
-
|
812
|
-
# all foreign keys need to match a primary key
|
813
|
-
extra_fks = fk_table_keys.difference(pk_table_keys)
|
814
|
-
if len(extra_fks) != 0:
|
815
|
-
raise ValueError(
|
816
|
-
f"{len(extra_fks)} distinct "
|
817
|
-
f"{pk_fk_correspondences['key'][i]} values were"
|
818
|
-
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
819
|
-
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
820
|
-
" All foreign keys must have a matching primary key.\n\n"
|
821
|
-
f"Extra key are: {', '.join(extra_fks)}"
|
822
|
-
)
|
823
|
-
|
824
|
-
# check optional data tables:
|
825
|
-
for k, v in self.species_data.items():
|
826
|
-
try:
|
827
|
-
self._validate_species_data(v)
|
828
|
-
except ValueError as e:
|
829
|
-
raise ValueError(f"species data {k} was invalid.") from e
|
830
|
-
|
831
|
-
for k, v in self.reactions_data.items():
|
832
|
-
try:
|
833
|
-
self._validate_reactions_data(v)
|
834
|
-
except ValueError as e:
|
835
|
-
raise ValueError(f"reactions data {k} was invalid.") from e
|
836
|
-
|
837
|
-
# validate reaction_species sbo_terms and stoi
|
838
|
-
self._validate_reaction_species()
|
839
|
-
|
840
|
-
def validate_and_resolve(self):
|
841
|
-
"""
|
842
|
-
Validate and attempt to automatically fix common issues.
|
843
|
-
|
844
|
-
This method iteratively:
|
845
|
-
1. Attempts validation
|
846
|
-
2. If validation fails, tries to resolve the issue
|
847
|
-
3. Repeats until validation passes or issue cannot be resolved
|
848
|
-
|
849
|
-
Raises
|
850
|
-
------
|
851
|
-
ValueError
|
852
|
-
If validation fails and cannot be automatically resolved
|
853
|
-
"""
|
854
|
-
|
855
|
-
current_exception = None
|
856
|
-
validated = False
|
857
|
-
|
858
|
-
while not validated:
|
859
|
-
try:
|
860
|
-
self.validate()
|
861
|
-
validated = True
|
862
|
-
except Exception as e:
|
863
|
-
e_str = str(e)
|
864
|
-
if e_str == current_exception:
|
865
|
-
logger.warning(
|
866
|
-
"Automated resolution of an Exception was attempted but failed"
|
867
|
-
)
|
868
|
-
raise e
|
869
|
-
|
870
|
-
# try to resolve
|
871
|
-
self._attempt_resolve(e)
|
872
|
-
|
873
|
-
def select_species_data(self, species_data_table: str) -> pd.DataFrame:
|
874
|
-
"""
|
875
|
-
Select a species data table from the SBML_dfs object.
|
876
|
-
|
877
|
-
Parameters
|
878
|
-
----------
|
879
|
-
species_data_table : str
|
880
|
-
Name of the species data table to select
|
881
|
-
|
882
|
-
Returns
|
883
|
-
-------
|
884
|
-
pd.DataFrame
|
885
|
-
The selected species data table
|
886
|
-
|
887
|
-
Raises
|
888
|
-
------
|
889
|
-
ValueError
|
890
|
-
If species_data_table is not found
|
891
|
-
"""
|
892
|
-
# Check if species_data_table exists in sbml_dfs.species_data
|
893
|
-
if species_data_table not in self.species_data:
|
894
|
-
raise ValueError(
|
895
|
-
f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
|
896
|
-
f"Available tables: {self.species_data.keys()}"
|
716
|
+
else:
|
717
|
+
ontology_priorities = pd.DataFrame(
|
718
|
+
[{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
|
897
719
|
)
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
def _validate_table(self, table: str) -> None:
|
903
|
-
"""
|
904
|
-
Validate a table in this SBML_dfs object against its schema.
|
905
|
-
|
906
|
-
This is an internal method that validates a table that is part of this SBML_dfs
|
907
|
-
object against the schema stored in self.schema.
|
908
|
-
|
909
|
-
Parameters
|
910
|
-
----------
|
911
|
-
table : str
|
912
|
-
Name of the table to validate
|
913
|
-
|
914
|
-
Raises
|
915
|
-
------
|
916
|
-
ValueError
|
917
|
-
If the table does not conform to its schema
|
918
|
-
"""
|
919
|
-
table_schema = self.schema[table]
|
920
|
-
table_data = getattr(self, table)
|
921
|
-
_perform_sbml_dfs_table_validation(table_data, table_schema, table)
|
922
|
-
|
923
|
-
def _remove_entity_data(self, entity_type: str, label: str) -> None:
|
924
|
-
"""
|
925
|
-
Remove data from species_data or reactions_data by table name and label.
|
926
|
-
|
927
|
-
Parameters
|
928
|
-
----------
|
929
|
-
entity_type : str
|
930
|
-
Name of the table to remove data from ('species' or 'reactions')
|
931
|
-
label : str
|
932
|
-
Label of the data to remove
|
933
|
-
|
934
|
-
Notes
|
935
|
-
-----
|
936
|
-
If the label does not exist, a warning will be logged that includes the existing labels.
|
937
|
-
"""
|
938
|
-
if entity_type not in ENTITIES_W_DATA:
|
939
|
-
raise ValueError("table_name must be either 'species' or 'reactions'")
|
940
|
-
|
941
|
-
data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
|
942
|
-
if label not in data_dict:
|
943
|
-
existing_labels = list(data_dict.keys())
|
944
|
-
logger.warning(
|
945
|
-
f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
|
946
|
-
f"Existing labels: {existing_labels}"
|
720
|
+
# if only a single ontology is sought then just return matching entries
|
721
|
+
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
722
|
+
ontology_priorities, how="inner"
|
947
723
|
)
|
948
|
-
return
|
949
|
-
|
950
|
-
del data_dict[label]
|
951
|
-
|
952
|
-
def _remove_unused_cspecies(self):
|
953
|
-
"""Removes compartmentalized species that are no
|
954
|
-
longer part of any reactions"""
|
955
|
-
sc_ids = self._get_unused_cspecies()
|
956
|
-
self._remove_compartmentalized_species(sc_ids)
|
957
|
-
|
958
|
-
def _get_unused_cspecies(self) -> set[str]:
|
959
|
-
"""Returns a set of compartmentalized species
|
960
|
-
that are not part of any reactions"""
|
961
|
-
sc_ids = set(self.compartmentalized_species.index) - set(
|
962
|
-
self.reaction_species[SBML_DFS.SC_ID]
|
963
|
-
)
|
964
|
-
return sc_ids # type: ignore
|
965
|
-
|
966
|
-
def _remove_unused_species(self):
|
967
|
-
"""Removes species that are no longer part of any
|
968
|
-
compartmentalized species"""
|
969
|
-
s_ids = self._get_unused_species()
|
970
|
-
self._remove_species(s_ids)
|
971
|
-
|
972
|
-
def _get_unused_species(self) -> set[str]:
|
973
|
-
"""Returns a list of species that are not part of any reactions"""
|
974
|
-
s_ids = set(self.species.index) - set(
|
975
|
-
self.compartmentalized_species[SBML_DFS.S_ID]
|
976
|
-
)
|
977
|
-
return s_ids # type: ignore
|
978
|
-
|
979
|
-
def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
980
|
-
"""Removes compartmentalized species from the model
|
981
|
-
|
982
|
-
This should not be directly used by the user, as it can lead to
|
983
|
-
invalid reactions when removing species without a logic to decide
|
984
|
-
if the reaction needs to be removed as well.
|
985
724
|
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
self.compartmentalized_species = self.compartmentalized_species.drop(
|
991
|
-
index=list(sc_ids)
|
725
|
+
uri_urls = (
|
726
|
+
all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
|
727
|
+
.groupby(schema[entity_type]["pk"])
|
728
|
+
.first()[IDENTIFIERS.URL]
|
992
729
|
)
|
993
|
-
|
994
|
-
self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
|
995
|
-
|
996
|
-
def _remove_species(self, s_ids: Iterable[str]):
|
997
|
-
"""Removes species from the model
|
998
|
-
|
999
|
-
This should not be directly used by the user, as it can lead to
|
1000
|
-
invalid reactions when removing species without a logic to decide
|
1001
|
-
if the reaction needs to be removed as well.
|
1002
|
-
|
1003
|
-
This removes the species and corresponding compartmentalized species and
|
1004
|
-
reactions_species.
|
1005
|
-
|
1006
|
-
Args:
|
1007
|
-
s_ids (Iterable[str]): the species to remove
|
1008
|
-
"""
|
1009
|
-
sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
|
1010
|
-
self._remove_compartmentalized_species(sc_ids)
|
1011
|
-
# Remove species
|
1012
|
-
self.species = self.species.drop(index=list(s_ids))
|
1013
|
-
# remove data
|
1014
|
-
for k, data in self.species_data.items():
|
1015
|
-
self.species_data[k] = data.drop(index=list(s_ids))
|
1016
|
-
|
1017
|
-
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
1018
|
-
"""Validates species data attribute
|
1019
|
-
|
1020
|
-
Args:
|
1021
|
-
species_data_table (pd.DataFrame): a species data table
|
1022
|
-
|
1023
|
-
Raises:
|
1024
|
-
ValueError: s_id not index name
|
1025
|
-
ValueError: s_id index contains duplicates
|
1026
|
-
ValueError: s_id not in species table
|
1027
|
-
"""
|
1028
|
-
_validate_matching_data(species_data_table, self.species)
|
1029
|
-
|
1030
|
-
def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
|
1031
|
-
"""Validates reactions data attribute
|
1032
|
-
|
1033
|
-
Args:
|
1034
|
-
reactions_data_table (pd.DataFrame): a reactions data table
|
1035
|
-
|
1036
|
-
Raises:
|
1037
|
-
ValueError: r_id not index name
|
1038
|
-
ValueError: r_id index contains duplicates
|
1039
|
-
ValueError: r_id not in reactions table
|
1040
|
-
"""
|
1041
|
-
_validate_matching_data(reactions_data_table, self.reactions)
|
1042
|
-
|
1043
|
-
def _validate_reaction_species(self):
|
1044
|
-
if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
|
1045
|
-
raise ValueError(
|
1046
|
-
"All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
|
1047
|
-
)
|
1048
|
-
|
1049
|
-
# test for null SBO terms
|
1050
|
-
n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
|
1051
|
-
if n_null_sbo_terms != 0:
|
1052
|
-
raise ValueError(
|
1053
|
-
f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
|
1054
|
-
)
|
1055
|
-
|
1056
|
-
# find invalid SBO terms
|
1057
|
-
sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
|
1058
|
-
invalid_sbo_term_counts = sbo_counts[
|
1059
|
-
~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
|
1060
|
-
]
|
1061
|
-
|
1062
|
-
if invalid_sbo_term_counts.shape[0] != 0:
|
1063
|
-
invalid_sbo_counts_str = ", ".join(
|
1064
|
-
[f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
|
1065
|
-
)
|
1066
|
-
raise ValueError(
|
1067
|
-
f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
|
1068
|
-
f"defined {invalid_sbo_counts_str}"
|
1069
|
-
)
|
1070
|
-
|
1071
|
-
def _attempt_resolve(self, e):
|
1072
|
-
str_e = str(e)
|
1073
|
-
if str_e == "compartmentalized_species included missing c_id values":
|
1074
|
-
logger.warning(str_e)
|
1075
|
-
logger.warning(
|
1076
|
-
"Attempting to resolve with infer_uncompartmentalized_species_location()"
|
1077
|
-
)
|
1078
|
-
self = infer_uncompartmentalized_species_location(self)
|
1079
|
-
elif re.search("sbo_terms were not defined", str_e):
|
1080
|
-
logger.warning(str_e)
|
1081
|
-
logger.warning("Attempting to resolve with infer_sbo_terms()")
|
1082
|
-
self = infer_sbo_terms(self)
|
1083
|
-
else:
|
1084
|
-
logger.warning(
|
1085
|
-
"An error occurred which could not be automatically resolved"
|
1086
|
-
)
|
1087
|
-
raise e
|
1088
|
-
|
1089
|
-
|
1090
|
-
def species_status(s_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
|
1091
|
-
"""
|
1092
|
-
Species Status
|
1093
|
-
|
1094
|
-
Return all of the reaction's a species particpates in.
|
1095
|
-
|
1096
|
-
Parameters:
|
1097
|
-
s_id: str
|
1098
|
-
A species ID
|
1099
|
-
sbml_dfs: SBML_dfs
|
1100
|
-
|
1101
|
-
Returns:
|
1102
|
-
pd.DataFrame, one row reaction
|
1103
|
-
"""
|
1104
|
-
|
1105
|
-
matching_species = sbml_dfs.species.loc[s_id]
|
1106
|
-
|
1107
|
-
if not isinstance(matching_species, pd.Series):
|
1108
|
-
raise ValueError(f"{s_id} did not match a single species")
|
1109
|
-
|
1110
|
-
# find all rxns species particpate in
|
1111
|
-
|
1112
|
-
matching_compartmentalized_species = sbml_dfs.compartmentalized_species[
|
1113
|
-
sbml_dfs.compartmentalized_species.s_id.isin([s_id])
|
1114
|
-
]
|
1115
|
-
|
1116
|
-
rxns_participating = sbml_dfs.reaction_species[
|
1117
|
-
sbml_dfs.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
|
1118
|
-
]
|
1119
|
-
|
1120
|
-
# find all participants in these rxns
|
1121
|
-
|
1122
|
-
full_rxns_participating = sbml_dfs.reaction_species[
|
1123
|
-
sbml_dfs.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
|
1124
|
-
].merge(
|
1125
|
-
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1126
|
-
)
|
1127
|
-
|
1128
|
-
reaction_descriptions = pd.concat(
|
1129
|
-
[
|
1130
|
-
reaction_summary(x, sbml_dfs)
|
1131
|
-
for x in set(full_rxns_participating[SBML_DFS.R_ID].tolist())
|
1132
|
-
]
|
1133
|
-
)
|
1134
|
-
|
1135
|
-
status = (
|
1136
|
-
full_rxns_participating.loc[
|
1137
|
-
full_rxns_participating[SBML_DFS.SC_ID].isin(
|
1138
|
-
matching_compartmentalized_species.index.values.tolist()
|
1139
|
-
),
|
1140
|
-
[SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
|
1141
|
-
]
|
1142
|
-
.merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
|
1143
|
-
.reset_index(drop=True)
|
1144
|
-
.drop(SBML_DFS.R_ID, axis=1)
|
1145
|
-
)
|
1146
|
-
|
1147
|
-
return status
|
1148
|
-
|
1149
|
-
|
1150
|
-
def reaction_summary(r_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
|
1151
|
-
"""
|
1152
|
-
Reaction Summary
|
1153
|
-
|
1154
|
-
Return a reaction's name and a human-readable formula.
|
1155
|
-
|
1156
|
-
Parameters:
|
1157
|
-
r_id: str
|
1158
|
-
A reaction ID
|
1159
|
-
sbml_dfs: SBML_dfs
|
1160
|
-
|
1161
|
-
Returns:
|
1162
|
-
one row pd.DataFrame
|
1163
|
-
"""
|
1164
|
-
|
1165
|
-
logger.warning(
|
1166
|
-
"reaction_summary is deprecated and will be removed in a future version of rcpr; "
|
1167
|
-
"please use reaction_summaries() instead"
|
1168
|
-
)
|
1169
|
-
|
1170
|
-
matching_reaction = sbml_dfs.reactions.loc[r_id]
|
730
|
+
return uri_urls
|
1171
731
|
|
1172
|
-
|
1173
|
-
|
732
|
+
def infer_sbo_terms(self):
|
733
|
+
"""
|
734
|
+
Infer SBO Terms
|
1174
735
|
|
1175
|
-
|
736
|
+
Define SBO terms based on stoichiometry for reaction_species with missing terms.
|
737
|
+
Modifies the SBML_dfs object in-place.
|
1176
738
|
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
739
|
+
Returns
|
740
|
+
-------
|
741
|
+
None (modifies SBML_dfs object in-place)
|
742
|
+
"""
|
743
|
+
valid_sbo_terms = self.reaction_species[
|
744
|
+
self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
745
|
+
]
|
1182
746
|
|
1183
|
-
|
747
|
+
invalid_sbo_terms = self.reaction_species[
|
748
|
+
~self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
749
|
+
]
|
1184
750
|
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
construct_formula_string(
|
1191
|
-
augmented_matching_reaction_species, sbml_dfs.reactions, SBML_DFS.S_NAME
|
1192
|
-
)
|
1193
|
-
+ " ["
|
1194
|
-
+ augmented_matching_reaction_species[SBML_DFS.C_NAME].iloc[0]
|
1195
|
-
+ "]"
|
1196
|
-
)
|
1197
|
-
else:
|
1198
|
-
str_formula = construct_formula_string(
|
1199
|
-
matching_reaction_species, sbml_dfs.reactions, SBML_DFS.SC_NAME
|
1200
|
-
)
|
751
|
+
if not all(self.reaction_species[SBML_DFS.SBO_TERM].notnull()):
|
752
|
+
raise ValueError("All reaction_species[SBML_DFS.SBO_TERM] must be not null")
|
753
|
+
if invalid_sbo_terms.shape[0] == 0:
|
754
|
+
logger.info("All sbo_terms were valid; nothing to update.")
|
755
|
+
return
|
1201
756
|
|
1202
|
-
|
1203
|
-
{
|
1204
|
-
SBML_DFS.R_NAME: matching_reaction[SBML_DFS.R_NAME],
|
1205
|
-
"r_formula_str": str_formula,
|
1206
|
-
},
|
1207
|
-
index=[r_id],
|
1208
|
-
)
|
757
|
+
logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
|
1209
758
|
|
1210
|
-
|
759
|
+
# add missing/invalid terms based on stoichiometry
|
760
|
+
invalid_sbo_terms.loc[
|
761
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
|
762
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
|
1211
763
|
|
1212
|
-
|
764
|
+
invalid_sbo_terms.loc[
|
765
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
|
766
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
|
1213
767
|
|
768
|
+
invalid_sbo_terms.loc[
|
769
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
|
770
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
|
1214
771
|
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
772
|
+
updated_reaction_species = pd.concat(
|
773
|
+
[valid_sbo_terms, invalid_sbo_terms]
|
774
|
+
).sort_index()
|
1218
775
|
|
1219
|
-
|
776
|
+
if self.reaction_species.shape[0] != updated_reaction_species.shape[0]:
|
777
|
+
raise ValueError(
|
778
|
+
f"Trying to overwrite {self.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
|
779
|
+
)
|
780
|
+
self.reaction_species = updated_reaction_species
|
781
|
+
return
|
1220
782
|
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
A relational mechanistic model
|
1225
|
-
r_ids: [str], str or None
|
1226
|
-
Reaction IDs or None for all reactions
|
783
|
+
def infer_uncompartmentalized_species_location(self):
|
784
|
+
"""
|
785
|
+
Infer Uncompartmentalized Species Location
|
1227
786
|
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
"""
|
787
|
+
If the compartment of a subset of compartmentalized species
|
788
|
+
was not specified, infer an appropriate compartment from
|
789
|
+
other members of reactions they participate in.
|
1232
790
|
|
1233
|
-
|
1234
|
-
r_ids = [r_ids]
|
791
|
+
This method modifies the SBML_dfs object in-place.
|
1235
792
|
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
793
|
+
Returns
|
794
|
+
-------
|
795
|
+
None (modifies SBML_dfs object in-place)
|
796
|
+
"""
|
797
|
+
default_compartment = (
|
798
|
+
self.compartmentalized_species.value_counts(SBML_DFS.C_ID)
|
799
|
+
.rename("N")
|
800
|
+
.reset_index()
|
801
|
+
.sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
|
802
|
+
)
|
803
|
+
if not isinstance(default_compartment, str):
|
804
|
+
raise ValueError(
|
805
|
+
"No default compartment could be found - compartment "
|
806
|
+
"information may not be present"
|
807
|
+
)
|
1240
808
|
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
809
|
+
# infer the compartments of species missing compartments
|
810
|
+
missing_compartment_scids = self.compartmentalized_species[
|
811
|
+
self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
812
|
+
].index.tolist()
|
813
|
+
if len(missing_compartment_scids) == 0:
|
814
|
+
logger.info(
|
815
|
+
"All compartmentalized species have compartments, "
|
816
|
+
"returning input SBML_dfs"
|
817
|
+
)
|
818
|
+
return self
|
819
|
+
|
820
|
+
participating_reactions = (
|
821
|
+
self.reaction_species[
|
822
|
+
self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
823
|
+
][SBML_DFS.R_ID]
|
824
|
+
.unique()
|
825
|
+
.tolist()
|
826
|
+
)
|
827
|
+
reaction_participants = self.reaction_species[
|
828
|
+
self.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
|
829
|
+
].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
|
830
|
+
reaction_participants = reaction_participants.merge(
|
831
|
+
self.compartmentalized_species[SBML_DFS.C_ID],
|
832
|
+
left_on=SBML_DFS.SC_ID,
|
833
|
+
right_index=True,
|
834
|
+
)
|
1246
835
|
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
|
1254
|
-
# there species must be labelled with the sc_name to specify where a species exists
|
1255
|
-
if r_id_cross_compartment.shape[0] > 0:
|
1256
|
-
rxn_eqtn_cross_compartment = (
|
1257
|
-
matching_reaction_species[
|
1258
|
-
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1259
|
-
r_id_cross_compartment.index
|
1260
|
-
)
|
1261
|
-
]
|
1262
|
-
.sort_values([SBML_DFS.SC_NAME])
|
836
|
+
# find a default compartment to fall back on if all compartmental information is missing
|
837
|
+
primary_reaction_compartment = (
|
838
|
+
reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
|
839
|
+
.rename("N")
|
840
|
+
.reset_index()
|
841
|
+
.sort_values("N", ascending=False)
|
1263
842
|
.groupby(SBML_DFS.R_ID)
|
1264
|
-
.
|
1265
|
-
|
1266
|
-
x, sbml_dfs.reactions, SBML_DFS.SC_NAME
|
1267
|
-
)
|
1268
|
-
)
|
1269
|
-
.rename("r_formula_str")
|
843
|
+
.first()[SBML_DFS.C_ID]
|
844
|
+
.reset_index()
|
1270
845
|
)
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
# can be labelled with the compartment and individual species can receive a more readable s_name
|
1276
|
-
r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
|
1277
|
-
if r_id_within_compartment.shape[0] > 0:
|
1278
|
-
# add s_name
|
1279
|
-
augmented_matching_reaction_species = (
|
1280
|
-
matching_reaction_species[
|
1281
|
-
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1282
|
-
r_id_within_compartment.index
|
1283
|
-
)
|
846
|
+
|
847
|
+
inferred_compartmentalization = (
|
848
|
+
self.reaction_species[
|
849
|
+
self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
1284
850
|
]
|
1285
|
-
.merge(
|
1286
|
-
.
|
1287
|
-
.
|
851
|
+
.merge(primary_reaction_compartment)
|
852
|
+
.value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
|
853
|
+
.rename("N")
|
854
|
+
.reset_index()
|
855
|
+
.sort_values("N", ascending=False)
|
856
|
+
.groupby(SBML_DFS.SC_ID)
|
857
|
+
.first()
|
858
|
+
.reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
|
1288
859
|
)
|
1289
|
-
|
1290
|
-
|
1291
|
-
[SBML_DFS.R_ID, SBML_DFS.C_NAME]
|
1292
|
-
).apply(
|
1293
|
-
lambda x: construct_formula_string(x, sbml_dfs.reactions, SBML_DFS.S_NAME)
|
860
|
+
logger.info(
|
861
|
+
f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
|
1294
862
|
)
|
1295
|
-
# add compartment for each reaction
|
1296
|
-
rxn_eqtn_within_compartment = pd.Series(
|
1297
|
-
[
|
1298
|
-
y + ": " + x
|
1299
|
-
for x, y in zip(
|
1300
|
-
rxn_eqtn_within_compartment,
|
1301
|
-
rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.C_NAME),
|
1302
|
-
)
|
1303
|
-
],
|
1304
|
-
index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
|
1305
|
-
).rename("r_formula_str")
|
1306
|
-
else:
|
1307
|
-
rxn_eqtn_within_compartment = None
|
1308
|
-
|
1309
|
-
formula_strs = pd.concat([rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment])
|
1310
|
-
|
1311
|
-
return formula_strs
|
1312
|
-
|
1313
|
-
|
1314
|
-
def construct_formula_string(
|
1315
|
-
reaction_species_df: pd.DataFrame,
|
1316
|
-
reactions_df: pd.DataFrame,
|
1317
|
-
name_var: str,
|
1318
|
-
) -> str:
|
1319
|
-
"""
|
1320
|
-
Construct Formula String
|
1321
|
-
|
1322
|
-
Convert a table of reaction species into a formula string
|
1323
863
|
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
formula_str: str
|
1336
|
-
String representation of a reactions substrates, products and
|
1337
|
-
modifiers
|
864
|
+
# define where a reaction is most likely to occur based on the compartmentalization of its participants
|
865
|
+
species_with_unknown_compartmentalization = set(
|
866
|
+
missing_compartment_scids
|
867
|
+
).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
|
868
|
+
if len(species_with_unknown_compartmentalization) != 0:
|
869
|
+
logger.warning(
|
870
|
+
f"{len(species_with_unknown_compartmentalization)} "
|
871
|
+
"species compartmentalization could not be inferred"
|
872
|
+
" from other reaction participants. Their compartmentalization "
|
873
|
+
f"will be set to the default of {default_compartment}"
|
874
|
+
)
|
1338
875
|
|
1339
|
-
|
876
|
+
inferred_compartmentalization = pd.concat(
|
877
|
+
[
|
878
|
+
inferred_compartmentalization,
|
879
|
+
pd.DataFrame(
|
880
|
+
{
|
881
|
+
SBML_DFS.SC_ID: list(
|
882
|
+
species_with_unknown_compartmentalization
|
883
|
+
)
|
884
|
+
}
|
885
|
+
).assign(c_id=default_compartment),
|
886
|
+
]
|
887
|
+
)
|
1340
888
|
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
)
|
1346
|
-
]
|
889
|
+
if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
|
890
|
+
raise ValueError(
|
891
|
+
f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
|
892
|
+
)
|
1347
893
|
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
894
|
+
updated_compartmentalized_species = pd.concat(
|
895
|
+
[
|
896
|
+
self.compartmentalized_species[
|
897
|
+
~self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
898
|
+
],
|
899
|
+
self.compartmentalized_species[
|
900
|
+
self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
901
|
+
]
|
902
|
+
.drop(SBML_DFS.C_ID, axis=1)
|
903
|
+
.merge(
|
904
|
+
inferred_compartmentalization,
|
905
|
+
left_index=True,
|
906
|
+
right_on=SBML_DFS.SC_ID,
|
907
|
+
)
|
908
|
+
.set_index(SBML_DFS.SC_ID),
|
909
|
+
]
|
1356
910
|
)
|
1357
911
|
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1367
|
-
)
|
1368
|
-
products = " + ".join(
|
1369
|
-
reaction_species_df["label"][
|
1370
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
|
1371
|
-
].tolist()
|
1372
|
-
)
|
1373
|
-
modifiers = " + ".join(
|
1374
|
-
reaction_species_df["label"][
|
1375
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
|
1376
|
-
].tolist()
|
1377
|
-
)
|
1378
|
-
if modifiers != "":
|
1379
|
-
modifiers = f" ---- modifiers: {modifiers}]"
|
1380
|
-
|
1381
|
-
return f"{substrates}{arrow_type}{products}{modifiers}"
|
1382
|
-
|
912
|
+
if (
|
913
|
+
updated_compartmentalized_species.shape[0]
|
914
|
+
!= self.compartmentalized_species.shape[0]
|
915
|
+
):
|
916
|
+
raise ValueError(
|
917
|
+
f"Trying to overwrite {self.compartmentalized_species.shape[0]}"
|
918
|
+
" compartmentalized species with "
|
919
|
+
f"{updated_compartmentalized_species.shape[0]}"
|
920
|
+
)
|
1383
921
|
|
1384
|
-
|
1385
|
-
|
1386
|
-
Add Stoi To Species Name
|
922
|
+
if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
|
923
|
+
raise ValueError("Some species compartments are still missing")
|
1387
924
|
|
1388
|
-
|
925
|
+
self.compartmentalized_species = updated_compartmentalized_species
|
926
|
+
return
|
1389
927
|
|
1390
|
-
|
1391
|
-
|
1392
|
-
|
1393
|
-
Number of molecules
|
1394
|
-
name: str
|
1395
|
-
Name of species
|
928
|
+
def name_compartmentalized_species(self):
|
929
|
+
"""
|
930
|
+
Name Compartmentalized Species
|
1396
931
|
|
1397
|
-
|
1398
|
-
|
1399
|
-
name: str
|
1400
|
-
Name containing number of species
|
932
|
+
Rename compartmentalized species if they have the same
|
933
|
+
name as their species. Modifies the SBML_dfs object in-place.
|
1401
934
|
|
1402
|
-
|
935
|
+
Returns
|
936
|
+
-------
|
937
|
+
None (modifies SBML_dfs object in-place)
|
938
|
+
"""
|
939
|
+
augmented_cspecies = self.compartmentalized_species.merge(
|
940
|
+
self.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
|
941
|
+
).merge(
|
942
|
+
self.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
|
943
|
+
)
|
944
|
+
augmented_cspecies[SBML_DFS.SC_NAME] = [
|
945
|
+
f"{s} [{c}]" if sc == s else sc
|
946
|
+
for sc, c, s in zip(
|
947
|
+
augmented_cspecies[SBML_DFS.SC_NAME],
|
948
|
+
augmented_cspecies[SBML_DFS.C_NAME],
|
949
|
+
augmented_cspecies[SBML_DFS.S_NAME],
|
950
|
+
)
|
951
|
+
]
|
1403
952
|
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1407
|
-
return
|
953
|
+
self.compartmentalized_species = augmented_cspecies.loc[
|
954
|
+
:, self.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
|
955
|
+
]
|
956
|
+
return
|
1408
957
|
|
958
|
+
def reaction_formulas(
|
959
|
+
self, r_ids: Optional[Union[str, list[str]]] = None
|
960
|
+
) -> pd.Series:
|
961
|
+
"""
|
962
|
+
Reaction Summary
|
1409
963
|
|
1410
|
-
|
1411
|
-
species_ids: pd.DataFrame,
|
1412
|
-
max_complex_size: int = 4,
|
1413
|
-
max_promiscuity: int = 20,
|
1414
|
-
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
1415
|
-
) -> pd.DataFrame:
|
1416
|
-
"""
|
1417
|
-
Filter to Characteristic Species IDs
|
964
|
+
Return human-readable formulas for reactions.
|
1418
965
|
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
966
|
+
Parameters:
|
967
|
+
----------
|
968
|
+
r_ids: [str], str or None
|
969
|
+
Reaction IDs or None for all reactions
|
1422
970
|
|
1423
|
-
|
971
|
+
Returns
|
1424
972
|
----------
|
1425
|
-
|
1426
|
-
|
1427
|
-
max_complex_size: int
|
1428
|
-
The largest size of a complex, where BQB_HAS_PART terms will be retained.
|
1429
|
-
In most cases, complexes are handled with specific formation and
|
1430
|
-
dissolutation reactions,but these identifiers will be pulled in when
|
1431
|
-
searching by identifiers or searching the identifiers associated with a
|
1432
|
-
species against an external resource such as Open Targets.
|
1433
|
-
max_promiscuity: int
|
1434
|
-
Maximum number of species where a single molecule can act as a
|
1435
|
-
BQB_HAS_PART component associated with a single identifier (and common ontology).
|
1436
|
-
defining_biological_qualifiers (list[str]):
|
1437
|
-
BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
|
1438
|
-
permissive settings would include homologs, different forms of the same gene.
|
1439
|
-
|
1440
|
-
Returns:
|
1441
|
-
--------
|
1442
|
-
species_id: pd.DataFrame
|
1443
|
-
Input species filtered to characteristic identifiers
|
973
|
+
formula_strs: pd.Series
|
974
|
+
"""
|
1444
975
|
|
1445
|
-
|
976
|
+
validated_rids = self._validate_r_ids(r_ids)
|
1446
977
|
|
1447
|
-
|
1448
|
-
|
1449
|
-
|
978
|
+
matching_reaction_species = self.reaction_species[
|
979
|
+
self.reaction_species.r_id.isin(validated_rids)
|
980
|
+
].merge(
|
981
|
+
self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1450
982
|
)
|
1451
983
|
|
1452
|
-
|
1453
|
-
|
1454
|
-
|
1455
|
-
)
|
984
|
+
# split into within compartment and cross-compartment reactions
|
985
|
+
r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
|
986
|
+
SBML_DFS.C_ID
|
987
|
+
].nunique()
|
1456
988
|
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
989
|
+
# identify reactions which work across compartments
|
990
|
+
r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
|
991
|
+
# there species must be labelled with the sc_name to specify where a species exists
|
992
|
+
if r_id_cross_compartment.shape[0] > 0:
|
993
|
+
rxn_eqtn_cross_compartment = (
|
994
|
+
matching_reaction_species[
|
995
|
+
matching_reaction_species[SBML_DFS.R_ID].isin(
|
996
|
+
r_id_cross_compartment.index
|
997
|
+
)
|
998
|
+
]
|
999
|
+
.sort_values([SBML_DFS.SC_NAME])
|
1000
|
+
.groupby(SBML_DFS.R_ID)
|
1001
|
+
.apply(
|
1002
|
+
lambda x: sbml_dfs_utils.construct_formula_string(
|
1003
|
+
x, self.reactions, SBML_DFS.SC_NAME
|
1004
|
+
)
|
1005
|
+
)
|
1006
|
+
.rename("r_formula_str")
|
1007
|
+
)
|
1008
|
+
else:
|
1009
|
+
rxn_eqtn_cross_compartment = None
|
1010
|
+
|
1011
|
+
# identify reactions which occur within a single compartment; for these the reaction
|
1012
|
+
# can be labelled with the compartment and individual species can receive a more readable s_name
|
1013
|
+
r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
|
1014
|
+
if r_id_within_compartment.shape[0] > 0:
|
1015
|
+
# add s_name
|
1016
|
+
augmented_matching_reaction_species = (
|
1017
|
+
matching_reaction_species[
|
1018
|
+
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1019
|
+
r_id_within_compartment.index
|
1020
|
+
)
|
1021
|
+
]
|
1022
|
+
.merge(self.compartments, left_on=SBML_DFS.C_ID, right_index=True)
|
1023
|
+
.merge(self.species, left_on=SBML_DFS.S_ID, right_index=True)
|
1024
|
+
.sort_values([SBML_DFS.S_NAME])
|
1025
|
+
)
|
1026
|
+
# create formulas based on s_names of components
|
1027
|
+
rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
|
1028
|
+
[SBML_DFS.R_ID, SBML_DFS.C_NAME]
|
1029
|
+
).apply(
|
1030
|
+
lambda x: sbml_dfs_utils.construct_formula_string(
|
1031
|
+
x, self.reactions, SBML_DFS.S_NAME
|
1032
|
+
)
|
1033
|
+
)
|
1034
|
+
# add compartment for each reaction
|
1035
|
+
rxn_eqtn_within_compartment = pd.Series(
|
1036
|
+
[
|
1037
|
+
y + ": " + x
|
1038
|
+
for x, y in zip(
|
1039
|
+
rxn_eqtn_within_compartment,
|
1040
|
+
rxn_eqtn_within_compartment.index.get_level_values(
|
1041
|
+
SBML_DFS.C_NAME
|
1042
|
+
),
|
1043
|
+
)
|
1044
|
+
],
|
1045
|
+
index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
|
1046
|
+
).rename("r_formula_str")
|
1047
|
+
else:
|
1048
|
+
rxn_eqtn_within_compartment = None
|
1461
1049
|
|
1462
|
-
|
1463
|
-
|
1464
|
-
f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
|
1050
|
+
formula_strs = pd.concat(
|
1051
|
+
[rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment]
|
1465
1052
|
)
|
1466
1053
|
|
1467
|
-
|
1468
|
-
bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
|
1469
|
-
|
1470
|
-
# add components within modestly sized protein complexes
|
1471
|
-
# look at HAS_PART IDs
|
1472
|
-
bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
|
1054
|
+
return formula_strs
|
1473
1055
|
|
1474
|
-
|
1475
|
-
|
1476
|
-
|
1477
|
-
|
1478
|
-
|
1479
|
-
n_species_components[
|
1480
|
-
n_species_components > max_complex_size
|
1481
|
-
].index.get_level_values(SBML_DFS.S_ID)
|
1482
|
-
)
|
1056
|
+
def reaction_summaries(
|
1057
|
+
self, r_ids: Optional[Union[str, list[str]]] = None
|
1058
|
+
) -> pd.DataFrame:
|
1059
|
+
"""
|
1060
|
+
Reaction Summary
|
1483
1061
|
|
1484
|
-
|
1485
|
-
bqb_has_parts_species, max_promiscuity
|
1486
|
-
)
|
1062
|
+
Return a summary of reactions.
|
1487
1063
|
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
1064
|
+
Parameters:
|
1065
|
+
----------
|
1066
|
+
r_ids: [str], str or None
|
1067
|
+
Reaction IDs or None for all reactions
|
1492
1068
|
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1069
|
+
Returns
|
1070
|
+
----------
|
1071
|
+
reaction_summaries_df: pd.DataFrame
|
1072
|
+
A table with r_id as an index and columns:
|
1073
|
+
- r_name: str, name of the reaction
|
1074
|
+
- r_formula_str: str, human-readable formula of the reaction
|
1075
|
+
"""
|
1500
1076
|
|
1501
|
-
|
1077
|
+
validated_rids = self._validate_r_ids(r_ids)
|
1502
1078
|
|
1079
|
+
participating_r_names = self.reactions.loc[validated_rids, SBML_DFS.R_NAME]
|
1080
|
+
participating_r_formulas = self.reaction_formulas(r_ids=validated_rids)
|
1081
|
+
reaction_summareis_df = pd.concat(
|
1082
|
+
[participating_r_names, participating_r_formulas], axis=1
|
1083
|
+
)
|
1503
1084
|
|
1504
|
-
|
1505
|
-
"""
|
1506
|
-
Infer Uncompartmentalized Species Location
|
1085
|
+
return reaction_summareis_df
|
1507
1086
|
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1087
|
+
def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
1088
|
+
"""
|
1089
|
+
Remove compartmentalized species and associated reactions.
|
1511
1090
|
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
A relational pathway model
|
1091
|
+
Starting with a set of compartmentalized species, determine which reactions
|
1092
|
+
should be removed based on their removal. Then remove these reactions,
|
1093
|
+
compartmentalized species, and species.
|
1516
1094
|
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1095
|
+
Parameters
|
1096
|
+
----------
|
1097
|
+
sc_ids : Iterable[str]
|
1098
|
+
IDs of compartmentalized species to remove
|
1099
|
+
"""
|
1521
1100
|
|
1522
|
-
|
1101
|
+
# find reactions which should be totally removed since they are losing critical species
|
1102
|
+
removed_reactions = self._find_underspecified_reactions_by_scids(sc_ids)
|
1103
|
+
self.remove_reactions(removed_reactions)
|
1523
1104
|
|
1524
|
-
|
1525
|
-
sbml_dfs.compartmentalized_species.value_counts(SBML_DFS.C_ID)
|
1526
|
-
.rename("N")
|
1527
|
-
.reset_index()
|
1528
|
-
.sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
|
1529
|
-
)
|
1530
|
-
if not isinstance(default_compartment, str):
|
1531
|
-
raise ValueError(
|
1532
|
-
"No default compartment could be found - compartment "
|
1533
|
-
"information may not be present"
|
1534
|
-
)
|
1105
|
+
self._remove_compartmentalized_species(sc_ids)
|
1535
1106
|
|
1536
|
-
|
1107
|
+
# remove species (and their associated species data if all their cspecies have been lost)
|
1108
|
+
self._remove_unused_species()
|
1537
1109
|
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
if len(missing_compartment_scids) == 0:
|
1542
|
-
logger.info(
|
1543
|
-
"All compartmentalized species have compartments, "
|
1544
|
-
"returning input sbml_dfs"
|
1545
|
-
)
|
1546
|
-
return sbml_dfs
|
1547
|
-
|
1548
|
-
participating_reactions = (
|
1549
|
-
sbml_dfs.reaction_species[
|
1550
|
-
sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
1551
|
-
][SBML_DFS.R_ID]
|
1552
|
-
.unique()
|
1553
|
-
.tolist()
|
1554
|
-
)
|
1555
|
-
reaction_participants = sbml_dfs.reaction_species[
|
1556
|
-
sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
|
1557
|
-
].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
|
1558
|
-
reaction_participants = reaction_participants.merge(
|
1559
|
-
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID],
|
1560
|
-
left_on=SBML_DFS.SC_ID,
|
1561
|
-
right_index=True,
|
1562
|
-
)
|
1110
|
+
def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
|
1111
|
+
"""
|
1112
|
+
Remove reactions from the model.
|
1563
1113
|
|
1564
|
-
|
1114
|
+
Parameters
|
1115
|
+
----------
|
1116
|
+
r_ids : Iterable[str]
|
1117
|
+
IDs of reactions to remove
|
1118
|
+
remove_species : bool, optional
|
1119
|
+
Whether to remove species that are no longer part of any reactions,
|
1120
|
+
by default False
|
1121
|
+
"""
|
1122
|
+
# remove corresponding reactions_species
|
1123
|
+
self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
|
1124
|
+
# remove reactions
|
1125
|
+
self.reactions = self.reactions.drop(index=list(r_ids))
|
1126
|
+
# remove reactions_data
|
1127
|
+
if hasattr(self, "reactions_data"):
|
1128
|
+
for k, data in self.reactions_data.items():
|
1129
|
+
self.reactions_data[k] = data.drop(index=list(r_ids))
|
1130
|
+
# remove species if requested
|
1131
|
+
if remove_species:
|
1132
|
+
self._remove_unused_cspecies()
|
1133
|
+
self._remove_unused_species()
|
1565
1134
|
|
1566
|
-
|
1567
|
-
|
1568
|
-
.
|
1569
|
-
|
1570
|
-
.
|
1571
|
-
.groupby(SBML_DFS.R_ID)
|
1572
|
-
.first()[SBML_DFS.C_ID]
|
1573
|
-
.reset_index()
|
1574
|
-
)
|
1135
|
+
def remove_reactions_data(self, label: str):
|
1136
|
+
"""
|
1137
|
+
Remove reactions data by label.
|
1138
|
+
"""
|
1139
|
+
self._remove_entity_data(SBML_DFS.REACTIONS, label)
|
1575
1140
|
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
.
|
1581
|
-
.value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
|
1582
|
-
.rename("N")
|
1583
|
-
.reset_index()
|
1584
|
-
.sort_values("N", ascending=False)
|
1585
|
-
.groupby(SBML_DFS.SC_ID)
|
1586
|
-
.first()
|
1587
|
-
.reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
|
1588
|
-
)
|
1589
|
-
logger.info(
|
1590
|
-
f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
|
1591
|
-
)
|
1141
|
+
def remove_species_data(self, label: str):
|
1142
|
+
"""
|
1143
|
+
Remove species data by label.
|
1144
|
+
"""
|
1145
|
+
self._remove_entity_data(SBML_DFS.SPECIES, label)
|
1592
1146
|
|
1593
|
-
|
1594
|
-
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
f"will be set to the default of {default_compartment}"
|
1603
|
-
)
|
1147
|
+
def search_by_ids(
|
1148
|
+
self,
|
1149
|
+
ids: list[str],
|
1150
|
+
entity_type: str,
|
1151
|
+
identifiers_df: pd.DataFrame,
|
1152
|
+
ontologies: None | set[str] = None,
|
1153
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
1154
|
+
"""
|
1155
|
+
Find entities and identifiers matching a set of query IDs.
|
1604
1156
|
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
|
1157
|
+
Parameters
|
1158
|
+
----------
|
1159
|
+
ids : List[str]
|
1160
|
+
List of identifiers to search for
|
1161
|
+
entity_type : str
|
1162
|
+
Type of entity to search (e.g., 'species', 'reactions')
|
1163
|
+
identifiers_df : pd.DataFrame
|
1164
|
+
DataFrame containing identifier mappings
|
1165
|
+
ontologies : Optional[Set[str]], optional
|
1166
|
+
Set of ontologies to filter by, by default None
|
1613
1167
|
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1617
|
-
|
1168
|
+
Returns
|
1169
|
+
-------
|
1170
|
+
Tuple[pd.DataFrame, pd.DataFrame]
|
1171
|
+
- Matching entities
|
1172
|
+
- Matching identifiers
|
1618
1173
|
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
inferred_compartmentalization, left_index=True, right_on=SBML_DFS.SC_ID
|
1630
|
-
)
|
1631
|
-
.set_index(SBML_DFS.SC_ID),
|
1632
|
-
]
|
1633
|
-
)
|
1174
|
+
Raises
|
1175
|
+
------
|
1176
|
+
ValueError
|
1177
|
+
If entity_type is invalid or ontologies are invalid
|
1178
|
+
TypeError
|
1179
|
+
If ontologies is not a set
|
1180
|
+
"""
|
1181
|
+
# validate inputs
|
1182
|
+
entity_table = self.get_table(entity_type, required_attributes={"id"})
|
1183
|
+
entity_pk = self.schema[entity_type]["pk"]
|
1634
1184
|
|
1635
|
-
|
1636
|
-
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
1641
|
-
|
1642
|
-
|
1643
|
-
|
1185
|
+
utils.match_pd_vars(
|
1186
|
+
identifiers_df,
|
1187
|
+
req_vars={
|
1188
|
+
entity_pk,
|
1189
|
+
IDENTIFIERS.ONTOLOGY,
|
1190
|
+
IDENTIFIERS.IDENTIFIER,
|
1191
|
+
IDENTIFIERS.URL,
|
1192
|
+
IDENTIFIERS.BQB,
|
1193
|
+
},
|
1194
|
+
allow_series=False,
|
1195
|
+
).assert_present()
|
1644
1196
|
|
1645
|
-
|
1646
|
-
|
1197
|
+
if ontologies is not None:
|
1198
|
+
if not isinstance(ontologies, set):
|
1199
|
+
# for clarity this should not be reachable based on type hints
|
1200
|
+
raise TypeError(
|
1201
|
+
f"ontologies must be a set, but got {type(ontologies).__name__}"
|
1202
|
+
)
|
1203
|
+
ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
|
1204
|
+
invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
|
1205
|
+
if len(invalid_ontologies) > 0:
|
1206
|
+
raise ValueError(
|
1207
|
+
f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
|
1208
|
+
f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
|
1209
|
+
)
|
1647
1210
|
|
1648
|
-
|
1211
|
+
# fitler to just to identifiers matchign the ontologies of interest
|
1212
|
+
identifiers_df = identifiers_df.query("ontology in @ontologies")
|
1649
1213
|
|
1650
|
-
|
1214
|
+
matching_identifiers = identifiers_df.loc[
|
1215
|
+
identifiers_df["identifier"].isin(ids)
|
1216
|
+
]
|
1217
|
+
entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
|
1651
1218
|
|
1219
|
+
return entity_subset, matching_identifiers
|
1652
1220
|
|
1653
|
-
def
|
1654
|
-
|
1655
|
-
|
1221
|
+
def search_by_name(
|
1222
|
+
self, name: str, entity_type: str, partial_match: bool = True
|
1223
|
+
) -> pd.DataFrame:
|
1224
|
+
"""
|
1225
|
+
Find entities by exact or partial name match.
|
1656
1226
|
|
1657
|
-
|
1227
|
+
Parameters
|
1228
|
+
----------
|
1229
|
+
name : str
|
1230
|
+
Name to search for
|
1231
|
+
entity_type : str
|
1232
|
+
Type of entity to search (e.g., 'species', 'reactions')
|
1233
|
+
partial_match : bool, optional
|
1234
|
+
Whether to allow partial string matches, by default True
|
1658
1235
|
|
1659
|
-
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1236
|
+
Returns
|
1237
|
+
-------
|
1238
|
+
pd.DataFrame
|
1239
|
+
Matching entities
|
1240
|
+
"""
|
1241
|
+
entity_table = self.get_table(entity_type, required_attributes={"label"})
|
1242
|
+
label_attr = self.schema[entity_type]["label"]
|
1663
1243
|
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1244
|
+
if partial_match:
|
1245
|
+
matches = entity_table.loc[
|
1246
|
+
entity_table[label_attr].str.contains(name, case=False)
|
1247
|
+
]
|
1248
|
+
else:
|
1249
|
+
matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
|
1250
|
+
return matches
|
1668
1251
|
|
1669
|
-
|
1252
|
+
def select_species_data(self, species_data_table: str) -> pd.DataFrame:
|
1253
|
+
"""
|
1254
|
+
Select a species data table from the SBML_dfs object.
|
1670
1255
|
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1256
|
+
Parameters
|
1257
|
+
----------
|
1258
|
+
species_data_table : str
|
1259
|
+
Name of the species data table to select
|
1674
1260
|
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1261
|
+
Returns
|
1262
|
+
-------
|
1263
|
+
pd.DataFrame
|
1264
|
+
The selected species data table
|
1678
1265
|
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1684
|
-
|
1685
|
-
|
1266
|
+
Raises
|
1267
|
+
------
|
1268
|
+
ValueError
|
1269
|
+
If species_data_table is not found
|
1270
|
+
"""
|
1271
|
+
# Check if species_data_table exists in sbml_dfs.species_data
|
1272
|
+
if species_data_table not in self.species_data:
|
1273
|
+
raise ValueError(
|
1274
|
+
f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
|
1275
|
+
f"Available tables: {self.species_data.keys()}"
|
1276
|
+
)
|
1686
1277
|
|
1687
|
-
|
1278
|
+
# Get the species data
|
1279
|
+
return self.species_data[species_data_table]
|
1688
1280
|
|
1689
|
-
|
1690
|
-
|
1691
|
-
|
1692
|
-
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
|
1281
|
+
def species_status(self, s_id: str) -> pd.DataFrame:
|
1282
|
+
"""
|
1283
|
+
Species Status
|
1693
1284
|
|
1694
|
-
|
1695
|
-
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
|
1696
|
-
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
|
1285
|
+
Return all of the reactions a species participates in.
|
1697
1286
|
|
1698
|
-
|
1699
|
-
|
1700
|
-
|
1287
|
+
Parameters:
|
1288
|
+
s_id: str
|
1289
|
+
A species ID
|
1701
1290
|
|
1702
|
-
|
1703
|
-
|
1704
|
-
|
1291
|
+
Returns:
|
1292
|
+
pd.DataFrame, one row per reaction the species participates in
|
1293
|
+
with columns:
|
1294
|
+
- sc_name: str, name of the compartment the species participates in
|
1295
|
+
- stoichiometry: float, stoichiometry of the species in the reaction
|
1296
|
+
- r_name: str, name of the reaction
|
1297
|
+
- r_formula_str: str, human-readable formula of the reaction
|
1298
|
+
"""
|
1705
1299
|
|
1706
|
-
|
1707
|
-
|
1708
|
-
f"Trying to overwrite {sbml_dfs.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
|
1709
|
-
)
|
1710
|
-
sbml_dfs.reaction_species = updated_reaction_species
|
1300
|
+
if s_id not in self.species.index:
|
1301
|
+
raise ValueError(f"{s_id} not found in species table")
|
1711
1302
|
|
1712
|
-
|
1303
|
+
matching_species = self.species.loc[s_id]
|
1713
1304
|
|
1305
|
+
if not isinstance(matching_species, pd.Series):
|
1306
|
+
raise ValueError(f"{s_id} did not match a single species")
|
1714
1307
|
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1308
|
+
# find all rxns species participate in
|
1309
|
+
matching_compartmentalized_species = self.compartmentalized_species[
|
1310
|
+
self.compartmentalized_species.s_id.isin([s_id])
|
1311
|
+
]
|
1718
1312
|
|
1719
|
-
|
1720
|
-
|
1313
|
+
rxns_participating = self.reaction_species[
|
1314
|
+
self.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
|
1315
|
+
]
|
1721
1316
|
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1317
|
+
# find all participants in these rxns
|
1318
|
+
full_rxns_participating = self.reaction_species[
|
1319
|
+
self.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
|
1320
|
+
].merge(
|
1321
|
+
self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1322
|
+
)
|
1726
1323
|
|
1727
|
-
|
1728
|
-
|
1729
|
-
sbml_dfs
|
1730
|
-
"""
|
1324
|
+
participating_rids = full_rxns_participating[SBML_DFS.R_ID].unique()
|
1325
|
+
reaction_descriptions = self.reaction_summaries(r_ids=participating_rids)
|
1731
1326
|
|
1732
|
-
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
|
1739
|
-
|
1740
|
-
|
1741
|
-
|
1742
|
-
augmented_cspecies[SBML_DFS.S_NAME],
|
1327
|
+
status = (
|
1328
|
+
full_rxns_participating.loc[
|
1329
|
+
full_rxns_participating[SBML_DFS.SC_ID].isin(
|
1330
|
+
matching_compartmentalized_species.index.values.tolist()
|
1331
|
+
),
|
1332
|
+
[SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
|
1333
|
+
]
|
1334
|
+
.merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
|
1335
|
+
.reset_index(drop=True)
|
1336
|
+
.drop(SBML_DFS.R_ID, axis=1)
|
1743
1337
|
)
|
1744
|
-
]
|
1745
1338
|
|
1746
|
-
|
1747
|
-
:, sbml_dfs.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
|
1748
|
-
]
|
1749
|
-
|
1750
|
-
return sbml_dfs
|
1339
|
+
return status
|
1751
1340
|
|
1341
|
+
def validate(self):
|
1342
|
+
"""
|
1343
|
+
Validate the SBML_dfs structure and relationships.
|
1752
1344
|
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1759
|
-
|
1760
|
-
|
1761
|
-
Export SBML_dfs
|
1762
|
-
|
1763
|
-
Export summaries of species identifiers and each table underlying
|
1764
|
-
an SBML_dfs pathway model
|
1765
|
-
|
1766
|
-
Params
|
1767
|
-
------
|
1768
|
-
model_prefix: str
|
1769
|
-
Label to prepend to all exported files
|
1770
|
-
sbml_dfs: sbml.SBML_dfs
|
1771
|
-
A pathway model
|
1772
|
-
outdir: str
|
1773
|
-
Path to an existing directory where results should be saved
|
1774
|
-
overwrite: bool
|
1775
|
-
Should the directory be overwritten if it already exists?
|
1776
|
-
dogmatic: bool
|
1777
|
-
If True then treat genes, transcript, and proteins as separate species. If False
|
1778
|
-
then treat them interchangeably.
|
1345
|
+
Checks:
|
1346
|
+
- Schema existence
|
1347
|
+
- Required tables presence
|
1348
|
+
- Individual table structure
|
1349
|
+
- Primary key uniqueness
|
1350
|
+
- Foreign key relationships
|
1351
|
+
- Optional data table validity
|
1352
|
+
- Reaction species validity
|
1779
1353
|
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1354
|
+
Raises
|
1355
|
+
------
|
1356
|
+
ValueError
|
1357
|
+
If any validation check fails
|
1358
|
+
"""
|
1783
1359
|
|
1784
|
-
|
1360
|
+
if not hasattr(self, "schema"):
|
1361
|
+
raise ValueError("No schema found")
|
1785
1362
|
|
1786
|
-
|
1787
|
-
|
1788
|
-
if not isinstance(sbml_dfs, SBML_dfs):
|
1789
|
-
raise TypeError(
|
1790
|
-
f"sbml_dfs was a {type(sbml_dfs)} and must" " be an sbml.SBML_dfs"
|
1791
|
-
)
|
1363
|
+
required_tables = self._required_entities
|
1364
|
+
schema_tables = set(self.schema.keys())
|
1792
1365
|
|
1793
|
-
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
|
1366
|
+
extra_tables = schema_tables.difference(required_tables)
|
1367
|
+
if len(extra_tables) != 0:
|
1368
|
+
logger.debug(
|
1369
|
+
f"{len(extra_tables)} unexpected tables found: "
|
1370
|
+
f"{', '.join(extra_tables)}"
|
1371
|
+
)
|
1798
1372
|
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
"Files will be added to the existing directory."
|
1805
|
-
)
|
1806
|
-
with open_fs(outdir, writeable=True) as fs:
|
1807
|
-
species_identifiers_path = (
|
1808
|
-
model_prefix + CPR_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
|
1809
|
-
)
|
1810
|
-
with fs.openbin(species_identifiers_path, "w") as f:
|
1811
|
-
species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
|
1812
|
-
f, sep="\t", index=False
|
1373
|
+
missing_tables = required_tables.difference(schema_tables)
|
1374
|
+
if len(missing_tables) != 0:
|
1375
|
+
raise ValueError(
|
1376
|
+
f"Missing {len(missing_tables)} required tables: "
|
1377
|
+
f"{', '.join(missing_tables)}"
|
1813
1378
|
)
|
1814
1379
|
|
1815
|
-
#
|
1816
|
-
|
1817
|
-
|
1818
|
-
reation_species_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTION_SPECIES
|
1819
|
-
compartments_path = model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTS
|
1820
|
-
compartmentalized_species_path = (
|
1821
|
-
model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
|
1822
|
-
)
|
1823
|
-
with fs.openbin(species_path, "w") as f:
|
1824
|
-
sbml_dfs.species[[SBML_DFS.S_NAME]].to_json(f)
|
1380
|
+
# check individual tables
|
1381
|
+
for table in required_tables:
|
1382
|
+
self._validate_table(table)
|
1825
1383
|
|
1826
|
-
|
1827
|
-
|
1384
|
+
# check whether pks and fks agree
|
1385
|
+
self._check_pk_fk_correspondence()
|
1828
1386
|
|
1829
|
-
|
1830
|
-
|
1387
|
+
# check optional data tables:
|
1388
|
+
for k, v in self.species_data.items():
|
1389
|
+
try:
|
1390
|
+
self._validate_species_data(v)
|
1391
|
+
except ValueError as e:
|
1392
|
+
raise ValueError(f"species data {k} was invalid.") from e
|
1831
1393
|
|
1832
|
-
|
1833
|
-
|
1394
|
+
for k, v in self.reactions_data.items():
|
1395
|
+
try:
|
1396
|
+
self._validate_reactions_data(v)
|
1397
|
+
except ValueError as e:
|
1398
|
+
raise ValueError(f"reactions data {k} was invalid.") from e
|
1834
1399
|
|
1835
|
-
|
1836
|
-
|
1837
|
-
f
|
1838
|
-
)
|
1400
|
+
# validate reaction_species sbo_terms and stoi
|
1401
|
+
self._validate_reaction_species()
|
1839
1402
|
|
1840
|
-
|
1403
|
+
def validate_and_resolve(self):
|
1404
|
+
"""
|
1405
|
+
Validate and attempt to automatically fix common issues.
|
1841
1406
|
|
1407
|
+
This method iteratively:
|
1408
|
+
1. Attempts validation
|
1409
|
+
2. If validation fails, tries to resolve the issue
|
1410
|
+
3. Repeats until validation passes or issue cannot be resolved
|
1842
1411
|
|
1843
|
-
|
1844
|
-
|
1845
|
-
|
1846
|
-
|
1847
|
-
|
1848
|
-
upstream_stoichiometry: int = 0,
|
1849
|
-
downstream_stoichiometry: int = 1,
|
1850
|
-
downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
|
1851
|
-
keep_species_data: bool | str = False,
|
1852
|
-
keep_reactions_data: bool | str = False,
|
1853
|
-
) -> SBML_dfs:
|
1854
|
-
"""
|
1855
|
-
Create SBML_dfs from Edgelist
|
1856
|
-
|
1857
|
-
Combine a set of interactions into an sbml.SBML_dfs mechanistic model
|
1858
|
-
|
1859
|
-
Parameters:
|
1860
|
-
interaction_edgelist (pd.DataFrame): A table containing interactions:
|
1861
|
-
- upstream_name (str): matching "s_name" from "species_df"
|
1862
|
-
- downstream_name (str): matching "s_name" from "species_df"
|
1863
|
-
- upstream_compartment (str): compartment of "upstream_name"
|
1864
|
-
with names matching "c_name" from "compartments_df"
|
1865
|
-
- downstream_compartment (str): compartment of "downstream_name"
|
1866
|
-
with names matching "c_name" from "compartments_df"
|
1867
|
-
- r_name (str): a name for the interaction
|
1868
|
-
- sbo_term (str): sbo term defining the type of
|
1869
|
-
molecular interaction (see MINI_SBO_FROM_NAME)
|
1870
|
-
- r_Identifiers (identifiers.Identifiers): identifiers
|
1871
|
-
supporting the interaction (e.g., pubmed ids)
|
1872
|
-
- r_isreversible (bool): Is this reaction reversible?
|
1873
|
-
If True, the reaction is reversible
|
1874
|
-
By default, the interactions of TRRUST networks are irreversible, and reversible for STRING networks
|
1875
|
-
species_df (pd.DataFrame): A table defining unique molecular
|
1876
|
-
species participating in "interaction_edgelist":
|
1877
|
-
- s_name (str): name of molecular species
|
1878
|
-
- s_Identifiers (identifiers.Identifiers): identifiers
|
1879
|
-
defining the species
|
1880
|
-
compartments_df (pd.DataFrame): A table defining compartments
|
1881
|
-
where interactions are occurring "interaction_edgelist":
|
1882
|
-
- c_name (str): name of compartment
|
1883
|
-
- c_Identifiers (identifiers.Identifiers):
|
1884
|
-
identifiers defining the compartment (see
|
1885
|
-
bigg.annotate_recon() for a set of names > go categories)
|
1886
|
-
interaction_source (source.Source): A source object
|
1887
|
-
which will tie model entities to the interaction source
|
1888
|
-
upstream_stoichiometry (int): stoichiometry of
|
1889
|
-
upstream species in reaction
|
1890
|
-
downstream_stoichiometry (int): stoichiometry of
|
1891
|
-
downstream species in reaction
|
1892
|
-
downstream_sbo_name (str): sbo term defining the
|
1893
|
-
type of molecular interaction for the downstream reactand
|
1894
|
-
(see MINI_SBO_FROM_NAME)
|
1895
|
-
keep_species_data (bool | str): Should species data
|
1896
|
-
be kept in the model? If True, all species data will be kept
|
1897
|
-
and saved as "species_data" in the SBML_dfs. The label will be 'source'
|
1898
|
-
If False, no species data will be kept.
|
1899
|
-
If a string: label for the species data to be kept.
|
1900
|
-
keep_reactions_data (bool | str): Should reaction data be kept in the model?
|
1901
|
-
If True, all reaction data will be kept and saved
|
1902
|
-
as "reactions_data" in the SBML_dfs. The label will be 'source'.
|
1903
|
-
If False, no reaction data will be kept.
|
1904
|
-
If a string: label for the reaction data to be kept.
|
1905
|
-
|
1906
|
-
Returns:
|
1907
|
-
sbml.SBML_dfs
|
1412
|
+
Raises
|
1413
|
+
------
|
1414
|
+
ValueError
|
1415
|
+
If validation fails and cannot be automatically resolved
|
1416
|
+
"""
|
1908
1417
|
|
1909
|
-
|
1418
|
+
current_exception = None
|
1419
|
+
validated = False
|
1910
1420
|
|
1911
|
-
|
1912
|
-
|
1913
|
-
|
1914
|
-
|
1421
|
+
while not validated:
|
1422
|
+
try:
|
1423
|
+
self.validate()
|
1424
|
+
validated = True
|
1425
|
+
except Exception as e:
|
1426
|
+
e_str = str(e)
|
1427
|
+
if e_str == current_exception:
|
1428
|
+
logger.warning(
|
1429
|
+
"Automated resolution of an Exception was attempted but failed"
|
1430
|
+
)
|
1431
|
+
raise e
|
1915
1432
|
|
1916
|
-
|
1917
|
-
|
1918
|
-
# as `reaction_data`
|
1919
|
-
interaction_edgelist_required_vars = {
|
1920
|
-
"upstream_name",
|
1921
|
-
"downstream_name",
|
1922
|
-
"upstream_compartment",
|
1923
|
-
"downstream_compartment",
|
1924
|
-
SBML_DFS.R_NAME,
|
1925
|
-
SBML_DFS.SBO_TERM,
|
1926
|
-
SBML_DFS.R_IDENTIFIERS,
|
1927
|
-
SBML_DFS.R_ISREVERSIBLE,
|
1928
|
-
}
|
1929
|
-
if keep_reactions_data is not False:
|
1930
|
-
extra_reactions_columns = [
|
1931
|
-
c
|
1932
|
-
for c in interaction_edgelist.columns
|
1933
|
-
if c not in interaction_edgelist_required_vars
|
1934
|
-
]
|
1935
|
-
else:
|
1936
|
-
extra_reactions_columns = []
|
1937
|
-
# Extra species columns
|
1938
|
-
if keep_species_data is not False:
|
1939
|
-
extra_species_columns = [
|
1940
|
-
c
|
1941
|
-
for c in species_df.columns
|
1942
|
-
if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
1943
|
-
]
|
1944
|
-
else:
|
1945
|
-
extra_species_columns = []
|
1433
|
+
# try to resolve
|
1434
|
+
self._attempt_resolve(e)
|
1946
1435
|
|
1947
|
-
#
|
1948
|
-
|
1949
|
-
|
1950
|
-
range(compartments_df.shape[0]), SBML_DFS.C_ID
|
1951
|
-
)
|
1952
|
-
compartments_df = compartments_df.set_index(SBML_DFS.C_ID)[
|
1953
|
-
[SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
|
1954
|
-
]
|
1955
|
-
|
1956
|
-
# format species
|
1957
|
-
species_df[SBML_DFS.S_SOURCE] = interaction_source
|
1958
|
-
species_df[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
|
1959
|
-
range(species_df.shape[0]), SBML_DFS.S_ID
|
1960
|
-
)
|
1436
|
+
# =============================================================================
|
1437
|
+
# PRIVATE METHODS (ALPHABETICAL ORDER)
|
1438
|
+
# =============================================================================
|
1961
1439
|
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1965
|
-
|
1966
|
-
|
1967
|
-
|
1968
|
-
|
1969
|
-
|
1970
|
-
|
1971
|
-
|
1972
|
-
|
1973
|
-
|
1974
|
-
|
1975
|
-
|
1976
|
-
|
1977
|
-
|
1978
|
-
|
1979
|
-
"upstream_compartment": SBML_DFS.C_NAME,
|
1980
|
-
},
|
1981
|
-
axis=1,
|
1982
|
-
),
|
1983
|
-
interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
|
1984
|
-
{
|
1985
|
-
"downstream_name": SBML_DFS.S_NAME,
|
1986
|
-
"downstream_compartment": SBML_DFS.C_NAME,
|
1987
|
-
},
|
1988
|
-
axis=1,
|
1989
|
-
),
|
1990
|
-
]
|
1991
|
-
).drop_duplicates()
|
1992
|
-
|
1993
|
-
# merge to add species and compartments primary keys
|
1994
|
-
comp_species_w_ids = comp_species.merge(
|
1995
|
-
species_df[SBML_DFS.S_NAME].reset_index(),
|
1996
|
-
how="left",
|
1997
|
-
left_on=SBML_DFS.S_NAME,
|
1998
|
-
right_on=SBML_DFS.S_NAME,
|
1999
|
-
).merge(
|
2000
|
-
compartments_df[SBML_DFS.C_NAME].reset_index(),
|
2001
|
-
how="left",
|
2002
|
-
left_on=SBML_DFS.C_NAME,
|
2003
|
-
right_on=SBML_DFS.C_NAME,
|
2004
|
-
)
|
1440
|
+
def _attempt_resolve(self, e):
|
1441
|
+
str_e = str(e)
|
1442
|
+
if str_e == "compartmentalized_species included missing c_id values":
|
1443
|
+
logger.warning(str_e)
|
1444
|
+
logger.warning(
|
1445
|
+
"Attempting to resolve with infer_uncompartmentalized_species_location()"
|
1446
|
+
)
|
1447
|
+
self.infer_uncompartmentalized_species_location()
|
1448
|
+
elif re.search("sbo_terms were not defined", str_e):
|
1449
|
+
logger.warning(str_e)
|
1450
|
+
logger.warning("Attempting to resolve with infer_sbo_terms()")
|
1451
|
+
self.infer_sbo_terms()
|
1452
|
+
else:
|
1453
|
+
logger.warning(
|
1454
|
+
"An error occurred which could not be automatically resolved"
|
1455
|
+
)
|
1456
|
+
raise e
|
2005
1457
|
|
2006
|
-
|
2007
|
-
|
1458
|
+
def _check_pk_fk_correspondence(self):
|
1459
|
+
"""
|
1460
|
+
Check whether primary keys and foreign keys agree for all tables in the schema.
|
1461
|
+
Raises ValueError if any correspondence fails.
|
1462
|
+
"""
|
2008
1463
|
|
2009
|
-
|
2010
|
-
|
2011
|
-
f"{s} [{c}]"
|
2012
|
-
for s, c in zip(
|
2013
|
-
comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
|
1464
|
+
pk_df = pd.DataFrame(
|
1465
|
+
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
2014
1466
|
)
|
2015
|
-
]
|
2016
|
-
# add source object
|
2017
|
-
comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
|
2018
|
-
# name index
|
2019
|
-
comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
|
2020
|
-
range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
|
2021
|
-
)
|
2022
|
-
comp_species_w_ids = comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
|
2023
|
-
[SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
|
2024
|
-
]
|
2025
|
-
|
2026
|
-
# create reactions
|
2027
|
-
|
2028
|
-
# create a from cs_species -> to cs_species edgelist
|
2029
|
-
# interaction_edgelist
|
2030
|
-
comp_species_w_names = (
|
2031
|
-
comp_species_w_ids.reset_index()
|
2032
|
-
.merge(species_df[SBML_DFS.S_NAME].reset_index())
|
2033
|
-
.merge(compartments_df[SBML_DFS.C_NAME].reset_index())
|
2034
|
-
)
|
2035
1467
|
|
2036
|
-
|
2037
|
-
|
2038
|
-
|
2039
|
-
|
2040
|
-
|
2041
|
-
|
2042
|
-
|
2043
|
-
|
2044
|
-
|
2045
|
-
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2050
|
-
SBML_DFS.S_NAME: "downstream_name",
|
2051
|
-
SBML_DFS.C_NAME: "downstream_compartment",
|
2052
|
-
},
|
2053
|
-
axis=1,
|
2054
|
-
),
|
2055
|
-
how="left",
|
2056
|
-
)[
|
2057
|
-
REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
|
2058
|
-
]
|
2059
|
-
|
2060
|
-
# some extra checks
|
2061
|
-
if interaction_edgelist.shape[0] != interaction_edgelist_w_cspecies.shape[0]:
|
2062
|
-
raise ValueError(
|
2063
|
-
"Merging compartmentalized species to interaction_edgelist"
|
2064
|
-
" resulted in an increase in the tables from "
|
2065
|
-
f"{interaction_edgelist.shape[0]} to "
|
2066
|
-
f"{interaction_edgelist_w_cspecies.shape[0]} indicating"
|
2067
|
-
" a 1-many join which should have been 1-1"
|
1468
|
+
fk_df = (
|
1469
|
+
pd.DataFrame(
|
1470
|
+
[
|
1471
|
+
{"fk_table": k, "fk": v["fk"]}
|
1472
|
+
for k, v in self.schema.items()
|
1473
|
+
if "fk" in v.keys()
|
1474
|
+
]
|
1475
|
+
)
|
1476
|
+
.set_index("fk_table")["fk"]
|
1477
|
+
.apply(pd.Series)
|
1478
|
+
.reset_index()
|
1479
|
+
.melt(id_vars="fk_table")
|
1480
|
+
.drop(["variable"], axis=1)
|
1481
|
+
.rename(columns={"value": "key"})
|
2068
1482
|
)
|
2069
1483
|
|
2070
|
-
|
2071
|
-
interaction_edgelist_w_cspecies[SBML_DFS.R_SOURCE] = interaction_source
|
2072
|
-
interaction_edgelist_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
|
2073
|
-
range(interaction_edgelist_w_cspecies.shape[0]), SBML_DFS.R_ID
|
2074
|
-
)
|
1484
|
+
pk_fk_correspondences = pk_df.merge(fk_df)
|
2075
1485
|
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
SBML_DFS.R_SOURCE,
|
2080
|
-
SBML_DFS.R_ISREVERSIBLE,
|
2081
|
-
]
|
2082
|
-
reactions_df = interaction_edgelist_w_cspecies.copy().set_index(SBML_DFS.R_ID)[
|
2083
|
-
reactions_df_columns + extra_reactions_columns
|
2084
|
-
]
|
2085
|
-
# Keep extra columns to save them as extra data
|
2086
|
-
reactions_data = reactions_df[extra_reactions_columns]
|
2087
|
-
reactions_df = reactions_df[reactions_df_columns]
|
2088
|
-
|
2089
|
-
# define upstream and downstream comp species as reaction species
|
2090
|
-
reaction_species_df = pd.concat(
|
2091
|
-
[
|
2092
|
-
# upstream interactions are defined by sbo_term and should generally
|
2093
|
-
# be modifiers/stimulator/inhibitor/interactor
|
2094
|
-
interaction_edgelist_w_cspecies[["sc_id_up", "sbo_term", "r_id"]]
|
2095
|
-
.assign(stoichiometry=upstream_stoichiometry)
|
2096
|
-
.rename({"sc_id_up": "sc_id"}, axis=1),
|
2097
|
-
# downstream interactions indicate some modification of the state
|
2098
|
-
# of the species and hence are defined as product
|
2099
|
-
interaction_edgelist_w_cspecies[["sc_id_down", "r_id"]]
|
2100
|
-
.assign(
|
2101
|
-
stoichiometry=downstream_stoichiometry,
|
2102
|
-
sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
|
1486
|
+
for i in range(0, pk_fk_correspondences.shape[0]):
|
1487
|
+
pk_table_keys = set(
|
1488
|
+
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
2103
1489
|
)
|
2104
|
-
|
2105
|
-
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2109
|
-
)
|
2110
|
-
reaction_species_df = reaction_species_df.set_index("rsc_id")
|
2111
|
-
|
2112
|
-
# form sbml_dfs object
|
2113
|
-
sbml_tbl_dict: MutableMapping[str, pd.DataFrame | dict[str, pd.DataFrame]] = {
|
2114
|
-
"compartments": compartments_df,
|
2115
|
-
"species": species_df,
|
2116
|
-
"compartmentalized_species": comp_species_w_ids,
|
2117
|
-
"reactions": reactions_df,
|
2118
|
-
"reaction_species": reaction_species_df,
|
2119
|
-
}
|
2120
|
-
if len(extra_reactions_columns) > 0:
|
2121
|
-
if isinstance(keep_reactions_data, str):
|
2122
|
-
reactions_data_label = keep_reactions_data
|
2123
|
-
else:
|
2124
|
-
reactions_data_label = "source"
|
2125
|
-
sbml_tbl_dict["reactions_data"] = {reactions_data_label: reactions_data}
|
1490
|
+
if None in pk_table_keys:
|
1491
|
+
raise ValueError(
|
1492
|
+
f"{pk_fk_correspondences['pk_table'][i]} had "
|
1493
|
+
"missing values in its index"
|
1494
|
+
)
|
2126
1495
|
|
2127
|
-
|
2128
|
-
|
2129
|
-
|
2130
|
-
|
2131
|
-
|
2132
|
-
|
1496
|
+
fk_table_keys = set(
|
1497
|
+
getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
|
1498
|
+
:, pk_fk_correspondences["key"][i]
|
1499
|
+
]
|
1500
|
+
)
|
1501
|
+
if None in fk_table_keys:
|
1502
|
+
raise ValueError(
|
1503
|
+
f"{pk_fk_correspondences['fk_table'][i]} included "
|
1504
|
+
f"missing {pk_fk_correspondences['key'][i]} values"
|
1505
|
+
)
|
1506
|
+
|
1507
|
+
# all foreign keys need to match a primary key
|
1508
|
+
extra_fks = fk_table_keys.difference(pk_table_keys)
|
1509
|
+
if len(extra_fks) != 0:
|
1510
|
+
raise ValueError(
|
1511
|
+
f"{len(extra_fks)} distinct "
|
1512
|
+
f"{pk_fk_correspondences['key'][i]} values were"
|
1513
|
+
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
1514
|
+
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
1515
|
+
" All foreign keys must have a matching primary key.\n\n"
|
1516
|
+
f"Extra key are: {', '.join(extra_fks)}"
|
1517
|
+
)
|
2133
1518
|
|
2134
|
-
|
2135
|
-
|
1519
|
+
def _find_underspecified_reactions_by_scids(
|
1520
|
+
self, sc_ids: Iterable[str]
|
1521
|
+
) -> set[str]:
|
1522
|
+
"""
|
1523
|
+
Find Underspecified reactions
|
2136
1524
|
|
2137
|
-
|
1525
|
+
Identify reactions which should be removed if a set of molecular species are removed
|
1526
|
+
from the system.
|
2138
1527
|
|
1528
|
+
Parameters
|
1529
|
+
----------
|
1530
|
+
sc_ids : list[str]
|
1531
|
+
A list of compartmentalized species ids (sc_ids) which will be removed.
|
2139
1532
|
|
2140
|
-
|
2141
|
-
|
2142
|
-
|
2143
|
-
|
2144
|
-
|
2145
|
-
|
2146
|
-
|
2147
|
-
|
2148
|
-
|
2149
|
-
|
2150
|
-
|
2151
|
-
|
2152
|
-
|
2153
|
-
if len(missing_required_fields) > 0:
|
2154
|
-
raise ValueError(
|
2155
|
-
f"{', '.join(missing_required_fields)} are required variables"
|
2156
|
-
' in "compartments_df" but were not present in the input file.'
|
1533
|
+
Returns
|
1534
|
+
-------
|
1535
|
+
underspecified_reactions : set[str]
|
1536
|
+
A set of reactions which should be removed because they will not occur once
|
1537
|
+
"sc_ids" are removed.
|
1538
|
+
"""
|
1539
|
+
updated_reaction_species = self.reaction_species.copy()
|
1540
|
+
updated_reaction_species["new"] = ~updated_reaction_species[
|
1541
|
+
SBML_DFS.SC_ID
|
1542
|
+
].isin(sc_ids)
|
1543
|
+
updated_reaction_species = sbml_dfs_utils.add_sbo_role(updated_reaction_species)
|
1544
|
+
underspecified_reactions = sbml_dfs_utils.find_underspecified_reactions(
|
1545
|
+
updated_reaction_species
|
2157
1546
|
)
|
1547
|
+
return underspecified_reactions
|
2158
1548
|
|
2159
|
-
|
2160
|
-
|
2161
|
-
|
2162
|
-
|
2163
|
-
|
2164
|
-
raise ValueError(
|
2165
|
-
f"{', '.join(missing_required_fields)} are required"
|
2166
|
-
' variables in "species_df" but were not present '
|
2167
|
-
"in the input file."
|
1549
|
+
def _get_unused_cspecies(self) -> set[str]:
|
1550
|
+
"""Returns a set of compartmentalized species
|
1551
|
+
that are not part of any reactions"""
|
1552
|
+
sc_ids = set(self.compartmentalized_species.index) - set(
|
1553
|
+
self.reaction_species[SBML_DFS.SC_ID]
|
2168
1554
|
)
|
1555
|
+
return sc_ids # type: ignore
|
2169
1556
|
|
2170
|
-
|
2171
|
-
|
2172
|
-
|
2173
|
-
|
2174
|
-
)
|
2175
|
-
if len(missing_required_fields) > 0:
|
2176
|
-
raise ValueError(
|
2177
|
-
f"{', '.join(missing_required_fields)} are required "
|
2178
|
-
'variables in "interaction_edgelist" but were not '
|
2179
|
-
"present in the input file."
|
1557
|
+
def _get_unused_species(self) -> set[str]:
|
1558
|
+
"""Returns a list of species that are not part of any reactions"""
|
1559
|
+
s_ids = set(self.species.index) - set(
|
1560
|
+
self.compartmentalized_species[SBML_DFS.S_ID]
|
2180
1561
|
)
|
1562
|
+
return s_ids # type: ignore
|
2181
1563
|
|
2182
|
-
|
2183
|
-
|
1564
|
+
def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
1565
|
+
"""Removes compartmentalized species from the model
|
2184
1566
|
|
2185
|
-
|
2186
|
-
|
2187
|
-
|
2188
|
-
"""Check for a mismatch between the provided species data and species implied by the edgelist."""
|
1567
|
+
This should not be directly used by the user, as it can lead to
|
1568
|
+
invalid reactions when removing species without a logic to decide
|
1569
|
+
if the reaction needs to be removed as well.
|
2189
1570
|
|
2190
|
-
|
2191
|
-
|
2192
|
-
|
2193
|
-
|
2194
|
-
|
2195
|
-
|
2196
|
-
f" to {merged_species.shape[0]} indicating that names were"
|
2197
|
-
" not unique"
|
1571
|
+
Args:
|
1572
|
+
sc_ids (Iterable[str]): the compartmentalized species to remove
|
1573
|
+
"""
|
1574
|
+
# Remove compartmentalized species
|
1575
|
+
self.compartmentalized_species = self.compartmentalized_species.drop(
|
1576
|
+
index=list(sc_ids)
|
2198
1577
|
)
|
1578
|
+
# remove corresponding reactions_species
|
1579
|
+
self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
|
2199
1580
|
|
2200
|
-
|
2201
|
-
|
2202
|
-
|
2203
|
-
].unique()
|
2204
|
-
if len(missing_compartments) >= 1:
|
2205
|
-
raise ValueError(
|
2206
|
-
f"{len(missing_compartments)} compartments were present in"
|
2207
|
-
' "interaction_edgelist" but not "compartments_df":'
|
2208
|
-
f" {', '.join(missing_compartments)}"
|
2209
|
-
)
|
1581
|
+
def _remove_entity_data(self, entity_type: str, label: str) -> None:
|
1582
|
+
"""
|
1583
|
+
Remove data from species_data or reactions_data by table name and label.
|
2210
1584
|
|
2211
|
-
|
2212
|
-
|
2213
|
-
|
2214
|
-
|
2215
|
-
|
2216
|
-
|
2217
|
-
'"interaction_edgelist" but not "species_df":'
|
2218
|
-
f" {', '.join(missing_species)}"
|
2219
|
-
)
|
1585
|
+
Parameters
|
1586
|
+
----------
|
1587
|
+
entity_type : str
|
1588
|
+
Name of the table to remove data from ('species' or 'reactions')
|
1589
|
+
label : str
|
1590
|
+
Label of the data to remove
|
2220
1591
|
|
2221
|
-
|
1592
|
+
Notes
|
1593
|
+
-----
|
1594
|
+
If the label does not exist, a warning will be logged that includes the existing labels.
|
1595
|
+
"""
|
1596
|
+
if entity_type not in ENTITIES_W_DATA:
|
1597
|
+
raise ValueError("table_name must be either 'species' or 'reactions'")
|
2222
1598
|
|
1599
|
+
data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
|
1600
|
+
if label not in data_dict:
|
1601
|
+
existing_labels = list(data_dict.keys())
|
1602
|
+
logger.warning(
|
1603
|
+
f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
|
1604
|
+
f"Existing labels: {existing_labels}"
|
1605
|
+
)
|
1606
|
+
return
|
2223
1607
|
|
2224
|
-
|
2225
|
-
stubbed_compartment: str = GENERIC_COMPARTMENT,
|
2226
|
-
) -> pd.DataFrame:
|
2227
|
-
"""Stub Compartments
|
1608
|
+
del data_dict[label]
|
2228
1609
|
|
2229
|
-
|
1610
|
+
def _remove_species(self, s_ids: Iterable[str]):
|
1611
|
+
"""Removes species from the model
|
2230
1612
|
|
2231
|
-
|
2232
|
-
|
2233
|
-
|
1613
|
+
This should not be directly used by the user, as it can lead to
|
1614
|
+
invalid reactions when removing species without a logic to decide
|
1615
|
+
if the reaction needs to be removed as well.
|
2234
1616
|
|
2235
|
-
|
2236
|
-
|
2237
|
-
"""
|
1617
|
+
This removes the species and corresponding compartmentalized species and
|
1618
|
+
reactions_species.
|
2238
1619
|
|
2239
|
-
|
2240
|
-
|
2241
|
-
|
2242
|
-
)
|
1620
|
+
Args:
|
1621
|
+
s_ids (Iterable[str]): the species to remove
|
1622
|
+
"""
|
1623
|
+
sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
|
1624
|
+
self._remove_compartmentalized_species(sc_ids)
|
1625
|
+
# Remove species
|
1626
|
+
self.species = self.species.drop(index=list(s_ids))
|
1627
|
+
# remove data
|
1628
|
+
for k, data in self.species_data.items():
|
1629
|
+
self.species_data[k] = data.drop(index=list(s_ids))
|
2243
1630
|
|
2244
|
-
|
2245
|
-
|
2246
|
-
|
2247
|
-
)
|
1631
|
+
def _remove_unused_cspecies(self):
|
1632
|
+
"""Removes compartmentalized species that are no
|
1633
|
+
longer part of any reactions"""
|
1634
|
+
sc_ids = self._get_unused_cspecies()
|
1635
|
+
self._remove_compartmentalized_species(sc_ids)
|
2248
1636
|
|
2249
|
-
|
1637
|
+
def _remove_unused_species(self):
|
1638
|
+
"""Removes species that are no longer part of any
|
1639
|
+
compartmentalized species"""
|
1640
|
+
s_ids = self._get_unused_species()
|
1641
|
+
self._remove_species(s_ids)
|
2250
1642
|
|
2251
|
-
|
2252
|
-
uri=identifiers.create_uri_url(
|
2253
|
-
ontology=ONTOLOGIES.GO,
|
2254
|
-
identifier=stubbed_compartment_id,
|
2255
|
-
),
|
2256
|
-
biological_qualifier_type=BQB.IS,
|
2257
|
-
)
|
1643
|
+
def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
|
2258
1644
|
|
2259
|
-
|
2260
|
-
|
2261
|
-
SBML_DFS.C_NAME: [stubbed_compartment],
|
2262
|
-
SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
|
2263
|
-
}
|
2264
|
-
)
|
2265
|
-
compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
|
2266
|
-
compartments_df.index.name = SBML_DFS.C_ID
|
1645
|
+
if isinstance(r_ids, str):
|
1646
|
+
r_ids = [r_ids]
|
2267
1647
|
|
2268
|
-
|
1648
|
+
if r_ids is None:
|
1649
|
+
return self.reactions.index.tolist()
|
1650
|
+
else:
|
1651
|
+
if not all(r_id in self.reactions.index for r_id in r_ids):
|
1652
|
+
raise ValueError(f"Reaction IDs {r_ids} not found in reactions table")
|
2269
1653
|
|
1654
|
+
return r_ids
|
2270
1655
|
|
2271
|
-
def
|
2272
|
-
|
1656
|
+
def _validate_reaction_species(self):
|
1657
|
+
if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
|
1658
|
+
raise ValueError(
|
1659
|
+
"All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
|
1660
|
+
)
|
2273
1661
|
|
2274
|
-
|
2275
|
-
|
1662
|
+
# test for null SBO terms
|
1663
|
+
n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
|
1664
|
+
if n_null_sbo_terms != 0:
|
1665
|
+
raise ValueError(
|
1666
|
+
f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
|
1667
|
+
)
|
2276
1668
|
|
2277
|
-
|
2278
|
-
|
2279
|
-
|
2280
|
-
|
1669
|
+
# find invalid SBO terms
|
1670
|
+
sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
|
1671
|
+
invalid_sbo_term_counts = sbo_counts[
|
1672
|
+
~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
|
1673
|
+
]
|
2281
1674
|
|
2282
|
-
|
2283
|
-
|
2284
|
-
|
2285
|
-
|
2286
|
-
|
2287
|
-
|
2288
|
-
|
2289
|
-
|
2290
|
-
"the index name for reaction data table was not"
|
2291
|
-
f" {ref_index_name}: {data_table.index.name}"
|
2292
|
-
)
|
2293
|
-
ids = data_table.index
|
2294
|
-
if any(ids.duplicated()):
|
2295
|
-
raise ValueError(
|
2296
|
-
"the index for reaction data table " "contained duplicate values"
|
2297
|
-
)
|
2298
|
-
if not all(ids.isin(ref_table.index)):
|
2299
|
-
raise ValueError(
|
2300
|
-
"the index for reaction data table contained values"
|
2301
|
-
" not found in the reactions table"
|
2302
|
-
)
|
2303
|
-
if not isinstance(data_table, pd.DataFrame):
|
2304
|
-
raise TypeError(
|
2305
|
-
f"The data table was type {type(data_table).__name__}"
|
2306
|
-
" but must be a pd.DataFrame"
|
2307
|
-
)
|
1675
|
+
if invalid_sbo_term_counts.shape[0] != 0:
|
1676
|
+
invalid_sbo_counts_str = ", ".join(
|
1677
|
+
[f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
|
1678
|
+
)
|
1679
|
+
raise ValueError(
|
1680
|
+
f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
|
1681
|
+
f"defined {invalid_sbo_counts_str}"
|
1682
|
+
)
|
2308
1683
|
|
1684
|
+
def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
|
1685
|
+
"""Validates reactions data attribute
|
2309
1686
|
|
2310
|
-
|
2311
|
-
|
1687
|
+
Args:
|
1688
|
+
reactions_data_table (pd.DataFrame): a reactions data table
|
2312
1689
|
|
2313
|
-
|
2314
|
-
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2319
|
-
return "protein"
|
2320
|
-
else:
|
2321
|
-
return "unknown"
|
2322
|
-
|
2323
|
-
|
2324
|
-
def stub_ids(ids):
|
2325
|
-
if len(ids) == 0:
|
2326
|
-
return pd.DataFrame(
|
2327
|
-
{
|
2328
|
-
IDENTIFIERS.ONTOLOGY: [None],
|
2329
|
-
IDENTIFIERS.IDENTIFIER: [None],
|
2330
|
-
IDENTIFIERS.URL: [None],
|
2331
|
-
IDENTIFIERS.BQB: [None],
|
2332
|
-
}
|
2333
|
-
)
|
2334
|
-
else:
|
2335
|
-
return pd.DataFrame(ids)
|
1690
|
+
Raises:
|
1691
|
+
ValueError: r_id not index name
|
1692
|
+
ValueError: r_id index contains duplicates
|
1693
|
+
ValueError: r_id not in reactions table
|
1694
|
+
"""
|
1695
|
+
sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
|
2336
1696
|
|
1697
|
+
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
1698
|
+
"""Validates species data attribute
|
2337
1699
|
|
2338
|
-
|
2339
|
-
|
2340
|
-
Add an sbo_role column to the reaction_species table.
|
1700
|
+
Args:
|
1701
|
+
species_data_table (pd.DataFrame): a species data table
|
2341
1702
|
|
2342
|
-
|
2343
|
-
|
1703
|
+
Raises:
|
1704
|
+
ValueError: s_id not index name
|
1705
|
+
ValueError: s_id index contains duplicates
|
1706
|
+
ValueError: s_id not in species table
|
1707
|
+
"""
|
1708
|
+
sbml_dfs_utils._validate_matching_data(species_data_table, self.species)
|
2344
1709
|
|
2345
|
-
|
2346
|
-
|
1710
|
+
def _validate_table(self, table_name: str) -> None:
|
1711
|
+
"""
|
1712
|
+
Validate a table in this SBML_dfs object against its schema.
|
2347
1713
|
|
2348
|
-
|
1714
|
+
This is an internal method that validates a table that is part of this SBML_dfs
|
1715
|
+
object against the schema stored in self.schema.
|
2349
1716
|
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
2353
|
-
|
2354
|
-
)
|
1717
|
+
Parameters
|
1718
|
+
----------
|
1719
|
+
table : str
|
1720
|
+
Name of the table to validate
|
2355
1721
|
|
2356
|
-
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2362
|
-
)
|
2363
|
-
mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
|
2364
|
-
reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
|
1722
|
+
Raises
|
1723
|
+
------
|
1724
|
+
ValueError
|
1725
|
+
If the table does not conform to its schema
|
1726
|
+
"""
|
1727
|
+
table_data = getattr(self, table_name)
|
2365
1728
|
|
2366
|
-
|
1729
|
+
sbml_dfs_utils.validate_sbml_dfs_table(table_data, table_name)
|
2367
1730
|
|
2368
1731
|
|
2369
|
-
def
|
2370
|
-
|
2371
|
-
|
1732
|
+
def sbml_dfs_from_edgelist(
|
1733
|
+
interaction_edgelist: pd.DataFrame,
|
1734
|
+
species_df: pd.DataFrame,
|
1735
|
+
compartments_df: pd.DataFrame,
|
1736
|
+
interaction_source: source.Source,
|
1737
|
+
upstream_stoichiometry: int = 0,
|
1738
|
+
downstream_stoichiometry: int = 1,
|
1739
|
+
downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
|
1740
|
+
keep_species_data: bool | str = False,
|
1741
|
+
keep_reactions_data: bool | str = False,
|
1742
|
+
) -> SBML_dfs:
|
1743
|
+
"""
|
1744
|
+
Create SBML_dfs from interaction edgelist.
|
2372
1745
|
|
2373
|
-
|
2374
|
-
|
2375
|
-
raise ValueError(
|
2376
|
-
"The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
|
2377
|
-
)
|
2378
|
-
if "new" not in reaction_species_w_roles.columns:
|
2379
|
-
raise ValueError(
|
2380
|
-
"The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2381
|
-
)
|
2382
|
-
# check that new is a boolean column
|
2383
|
-
if reaction_species_w_roles["new"].dtype != bool:
|
2384
|
-
raise ValueError(
|
2385
|
-
"The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2386
|
-
)
|
1746
|
+
Combines a set of molecular interactions into a mechanistic SBML_dfs model
|
1747
|
+
by processing interaction data, species information, and compartment definitions.
|
2387
1748
|
|
2388
|
-
|
2389
|
-
|
2390
|
-
|
2391
|
-
|
1749
|
+
Parameters
|
1750
|
+
----------
|
1751
|
+
interaction_edgelist : pd.DataFrame
|
1752
|
+
Table containing molecular interactions with columns:
|
1753
|
+
- upstream_name : str, matches "s_name" from species_df
|
1754
|
+
- downstream_name : str, matches "s_name" from species_df
|
1755
|
+
- upstream_compartment : str, matches "c_name" from compartments_df
|
1756
|
+
- downstream_compartment : str, matches "c_name" from compartments_df
|
1757
|
+
- r_name : str, name for the interaction
|
1758
|
+
- sbo_term : str, SBO term defining interaction type
|
1759
|
+
- r_Identifiers : identifiers.Identifiers, supporting identifiers
|
1760
|
+
- r_isreversible : bool, whether reaction is reversible
|
1761
|
+
species_df : pd.DataFrame
|
1762
|
+
Table defining molecular species with columns:
|
1763
|
+
- s_name : str, name of molecular species
|
1764
|
+
- s_Identifiers : identifiers.Identifiers, species identifiers
|
1765
|
+
compartments_df : pd.DataFrame
|
1766
|
+
Table defining compartments with columns:
|
1767
|
+
- c_name : str, name of compartment
|
1768
|
+
- c_Identifiers : identifiers.Identifiers, compartment identifiers
|
1769
|
+
interaction_source : source.Source
|
1770
|
+
Source object linking model entities to interaction source
|
1771
|
+
upstream_stoichiometry : int, default 0
|
1772
|
+
Stoichiometry of upstream species in reactions
|
1773
|
+
downstream_stoichiometry : int, default 1
|
1774
|
+
Stoichiometry of downstream species in reactions
|
1775
|
+
downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
|
1776
|
+
SBO term for downstream reactant type
|
1777
|
+
keep_species_data : bool or str, default False
|
1778
|
+
Whether to preserve extra species columns. If True, saves as 'source' label.
|
1779
|
+
If string, uses as custom label. If False, discards extra data.
|
1780
|
+
keep_reactions_data : bool or str, default False
|
1781
|
+
Whether to preserve extra reaction columns. If True, saves as 'source' label.
|
1782
|
+
If string, uses as custom label. If False, discards extra data.
|
1783
|
+
|
1784
|
+
Returns
|
1785
|
+
-------
|
1786
|
+
SBML_dfs
|
1787
|
+
Validated SBML data structure containing compartments, species,
|
1788
|
+
compartmentalized species, reactions, and reaction species tables.
|
1789
|
+
"""
|
1790
|
+
# 1. Validate inputs
|
1791
|
+
sbml_dfs_utils._edgelist_validate_inputs(
|
1792
|
+
interaction_edgelist, species_df, compartments_df
|
2392
1793
|
)
|
2393
1794
|
|
2394
|
-
|
2395
|
-
|
2396
|
-
|
2397
|
-
f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
|
2398
|
-
)
|
2399
|
-
|
2400
|
-
# find the cases where all "new" values for a given (r_id, sbo_term) are False
|
2401
|
-
reactions_with_lost_requirements = set(
|
2402
|
-
reaction_species_w_roles
|
2403
|
-
# drop already filtered reactions
|
2404
|
-
.query("r_id not in @reactions_with_lost_defining_members")
|
2405
|
-
.query("sbo_role == 'REQUIRED'")
|
2406
|
-
# which entries which have some required attribute have all False values for that attribute
|
2407
|
-
.groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
|
2408
|
-
.agg({"new": "any"})
|
2409
|
-
.query("new == False")
|
2410
|
-
.index.get_level_values(SBML_DFS.R_ID)
|
1795
|
+
# 2. Identify which extra columns to preserve
|
1796
|
+
extra_columns = sbml_dfs_utils._edgelist_identify_extra_columns(
|
1797
|
+
interaction_edgelist, species_df, keep_reactions_data, keep_species_data
|
2411
1798
|
)
|
2412
1799
|
|
2413
|
-
|
2414
|
-
|
2415
|
-
|
2416
|
-
|
2417
|
-
|
2418
|
-
|
2419
|
-
underspecified_reactions = reactions_with_lost_defining_members.union(
|
2420
|
-
reactions_with_lost_requirements
|
1800
|
+
# 3. Process compartments and species tables
|
1801
|
+
processed_compartments = sbml_dfs_utils._edgelist_process_compartments(
|
1802
|
+
compartments_df, interaction_source
|
1803
|
+
)
|
1804
|
+
processed_species, species_data = sbml_dfs_utils._edgelist_process_species(
|
1805
|
+
species_df, interaction_source, extra_columns["species"]
|
2421
1806
|
)
|
2422
1807
|
|
2423
|
-
|
2424
|
-
|
2425
|
-
|
2426
|
-
|
2427
|
-
|
2428
|
-
|
2429
|
-
|
2430
|
-
Find Underspecified reactions
|
2431
|
-
|
2432
|
-
Identity reactions which should be removed if a set of molecular species are removed
|
2433
|
-
from the system.
|
2434
|
-
|
2435
|
-
Params:
|
2436
|
-
sbml_dfs (SBML_dfs):
|
2437
|
-
A pathway representation
|
2438
|
-
sc_ids (list[str])
|
2439
|
-
A list of compartmentalized species ids (sc_ids) which will be removed.
|
2440
|
-
|
2441
|
-
Returns:
|
2442
|
-
underspecified_reactions (set[str]):
|
2443
|
-
A list of reactions which should be removed because they will not occur once
|
2444
|
-
\"sc_ids\" are removed.
|
2445
|
-
|
2446
|
-
"""
|
1808
|
+
# 4. Create compartmentalized species
|
1809
|
+
comp_species = sbml_dfs_utils._edgelist_create_compartmentalized_species(
|
1810
|
+
interaction_edgelist,
|
1811
|
+
processed_species,
|
1812
|
+
processed_compartments,
|
1813
|
+
interaction_source,
|
1814
|
+
)
|
2447
1815
|
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
1816
|
+
# 5. Create reactions and reaction species
|
1817
|
+
reactions, reaction_species, reactions_data = (
|
1818
|
+
sbml_dfs_utils._edgelist_create_reactions_and_species(
|
1819
|
+
interaction_edgelist,
|
1820
|
+
comp_species,
|
1821
|
+
processed_species,
|
1822
|
+
processed_compartments,
|
1823
|
+
interaction_source,
|
1824
|
+
upstream_stoichiometry,
|
1825
|
+
downstream_stoichiometry,
|
1826
|
+
downstream_sbo_name,
|
1827
|
+
extra_columns["reactions"],
|
1828
|
+
)
|
2451
1829
|
)
|
2452
1830
|
|
2453
|
-
|
2454
|
-
|
1831
|
+
# 6. Assemble final SBML_dfs object
|
1832
|
+
sbml_dfs = _edgelist_assemble_sbml_model(
|
1833
|
+
processed_compartments,
|
1834
|
+
processed_species,
|
1835
|
+
comp_species,
|
1836
|
+
reactions,
|
1837
|
+
reaction_species,
|
1838
|
+
species_data,
|
1839
|
+
reactions_data,
|
1840
|
+
keep_species_data,
|
1841
|
+
keep_reactions_data,
|
1842
|
+
extra_columns,
|
1843
|
+
)
|
2455
1844
|
|
2456
|
-
return
|
1845
|
+
return sbml_dfs
|
2457
1846
|
|
2458
1847
|
|
2459
|
-
def
|
1848
|
+
def _edgelist_assemble_sbml_model(
|
1849
|
+
compartments: pd.DataFrame,
|
1850
|
+
species: pd.DataFrame,
|
1851
|
+
comp_species: pd.DataFrame,
|
1852
|
+
reactions: pd.DataFrame,
|
1853
|
+
reaction_species: pd.DataFrame,
|
1854
|
+
species_data,
|
1855
|
+
reactions_data,
|
1856
|
+
keep_species_data,
|
1857
|
+
keep_reactions_data,
|
1858
|
+
extra_columns: dict[str, list[str]],
|
1859
|
+
) -> SBML_dfs:
|
2460
1860
|
"""
|
2461
|
-
|
2462
|
-
|
2463
|
-
This function validates a table against the schema defined in SBML_DFS_SCHEMA,
|
2464
|
-
without requiring an SBML_dfs object. Useful for validating tables before
|
2465
|
-
creating an SBML_dfs object.
|
1861
|
+
Assemble the final SBML_dfs object.
|
2466
1862
|
|
2467
1863
|
Parameters
|
2468
1864
|
----------
|
2469
|
-
|
2470
|
-
|
2471
|
-
|
2472
|
-
|
2473
|
-
|
2474
|
-
|
2475
|
-
|
2476
|
-
|
2477
|
-
|
2478
|
-
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
|
2483
|
-
|
2484
|
-
|
2485
|
-
|
2486
|
-
|
2487
|
-
|
2488
|
-
|
2489
|
-
|
2490
|
-
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
"""
|
2495
|
-
Core validation logic for SBML_dfs tables.
|
2496
|
-
|
2497
|
-
This function performs the actual validation checks for any table against its schema,
|
2498
|
-
regardless of whether it's part of an SBML_dfs object or standalone.
|
2499
|
-
|
2500
|
-
Parameters
|
2501
|
-
----------
|
2502
|
-
table_data : pd.DataFrame
|
2503
|
-
The table data to validate
|
2504
|
-
table_schema : dict
|
2505
|
-
Schema definition for the table
|
2506
|
-
table_name : str
|
2507
|
-
Name of the table (for error messages)
|
2508
|
-
|
2509
|
-
Raises
|
2510
|
-
------
|
2511
|
-
ValueError
|
2512
|
-
If the table does not conform to its schema:
|
2513
|
-
- Not a DataFrame
|
2514
|
-
- Wrong index name
|
2515
|
-
- Duplicate primary keys
|
2516
|
-
- Missing required variables
|
2517
|
-
- Empty table
|
1865
|
+
compartments : pd.DataFrame
|
1866
|
+
Processed compartments data
|
1867
|
+
species : pd.DataFrame
|
1868
|
+
Processed species data
|
1869
|
+
comp_species : pd.DataFrame
|
1870
|
+
Compartmentalized species data
|
1871
|
+
reactions : pd.DataFrame
|
1872
|
+
Reactions data
|
1873
|
+
reaction_species : pd.DataFrame
|
1874
|
+
Reaction species relationships
|
1875
|
+
species_data : pd.DataFrame
|
1876
|
+
Extra species data to include
|
1877
|
+
reactions_data : pd.DataFrame
|
1878
|
+
Extra reactions data to include
|
1879
|
+
keep_species_data : bool or str
|
1880
|
+
Label for species extra data
|
1881
|
+
keep_reactions_data : bool or str
|
1882
|
+
Label for reactions extra data
|
1883
|
+
extra_columns : dict
|
1884
|
+
Dictionary containing lists of extra column names
|
1885
|
+
|
1886
|
+
Returns
|
1887
|
+
-------
|
1888
|
+
SBML_dfs
|
1889
|
+
Validated SBML data structure
|
2518
1890
|
"""
|
2519
|
-
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2524
|
-
|
2525
|
-
|
2526
|
-
if table_data.index.name != expected_index_name:
|
2527
|
-
raise ValueError(
|
2528
|
-
f"the index name for {table_name} was not the pk: {expected_index_name}"
|
2529
|
-
)
|
2530
|
-
|
2531
|
-
# check that all entries in the index are unique
|
2532
|
-
if len(set(table_data.index.tolist())) != table_data.shape[0]:
|
2533
|
-
duplicated_pks = table_data.index.value_counts()
|
2534
|
-
duplicated_pks = duplicated_pks[duplicated_pks > 1]
|
2535
|
-
|
2536
|
-
example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
|
2537
|
-
raise ValueError(
|
2538
|
-
f"{duplicated_pks.shape[0]} primary keys were duplicated "
|
2539
|
-
f"including {', '.join(example_duplicates)}"
|
2540
|
-
)
|
2541
|
-
|
2542
|
-
# check variables
|
2543
|
-
expected_vars = set(table_schema["vars"])
|
2544
|
-
table_vars = set(list(table_data.columns))
|
1891
|
+
sbml_tbl_dict = {
|
1892
|
+
"compartments": compartments,
|
1893
|
+
"species": species,
|
1894
|
+
"compartmentalized_species": comp_species,
|
1895
|
+
"reactions": reactions,
|
1896
|
+
"reaction_species": reaction_species,
|
1897
|
+
}
|
2545
1898
|
|
2546
|
-
|
2547
|
-
if len(
|
2548
|
-
|
2549
|
-
|
2550
|
-
f"{', '.join(extra_vars)}"
|
1899
|
+
# Add extra data if requested
|
1900
|
+
if len(extra_columns["reactions"]) > 0:
|
1901
|
+
data_label = (
|
1902
|
+
keep_reactions_data if isinstance(keep_reactions_data, str) else "source"
|
2551
1903
|
)
|
1904
|
+
sbml_tbl_dict["reactions_data"] = {data_label: reactions_data}
|
2552
1905
|
|
2553
|
-
|
2554
|
-
|
2555
|
-
|
2556
|
-
f"Missing {len(missing_vars)} required variables for {table_name}: "
|
2557
|
-
f"{', '.join(missing_vars)}"
|
1906
|
+
if len(extra_columns["species"]) > 0:
|
1907
|
+
data_label = (
|
1908
|
+
keep_species_data if isinstance(keep_species_data, str) else "source"
|
2558
1909
|
)
|
1910
|
+
sbml_tbl_dict["species_data"] = {data_label: species_data}
|
2559
1911
|
|
2560
|
-
|
2561
|
-
|
2562
|
-
raise ValueError(f"{table_name} contained no entries")
|
2563
|
-
|
2564
|
-
|
2565
|
-
def _filter_promiscuous_components(
|
2566
|
-
bqb_has_parts_species: pd.DataFrame, max_promiscuity: int
|
2567
|
-
) -> pd.DataFrame:
|
2568
|
-
|
2569
|
-
# number of complexes a species is part of
|
2570
|
-
n_complexes_involvedin = bqb_has_parts_species.value_counts(
|
2571
|
-
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
2572
|
-
)
|
2573
|
-
promiscuous_component_identifiers_index = n_complexes_involvedin[
|
2574
|
-
n_complexes_involvedin > max_promiscuity
|
2575
|
-
].index
|
2576
|
-
promiscuous_component_identifiers = pd.Series(
|
2577
|
-
data=[True] * len(promiscuous_component_identifiers_index),
|
2578
|
-
index=promiscuous_component_identifiers_index,
|
2579
|
-
name="is_shared_component",
|
2580
|
-
dtype=bool,
|
2581
|
-
)
|
2582
|
-
|
2583
|
-
if len(promiscuous_component_identifiers) == 0:
|
2584
|
-
return bqb_has_parts_species
|
2585
|
-
|
2586
|
-
filtered_bqb_has_parts = bqb_has_parts_species.merge(
|
2587
|
-
promiscuous_component_identifiers,
|
2588
|
-
left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
|
2589
|
-
right_index=True,
|
2590
|
-
how="left",
|
2591
|
-
)
|
2592
|
-
|
2593
|
-
filtered_bqb_has_parts["is_shared_component"] = (
|
2594
|
-
filtered_bqb_has_parts["is_shared_component"].astype("boolean").fillna(False)
|
2595
|
-
)
|
2596
|
-
# drop identifiers shared as components across many species
|
2597
|
-
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
2598
|
-
~filtered_bqb_has_parts["is_shared_component"]
|
2599
|
-
].drop(["is_shared_component"], axis=1)
|
1912
|
+
sbml_model = SBML_dfs(sbml_tbl_dict)
|
1913
|
+
sbml_model.validate()
|
2600
1914
|
|
2601
|
-
return
|
1915
|
+
return sbml_model
|