napistu 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +38 -27
- napistu/consensus.py +22 -27
- napistu/constants.py +91 -65
- napistu/context/filtering.py +2 -1
- napistu/identifiers.py +3 -6
- napistu/indices.py +3 -1
- napistu/ingestion/bigg.py +6 -6
- napistu/ingestion/sbml.py +298 -295
- napistu/ingestion/string.py +16 -19
- napistu/ingestion/trrust.py +22 -27
- napistu/ingestion/yeast.py +2 -1
- napistu/matching/interactions.py +4 -4
- napistu/matching/species.py +1 -1
- napistu/modify/uncompartmentalize.py +1 -1
- napistu/network/net_create.py +1 -1
- napistu/network/paths.py +1 -1
- napistu/ontologies/dogma.py +2 -1
- napistu/ontologies/genodexito.py +5 -1
- napistu/ontologies/renaming.py +4 -0
- napistu/sbml_dfs_core.py +1343 -2167
- napistu/sbml_dfs_utils.py +1086 -143
- napistu/utils.py +52 -41
- {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/METADATA +2 -2
- {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/RECORD +40 -40
- tests/conftest.py +113 -13
- tests/test_consensus.py +161 -4
- tests/test_context_filtering.py +2 -2
- tests/test_gaps.py +26 -15
- tests/test_network_net_create.py +1 -1
- tests/test_network_precompute.py +1 -1
- tests/test_ontologies_genodexito.py +3 -0
- tests/test_ontologies_mygene.py +3 -0
- tests/test_ontologies_renaming.py +28 -24
- tests/test_sbml_dfs_core.py +260 -211
- tests/test_sbml_dfs_utils.py +194 -36
- tests/test_utils.py +19 -0
- {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/WHEEL +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/entry_points.txt +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_core.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import copy
|
3
4
|
import logging
|
4
5
|
import re
|
5
6
|
from typing import Any
|
@@ -7,8 +8,12 @@ from typing import Iterable
|
|
7
8
|
from typing import Mapping
|
8
9
|
from typing import MutableMapping
|
9
10
|
from typing import TYPE_CHECKING
|
11
|
+
from typing import Optional
|
12
|
+
from typing import Union
|
10
13
|
|
14
|
+
from fs import open_fs
|
11
15
|
import pandas as pd
|
16
|
+
|
12
17
|
from napistu import identifiers
|
13
18
|
from napistu import sbml_dfs_utils
|
14
19
|
from napistu import source
|
@@ -17,25 +22,14 @@ from napistu.ingestion import sbml
|
|
17
22
|
from napistu.constants import SBML_DFS
|
18
23
|
from napistu.constants import SBML_DFS_SCHEMA
|
19
24
|
from napistu.constants import IDENTIFIERS
|
20
|
-
from napistu.constants import
|
21
|
-
from napistu.constants import CPR_STANDARD_OUTPUTS
|
22
|
-
from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
|
25
|
+
from napistu.constants import NAPISTU_STANDARD_OUTPUTS
|
23
26
|
from napistu.constants import BQB_PRIORITIES
|
24
27
|
from napistu.constants import ONTOLOGY_PRIORITIES
|
25
|
-
from napistu.constants import BQB
|
26
|
-
from napistu.constants import BQB_DEFINING_ATTRS
|
27
28
|
from napistu.constants import MINI_SBO_FROM_NAME
|
28
29
|
from napistu.constants import MINI_SBO_TO_NAME
|
29
|
-
from napistu.constants import ONTOLOGIES
|
30
|
-
from napistu.constants import SBO_NAME_TO_ROLE
|
31
30
|
from napistu.constants import SBOTERM_NAMES
|
32
|
-
from napistu.constants import SBO_ROLES_DEFS
|
33
31
|
from napistu.constants import ENTITIES_W_DATA
|
34
32
|
from napistu.constants import ENTITIES_TO_ENTITY_DATA
|
35
|
-
from napistu.ingestion.constants import GENERIC_COMPARTMENT
|
36
|
-
from napistu.ingestion.constants import COMPARTMENT_ALIASES
|
37
|
-
from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
|
38
|
-
from fs import open_fs
|
39
33
|
|
40
34
|
logger = logging.getLogger(__name__)
|
41
35
|
|
@@ -65,26 +59,80 @@ class SBML_dfs:
|
|
65
59
|
schema : dict
|
66
60
|
Dictionary representing the structure of the other attributes and meaning of their variables
|
67
61
|
|
68
|
-
Methods
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
62
|
+
Public Methods (alphabetical)
|
63
|
+
----------------------------
|
64
|
+
add_reactions_data(label, data)
|
65
|
+
Add a new reactions data table to the model with validation.
|
66
|
+
add_species_data(label, data)
|
67
|
+
Add a new species data table to the model with validation.
|
68
|
+
copy()
|
69
|
+
Return a deep copy of the SBML_dfs object.
|
70
|
+
export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
|
71
|
+
Export the SBML_dfs model and its tables to files in a specified directory.
|
72
|
+
get_characteristic_species_ids(dogmatic=True)
|
73
|
+
Return characteristic systematic identifiers for molecular species, optionally using a strict or loose definition.
|
76
74
|
get_cspecies_features()
|
77
|
-
|
78
|
-
get_species_features()
|
79
|
-
Get additional attributes of species
|
75
|
+
Compute and return additional features for compartmentalized species, such as degree and type.
|
80
76
|
get_identifiers(id_type)
|
81
|
-
|
82
|
-
|
83
|
-
|
77
|
+
Retrieve a table of identifiers for a specified entity type (e.g., species or reactions).
|
78
|
+
get_network_summary()
|
79
|
+
Return a dictionary of diagnostic statistics summarizing the network structure.
|
80
|
+
get_species_features()
|
81
|
+
Compute and return additional features for species, such as species type.
|
82
|
+
get_table(entity_type, required_attributes=None)
|
83
|
+
Retrieve a table for a given entity type, optionally validating required attributes.
|
84
|
+
get_uri_urls(entity_type, entity_ids=None, required_ontology=None)
|
85
|
+
Return reference URLs for specified entities, optionally filtered by ontology.
|
86
|
+
infer_sbo_terms()
|
87
|
+
Infer and fill in missing SBO terms for reaction species based on stoichiometry.
|
88
|
+
infer_uncompartmentalized_species_location()
|
89
|
+
Infer and assign compartments for compartmentalized species with missing compartment information.
|
90
|
+
name_compartmentalized_species()
|
91
|
+
Rename compartmentalized species to include compartment information if needed.
|
92
|
+
reaction_formulas(r_ids=None)
|
93
|
+
Generate human-readable reaction formulas for specified reactions.
|
94
|
+
reaction_summaries(r_ids=None)
|
95
|
+
Return a summary DataFrame for specified reactions, including names and formulas.
|
96
|
+
remove_compartmentalized_species(sc_ids)
|
97
|
+
Remove specified compartmentalized species and associated reactions from the model.
|
98
|
+
remove_reactions(r_ids, remove_species=False)
|
99
|
+
Remove specified reactions and optionally remove unused species.
|
100
|
+
remove_reactions_data(label)
|
101
|
+
Remove a reactions data table by label.
|
102
|
+
remove_species_data(label)
|
103
|
+
Remove a species data table by label.
|
104
|
+
search_by_ids(ids, entity_type, identifiers_df, ontologies=None)
|
105
|
+
Find entities and identifiers matching a set of query IDs.
|
106
|
+
search_by_name(name, entity_type, partial_match=True)
|
107
|
+
Find entities by exact or partial name match.
|
108
|
+
select_species_data(species_data_table)
|
109
|
+
Select a species data table from the SBML_dfs object by name.
|
110
|
+
species_status(s_id)
|
111
|
+
Return all reactions a species participates in, with stoichiometry and formula information.
|
84
112
|
validate()
|
85
|
-
Validate the SBML_dfs structure and relationships
|
113
|
+
Validate the SBML_dfs structure and relationships.
|
86
114
|
validate_and_resolve()
|
87
|
-
Validate and attempt to automatically fix common issues
|
115
|
+
Validate and attempt to automatically fix common issues.
|
116
|
+
|
117
|
+
Private/Hidden Methods (alphabetical, appear after public methods)
|
118
|
+
-----------------------------------------------------------------
|
119
|
+
_attempt_resolve(e)
|
120
|
+
_find_underspecified_reactions_by_scids(sc_ids)
|
121
|
+
_get_unused_cspecies()
|
122
|
+
_get_unused_species()
|
123
|
+
_remove_compartmentalized_species(sc_ids)
|
124
|
+
_remove_entity_data(entity_type, label)
|
125
|
+
_remove_species(s_ids)
|
126
|
+
_remove_unused_cspecies()
|
127
|
+
_remove_unused_species()
|
128
|
+
_validate_identifiers()
|
129
|
+
_validate_pk_fk_correspondence()
|
130
|
+
_validate_r_ids(r_ids)
|
131
|
+
_validate_reaction_species()
|
132
|
+
_validate_reactions_data(reactions_data_table)
|
133
|
+
_validate_sources()
|
134
|
+
_validate_species_data(species_data_table)
|
135
|
+
_validate_table(table_name)
|
88
136
|
"""
|
89
137
|
|
90
138
|
compartments: pd.DataFrame
|
@@ -162,193 +210,187 @@ class SBML_dfs:
|
|
162
210
|
'"validate" = False so "resolve" will be ignored (eventhough it was True)'
|
163
211
|
)
|
164
212
|
|
165
|
-
|
166
|
-
|
167
|
-
|
213
|
+
# =============================================================================
|
214
|
+
# PUBLIC METHODS (ALPHABETICAL ORDER)
|
215
|
+
# =============================================================================
|
216
|
+
|
217
|
+
def add_reactions_data(self, label: str, data: pd.DataFrame):
|
168
218
|
"""
|
169
|
-
|
219
|
+
Add additional reaction data with validation.
|
170
220
|
|
171
221
|
Parameters
|
172
222
|
----------
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
Must be passed as a set, e.g. {'id'}, not a string.
|
178
|
-
|
179
|
-
Returns
|
180
|
-
-------
|
181
|
-
pd.DataFrame
|
182
|
-
The requested table
|
223
|
+
label : str
|
224
|
+
Label for the new data
|
225
|
+
data : pd.DataFrame
|
226
|
+
Data to add, must be indexed by reaction_id
|
183
227
|
|
184
228
|
Raises
|
185
229
|
------
|
186
230
|
ValueError
|
187
|
-
If
|
188
|
-
TypeError
|
189
|
-
If required_attributes is not a set
|
231
|
+
If the data is invalid or label already exists
|
190
232
|
"""
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
if entity_type not in schema.keys():
|
233
|
+
self._validate_reactions_data(data)
|
234
|
+
if label in self.reactions_data:
|
195
235
|
raise ValueError(
|
196
|
-
f"{
|
197
|
-
f"which are present are {', '.join(schema.keys())}"
|
236
|
+
f"{label} already exists in reactions_data. " "Drop it first."
|
198
237
|
)
|
238
|
+
self.reactions_data[label] = data
|
199
239
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
|
204
|
-
"Did you pass a string instead of a set?"
|
205
|
-
)
|
240
|
+
def add_species_data(self, label: str, data: pd.DataFrame):
|
241
|
+
"""
|
242
|
+
Add additional species data with validation.
|
206
243
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
244
|
+
Parameters
|
245
|
+
----------
|
246
|
+
label : str
|
247
|
+
Label for the new data
|
248
|
+
data : pd.DataFrame
|
249
|
+
Data to add, must be indexed by species_id
|
212
250
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
251
|
+
Raises
|
252
|
+
------
|
253
|
+
ValueError
|
254
|
+
If the data is invalid or label already exists
|
255
|
+
"""
|
256
|
+
self._validate_species_data(data)
|
257
|
+
if label in self.species_data:
|
258
|
+
raise ValueError(
|
259
|
+
f"{label} already exists in species_data. " "Drop it first."
|
260
|
+
)
|
261
|
+
self.species_data[label] = data
|
218
262
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
]
|
223
|
-
if len(invalid_attrs) > 0:
|
224
|
-
raise ValueError(
|
225
|
-
f"The following required attributes are not present for the {entity_type} table: "
|
226
|
-
f"{', '.join(invalid_attrs)}."
|
227
|
-
)
|
263
|
+
def copy(self):
|
264
|
+
"""
|
265
|
+
Return a deep copy of the SBML_dfs object.
|
228
266
|
|
229
|
-
|
267
|
+
Returns
|
268
|
+
-------
|
269
|
+
SBML_dfs
|
270
|
+
A deep copy of the current SBML_dfs object.
|
271
|
+
"""
|
272
|
+
return copy.deepcopy(self)
|
230
273
|
|
231
|
-
def
|
274
|
+
def export_sbml_dfs(
|
232
275
|
self,
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
) ->
|
276
|
+
model_prefix: str,
|
277
|
+
outdir: str,
|
278
|
+
overwrite: bool = False,
|
279
|
+
dogmatic: bool = True,
|
280
|
+
) -> None:
|
238
281
|
"""
|
239
|
-
|
282
|
+
Export SBML_dfs
|
240
283
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
284
|
+
Export summaries of species identifiers and each table underlying
|
285
|
+
an SBML_dfs pathway model
|
286
|
+
|
287
|
+
Params
|
288
|
+
------
|
289
|
+
model_prefix: str
|
290
|
+
Label to prepend to all exported files
|
291
|
+
outdir: str
|
292
|
+
Path to an existing directory where results should be saved
|
293
|
+
overwrite: bool
|
294
|
+
Should the directory be overwritten if it already exists?
|
295
|
+
dogmatic: bool
|
296
|
+
If True then treat genes, transcript, and proteins as separate species. If False
|
297
|
+
then treat them interchangeably.
|
251
298
|
|
252
299
|
Returns
|
253
300
|
-------
|
254
|
-
|
255
|
-
- Matching entities
|
256
|
-
- Matching identifiers
|
257
|
-
|
258
|
-
Raises
|
259
|
-
------
|
260
|
-
ValueError
|
261
|
-
If entity_type is invalid or ontologies are invalid
|
262
|
-
TypeError
|
263
|
-
If ontologies is not a set
|
301
|
+
None
|
264
302
|
"""
|
265
|
-
|
266
|
-
|
267
|
-
|
303
|
+
if not isinstance(model_prefix, str):
|
304
|
+
raise TypeError(
|
305
|
+
f"model_prefix was a {type(model_prefix)} " "and must be a str"
|
306
|
+
)
|
307
|
+
if not isinstance(self, SBML_dfs):
|
308
|
+
raise TypeError(
|
309
|
+
f"sbml_dfs was a {type(self)} and must" " be an sbml.SBML_dfs"
|
310
|
+
)
|
268
311
|
|
269
|
-
|
270
|
-
|
271
|
-
req_vars={
|
272
|
-
entity_pk,
|
273
|
-
IDENTIFIERS.ONTOLOGY,
|
274
|
-
IDENTIFIERS.IDENTIFIER,
|
275
|
-
IDENTIFIERS.URL,
|
276
|
-
IDENTIFIERS.BQB,
|
277
|
-
},
|
278
|
-
allow_series=False,
|
279
|
-
).assert_present()
|
312
|
+
# filter to identifiers which make sense when mapping from ids -> species
|
313
|
+
species_identifiers = self.get_characteristic_species_ids(dogmatic=dogmatic)
|
280
314
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
315
|
+
try:
|
316
|
+
utils.initialize_dir(outdir, overwrite=overwrite)
|
317
|
+
except FileExistsError:
|
318
|
+
logger.warning(
|
319
|
+
f"Directory {outdir} already exists and overwrite is False. "
|
320
|
+
"Files will be added to the existing directory."
|
321
|
+
)
|
322
|
+
with open_fs(outdir, writeable=True) as fs:
|
323
|
+
species_identifiers_path = (
|
324
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
|
325
|
+
)
|
326
|
+
with fs.openbin(species_identifiers_path, "w") as f:
|
327
|
+
species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
|
328
|
+
f, sep="\t", index=False
|
293
329
|
)
|
294
330
|
|
295
|
-
#
|
296
|
-
|
331
|
+
# export jsons
|
332
|
+
species_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES
|
333
|
+
reactions_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTIONS
|
334
|
+
reation_species_path = (
|
335
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTION_SPECIES
|
336
|
+
)
|
337
|
+
compartments_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTS
|
338
|
+
compartmentalized_species_path = (
|
339
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
|
340
|
+
)
|
341
|
+
with fs.openbin(species_path, "w") as f:
|
342
|
+
self.species[[SBML_DFS.S_NAME]].to_json(f)
|
297
343
|
|
298
|
-
|
299
|
-
|
300
|
-
]
|
301
|
-
entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
|
344
|
+
with fs.openbin(reactions_path, "w") as f:
|
345
|
+
self.reactions[[SBML_DFS.R_NAME]].to_json(f)
|
302
346
|
|
303
|
-
|
347
|
+
with fs.openbin(reation_species_path, "w") as f:
|
348
|
+
self.reaction_species.to_json(f)
|
304
349
|
|
305
|
-
|
306
|
-
|
307
|
-
|
350
|
+
with fs.openbin(compartments_path, "w") as f:
|
351
|
+
self.compartments[[SBML_DFS.C_NAME]].to_json(f)
|
352
|
+
|
353
|
+
with fs.openbin(compartmentalized_species_path, "w") as f:
|
354
|
+
self.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
|
355
|
+
f
|
356
|
+
)
|
357
|
+
|
358
|
+
return None
|
359
|
+
|
360
|
+
def get_characteristic_species_ids(self, dogmatic: bool = True) -> pd.DataFrame:
|
308
361
|
"""
|
309
|
-
|
362
|
+
Get Characteristic Species IDs
|
363
|
+
|
364
|
+
List the systematic identifiers which are characteristic of molecular species, e.g., excluding subcomponents, and optionally, treating proteins, transcripts, and genes equiavlently.
|
310
365
|
|
311
366
|
Parameters
|
312
367
|
----------
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
partial_match : bool, optional
|
318
|
-
Whether to allow partial string matches, by default True
|
368
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
369
|
+
The SBML_dfs object.
|
370
|
+
dogmatic : bool, default=True
|
371
|
+
Whether to use the dogmatic flag to determine which BQB attributes are valid.
|
319
372
|
|
320
373
|
Returns
|
321
374
|
-------
|
322
375
|
pd.DataFrame
|
323
|
-
|
376
|
+
A DataFrame containing the systematic identifiers which are characteristic of molecular species.
|
324
377
|
"""
|
325
|
-
entity_table = self.get_table(entity_type, required_attributes={"label"})
|
326
|
-
label_attr = self.schema[entity_type]["label"]
|
327
378
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
else:
|
333
|
-
matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
|
334
|
-
return matches
|
379
|
+
# select valid BQB attributes based on dogmatic flag
|
380
|
+
defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(
|
381
|
+
dogmatic
|
382
|
+
)
|
335
383
|
|
336
|
-
|
337
|
-
|
338
|
-
Get additional attributes of species.
|
384
|
+
# pre-summarize ontologies
|
385
|
+
species_identifiers = self.get_identifiers(SBML_DFS.SPECIES)
|
339
386
|
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
- species_type: Classification of the species (e.g., metabolite, protein)
|
345
|
-
"""
|
346
|
-
species = self.species
|
347
|
-
augmented_species = species.assign(
|
348
|
-
**{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
|
387
|
+
# drop some BQB_HAS_PART annotations
|
388
|
+
species_identifiers = sbml_dfs_utils.filter_to_characteristic_species_ids(
|
389
|
+
species_identifiers,
|
390
|
+
defining_biological_qualifiers=defining_biological_qualifiers,
|
349
391
|
)
|
350
392
|
|
351
|
-
return
|
393
|
+
return species_identifiers
|
352
394
|
|
353
395
|
def get_cspecies_features(self) -> pd.DataFrame:
|
354
396
|
"""
|
@@ -414,7 +456,7 @@ class SBML_dfs:
|
|
414
456
|
If id_type is invalid or identifiers are malformed
|
415
457
|
"""
|
416
458
|
selected_table = self.get_table(id_type, {"id"})
|
417
|
-
schema =
|
459
|
+
schema = SBML_DFS_SCHEMA.SCHEMA
|
418
460
|
|
419
461
|
identifiers_dict = dict()
|
420
462
|
for sysid in selected_table.index:
|
@@ -432,6 +474,7 @@ class SBML_dfs:
|
|
432
474
|
if not identifiers_dict:
|
433
475
|
# Return empty DataFrame with expected columns if nothing found
|
434
476
|
return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
|
477
|
+
|
435
478
|
identifiers_tbl = pd.concat(identifiers_dict)
|
436
479
|
|
437
480
|
identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
|
@@ -445,113 +488,28 @@ class SBML_dfs:
|
|
445
488
|
|
446
489
|
return named_identifiers
|
447
490
|
|
448
|
-
def
|
449
|
-
self,
|
450
|
-
entity_type: str,
|
451
|
-
entity_ids: Iterable[str] | None = None,
|
452
|
-
required_ontology: str | None = None,
|
453
|
-
) -> pd.Series:
|
491
|
+
def get_network_summary(self) -> Mapping[str, Any]:
|
454
492
|
"""
|
455
|
-
Get
|
456
|
-
|
457
|
-
Parameters
|
458
|
-
----------
|
459
|
-
entity_type : str
|
460
|
-
Type of entity to get URLs for (e.g., 'species', 'reactions')
|
461
|
-
entity_ids : Optional[Iterable[str]], optional
|
462
|
-
Specific entities to get URLs for, by default None (all entities)
|
463
|
-
required_ontology : Optional[str], optional
|
464
|
-
Specific ontology to get URLs from, by default None
|
493
|
+
Get diagnostic statistics about the network.
|
465
494
|
|
466
495
|
Returns
|
467
496
|
-------
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
if entity_type not in valid_entity_types:
|
486
|
-
raise ValueError(
|
487
|
-
f"{entity_type} is an invalid entity_type; valid types "
|
488
|
-
f"are {', '.join(valid_entity_types)}"
|
489
|
-
)
|
490
|
-
|
491
|
-
entity_table = getattr(self, entity_type)
|
492
|
-
|
493
|
-
if entity_ids is not None:
|
494
|
-
# ensure that entity_ids are unique and then convert back to list
|
495
|
-
# to support pandas indexing
|
496
|
-
entity_ids = list(set(entity_ids))
|
497
|
-
|
498
|
-
# filter to a subset of identifiers if one is provided
|
499
|
-
entity_table = entity_table.loc[entity_ids]
|
500
|
-
|
501
|
-
# create a dataframe of all identifiers for the select entities
|
502
|
-
all_ids = pd.concat(
|
503
|
-
[
|
504
|
-
sbml_dfs_utils._stub_ids(
|
505
|
-
entity_table[schema[entity_type]["id"]].iloc[i].ids
|
506
|
-
).assign(id=entity_table.index[i])
|
507
|
-
for i in range(0, entity_table.shape[0])
|
508
|
-
]
|
509
|
-
).rename(columns={"id": schema[entity_type]["pk"]})
|
510
|
-
|
511
|
-
# set priorities for ontologies and bqb terms
|
512
|
-
|
513
|
-
if required_ontology is None:
|
514
|
-
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
515
|
-
ONTOLOGY_PRIORITIES, how="left"
|
516
|
-
)
|
517
|
-
else:
|
518
|
-
ontology_priorities = pd.DataFrame(
|
519
|
-
[{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
|
520
|
-
)
|
521
|
-
# if only a single ontology is sought then just return matching entries
|
522
|
-
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
523
|
-
ontology_priorities, how="inner"
|
524
|
-
)
|
525
|
-
|
526
|
-
uri_urls = (
|
527
|
-
all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
|
528
|
-
.groupby(schema[entity_type]["pk"])
|
529
|
-
.first()[IDENTIFIERS.URL]
|
530
|
-
)
|
531
|
-
return uri_urls
|
532
|
-
|
533
|
-
def get_network_summary(self) -> Mapping[str, Any]:
|
534
|
-
"""
|
535
|
-
Get diagnostic statistics about the network.
|
536
|
-
|
537
|
-
Returns
|
538
|
-
-------
|
539
|
-
Mapping[str, Any]
|
540
|
-
Dictionary of diagnostic statistics including:
|
541
|
-
- n_species_types: Number of species types
|
542
|
-
- dict_n_species_per_type: Number of species per type
|
543
|
-
- n_species: Number of species
|
544
|
-
- n_cspecies: Number of compartmentalized species
|
545
|
-
- n_reaction_species: Number of reaction species
|
546
|
-
- n_reactions: Number of reactions
|
547
|
-
- n_compartments: Number of compartments
|
548
|
-
- dict_n_species_per_compartment: Number of species per compartment
|
549
|
-
- stats_species_per_reaction: Statistics on reactands per reaction
|
550
|
-
- top10_species_per_reaction: Top 10 reactions by number of reactands
|
551
|
-
- stats_degree: Statistics on species connectivity
|
552
|
-
- top10_degree: Top 10 species by connectivity
|
553
|
-
- stats_identifiers_per_species: Statistics on identifiers per species
|
554
|
-
- top10_identifiers_per_species: Top 10 species by number of identifiers
|
497
|
+
Mapping[str, Any]
|
498
|
+
Dictionary of diagnostic statistics including:
|
499
|
+
- n_species_types: Number of species types
|
500
|
+
- dict_n_species_per_type: Number of species per type
|
501
|
+
- n_species: Number of species
|
502
|
+
- n_cspecies: Number of compartmentalized species
|
503
|
+
- n_reaction_species: Number of reaction species
|
504
|
+
- n_reactions: Number of reactions
|
505
|
+
- n_compartments: Number of compartments
|
506
|
+
- dict_n_species_per_compartment: Number of species per compartment
|
507
|
+
- stats_species_per_reaction: Statistics on reactands per reaction
|
508
|
+
- top10_species_per_reaction: Top 10 reactions by number of reactands
|
509
|
+
- stats_degree: Statistics on species connectivity
|
510
|
+
- top10_degree: Top 10 species by connectivity
|
511
|
+
- stats_identifiers_per_species: Statistics on identifiers per species
|
512
|
+
- top10_identifiers_per_species: Top 10 species by number of identifiers
|
555
513
|
"""
|
556
514
|
stats: MutableMapping[str, Any] = {}
|
557
515
|
species_features = self.get_species_features()
|
@@ -616,2009 +574,1352 @@ class SBML_dfs:
|
|
616
574
|
|
617
575
|
return stats
|
618
576
|
|
619
|
-
def
|
577
|
+
def get_species_features(self) -> pd.DataFrame:
|
620
578
|
"""
|
621
|
-
|
622
|
-
|
623
|
-
Parameters
|
624
|
-
----------
|
625
|
-
label : str
|
626
|
-
Label for the new data
|
627
|
-
data : pd.DataFrame
|
628
|
-
Data to add, must be indexed by species_id
|
579
|
+
Get additional attributes of species.
|
629
580
|
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
581
|
+
Returns
|
582
|
+
-------
|
583
|
+
pd.DataFrame
|
584
|
+
Species with additional features including:
|
585
|
+
- species_type: Classification of the species (e.g., metabolite, protein)
|
634
586
|
"""
|
635
|
-
self.
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
587
|
+
species = self.species
|
588
|
+
augmented_species = species.assign(
|
589
|
+
**{
|
590
|
+
"species_type": lambda d: d["s_Identifiers"].apply(
|
591
|
+
sbml_dfs_utils.species_type_types
|
592
|
+
)
|
593
|
+
}
|
594
|
+
)
|
641
595
|
|
642
|
-
|
643
|
-
"""
|
644
|
-
Remove species data by label.
|
645
|
-
"""
|
646
|
-
self._remove_entity_data(SBML_DFS.SPECIES, label)
|
596
|
+
return augmented_species
|
647
597
|
|
648
|
-
def
|
598
|
+
def get_table(
|
599
|
+
self, entity_type: str, required_attributes: None | set[str] = None
|
600
|
+
) -> pd.DataFrame:
|
649
601
|
"""
|
650
|
-
|
602
|
+
Get a table from the SBML_dfs object with optional attribute validation.
|
651
603
|
|
652
604
|
Parameters
|
653
605
|
----------
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
606
|
+
entity_type : str
|
607
|
+
The type of entity table to retrieve (e.g., 'species', 'reactions')
|
608
|
+
required_attributes : Optional[Set[str]], optional
|
609
|
+
Set of attributes that must be present in the table, by default None.
|
610
|
+
Must be passed as a set, e.g. {'id'}, not a string.
|
611
|
+
|
612
|
+
Returns
|
613
|
+
-------
|
614
|
+
pd.DataFrame
|
615
|
+
The requested table
|
658
616
|
|
659
617
|
Raises
|
660
618
|
------
|
661
619
|
ValueError
|
662
|
-
If
|
620
|
+
If entity_type is invalid or required attributes are missing
|
621
|
+
TypeError
|
622
|
+
If required_attributes is not a set
|
663
623
|
"""
|
664
|
-
self._validate_reactions_data(data)
|
665
|
-
if label in self.reactions_data:
|
666
|
-
raise ValueError(
|
667
|
-
f"{label} already exists in reactions_data. Drop it first."
|
668
|
-
)
|
669
|
-
self.reactions_data[label] = data
|
670
624
|
|
671
|
-
|
672
|
-
"""
|
673
|
-
Remove reactions data by label.
|
674
|
-
"""
|
675
|
-
self._remove_entity_data(SBML_DFS.REACTIONS, label)
|
625
|
+
schema = self.schema
|
676
626
|
|
677
|
-
|
678
|
-
|
679
|
-
|
627
|
+
if entity_type not in schema.keys():
|
628
|
+
raise ValueError(
|
629
|
+
f"{entity_type} does not match a table in the SBML_dfs object. The tables "
|
630
|
+
f"which are present are {', '.join(schema.keys())}"
|
631
|
+
)
|
680
632
|
|
681
|
-
|
682
|
-
|
683
|
-
|
633
|
+
if required_attributes is not None:
|
634
|
+
if not isinstance(required_attributes, set):
|
635
|
+
raise TypeError(
|
636
|
+
f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
|
637
|
+
"Did you pass a string instead of a set?"
|
638
|
+
)
|
684
639
|
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
640
|
+
# determine whether required_attributes are appropriate
|
641
|
+
VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
|
642
|
+
invalid_required_attributes = required_attributes.difference(
|
643
|
+
VALID_REQUIRED_ATTRIBUTES
|
644
|
+
)
|
690
645
|
|
691
|
-
|
692
|
-
|
693
|
-
|
646
|
+
if len(invalid_required_attributes) > 0:
|
647
|
+
raise ValueError(
|
648
|
+
f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
|
649
|
+
f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
|
650
|
+
)
|
694
651
|
|
695
|
-
|
652
|
+
# determine if required_attributes are satisified
|
653
|
+
invalid_attrs = [
|
654
|
+
s for s in required_attributes if s not in schema[entity_type].keys()
|
655
|
+
]
|
656
|
+
if len(invalid_attrs) > 0:
|
657
|
+
raise ValueError(
|
658
|
+
f"The following required attributes are not present for the {entity_type} table: "
|
659
|
+
f"{', '.join(invalid_attrs)}."
|
660
|
+
)
|
696
661
|
|
697
|
-
|
698
|
-
self._remove_unused_species()
|
662
|
+
return getattr(self, entity_type)
|
699
663
|
|
700
|
-
def
|
664
|
+
def get_uri_urls(
|
665
|
+
self,
|
666
|
+
entity_type: str,
|
667
|
+
entity_ids: Iterable[str] | None = None,
|
668
|
+
required_ontology: str | None = None,
|
669
|
+
) -> pd.Series:
|
701
670
|
"""
|
702
|
-
|
671
|
+
Get reference URLs for specified entities.
|
703
672
|
|
704
673
|
Parameters
|
705
674
|
----------
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
# remove corresponding reactions_species
|
713
|
-
self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
|
714
|
-
# remove reactions
|
715
|
-
self.reactions = self.reactions.drop(index=list(r_ids))
|
716
|
-
# remove reactions_data
|
717
|
-
if hasattr(self, "reactions_data"):
|
718
|
-
for k, data in self.reactions_data.items():
|
719
|
-
self.reactions_data[k] = data.drop(index=list(r_ids))
|
720
|
-
# remove species if requested
|
721
|
-
if remove_species:
|
722
|
-
self._remove_unused_cspecies()
|
723
|
-
self._remove_unused_species()
|
724
|
-
|
725
|
-
def validate(self):
|
726
|
-
"""
|
727
|
-
Validate the SBML_dfs structure and relationships.
|
675
|
+
entity_type : str
|
676
|
+
Type of entity to get URLs for (e.g., 'species', 'reactions')
|
677
|
+
entity_ids : Optional[Iterable[str]], optional
|
678
|
+
Specific entities to get URLs for, by default None (all entities)
|
679
|
+
required_ontology : Optional[str], optional
|
680
|
+
Specific ontology to get URLs from, by default None
|
728
681
|
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
- Primary key uniqueness
|
734
|
-
- Foreign key relationships
|
735
|
-
- Optional data table validity
|
736
|
-
- Reaction species validity
|
682
|
+
Returns
|
683
|
+
-------
|
684
|
+
pd.Series
|
685
|
+
Series mapping entity IDs to their reference URLs
|
737
686
|
|
738
687
|
Raises
|
739
688
|
------
|
740
689
|
ValueError
|
741
|
-
If
|
690
|
+
If entity_type is invalid
|
742
691
|
"""
|
692
|
+
schema = self.schema
|
743
693
|
|
744
|
-
|
745
|
-
|
694
|
+
# valid entities and their identifier variables
|
695
|
+
valid_entity_types = [
|
696
|
+
SBML_DFS.COMPARTMENTS,
|
697
|
+
SBML_DFS.SPECIES,
|
698
|
+
SBML_DFS.REACTIONS,
|
699
|
+
]
|
746
700
|
|
747
|
-
|
748
|
-
|
701
|
+
if entity_type not in valid_entity_types:
|
702
|
+
raise ValueError(
|
703
|
+
f"{entity_type} is an invalid entity_type; valid types "
|
704
|
+
f"are {', '.join(valid_entity_types)}"
|
705
|
+
)
|
749
706
|
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
707
|
+
entity_table = getattr(self, entity_type)
|
708
|
+
|
709
|
+
if entity_ids is not None:
|
710
|
+
# ensure that entity_ids are unique and then convert back to list
|
711
|
+
# to support pandas indexing
|
712
|
+
entity_ids = list(set(entity_ids))
|
713
|
+
|
714
|
+
# filter to a subset of identifiers if one is provided
|
715
|
+
entity_table = entity_table.loc[entity_ids]
|
716
|
+
|
717
|
+
# create a dataframe of all identifiers for the select entities
|
718
|
+
all_ids = pd.concat(
|
719
|
+
[
|
720
|
+
sbml_dfs_utils._id_dict_to_df(
|
721
|
+
entity_table[schema[entity_type]["id"]].iloc[i].ids
|
722
|
+
).assign(id=entity_table.index[i])
|
723
|
+
for i in range(0, entity_table.shape[0])
|
724
|
+
]
|
725
|
+
).rename(columns={"id": schema[entity_type]["pk"]})
|
726
|
+
|
727
|
+
# set priorities for ontologies and bqb terms
|
728
|
+
|
729
|
+
if required_ontology is None:
|
730
|
+
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
731
|
+
ONTOLOGY_PRIORITIES, how="left"
|
732
|
+
)
|
733
|
+
else:
|
734
|
+
ontology_priorities = pd.DataFrame(
|
735
|
+
[{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
|
736
|
+
)
|
737
|
+
# if only a single ontology is sought then just return matching entries
|
738
|
+
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
739
|
+
ontology_priorities, how="inner"
|
755
740
|
)
|
756
741
|
|
757
|
-
|
758
|
-
|
742
|
+
uri_urls = (
|
743
|
+
all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
|
744
|
+
.groupby(schema[entity_type]["pk"])
|
745
|
+
.first()[IDENTIFIERS.URL]
|
746
|
+
)
|
747
|
+
return uri_urls
|
748
|
+
|
749
|
+
def infer_sbo_terms(self):
|
750
|
+
"""
|
751
|
+
Infer SBO Terms
|
752
|
+
|
753
|
+
Define SBO terms based on stoichiometry for reaction_species with missing terms.
|
754
|
+
Modifies the SBML_dfs object in-place.
|
755
|
+
|
756
|
+
Returns
|
757
|
+
-------
|
758
|
+
None (modifies SBML_dfs object in-place)
|
759
|
+
"""
|
760
|
+
valid_sbo_terms = self.reaction_species[
|
761
|
+
self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
762
|
+
]
|
763
|
+
|
764
|
+
invalid_sbo_terms = self.reaction_species[
|
765
|
+
~self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
766
|
+
]
|
767
|
+
|
768
|
+
if not all(self.reaction_species[SBML_DFS.SBO_TERM].notnull()):
|
769
|
+
raise ValueError("All reaction_species[SBML_DFS.SBO_TERM] must be not null")
|
770
|
+
if invalid_sbo_terms.shape[0] == 0:
|
771
|
+
logger.info("All sbo_terms were valid; nothing to update.")
|
772
|
+
return
|
773
|
+
|
774
|
+
logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
|
775
|
+
|
776
|
+
# add missing/invalid terms based on stoichiometry
|
777
|
+
invalid_sbo_terms.loc[
|
778
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
|
779
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
|
780
|
+
|
781
|
+
invalid_sbo_terms.loc[
|
782
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
|
783
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
|
784
|
+
|
785
|
+
invalid_sbo_terms.loc[
|
786
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
|
787
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
|
788
|
+
|
789
|
+
updated_reaction_species = pd.concat(
|
790
|
+
[valid_sbo_terms, invalid_sbo_terms]
|
791
|
+
).sort_index()
|
792
|
+
|
793
|
+
if self.reaction_species.shape[0] != updated_reaction_species.shape[0]:
|
759
794
|
raise ValueError(
|
760
|
-
f"
|
761
|
-
f"{', '.join(missing_tables)}"
|
795
|
+
f"Trying to overwrite {self.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
|
762
796
|
)
|
797
|
+
self.reaction_species = updated_reaction_species
|
798
|
+
return
|
763
799
|
|
764
|
-
|
765
|
-
|
766
|
-
|
800
|
+
def infer_uncompartmentalized_species_location(self):
|
801
|
+
"""
|
802
|
+
Infer Uncompartmentalized Species Location
|
767
803
|
|
768
|
-
|
769
|
-
|
770
|
-
|
804
|
+
If the compartment of a subset of compartmentalized species
|
805
|
+
was not specified, infer an appropriate compartment from
|
806
|
+
other members of reactions they participate in.
|
807
|
+
|
808
|
+
This method modifies the SBML_dfs object in-place.
|
809
|
+
|
810
|
+
Returns
|
811
|
+
-------
|
812
|
+
None (modifies SBML_dfs object in-place)
|
813
|
+
"""
|
814
|
+
default_compartment = (
|
815
|
+
self.compartmentalized_species.value_counts(SBML_DFS.C_ID)
|
816
|
+
.rename("N")
|
817
|
+
.reset_index()
|
818
|
+
.sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
|
771
819
|
)
|
820
|
+
if not isinstance(default_compartment, str):
|
821
|
+
raise ValueError(
|
822
|
+
"No default compartment could be found - compartment "
|
823
|
+
"information may not be present"
|
824
|
+
)
|
772
825
|
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
826
|
+
# infer the compartments of species missing compartments
|
827
|
+
missing_compartment_scids = self.compartmentalized_species[
|
828
|
+
self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
829
|
+
].index.tolist()
|
830
|
+
if len(missing_compartment_scids) == 0:
|
831
|
+
logger.info(
|
832
|
+
"All compartmentalized species have compartments, "
|
833
|
+
"returning input SBML_dfs"
|
780
834
|
)
|
781
|
-
|
782
|
-
|
835
|
+
return self
|
836
|
+
|
837
|
+
participating_reactions = (
|
838
|
+
self.reaction_species[
|
839
|
+
self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
840
|
+
][SBML_DFS.R_ID]
|
841
|
+
.unique()
|
842
|
+
.tolist()
|
843
|
+
)
|
844
|
+
reaction_participants = self.reaction_species[
|
845
|
+
self.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
|
846
|
+
].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
|
847
|
+
reaction_participants = reaction_participants.merge(
|
848
|
+
self.compartmentalized_species[SBML_DFS.C_ID],
|
849
|
+
left_on=SBML_DFS.SC_ID,
|
850
|
+
right_index=True,
|
851
|
+
)
|
852
|
+
|
853
|
+
# find a default compartment to fall back on if all compartmental information is missing
|
854
|
+
primary_reaction_compartment = (
|
855
|
+
reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
|
856
|
+
.rename("N")
|
857
|
+
.reset_index()
|
858
|
+
.sort_values("N", ascending=False)
|
859
|
+
.groupby(SBML_DFS.R_ID)
|
860
|
+
.first()[SBML_DFS.C_ID]
|
783
861
|
.reset_index()
|
784
|
-
.melt(id_vars="fk_table")
|
785
|
-
.drop(["variable"], axis=1)
|
786
|
-
.rename(columns={"value": "key"})
|
787
862
|
)
|
788
863
|
|
789
|
-
|
864
|
+
inferred_compartmentalization = (
|
865
|
+
self.reaction_species[
|
866
|
+
self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
867
|
+
]
|
868
|
+
.merge(primary_reaction_compartment)
|
869
|
+
.value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
|
870
|
+
.rename("N")
|
871
|
+
.reset_index()
|
872
|
+
.sort_values("N", ascending=False)
|
873
|
+
.groupby(SBML_DFS.SC_ID)
|
874
|
+
.first()
|
875
|
+
.reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
|
876
|
+
)
|
877
|
+
logger.info(
|
878
|
+
f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
|
879
|
+
)
|
790
880
|
|
791
|
-
|
792
|
-
|
793
|
-
|
881
|
+
# define where a reaction is most likely to occur based on the compartmentalization of its participants
|
882
|
+
species_with_unknown_compartmentalization = set(
|
883
|
+
missing_compartment_scids
|
884
|
+
).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
|
885
|
+
if len(species_with_unknown_compartmentalization) != 0:
|
886
|
+
logger.warning(
|
887
|
+
f"{len(species_with_unknown_compartmentalization)} "
|
888
|
+
"species compartmentalization could not be inferred"
|
889
|
+
" from other reaction participants. Their compartmentalization "
|
890
|
+
f"will be set to the default of {default_compartment}"
|
794
891
|
)
|
795
|
-
if None in pk_table_keys:
|
796
|
-
raise ValueError(
|
797
|
-
f"{pk_fk_correspondences['pk_table'][i]} had "
|
798
|
-
"missing values in its index"
|
799
|
-
)
|
800
892
|
|
801
|
-
|
802
|
-
|
803
|
-
|
893
|
+
inferred_compartmentalization = pd.concat(
|
894
|
+
[
|
895
|
+
inferred_compartmentalization,
|
896
|
+
pd.DataFrame(
|
897
|
+
{
|
898
|
+
SBML_DFS.SC_ID: list(
|
899
|
+
species_with_unknown_compartmentalization
|
900
|
+
)
|
901
|
+
}
|
902
|
+
).assign(c_id=default_compartment),
|
804
903
|
]
|
805
904
|
)
|
806
|
-
if None in fk_table_keys:
|
807
|
-
raise ValueError(
|
808
|
-
f"{pk_fk_correspondences['fk_table'][i]} included "
|
809
|
-
f"missing {pk_fk_correspondences['key'][i]} values"
|
810
|
-
)
|
811
905
|
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
906
|
+
if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
|
907
|
+
raise ValueError(
|
908
|
+
f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
|
909
|
+
)
|
910
|
+
|
911
|
+
updated_compartmentalized_species = pd.concat(
|
912
|
+
[
|
913
|
+
self.compartmentalized_species[
|
914
|
+
~self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
915
|
+
],
|
916
|
+
self.compartmentalized_species[
|
917
|
+
self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
918
|
+
]
|
919
|
+
.drop(SBML_DFS.C_ID, axis=1)
|
920
|
+
.merge(
|
921
|
+
inferred_compartmentalization,
|
922
|
+
left_index=True,
|
923
|
+
right_on=SBML_DFS.SC_ID,
|
822
924
|
)
|
925
|
+
.set_index(SBML_DFS.SC_ID),
|
926
|
+
]
|
927
|
+
)
|
823
928
|
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
929
|
+
if (
|
930
|
+
updated_compartmentalized_species.shape[0]
|
931
|
+
!= self.compartmentalized_species.shape[0]
|
932
|
+
):
|
933
|
+
raise ValueError(
|
934
|
+
f"Trying to overwrite {self.compartmentalized_species.shape[0]}"
|
935
|
+
" compartmentalized species with "
|
936
|
+
f"{updated_compartmentalized_species.shape[0]}"
|
937
|
+
)
|
830
938
|
|
831
|
-
|
832
|
-
|
833
|
-
self._validate_reactions_data(v)
|
834
|
-
except ValueError as e:
|
835
|
-
raise ValueError(f"reactions data {k} was invalid.") from e
|
939
|
+
if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
|
940
|
+
raise ValueError("Some species compartments are still missing")
|
836
941
|
|
837
|
-
|
838
|
-
|
942
|
+
self.compartmentalized_species = updated_compartmentalized_species
|
943
|
+
return
|
839
944
|
|
840
|
-
def
|
945
|
+
def name_compartmentalized_species(self):
|
841
946
|
"""
|
842
|
-
|
947
|
+
Name Compartmentalized Species
|
843
948
|
|
844
|
-
|
845
|
-
|
846
|
-
2. If validation fails, tries to resolve the issue
|
847
|
-
3. Repeats until validation passes or issue cannot be resolved
|
949
|
+
Rename compartmentalized species if they have the same
|
950
|
+
name as their species. Modifies the SBML_dfs object in-place.
|
848
951
|
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
If validation fails and cannot be automatically resolved
|
952
|
+
Returns
|
953
|
+
-------
|
954
|
+
None (modifies SBML_dfs object in-place)
|
853
955
|
"""
|
956
|
+
augmented_cspecies = self.compartmentalized_species.merge(
|
957
|
+
self.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
|
958
|
+
).merge(
|
959
|
+
self.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
|
960
|
+
)
|
961
|
+
augmented_cspecies[SBML_DFS.SC_NAME] = [
|
962
|
+
f"{s} [{c}]" if sc == s else sc
|
963
|
+
for sc, c, s in zip(
|
964
|
+
augmented_cspecies[SBML_DFS.SC_NAME],
|
965
|
+
augmented_cspecies[SBML_DFS.C_NAME],
|
966
|
+
augmented_cspecies[SBML_DFS.S_NAME],
|
967
|
+
)
|
968
|
+
]
|
854
969
|
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
try:
|
860
|
-
self.validate()
|
861
|
-
validated = True
|
862
|
-
except Exception as e:
|
863
|
-
e_str = str(e)
|
864
|
-
if e_str == current_exception:
|
865
|
-
logger.warning(
|
866
|
-
"Automated resolution of an Exception was attempted but failed"
|
867
|
-
)
|
868
|
-
raise e
|
869
|
-
|
870
|
-
# try to resolve
|
871
|
-
self._attempt_resolve(e)
|
970
|
+
self.compartmentalized_species = augmented_cspecies.loc[
|
971
|
+
:, self.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
|
972
|
+
]
|
973
|
+
return
|
872
974
|
|
873
|
-
def
|
975
|
+
def reaction_formulas(
|
976
|
+
self, r_ids: Optional[Union[str, list[str]]] = None
|
977
|
+
) -> pd.Series:
|
874
978
|
"""
|
875
|
-
|
979
|
+
Reaction Summary
|
876
980
|
|
877
|
-
|
981
|
+
Return human-readable formulas for reactions.
|
982
|
+
|
983
|
+
Parameters:
|
878
984
|
----------
|
879
|
-
|
880
|
-
|
985
|
+
r_ids: [str], str or None
|
986
|
+
Reaction IDs or None for all reactions
|
881
987
|
|
882
988
|
Returns
|
883
|
-
|
884
|
-
pd.
|
885
|
-
The selected species data table
|
886
|
-
|
887
|
-
Raises
|
888
|
-
------
|
889
|
-
ValueError
|
890
|
-
If species_data_table is not found
|
989
|
+
----------
|
990
|
+
formula_strs: pd.Series
|
891
991
|
"""
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
992
|
+
|
993
|
+
validated_rids = self._validate_r_ids(r_ids)
|
994
|
+
|
995
|
+
matching_reaction_species = self.reaction_species[
|
996
|
+
self.reaction_species.r_id.isin(validated_rids)
|
997
|
+
].merge(
|
998
|
+
self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
999
|
+
)
|
1000
|
+
|
1001
|
+
# split into within compartment and cross-compartment reactions
|
1002
|
+
r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
|
1003
|
+
SBML_DFS.C_ID
|
1004
|
+
].nunique()
|
1005
|
+
|
1006
|
+
# identify reactions which work across compartments
|
1007
|
+
r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
|
1008
|
+
# there species must be labelled with the sc_name to specify where a species exists
|
1009
|
+
if r_id_cross_compartment.shape[0] > 0:
|
1010
|
+
rxn_eqtn_cross_compartment = (
|
1011
|
+
matching_reaction_species[
|
1012
|
+
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1013
|
+
r_id_cross_compartment.index
|
1014
|
+
)
|
1015
|
+
]
|
1016
|
+
.sort_values([SBML_DFS.SC_NAME])
|
1017
|
+
.groupby(SBML_DFS.R_ID)
|
1018
|
+
.apply(
|
1019
|
+
lambda x: sbml_dfs_utils.construct_formula_string(
|
1020
|
+
x, self.reactions, SBML_DFS.SC_NAME
|
1021
|
+
)
|
1022
|
+
)
|
1023
|
+
.rename("r_formula_str")
|
1024
|
+
)
|
1025
|
+
else:
|
1026
|
+
rxn_eqtn_cross_compartment = None
|
1027
|
+
|
1028
|
+
# identify reactions which occur within a single compartment; for these the reaction
|
1029
|
+
# can be labelled with the compartment and individual species can receive a more readable s_name
|
1030
|
+
r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
|
1031
|
+
if r_id_within_compartment.shape[0] > 0:
|
1032
|
+
# add s_name
|
1033
|
+
augmented_matching_reaction_species = (
|
1034
|
+
matching_reaction_species[
|
1035
|
+
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1036
|
+
r_id_within_compartment.index
|
1037
|
+
)
|
1038
|
+
]
|
1039
|
+
.merge(self.compartments, left_on=SBML_DFS.C_ID, right_index=True)
|
1040
|
+
.merge(self.species, left_on=SBML_DFS.S_ID, right_index=True)
|
1041
|
+
.sort_values([SBML_DFS.S_NAME])
|
1042
|
+
)
|
1043
|
+
# create formulas based on s_names of components
|
1044
|
+
rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
|
1045
|
+
[SBML_DFS.R_ID, SBML_DFS.C_NAME]
|
1046
|
+
).apply(
|
1047
|
+
lambda x: sbml_dfs_utils.construct_formula_string(
|
1048
|
+
x, self.reactions, SBML_DFS.S_NAME
|
1049
|
+
)
|
897
1050
|
)
|
1051
|
+
# add compartment for each reaction
|
1052
|
+
rxn_eqtn_within_compartment = pd.Series(
|
1053
|
+
[
|
1054
|
+
y + ": " + x
|
1055
|
+
for x, y in zip(
|
1056
|
+
rxn_eqtn_within_compartment,
|
1057
|
+
rxn_eqtn_within_compartment.index.get_level_values(
|
1058
|
+
SBML_DFS.C_NAME
|
1059
|
+
),
|
1060
|
+
)
|
1061
|
+
],
|
1062
|
+
index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
|
1063
|
+
).rename("r_formula_str")
|
1064
|
+
else:
|
1065
|
+
rxn_eqtn_within_compartment = None
|
898
1066
|
|
899
|
-
|
900
|
-
|
1067
|
+
formula_strs = pd.concat(
|
1068
|
+
[rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment]
|
1069
|
+
)
|
901
1070
|
|
902
|
-
|
1071
|
+
return formula_strs
|
1072
|
+
|
1073
|
+
def reaction_summaries(
|
1074
|
+
self, r_ids: Optional[Union[str, list[str]]] = None
|
1075
|
+
) -> pd.DataFrame:
|
903
1076
|
"""
|
904
|
-
|
1077
|
+
Reaction Summary
|
905
1078
|
|
906
|
-
|
907
|
-
object against the schema stored in self.schema.
|
1079
|
+
Return a summary of reactions.
|
908
1080
|
|
909
|
-
Parameters
|
1081
|
+
Parameters:
|
910
1082
|
----------
|
911
|
-
|
912
|
-
|
1083
|
+
r_ids: [str], str or None
|
1084
|
+
Reaction IDs or None for all reactions
|
913
1085
|
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
1086
|
+
Returns
|
1087
|
+
----------
|
1088
|
+
reaction_summaries_df: pd.DataFrame
|
1089
|
+
A table with r_id as an index and columns:
|
1090
|
+
- r_name: str, name of the reaction
|
1091
|
+
- r_formula_str: str, human-readable formula of the reaction
|
918
1092
|
"""
|
919
|
-
table_schema = self.schema[table]
|
920
|
-
table_data = getattr(self, table)
|
921
|
-
_perform_sbml_dfs_table_validation(table_data, table_schema, table)
|
922
1093
|
|
923
|
-
|
1094
|
+
validated_rids = self._validate_r_ids(r_ids)
|
1095
|
+
|
1096
|
+
participating_r_names = self.reactions.loc[validated_rids, SBML_DFS.R_NAME]
|
1097
|
+
participating_r_formulas = self.reaction_formulas(r_ids=validated_rids)
|
1098
|
+
reaction_summareis_df = pd.concat(
|
1099
|
+
[participating_r_names, participating_r_formulas], axis=1
|
1100
|
+
)
|
1101
|
+
|
1102
|
+
return reaction_summareis_df
|
1103
|
+
|
1104
|
+
def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
924
1105
|
"""
|
925
|
-
Remove
|
1106
|
+
Remove compartmentalized species and associated reactions.
|
1107
|
+
|
1108
|
+
Starting with a set of compartmentalized species, determine which reactions
|
1109
|
+
should be removed based on their removal. Then remove these reactions,
|
1110
|
+
compartmentalized species, and species.
|
926
1111
|
|
927
1112
|
Parameters
|
928
1113
|
----------
|
929
|
-
|
930
|
-
|
931
|
-
label : str
|
932
|
-
Label of the data to remove
|
933
|
-
|
934
|
-
Notes
|
935
|
-
-----
|
936
|
-
If the label does not exist, a warning will be logged that includes the existing labels.
|
937
|
-
"""
|
938
|
-
if entity_type not in ENTITIES_W_DATA:
|
939
|
-
raise ValueError("table_name must be either 'species' or 'reactions'")
|
940
|
-
|
941
|
-
data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
|
942
|
-
if label not in data_dict:
|
943
|
-
existing_labels = list(data_dict.keys())
|
944
|
-
logger.warning(
|
945
|
-
f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
|
946
|
-
f"Existing labels: {existing_labels}"
|
947
|
-
)
|
948
|
-
return
|
949
|
-
|
950
|
-
del data_dict[label]
|
951
|
-
|
952
|
-
def _remove_unused_cspecies(self):
|
953
|
-
"""Removes compartmentalized species that are no
|
954
|
-
longer part of any reactions"""
|
955
|
-
sc_ids = self._get_unused_cspecies()
|
956
|
-
self._remove_compartmentalized_species(sc_ids)
|
957
|
-
|
958
|
-
def _get_unused_cspecies(self) -> set[str]:
|
959
|
-
"""Returns a set of compartmentalized species
|
960
|
-
that are not part of any reactions"""
|
961
|
-
sc_ids = set(self.compartmentalized_species.index) - set(
|
962
|
-
self.reaction_species[SBML_DFS.SC_ID]
|
963
|
-
)
|
964
|
-
return sc_ids # type: ignore
|
965
|
-
|
966
|
-
def _remove_unused_species(self):
|
967
|
-
"""Removes species that are no longer part of any
|
968
|
-
compartmentalized species"""
|
969
|
-
s_ids = self._get_unused_species()
|
970
|
-
self._remove_species(s_ids)
|
971
|
-
|
972
|
-
def _get_unused_species(self) -> set[str]:
|
973
|
-
"""Returns a list of species that are not part of any reactions"""
|
974
|
-
s_ids = set(self.species.index) - set(
|
975
|
-
self.compartmentalized_species[SBML_DFS.S_ID]
|
976
|
-
)
|
977
|
-
return s_ids # type: ignore
|
978
|
-
|
979
|
-
def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
980
|
-
"""Removes compartmentalized species from the model
|
981
|
-
|
982
|
-
This should not be directly used by the user, as it can lead to
|
983
|
-
invalid reactions when removing species without a logic to decide
|
984
|
-
if the reaction needs to be removed as well.
|
985
|
-
|
986
|
-
Args:
|
987
|
-
sc_ids (Iterable[str]): the compartmentalized species to remove
|
1114
|
+
sc_ids : Iterable[str]
|
1115
|
+
IDs of compartmentalized species to remove
|
988
1116
|
"""
|
989
|
-
# Remove compartmentalized species
|
990
|
-
self.compartmentalized_species = self.compartmentalized_species.drop(
|
991
|
-
index=list(sc_ids)
|
992
|
-
)
|
993
|
-
# remove corresponding reactions_species
|
994
|
-
self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
|
995
|
-
|
996
|
-
def _remove_species(self, s_ids: Iterable[str]):
|
997
|
-
"""Removes species from the model
|
998
1117
|
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
This removes the species and corresponding compartmentalized species and
|
1004
|
-
reactions_species.
|
1118
|
+
# find reactions which should be totally removed since they are losing critical species
|
1119
|
+
removed_reactions = self._find_underspecified_reactions_by_scids(sc_ids)
|
1120
|
+
self.remove_reactions(removed_reactions)
|
1005
1121
|
|
1006
|
-
Args:
|
1007
|
-
s_ids (Iterable[str]): the species to remove
|
1008
|
-
"""
|
1009
|
-
sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
|
1010
1122
|
self._remove_compartmentalized_species(sc_ids)
|
1011
|
-
# Remove species
|
1012
|
-
self.species = self.species.drop(index=list(s_ids))
|
1013
|
-
# remove data
|
1014
|
-
for k, data in self.species_data.items():
|
1015
|
-
self.species_data[k] = data.drop(index=list(s_ids))
|
1016
|
-
|
1017
|
-
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
1018
|
-
"""Validates species data attribute
|
1019
|
-
|
1020
|
-
Args:
|
1021
|
-
species_data_table (pd.DataFrame): a species data table
|
1022
|
-
|
1023
|
-
Raises:
|
1024
|
-
ValueError: s_id not index name
|
1025
|
-
ValueError: s_id index contains duplicates
|
1026
|
-
ValueError: s_id not in species table
|
1027
|
-
"""
|
1028
|
-
_validate_matching_data(species_data_table, self.species)
|
1029
|
-
|
1030
|
-
def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
|
1031
|
-
"""Validates reactions data attribute
|
1032
1123
|
|
1033
|
-
|
1034
|
-
|
1124
|
+
# remove species (and their associated species data if all their cspecies have been lost)
|
1125
|
+
self._remove_unused_species()
|
1035
1126
|
|
1036
|
-
|
1037
|
-
ValueError: r_id not index name
|
1038
|
-
ValueError: r_id index contains duplicates
|
1039
|
-
ValueError: r_id not in reactions table
|
1127
|
+
def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
|
1040
1128
|
"""
|
1041
|
-
|
1042
|
-
|
1043
|
-
def _validate_reaction_species(self):
|
1044
|
-
if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
|
1045
|
-
raise ValueError(
|
1046
|
-
"All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
|
1047
|
-
)
|
1048
|
-
|
1049
|
-
# test for null SBO terms
|
1050
|
-
n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
|
1051
|
-
if n_null_sbo_terms != 0:
|
1052
|
-
raise ValueError(
|
1053
|
-
f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
|
1054
|
-
)
|
1055
|
-
|
1056
|
-
# find invalid SBO terms
|
1057
|
-
sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
|
1058
|
-
invalid_sbo_term_counts = sbo_counts[
|
1059
|
-
~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
|
1060
|
-
]
|
1061
|
-
|
1062
|
-
if invalid_sbo_term_counts.shape[0] != 0:
|
1063
|
-
invalid_sbo_counts_str = ", ".join(
|
1064
|
-
[f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
|
1065
|
-
)
|
1066
|
-
raise ValueError(
|
1067
|
-
f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
|
1068
|
-
f"defined {invalid_sbo_counts_str}"
|
1069
|
-
)
|
1070
|
-
|
1071
|
-
def _attempt_resolve(self, e):
|
1072
|
-
str_e = str(e)
|
1073
|
-
if str_e == "compartmentalized_species included missing c_id values":
|
1074
|
-
logger.warning(str_e)
|
1075
|
-
logger.warning(
|
1076
|
-
"Attempting to resolve with infer_uncompartmentalized_species_location()"
|
1077
|
-
)
|
1078
|
-
self = infer_uncompartmentalized_species_location(self)
|
1079
|
-
elif re.search("sbo_terms were not defined", str_e):
|
1080
|
-
logger.warning(str_e)
|
1081
|
-
logger.warning("Attempting to resolve with infer_sbo_terms()")
|
1082
|
-
self = infer_sbo_terms(self)
|
1083
|
-
else:
|
1084
|
-
logger.warning(
|
1085
|
-
"An error occurred which could not be automatically resolved"
|
1086
|
-
)
|
1087
|
-
raise e
|
1088
|
-
|
1089
|
-
|
1090
|
-
def species_status(s_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
|
1091
|
-
"""
|
1092
|
-
Species Status
|
1093
|
-
|
1094
|
-
Return all of the reaction's a species particpates in.
|
1095
|
-
|
1096
|
-
Parameters:
|
1097
|
-
s_id: str
|
1098
|
-
A species ID
|
1099
|
-
sbml_dfs: SBML_dfs
|
1100
|
-
|
1101
|
-
Returns:
|
1102
|
-
pd.DataFrame, one row reaction
|
1103
|
-
"""
|
1104
|
-
|
1105
|
-
matching_species = sbml_dfs.species.loc[s_id]
|
1106
|
-
|
1107
|
-
if not isinstance(matching_species, pd.Series):
|
1108
|
-
raise ValueError(f"{s_id} did not match a single species")
|
1109
|
-
|
1110
|
-
# find all rxns species particpate in
|
1111
|
-
|
1112
|
-
matching_compartmentalized_species = sbml_dfs.compartmentalized_species[
|
1113
|
-
sbml_dfs.compartmentalized_species.s_id.isin([s_id])
|
1114
|
-
]
|
1115
|
-
|
1116
|
-
rxns_participating = sbml_dfs.reaction_species[
|
1117
|
-
sbml_dfs.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
|
1118
|
-
]
|
1119
|
-
|
1120
|
-
# find all participants in these rxns
|
1121
|
-
|
1122
|
-
full_rxns_participating = sbml_dfs.reaction_species[
|
1123
|
-
sbml_dfs.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
|
1124
|
-
].merge(
|
1125
|
-
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1126
|
-
)
|
1127
|
-
|
1128
|
-
reaction_descriptions = pd.concat(
|
1129
|
-
[
|
1130
|
-
reaction_summary(x, sbml_dfs)
|
1131
|
-
for x in set(full_rxns_participating[SBML_DFS.R_ID].tolist())
|
1132
|
-
]
|
1133
|
-
)
|
1134
|
-
|
1135
|
-
status = (
|
1136
|
-
full_rxns_participating.loc[
|
1137
|
-
full_rxns_participating[SBML_DFS.SC_ID].isin(
|
1138
|
-
matching_compartmentalized_species.index.values.tolist()
|
1139
|
-
),
|
1140
|
-
[SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
|
1141
|
-
]
|
1142
|
-
.merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
|
1143
|
-
.reset_index(drop=True)
|
1144
|
-
.drop(SBML_DFS.R_ID, axis=1)
|
1145
|
-
)
|
1146
|
-
|
1147
|
-
return status
|
1148
|
-
|
1149
|
-
|
1150
|
-
def reaction_summary(r_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
|
1151
|
-
"""
|
1152
|
-
Reaction Summary
|
1153
|
-
|
1154
|
-
Return a reaction's name and a human-readable formula.
|
1155
|
-
|
1156
|
-
Parameters:
|
1157
|
-
r_id: str
|
1158
|
-
A reaction ID
|
1159
|
-
sbml_dfs: SBML_dfs
|
1160
|
-
|
1161
|
-
Returns:
|
1162
|
-
one row pd.DataFrame
|
1163
|
-
"""
|
1164
|
-
|
1165
|
-
logger.warning(
|
1166
|
-
"reaction_summary is deprecated and will be removed in a future version of rcpr; "
|
1167
|
-
"please use reaction_summaries() instead"
|
1168
|
-
)
|
1169
|
-
|
1170
|
-
matching_reaction = sbml_dfs.reactions.loc[r_id]
|
1171
|
-
|
1172
|
-
if not isinstance(matching_reaction, pd.Series):
|
1173
|
-
raise ValueError(f"{r_id} did not match a single reaction")
|
1174
|
-
|
1175
|
-
matching_reaction = sbml_dfs.reactions.loc[r_id]
|
1176
|
-
|
1177
|
-
matching_reaction_species = sbml_dfs.reaction_species[
|
1178
|
-
sbml_dfs.reaction_species.r_id.isin([r_id])
|
1179
|
-
].merge(
|
1180
|
-
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1181
|
-
)
|
1182
|
-
|
1183
|
-
# collapse all reaction species to a formula string
|
1184
|
-
|
1185
|
-
if len(matching_reaction_species[SBML_DFS.C_ID].unique()) == 1:
|
1186
|
-
augmented_matching_reaction_species = matching_reaction_species.merge(
|
1187
|
-
sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True
|
1188
|
-
).merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
|
1189
|
-
str_formula = (
|
1190
|
-
construct_formula_string(
|
1191
|
-
augmented_matching_reaction_species, sbml_dfs.reactions, SBML_DFS.S_NAME
|
1192
|
-
)
|
1193
|
-
+ " ["
|
1194
|
-
+ augmented_matching_reaction_species[SBML_DFS.C_NAME].iloc[0]
|
1195
|
-
+ "]"
|
1196
|
-
)
|
1197
|
-
else:
|
1198
|
-
str_formula = construct_formula_string(
|
1199
|
-
matching_reaction_species, sbml_dfs.reactions, SBML_DFS.SC_NAME
|
1200
|
-
)
|
1201
|
-
|
1202
|
-
output = pd.DataFrame(
|
1203
|
-
{
|
1204
|
-
SBML_DFS.R_NAME: matching_reaction[SBML_DFS.R_NAME],
|
1205
|
-
"r_formula_str": str_formula,
|
1206
|
-
},
|
1207
|
-
index=[r_id],
|
1208
|
-
)
|
1209
|
-
|
1210
|
-
output.index.name = SBML_DFS.R_ID
|
1211
|
-
|
1212
|
-
return output
|
1213
|
-
|
1214
|
-
|
1215
|
-
def reaction_summaries(sbml_dfs: SBML_dfs, r_ids=None) -> pd.Series:
|
1216
|
-
"""
|
1217
|
-
Reaction Summary
|
1218
|
-
|
1219
|
-
Return human-readable formulas for reactions.
|
1220
|
-
|
1221
|
-
Parameters:
|
1222
|
-
----------
|
1223
|
-
sbml_dfs: sbml.SBML_dfs
|
1224
|
-
A relational mechanistic model
|
1225
|
-
r_ids: [str], str or None
|
1226
|
-
Reaction IDs or None for all reactions
|
1227
|
-
|
1228
|
-
Returns:
|
1229
|
-
----------
|
1230
|
-
formula_strs: pd.Series
|
1231
|
-
"""
|
1232
|
-
|
1233
|
-
if isinstance(r_ids, str):
|
1234
|
-
r_ids = [r_ids]
|
1235
|
-
|
1236
|
-
if r_ids is None:
|
1237
|
-
matching_reactions = sbml_dfs.reactions
|
1238
|
-
else:
|
1239
|
-
matching_reactions = sbml_dfs.reactions.loc[r_ids]
|
1240
|
-
|
1241
|
-
matching_reaction_species = sbml_dfs.reaction_species[
|
1242
|
-
sbml_dfs.reaction_species.r_id.isin(matching_reactions.index)
|
1243
|
-
].merge(
|
1244
|
-
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1245
|
-
)
|
1246
|
-
|
1247
|
-
# split into within compartment and cross-compartment reactions
|
1248
|
-
r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
|
1249
|
-
SBML_DFS.C_ID
|
1250
|
-
].nunique()
|
1251
|
-
|
1252
|
-
# identify reactions which work across compartments
|
1253
|
-
r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
|
1254
|
-
# there species must be labelled with the sc_name to specify where a species exists
|
1255
|
-
if r_id_cross_compartment.shape[0] > 0:
|
1256
|
-
rxn_eqtn_cross_compartment = (
|
1257
|
-
matching_reaction_species[
|
1258
|
-
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1259
|
-
r_id_cross_compartment.index
|
1260
|
-
)
|
1261
|
-
]
|
1262
|
-
.sort_values([SBML_DFS.SC_NAME])
|
1263
|
-
.groupby(SBML_DFS.R_ID)
|
1264
|
-
.apply(
|
1265
|
-
lambda x: construct_formula_string(
|
1266
|
-
x, sbml_dfs.reactions, SBML_DFS.SC_NAME
|
1267
|
-
)
|
1268
|
-
)
|
1269
|
-
.rename("r_formula_str")
|
1270
|
-
)
|
1271
|
-
else:
|
1272
|
-
rxn_eqtn_cross_compartment = None
|
1273
|
-
|
1274
|
-
# identify reactions which occur within a single compartment; for these the reaction
|
1275
|
-
# can be labelled with the compartment and individual species can receive a more readable s_name
|
1276
|
-
r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
|
1277
|
-
if r_id_within_compartment.shape[0] > 0:
|
1278
|
-
# add s_name
|
1279
|
-
augmented_matching_reaction_species = (
|
1280
|
-
matching_reaction_species[
|
1281
|
-
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1282
|
-
r_id_within_compartment.index
|
1283
|
-
)
|
1284
|
-
]
|
1285
|
-
.merge(sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True)
|
1286
|
-
.merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
|
1287
|
-
.sort_values([SBML_DFS.S_NAME])
|
1288
|
-
)
|
1289
|
-
# create formulas based on s_names of components
|
1290
|
-
rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
|
1291
|
-
[SBML_DFS.R_ID, SBML_DFS.C_NAME]
|
1292
|
-
).apply(
|
1293
|
-
lambda x: construct_formula_string(x, sbml_dfs.reactions, SBML_DFS.S_NAME)
|
1294
|
-
)
|
1295
|
-
# add compartment for each reaction
|
1296
|
-
rxn_eqtn_within_compartment = pd.Series(
|
1297
|
-
[
|
1298
|
-
y + ": " + x
|
1299
|
-
for x, y in zip(
|
1300
|
-
rxn_eqtn_within_compartment,
|
1301
|
-
rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.C_NAME),
|
1302
|
-
)
|
1303
|
-
],
|
1304
|
-
index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
|
1305
|
-
).rename("r_formula_str")
|
1306
|
-
else:
|
1307
|
-
rxn_eqtn_within_compartment = None
|
1308
|
-
|
1309
|
-
formula_strs = pd.concat([rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment])
|
1310
|
-
|
1311
|
-
return formula_strs
|
1312
|
-
|
1313
|
-
|
1314
|
-
def construct_formula_string(
|
1315
|
-
reaction_species_df: pd.DataFrame,
|
1316
|
-
reactions_df: pd.DataFrame,
|
1317
|
-
name_var: str,
|
1318
|
-
) -> str:
|
1319
|
-
"""
|
1320
|
-
Construct Formula String
|
1321
|
-
|
1322
|
-
Convert a table of reaction species into a formula string
|
1323
|
-
|
1324
|
-
Parameters:
|
1325
|
-
----------
|
1326
|
-
reaction_species_df: pd.DataFrame
|
1327
|
-
Table containing a reactions' species
|
1328
|
-
reactions_df: pd.DataFrame
|
1329
|
-
smbl.reactions
|
1330
|
-
name_var: str
|
1331
|
-
Name used to label species
|
1332
|
-
|
1333
|
-
Returns:
|
1334
|
-
----------
|
1335
|
-
formula_str: str
|
1336
|
-
String representation of a reactions substrates, products and
|
1337
|
-
modifiers
|
1338
|
-
|
1339
|
-
"""
|
1340
|
-
|
1341
|
-
reaction_species_df["label"] = [
|
1342
|
-
add_stoi_to_species_name(x, y)
|
1343
|
-
for x, y in zip(
|
1344
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
|
1345
|
-
)
|
1346
|
-
]
|
1347
|
-
|
1348
|
-
rxn_reversible = bool(
|
1349
|
-
reactions_df.loc[
|
1350
|
-
reaction_species_df[SBML_DFS.R_ID].iloc[0], SBML_DFS.R_ISREVERSIBLE
|
1351
|
-
]
|
1352
|
-
) # convert from a np.bool_ to bool if needed
|
1353
|
-
if not isinstance(rxn_reversible, bool):
|
1354
|
-
raise TypeError(
|
1355
|
-
f"rxn_reversible must be a bool, but got {type(rxn_reversible).__name__}"
|
1356
|
-
)
|
1357
|
-
|
1358
|
-
if rxn_reversible:
|
1359
|
-
arrow_type = " <-> "
|
1360
|
-
else:
|
1361
|
-
arrow_type = " -> "
|
1362
|
-
|
1363
|
-
substrates = " + ".join(
|
1364
|
-
reaction_species_df["label"][
|
1365
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
|
1366
|
-
].tolist()
|
1367
|
-
)
|
1368
|
-
products = " + ".join(
|
1369
|
-
reaction_species_df["label"][
|
1370
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
|
1371
|
-
].tolist()
|
1372
|
-
)
|
1373
|
-
modifiers = " + ".join(
|
1374
|
-
reaction_species_df["label"][
|
1375
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
|
1376
|
-
].tolist()
|
1377
|
-
)
|
1378
|
-
if modifiers != "":
|
1379
|
-
modifiers = f" ---- modifiers: {modifiers}]"
|
1380
|
-
|
1381
|
-
return f"{substrates}{arrow_type}{products}{modifiers}"
|
1382
|
-
|
1383
|
-
|
1384
|
-
def add_stoi_to_species_name(stoi: float | int, name: str) -> str:
|
1385
|
-
"""
|
1386
|
-
Add Stoi To Species Name
|
1387
|
-
|
1388
|
-
Add # of molecules to a species name
|
1389
|
-
|
1390
|
-
Parameters:
|
1391
|
-
----------
|
1392
|
-
stoi: float or int
|
1393
|
-
Number of molecules
|
1394
|
-
name: str
|
1395
|
-
Name of species
|
1396
|
-
|
1397
|
-
Returns:
|
1398
|
-
----------
|
1399
|
-
name: str
|
1400
|
-
Name containing number of species
|
1401
|
-
|
1402
|
-
"""
|
1403
|
-
|
1404
|
-
if stoi in [-1, 0, 1]:
|
1405
|
-
return name
|
1406
|
-
else:
|
1407
|
-
return str(abs(stoi)) + " " + name
|
1408
|
-
|
1409
|
-
|
1410
|
-
def filter_to_characteristic_species_ids(
|
1411
|
-
species_ids: pd.DataFrame,
|
1412
|
-
max_complex_size: int = 4,
|
1413
|
-
max_promiscuity: int = 20,
|
1414
|
-
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
1415
|
-
) -> pd.DataFrame:
|
1416
|
-
"""
|
1417
|
-
Filter to Characteristic Species IDs
|
1418
|
-
|
1419
|
-
Remove identifiers corresponding to one component within a large protein
|
1420
|
-
complexes and non-characteristic annotations such as pubmed references and
|
1421
|
-
homologues.
|
1129
|
+
Remove reactions from the model.
|
1422
1130
|
|
1423
1131
|
Parameters
|
1424
1132
|
----------
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
Input species filtered to characteristic identifiers
|
1444
|
-
|
1445
|
-
"""
|
1446
|
-
|
1447
|
-
if not isinstance(species_ids, pd.DataFrame):
|
1448
|
-
raise TypeError(
|
1449
|
-
f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
|
1450
|
-
)
|
1451
|
-
|
1452
|
-
if not isinstance(max_complex_size, int):
|
1453
|
-
raise TypeError(
|
1454
|
-
f"max_complex_size was a {type(max_complex_size)} but must be an int"
|
1455
|
-
)
|
1456
|
-
|
1457
|
-
if not isinstance(max_promiscuity, int):
|
1458
|
-
raise TypeError(
|
1459
|
-
f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
|
1460
|
-
)
|
1461
|
-
|
1462
|
-
if not isinstance(defining_biological_qualifiers, list):
|
1463
|
-
raise TypeError(
|
1464
|
-
f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
|
1465
|
-
)
|
1466
|
-
|
1467
|
-
# primary annotations of a species
|
1468
|
-
bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
|
1469
|
-
|
1470
|
-
# add components within modestly sized protein complexes
|
1471
|
-
# look at HAS_PART IDs
|
1472
|
-
bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
|
1473
|
-
|
1474
|
-
# number of species in a complex
|
1475
|
-
n_species_components = bqb_has_parts_species.value_counts(
|
1476
|
-
[IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
|
1477
|
-
)
|
1478
|
-
big_complex_sids = set(
|
1479
|
-
n_species_components[
|
1480
|
-
n_species_components > max_complex_size
|
1481
|
-
].index.get_level_values(SBML_DFS.S_ID)
|
1482
|
-
)
|
1483
|
-
|
1484
|
-
filtered_bqb_has_parts = _filter_promiscuous_components(
|
1485
|
-
bqb_has_parts_species, max_promiscuity
|
1486
|
-
)
|
1487
|
-
|
1488
|
-
# drop species parts if there are many components
|
1489
|
-
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
1490
|
-
~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
|
1491
|
-
]
|
1492
|
-
|
1493
|
-
# combine primary identifiers and rare components
|
1494
|
-
characteristic_species_ids = pd.concat(
|
1495
|
-
[
|
1496
|
-
bqb_is_species,
|
1497
|
-
filtered_bqb_has_parts,
|
1498
|
-
]
|
1499
|
-
)
|
1500
|
-
|
1501
|
-
return characteristic_species_ids
|
1502
|
-
|
1503
|
-
|
1504
|
-
def infer_uncompartmentalized_species_location(sbml_dfs: SBML_dfs) -> SBML_dfs:
|
1505
|
-
"""
|
1506
|
-
Infer Uncompartmentalized Species Location
|
1507
|
-
|
1508
|
-
If the compartment of a subset of compartmentalized species
|
1509
|
-
was not specified, infer an appropriate compartment from
|
1510
|
-
other members of reactions they particpate in
|
1511
|
-
|
1512
|
-
Parameters:
|
1513
|
-
----------
|
1514
|
-
sbml_dfs: sbml.SBML_dfs
|
1515
|
-
A relational pathway model
|
1516
|
-
|
1517
|
-
Returns:
|
1518
|
-
----------
|
1519
|
-
sbml_dfs: sbml.SBML_dfs
|
1520
|
-
A relational pathway model (with filled in species compartments)
|
1521
|
-
|
1522
|
-
"""
|
1523
|
-
|
1524
|
-
default_compartment = (
|
1525
|
-
sbml_dfs.compartmentalized_species.value_counts(SBML_DFS.C_ID)
|
1526
|
-
.rename("N")
|
1527
|
-
.reset_index()
|
1528
|
-
.sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
|
1529
|
-
)
|
1530
|
-
if not isinstance(default_compartment, str):
|
1531
|
-
raise ValueError(
|
1532
|
-
"No default compartment could be found - compartment "
|
1533
|
-
"information may not be present"
|
1534
|
-
)
|
1535
|
-
|
1536
|
-
# infer the compartments of species missing compartments
|
1537
|
-
|
1538
|
-
missing_compartment_scids = sbml_dfs.compartmentalized_species[
|
1539
|
-
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1540
|
-
].index.tolist()
|
1541
|
-
if len(missing_compartment_scids) == 0:
|
1542
|
-
logger.info(
|
1543
|
-
"All compartmentalized species have compartments, "
|
1544
|
-
"returning input sbml_dfs"
|
1545
|
-
)
|
1546
|
-
return sbml_dfs
|
1547
|
-
|
1548
|
-
participating_reactions = (
|
1549
|
-
sbml_dfs.reaction_species[
|
1550
|
-
sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
1551
|
-
][SBML_DFS.R_ID]
|
1552
|
-
.unique()
|
1553
|
-
.tolist()
|
1554
|
-
)
|
1555
|
-
reaction_participants = sbml_dfs.reaction_species[
|
1556
|
-
sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
|
1557
|
-
].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
|
1558
|
-
reaction_participants = reaction_participants.merge(
|
1559
|
-
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID],
|
1560
|
-
left_on=SBML_DFS.SC_ID,
|
1561
|
-
right_index=True,
|
1562
|
-
)
|
1563
|
-
|
1564
|
-
# find a default compartment to fall back on if all compartmental information is missing
|
1565
|
-
|
1566
|
-
primary_reaction_compartment = (
|
1567
|
-
reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
|
1568
|
-
.rename("N")
|
1569
|
-
.reset_index()
|
1570
|
-
.sort_values("N", ascending=False)
|
1571
|
-
.groupby(SBML_DFS.R_ID)
|
1572
|
-
.first()[SBML_DFS.C_ID]
|
1573
|
-
.reset_index()
|
1574
|
-
)
|
1575
|
-
|
1576
|
-
inferred_compartmentalization = (
|
1577
|
-
sbml_dfs.reaction_species[
|
1578
|
-
sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
1579
|
-
]
|
1580
|
-
.merge(primary_reaction_compartment)
|
1581
|
-
.value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
|
1582
|
-
.rename("N")
|
1583
|
-
.reset_index()
|
1584
|
-
.sort_values("N", ascending=False)
|
1585
|
-
.groupby(SBML_DFS.SC_ID)
|
1586
|
-
.first()
|
1587
|
-
.reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
|
1588
|
-
)
|
1589
|
-
logger.info(
|
1590
|
-
f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
|
1591
|
-
)
|
1592
|
-
|
1593
|
-
# define where a reaction is most likely to occur based on the compartmentalization of its particpants
|
1594
|
-
species_with_unknown_compartmentalization = set(
|
1595
|
-
missing_compartment_scids
|
1596
|
-
).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
|
1597
|
-
if len(species_with_unknown_compartmentalization) != 0:
|
1598
|
-
logger.warning(
|
1599
|
-
f"{len(species_with_unknown_compartmentalization)} "
|
1600
|
-
"species compartmentalization could not be inferred"
|
1601
|
-
" from other reaction particpants. Their compartmentalization "
|
1602
|
-
f"will be set to the default of {default_compartment}"
|
1603
|
-
)
|
1604
|
-
|
1605
|
-
inferred_compartmentalization = pd.concat(
|
1606
|
-
[
|
1607
|
-
inferred_compartmentalization,
|
1608
|
-
pd.DataFrame(
|
1609
|
-
{SBML_DFS.SC_ID: list(species_with_unknown_compartmentalization)}
|
1610
|
-
).assign(c_id=default_compartment),
|
1611
|
-
]
|
1612
|
-
)
|
1613
|
-
|
1614
|
-
if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
|
1615
|
-
raise ValueError(
|
1616
|
-
f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
|
1617
|
-
)
|
1618
|
-
|
1619
|
-
updated_compartmentalized_species = pd.concat(
|
1620
|
-
[
|
1621
|
-
sbml_dfs.compartmentalized_species[
|
1622
|
-
~sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1623
|
-
],
|
1624
|
-
sbml_dfs.compartmentalized_species[
|
1625
|
-
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1626
|
-
]
|
1627
|
-
.drop(SBML_DFS.C_ID, axis=1)
|
1628
|
-
.merge(
|
1629
|
-
inferred_compartmentalization, left_index=True, right_on=SBML_DFS.SC_ID
|
1630
|
-
)
|
1631
|
-
.set_index(SBML_DFS.SC_ID),
|
1632
|
-
]
|
1633
|
-
)
|
1634
|
-
|
1635
|
-
if (
|
1636
|
-
updated_compartmentalized_species.shape[0]
|
1637
|
-
!= sbml_dfs.compartmentalized_species.shape[0]
|
1638
|
-
):
|
1639
|
-
raise ValueError(
|
1640
|
-
f"Trying to overwrite {sbml_dfs.compartmentalized_species.shape[0]}"
|
1641
|
-
" compartmentalized species with "
|
1642
|
-
f"{updated_compartmentalized_species.shape[0]}"
|
1643
|
-
)
|
1644
|
-
|
1645
|
-
if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
|
1646
|
-
raise ValueError("Some species compartments are still missing")
|
1647
|
-
|
1648
|
-
sbml_dfs.compartmentalized_species = updated_compartmentalized_species
|
1649
|
-
|
1650
|
-
return sbml_dfs
|
1651
|
-
|
1652
|
-
|
1653
|
-
def infer_sbo_terms(sbml_dfs: SBML_dfs) -> SBML_dfs:
|
1654
|
-
"""
|
1655
|
-
Infer SBO Terms
|
1656
|
-
|
1657
|
-
Define SBO terms based on stoichiometry for reaction_species with missing terms
|
1658
|
-
|
1659
|
-
Parameters:
|
1660
|
-
----------
|
1661
|
-
sbml_dfs: sbml.SBML_dfs
|
1662
|
-
A relational pathway model
|
1663
|
-
|
1664
|
-
Returns:
|
1665
|
-
----------
|
1666
|
-
sbml_dfs: sbml.SBML_dfs
|
1667
|
-
A relational pathway model (with missing/invalid reaction species sbo_terms resolved)
|
1668
|
-
|
1669
|
-
"""
|
1670
|
-
|
1671
|
-
valid_sbo_terms = sbml_dfs.reaction_species[
|
1672
|
-
sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
1673
|
-
]
|
1674
|
-
|
1675
|
-
invalid_sbo_terms = sbml_dfs.reaction_species[
|
1676
|
-
~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
1677
|
-
]
|
1678
|
-
|
1679
|
-
if not all(sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].notnull()):
|
1680
|
-
raise ValueError(
|
1681
|
-
"All sbml_dfs.reaction_species[SBML_DFS.SBO_TERM] must be not null"
|
1682
|
-
)
|
1683
|
-
if invalid_sbo_terms.shape[0] == 0:
|
1684
|
-
logger.info("All sbo_terms were valid; returning input sbml_dfs")
|
1685
|
-
return sbml_dfs
|
1686
|
-
|
1687
|
-
logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
|
1688
|
-
|
1689
|
-
# add missing/invalid terms based on stoichiometry
|
1690
|
-
invalid_sbo_terms.loc[
|
1691
|
-
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
|
1692
|
-
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
|
1693
|
-
|
1694
|
-
invalid_sbo_terms.loc[
|
1695
|
-
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
|
1696
|
-
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
|
1697
|
-
|
1698
|
-
invalid_sbo_terms.loc[
|
1699
|
-
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
|
1700
|
-
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
|
1701
|
-
|
1702
|
-
updated_reaction_species = pd.concat(
|
1703
|
-
[valid_sbo_terms, invalid_sbo_terms]
|
1704
|
-
).sort_index()
|
1705
|
-
|
1706
|
-
if sbml_dfs.reaction_species.shape[0] != updated_reaction_species.shape[0]:
|
1707
|
-
raise ValueError(
|
1708
|
-
f"Trying to overwrite {sbml_dfs.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
|
1709
|
-
)
|
1710
|
-
sbml_dfs.reaction_species = updated_reaction_species
|
1711
|
-
|
1712
|
-
return sbml_dfs
|
1713
|
-
|
1714
|
-
|
1715
|
-
def name_compartmentalized_species(sbml_dfs):
|
1716
|
-
"""
|
1717
|
-
Name Compartmentalized Species
|
1718
|
-
|
1719
|
-
Rename compartmentalized species if they have the same
|
1720
|
-
name as their species
|
1721
|
-
|
1722
|
-
Parameters
|
1723
|
-
----------
|
1724
|
-
sbml_dfs : SBML_dfs
|
1725
|
-
A model formed by aggregating pathways
|
1726
|
-
|
1727
|
-
Returns:
|
1728
|
-
----------
|
1729
|
-
sbml_dfs
|
1730
|
-
"""
|
1731
|
-
|
1732
|
-
augmented_cspecies = sbml_dfs.compartmentalized_species.merge(
|
1733
|
-
sbml_dfs.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
|
1734
|
-
).merge(
|
1735
|
-
sbml_dfs.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
|
1736
|
-
)
|
1737
|
-
augmented_cspecies[SBML_DFS.SC_NAME] = [
|
1738
|
-
f"{s} [{c}]" if sc == s else sc
|
1739
|
-
for sc, c, s in zip(
|
1740
|
-
augmented_cspecies[SBML_DFS.SC_NAME],
|
1741
|
-
augmented_cspecies[SBML_DFS.C_NAME],
|
1742
|
-
augmented_cspecies[SBML_DFS.S_NAME],
|
1743
|
-
)
|
1744
|
-
]
|
1133
|
+
r_ids : Iterable[str]
|
1134
|
+
IDs of reactions to remove
|
1135
|
+
remove_species : bool, optional
|
1136
|
+
Whether to remove species that are no longer part of any reactions,
|
1137
|
+
by default False
|
1138
|
+
"""
|
1139
|
+
# remove corresponding reactions_species
|
1140
|
+
self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
|
1141
|
+
# remove reactions
|
1142
|
+
self.reactions = self.reactions.drop(index=list(r_ids))
|
1143
|
+
# remove reactions_data
|
1144
|
+
if hasattr(self, "reactions_data"):
|
1145
|
+
for k, data in self.reactions_data.items():
|
1146
|
+
self.reactions_data[k] = data.drop(index=list(r_ids))
|
1147
|
+
# remove species if requested
|
1148
|
+
if remove_species:
|
1149
|
+
self._remove_unused_cspecies()
|
1150
|
+
self._remove_unused_species()
|
1745
1151
|
|
1746
|
-
|
1747
|
-
|
1748
|
-
|
1152
|
+
def remove_reactions_data(self, label: str):
|
1153
|
+
"""
|
1154
|
+
Remove reactions data by label.
|
1155
|
+
"""
|
1156
|
+
self._remove_entity_data(SBML_DFS.REACTIONS, label)
|
1749
1157
|
|
1750
|
-
|
1158
|
+
def remove_species_data(self, label: str):
|
1159
|
+
"""
|
1160
|
+
Remove species data by label.
|
1161
|
+
"""
|
1162
|
+
self._remove_entity_data(SBML_DFS.SPECIES, label)
|
1751
1163
|
|
1164
|
+
def search_by_ids(
|
1165
|
+
self,
|
1166
|
+
ids: list[str],
|
1167
|
+
entity_type: str,
|
1168
|
+
identifiers_df: pd.DataFrame,
|
1169
|
+
ontologies: None | set[str] = None,
|
1170
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
1171
|
+
"""
|
1172
|
+
Find entities and identifiers matching a set of query IDs.
|
1752
1173
|
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1759
|
-
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
Export summaries of species identifiers and each table underlying
|
1764
|
-
an SBML_dfs pathway model
|
1765
|
-
|
1766
|
-
Params
|
1767
|
-
------
|
1768
|
-
model_prefix: str
|
1769
|
-
Label to prepend to all exported files
|
1770
|
-
sbml_dfs: sbml.SBML_dfs
|
1771
|
-
A pathway model
|
1772
|
-
outdir: str
|
1773
|
-
Path to an existing directory where results should be saved
|
1774
|
-
overwrite: bool
|
1775
|
-
Should the directory be overwritten if it already exists?
|
1776
|
-
dogmatic: bool
|
1777
|
-
If True then treat genes, transcript, and proteins as separate species. If False
|
1778
|
-
then treat them interchangeably.
|
1174
|
+
Parameters
|
1175
|
+
----------
|
1176
|
+
ids : List[str]
|
1177
|
+
List of identifiers to search for
|
1178
|
+
entity_type : str
|
1179
|
+
Type of entity to search (e.g., 'species', 'reactions')
|
1180
|
+
identifiers_df : pd.DataFrame
|
1181
|
+
DataFrame containing identifier mappings
|
1182
|
+
ontologies : Optional[Set[str]], optional
|
1183
|
+
Set of ontologies to filter by, by default None
|
1779
1184
|
|
1780
1185
|
Returns
|
1781
1186
|
-------
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
if not isinstance(model_prefix, str):
|
1787
|
-
raise TypeError(f"model_prefix was a {type(model_prefix)} " "and must be a str")
|
1788
|
-
if not isinstance(sbml_dfs, SBML_dfs):
|
1789
|
-
raise TypeError(
|
1790
|
-
f"sbml_dfs was a {type(sbml_dfs)} and must" " be an sbml.SBML_dfs"
|
1791
|
-
)
|
1792
|
-
|
1793
|
-
# filter to identifiers which make sense when mapping from ids -> species
|
1794
|
-
species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
|
1795
|
-
sbml_dfs,
|
1796
|
-
dogmatic=dogmatic,
|
1797
|
-
)
|
1798
|
-
|
1799
|
-
try:
|
1800
|
-
utils.initialize_dir(outdir, overwrite=overwrite)
|
1801
|
-
except FileExistsError:
|
1802
|
-
logger.warning(
|
1803
|
-
f"Directory {outdir} already exists and overwrite is False. "
|
1804
|
-
"Files will be added to the existing directory."
|
1805
|
-
)
|
1806
|
-
with open_fs(outdir, writeable=True) as fs:
|
1807
|
-
species_identifiers_path = (
|
1808
|
-
model_prefix + CPR_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
|
1809
|
-
)
|
1810
|
-
with fs.openbin(species_identifiers_path, "w") as f:
|
1811
|
-
species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
|
1812
|
-
f, sep="\t", index=False
|
1813
|
-
)
|
1814
|
-
|
1815
|
-
# export jsons
|
1816
|
-
species_path = model_prefix + CPR_STANDARD_OUTPUTS.SPECIES
|
1817
|
-
reactions_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTIONS
|
1818
|
-
reation_species_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTION_SPECIES
|
1819
|
-
compartments_path = model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTS
|
1820
|
-
compartmentalized_species_path = (
|
1821
|
-
model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
|
1822
|
-
)
|
1823
|
-
with fs.openbin(species_path, "w") as f:
|
1824
|
-
sbml_dfs.species[[SBML_DFS.S_NAME]].to_json(f)
|
1825
|
-
|
1826
|
-
with fs.openbin(reactions_path, "w") as f:
|
1827
|
-
sbml_dfs.reactions[[SBML_DFS.R_NAME]].to_json(f)
|
1828
|
-
|
1829
|
-
with fs.openbin(reation_species_path, "w") as f:
|
1830
|
-
sbml_dfs.reaction_species.to_json(f)
|
1831
|
-
|
1832
|
-
with fs.openbin(compartments_path, "w") as f:
|
1833
|
-
sbml_dfs.compartments[[SBML_DFS.C_NAME]].to_json(f)
|
1834
|
-
|
1835
|
-
with fs.openbin(compartmentalized_species_path, "w") as f:
|
1836
|
-
sbml_dfs.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
|
1837
|
-
f
|
1838
|
-
)
|
1839
|
-
|
1840
|
-
return None
|
1841
|
-
|
1842
|
-
|
1843
|
-
def sbml_dfs_from_edgelist(
|
1844
|
-
interaction_edgelist: pd.DataFrame,
|
1845
|
-
species_df: pd.DataFrame,
|
1846
|
-
compartments_df: pd.DataFrame,
|
1847
|
-
interaction_source: source.Source,
|
1848
|
-
upstream_stoichiometry: int = 0,
|
1849
|
-
downstream_stoichiometry: int = 1,
|
1850
|
-
downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
|
1851
|
-
keep_species_data: bool | str = False,
|
1852
|
-
keep_reactions_data: bool | str = False,
|
1853
|
-
) -> SBML_dfs:
|
1854
|
-
"""
|
1855
|
-
Create SBML_dfs from interaction edgelist.
|
1856
|
-
|
1857
|
-
Combines a set of molecular interactions into a mechanistic SBML_dfs model
|
1858
|
-
by processing interaction data, species information, and compartment definitions.
|
1187
|
+
Tuple[pd.DataFrame, pd.DataFrame]
|
1188
|
+
- Matching entities
|
1189
|
+
- Matching identifiers
|
1859
1190
|
|
1860
|
-
|
1861
|
-
|
1862
|
-
|
1863
|
-
|
1864
|
-
|
1865
|
-
|
1866
|
-
|
1867
|
-
|
1868
|
-
|
1869
|
-
|
1870
|
-
- r_Identifiers : identifiers.Identifiers, supporting identifiers
|
1871
|
-
- r_isreversible : bool, whether reaction is reversible
|
1872
|
-
species_df : pd.DataFrame
|
1873
|
-
Table defining molecular species with columns:
|
1874
|
-
- s_name : str, name of molecular species
|
1875
|
-
- s_Identifiers : identifiers.Identifiers, species identifiers
|
1876
|
-
compartments_df : pd.DataFrame
|
1877
|
-
Table defining compartments with columns:
|
1878
|
-
- c_name : str, name of compartment
|
1879
|
-
- c_Identifiers : identifiers.Identifiers, compartment identifiers
|
1880
|
-
interaction_source : source.Source
|
1881
|
-
Source object linking model entities to interaction source
|
1882
|
-
upstream_stoichiometry : int, default 0
|
1883
|
-
Stoichiometry of upstream species in reactions
|
1884
|
-
downstream_stoichiometry : int, default 1
|
1885
|
-
Stoichiometry of downstream species in reactions
|
1886
|
-
downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
|
1887
|
-
SBO term for downstream reactant type
|
1888
|
-
keep_species_data : bool or str, default False
|
1889
|
-
Whether to preserve extra species columns. If True, saves as 'source' label.
|
1890
|
-
If string, uses as custom label. If False, discards extra data.
|
1891
|
-
keep_reactions_data : bool or str, default False
|
1892
|
-
Whether to preserve extra reaction columns. If True, saves as 'source' label.
|
1893
|
-
If string, uses as custom label. If False, discards extra data.
|
1191
|
+
Raises
|
1192
|
+
------
|
1193
|
+
ValueError
|
1194
|
+
If entity_type is invalid or ontologies are invalid
|
1195
|
+
TypeError
|
1196
|
+
If ontologies is not a set
|
1197
|
+
"""
|
1198
|
+
# validate inputs
|
1199
|
+
entity_table = self.get_table(entity_type, required_attributes={"id"})
|
1200
|
+
entity_pk = self.schema[entity_type]["pk"]
|
1894
1201
|
|
1895
|
-
|
1896
|
-
|
1897
|
-
|
1898
|
-
|
1899
|
-
|
1900
|
-
|
1901
|
-
|
1902
|
-
|
1202
|
+
utils.match_pd_vars(
|
1203
|
+
identifiers_df,
|
1204
|
+
req_vars={
|
1205
|
+
entity_pk,
|
1206
|
+
IDENTIFIERS.ONTOLOGY,
|
1207
|
+
IDENTIFIERS.IDENTIFIER,
|
1208
|
+
IDENTIFIERS.URL,
|
1209
|
+
IDENTIFIERS.BQB,
|
1210
|
+
},
|
1211
|
+
allow_series=False,
|
1212
|
+
).assert_present()
|
1903
1213
|
|
1904
|
-
|
1905
|
-
|
1906
|
-
|
1907
|
-
|
1214
|
+
if ontologies is not None:
|
1215
|
+
if not isinstance(ontologies, set):
|
1216
|
+
# for clarity this should not be reachable based on type hints
|
1217
|
+
raise TypeError(
|
1218
|
+
f"ontologies must be a set, but got {type(ontologies).__name__}"
|
1219
|
+
)
|
1220
|
+
ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
|
1221
|
+
invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
|
1222
|
+
if len(invalid_ontologies) > 0:
|
1223
|
+
raise ValueError(
|
1224
|
+
f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
|
1225
|
+
f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
|
1226
|
+
)
|
1908
1227
|
|
1909
|
-
|
1910
|
-
|
1911
|
-
compartments_df, interaction_source
|
1912
|
-
)
|
1913
|
-
processed_species, species_data = _edgelist_process_species(
|
1914
|
-
species_df, interaction_source, extra_columns["species"]
|
1915
|
-
)
|
1228
|
+
# fitler to just to identifiers matchign the ontologies of interest
|
1229
|
+
identifiers_df = identifiers_df.query("ontology in @ontologies")
|
1916
1230
|
|
1917
|
-
|
1918
|
-
|
1919
|
-
|
1920
|
-
|
1921
|
-
processed_compartments,
|
1922
|
-
interaction_source,
|
1923
|
-
)
|
1231
|
+
matching_identifiers = identifiers_df.loc[
|
1232
|
+
identifiers_df["identifier"].isin(ids)
|
1233
|
+
]
|
1234
|
+
entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
|
1924
1235
|
|
1925
|
-
|
1926
|
-
reactions, reaction_species, reactions_data = (
|
1927
|
-
_edgelist_create_reactions_and_species(
|
1928
|
-
interaction_edgelist,
|
1929
|
-
comp_species,
|
1930
|
-
processed_species,
|
1931
|
-
processed_compartments,
|
1932
|
-
interaction_source,
|
1933
|
-
upstream_stoichiometry,
|
1934
|
-
downstream_stoichiometry,
|
1935
|
-
downstream_sbo_name,
|
1936
|
-
extra_columns["reactions"],
|
1937
|
-
)
|
1938
|
-
)
|
1236
|
+
return entity_subset, matching_identifiers
|
1939
1237
|
|
1940
|
-
|
1941
|
-
|
1942
|
-
|
1943
|
-
|
1944
|
-
|
1945
|
-
reactions,
|
1946
|
-
reaction_species,
|
1947
|
-
species_data,
|
1948
|
-
reactions_data,
|
1949
|
-
keep_species_data,
|
1950
|
-
keep_reactions_data,
|
1951
|
-
extra_columns,
|
1952
|
-
)
|
1238
|
+
def search_by_name(
|
1239
|
+
self, name: str, entity_type: str, partial_match: bool = True
|
1240
|
+
) -> pd.DataFrame:
|
1241
|
+
"""
|
1242
|
+
Find entities by exact or partial name match.
|
1953
1243
|
|
1954
|
-
|
1244
|
+
Parameters
|
1245
|
+
----------
|
1246
|
+
name : str
|
1247
|
+
Name to search for
|
1248
|
+
entity_type : str
|
1249
|
+
Type of entity to search (e.g., 'species', 'reactions')
|
1250
|
+
partial_match : bool, optional
|
1251
|
+
Whether to allow partial string matches, by default True
|
1955
1252
|
|
1956
|
-
|
1253
|
+
Returns
|
1254
|
+
-------
|
1255
|
+
pd.DataFrame
|
1256
|
+
Matching entities
|
1257
|
+
"""
|
1258
|
+
entity_table = self.get_table(entity_type, required_attributes={"label"})
|
1259
|
+
label_attr = self.schema[entity_type]["label"]
|
1957
1260
|
|
1261
|
+
if partial_match:
|
1262
|
+
matches = entity_table.loc[
|
1263
|
+
entity_table[label_attr].str.contains(name, case=False)
|
1264
|
+
]
|
1265
|
+
else:
|
1266
|
+
matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
|
1267
|
+
return matches
|
1958
1268
|
|
1959
|
-
def
|
1960
|
-
|
1269
|
+
def select_species_data(self, species_data_table: str) -> pd.DataFrame:
|
1270
|
+
"""
|
1271
|
+
Select a species data table from the SBML_dfs object.
|
1961
1272
|
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1965
|
-
|
1966
|
-
return "drug"
|
1967
|
-
else:
|
1968
|
-
return "protein"
|
1969
|
-
else:
|
1970
|
-
return "unknown"
|
1971
|
-
|
1972
|
-
|
1973
|
-
def stub_ids(ids):
|
1974
|
-
if len(ids) == 0:
|
1975
|
-
return pd.DataFrame(
|
1976
|
-
{
|
1977
|
-
IDENTIFIERS.ONTOLOGY: [None],
|
1978
|
-
IDENTIFIERS.IDENTIFIER: [None],
|
1979
|
-
IDENTIFIERS.URL: [None],
|
1980
|
-
IDENTIFIERS.BQB: [None],
|
1981
|
-
}
|
1982
|
-
)
|
1983
|
-
else:
|
1984
|
-
return pd.DataFrame(ids)
|
1273
|
+
Parameters
|
1274
|
+
----------
|
1275
|
+
species_data_table : str
|
1276
|
+
Name of the species data table to select
|
1985
1277
|
|
1278
|
+
Returns
|
1279
|
+
-------
|
1280
|
+
pd.DataFrame
|
1281
|
+
The selected species data table
|
1986
1282
|
|
1987
|
-
|
1988
|
-
|
1989
|
-
|
1283
|
+
Raises
|
1284
|
+
------
|
1285
|
+
ValueError
|
1286
|
+
If species_data_table is not found
|
1287
|
+
"""
|
1288
|
+
# Check if species_data_table exists in sbml_dfs.species_data
|
1289
|
+
if species_data_table not in self.species_data:
|
1290
|
+
raise ValueError(
|
1291
|
+
f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
|
1292
|
+
f"Available tables: {self.species_data.keys()}"
|
1293
|
+
)
|
1990
1294
|
|
1991
|
-
|
1992
|
-
|
1295
|
+
# Get the species data
|
1296
|
+
return self.species_data[species_data_table]
|
1993
1297
|
|
1994
|
-
|
1995
|
-
|
1298
|
+
def species_status(self, s_id: str) -> pd.DataFrame:
|
1299
|
+
"""
|
1300
|
+
Species Status
|
1996
1301
|
|
1997
|
-
|
1302
|
+
Return all of the reactions a species participates in.
|
1998
1303
|
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2002
|
-
.replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
|
2003
|
-
)
|
1304
|
+
Parameters:
|
1305
|
+
s_id: str
|
1306
|
+
A species ID
|
2004
1307
|
|
2005
|
-
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2013
|
-
reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
|
1308
|
+
Returns:
|
1309
|
+
pd.DataFrame, one row per reaction the species participates in
|
1310
|
+
with columns:
|
1311
|
+
- sc_name: str, name of the compartment the species participates in
|
1312
|
+
- stoichiometry: float, stoichiometry of the species in the reaction
|
1313
|
+
- r_name: str, name of the reaction
|
1314
|
+
- r_formula_str: str, human-readable formula of the reaction
|
1315
|
+
"""
|
2014
1316
|
|
2015
|
-
|
1317
|
+
if s_id not in self.species.index:
|
1318
|
+
raise ValueError(f"{s_id} not found in species table")
|
2016
1319
|
|
1320
|
+
matching_species = self.species.loc[s_id]
|
2017
1321
|
|
2018
|
-
|
2019
|
-
|
2020
|
-
) -> pd.DataFrame:
|
1322
|
+
if not isinstance(matching_species, pd.Series):
|
1323
|
+
raise ValueError(f"{s_id} did not match a single species")
|
2021
1324
|
|
2022
|
-
|
2023
|
-
|
2024
|
-
|
2025
|
-
|
2026
|
-
)
|
2027
|
-
if "new" not in reaction_species_w_roles.columns:
|
2028
|
-
raise ValueError(
|
2029
|
-
"The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2030
|
-
)
|
2031
|
-
# check that new is a boolean column
|
2032
|
-
if reaction_species_w_roles["new"].dtype != bool:
|
2033
|
-
raise ValueError(
|
2034
|
-
"The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2035
|
-
)
|
1325
|
+
# find all rxns species participate in
|
1326
|
+
matching_compartmentalized_species = self.compartmentalized_species[
|
1327
|
+
self.compartmentalized_species.s_id.isin([s_id])
|
1328
|
+
]
|
2036
1329
|
|
2037
|
-
|
2038
|
-
|
2039
|
-
|
2040
|
-
.tolist()
|
2041
|
-
)
|
1330
|
+
rxns_participating = self.reaction_species[
|
1331
|
+
self.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
|
1332
|
+
]
|
2042
1333
|
|
2043
|
-
|
2044
|
-
|
2045
|
-
|
2046
|
-
|
1334
|
+
# find all participants in these rxns
|
1335
|
+
full_rxns_participating = self.reaction_species[
|
1336
|
+
self.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
|
1337
|
+
].merge(
|
1338
|
+
self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
2047
1339
|
)
|
2048
1340
|
|
2049
|
-
|
2050
|
-
|
2051
|
-
reaction_species_w_roles
|
2052
|
-
# drop already filtered reactions
|
2053
|
-
.query("r_id not in @reactions_with_lost_defining_members")
|
2054
|
-
.query("sbo_role == 'REQUIRED'")
|
2055
|
-
# which entries which have some required attribute have all False values for that attribute
|
2056
|
-
.groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
|
2057
|
-
.agg({"new": "any"})
|
2058
|
-
.query("new == False")
|
2059
|
-
.index.get_level_values(SBML_DFS.R_ID)
|
2060
|
-
)
|
1341
|
+
participating_rids = full_rxns_participating[SBML_DFS.R_ID].unique()
|
1342
|
+
reaction_descriptions = self.reaction_summaries(r_ids=participating_rids)
|
2061
1343
|
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
1344
|
+
status = (
|
1345
|
+
full_rxns_participating.loc[
|
1346
|
+
full_rxns_participating[SBML_DFS.SC_ID].isin(
|
1347
|
+
matching_compartmentalized_species.index.values.tolist()
|
1348
|
+
),
|
1349
|
+
[SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
|
1350
|
+
]
|
1351
|
+
.merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
|
1352
|
+
.reset_index(drop=True)
|
1353
|
+
.drop(SBML_DFS.R_ID, axis=1)
|
2066
1354
|
)
|
2067
1355
|
|
2068
|
-
|
2069
|
-
reactions_with_lost_requirements
|
2070
|
-
)
|
1356
|
+
return status
|
2071
1357
|
|
2072
|
-
|
1358
|
+
def validate(self):
|
1359
|
+
"""
|
1360
|
+
Validate the SBML_dfs structure and relationships.
|
2073
1361
|
|
1362
|
+
Checks:
|
1363
|
+
- Schema existence
|
1364
|
+
- Required tables presence
|
1365
|
+
- Individual table structure
|
1366
|
+
- Primary key uniqueness
|
1367
|
+
- Foreign key relationships
|
1368
|
+
- Optional data table validity
|
1369
|
+
- Reaction species validity
|
2074
1370
|
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
1371
|
+
Raises
|
1372
|
+
------
|
1373
|
+
ValueError
|
1374
|
+
If any validation check fails
|
1375
|
+
"""
|
2080
1376
|
|
2081
|
-
|
2082
|
-
|
1377
|
+
if not hasattr(self, "schema"):
|
1378
|
+
raise ValueError("No schema found")
|
2083
1379
|
|
2084
|
-
|
2085
|
-
|
2086
|
-
A pathway representation
|
2087
|
-
sc_ids (list[str])
|
2088
|
-
A list of compartmentalized species ids (sc_ids) which will be removed.
|
1380
|
+
required_tables = self._required_entities
|
1381
|
+
schema_tables = set(self.schema.keys())
|
2089
1382
|
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
1383
|
+
extra_tables = schema_tables.difference(required_tables)
|
1384
|
+
if len(extra_tables) != 0:
|
1385
|
+
logger.debug(
|
1386
|
+
f"{len(extra_tables)} unexpected tables found: "
|
1387
|
+
f"{', '.join(extra_tables)}"
|
1388
|
+
)
|
2094
1389
|
|
2095
|
-
|
1390
|
+
missing_tables = required_tables.difference(schema_tables)
|
1391
|
+
if len(missing_tables) != 0:
|
1392
|
+
raise ValueError(
|
1393
|
+
f"Missing {len(missing_tables)} required tables: "
|
1394
|
+
f"{', '.join(missing_tables)}"
|
1395
|
+
)
|
2096
1396
|
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
)
|
1397
|
+
# check individual tables
|
1398
|
+
for table in required_tables:
|
1399
|
+
self._validate_table(table)
|
2101
1400
|
|
2102
|
-
|
2103
|
-
|
1401
|
+
# check whether pks and fks agree
|
1402
|
+
self._validate_pk_fk_correspondence()
|
2104
1403
|
|
2105
|
-
|
1404
|
+
# check optional data tables:
|
1405
|
+
for k, v in self.species_data.items():
|
1406
|
+
try:
|
1407
|
+
self._validate_species_data(v)
|
1408
|
+
except ValueError as e:
|
1409
|
+
raise ValueError(f"species data {k} was invalid.") from e
|
2106
1410
|
|
1411
|
+
for k, v in self.reactions_data.items():
|
1412
|
+
try:
|
1413
|
+
self._validate_reactions_data(v)
|
1414
|
+
except ValueError as e:
|
1415
|
+
raise ValueError(f"reactions data {k} was invalid.") from e
|
2107
1416
|
|
2108
|
-
|
2109
|
-
|
2110
|
-
Validate a standalone table against the SBML_dfs schema.
|
1417
|
+
# validate reaction_species sbo_terms and stoi
|
1418
|
+
self._validate_reaction_species()
|
2111
1419
|
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
1420
|
+
# validate identifiers and sources
|
1421
|
+
self._validate_identifiers()
|
1422
|
+
self._validate_sources()
|
2115
1423
|
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
1424
|
+
def validate_and_resolve(self):
|
1425
|
+
"""
|
1426
|
+
Validate and attempt to automatically fix common issues.
|
1427
|
+
|
1428
|
+
This method iteratively:
|
1429
|
+
1. Attempts validation
|
1430
|
+
2. If validation fails, tries to resolve the issue
|
1431
|
+
3. Repeats until validation passes or issue cannot be resolved
|
2122
1432
|
|
2123
1433
|
Raises
|
2124
1434
|
------
|
2125
1435
|
ValueError
|
2126
|
-
|
2127
|
-
|
2128
|
-
|
2129
|
-
|
2130
|
-
|
2131
|
-
|
2132
|
-
|
1436
|
+
If validation fails and cannot be automatically resolved
|
1437
|
+
"""
|
1438
|
+
|
1439
|
+
current_exception = None
|
1440
|
+
validated = False
|
1441
|
+
|
1442
|
+
while not validated:
|
1443
|
+
try:
|
1444
|
+
self.validate()
|
1445
|
+
validated = True
|
1446
|
+
except Exception as e:
|
1447
|
+
e_str = str(e)
|
1448
|
+
if e_str == current_exception:
|
1449
|
+
logger.warning(
|
1450
|
+
"Automated resolution of an Exception was attempted but failed"
|
1451
|
+
)
|
1452
|
+
raise e
|
2133
1453
|
|
2134
|
-
|
2135
|
-
|
1454
|
+
# try to resolve
|
1455
|
+
self._attempt_resolve(e)
|
2136
1456
|
|
1457
|
+
# =============================================================================
|
1458
|
+
# PRIVATE METHODS (ALPHABETICAL ORDER)
|
1459
|
+
# =============================================================================
|
2137
1460
|
|
2138
|
-
def
|
2139
|
-
|
2140
|
-
|
2141
|
-
|
2142
|
-
|
2143
|
-
|
2144
|
-
|
1461
|
+
def _attempt_resolve(self, e):
|
1462
|
+
str_e = str(e)
|
1463
|
+
if str_e == "compartmentalized_species included missing c_id values":
|
1464
|
+
logger.warning(str_e)
|
1465
|
+
logger.warning(
|
1466
|
+
"Attempting to resolve with infer_uncompartmentalized_species_location()"
|
1467
|
+
)
|
1468
|
+
self.infer_uncompartmentalized_species_location()
|
1469
|
+
elif re.search("sbo_terms were not defined", str_e):
|
1470
|
+
logger.warning(str_e)
|
1471
|
+
logger.warning("Attempting to resolve with infer_sbo_terms()")
|
1472
|
+
self.infer_sbo_terms()
|
1473
|
+
else:
|
1474
|
+
logger.warning(
|
1475
|
+
"An error occurred which could not be automatically resolved"
|
1476
|
+
)
|
1477
|
+
raise e
|
2145
1478
|
|
2146
|
-
|
2147
|
-
|
1479
|
+
def _find_underspecified_reactions_by_scids(
|
1480
|
+
self, sc_ids: Iterable[str]
|
1481
|
+
) -> set[str]:
|
1482
|
+
"""
|
1483
|
+
Find Underspecified reactions
|
1484
|
+
|
1485
|
+
Identify reactions which should be removed if a set of molecular species are removed
|
1486
|
+
from the system.
|
2148
1487
|
|
2149
1488
|
Parameters
|
2150
1489
|
----------
|
2151
|
-
|
2152
|
-
|
2153
|
-
table_schema : dict
|
2154
|
-
Schema definition for the table
|
2155
|
-
table_name : str
|
2156
|
-
Name of the table (for error messages)
|
1490
|
+
sc_ids : list[str]
|
1491
|
+
A list of compartmentalized species ids (sc_ids) which will be removed.
|
2157
1492
|
|
2158
|
-
|
2159
|
-
|
2160
|
-
|
2161
|
-
|
2162
|
-
|
2163
|
-
|
2164
|
-
|
2165
|
-
|
2166
|
-
|
2167
|
-
|
2168
|
-
|
2169
|
-
|
2170
|
-
|
1493
|
+
Returns
|
1494
|
+
-------
|
1495
|
+
underspecified_reactions : set[str]
|
1496
|
+
A set of reactions which should be removed because they will not occur once
|
1497
|
+
"sc_ids" are removed.
|
1498
|
+
"""
|
1499
|
+
updated_reaction_species = self.reaction_species.copy()
|
1500
|
+
updated_reaction_species["new"] = ~updated_reaction_species[
|
1501
|
+
SBML_DFS.SC_ID
|
1502
|
+
].isin(sc_ids)
|
1503
|
+
updated_reaction_species = sbml_dfs_utils.add_sbo_role(updated_reaction_species)
|
1504
|
+
underspecified_reactions = sbml_dfs_utils.find_underspecified_reactions(
|
1505
|
+
updated_reaction_species
|
2171
1506
|
)
|
1507
|
+
return underspecified_reactions
|
2172
1508
|
|
2173
|
-
|
2174
|
-
|
2175
|
-
|
2176
|
-
|
2177
|
-
|
1509
|
+
def _get_unused_cspecies(self) -> set[str]:
|
1510
|
+
"""Returns a set of compartmentalized species
|
1511
|
+
that are not part of any reactions"""
|
1512
|
+
sc_ids = set(self.compartmentalized_species.index) - set(
|
1513
|
+
self.reaction_species[SBML_DFS.SC_ID]
|
2178
1514
|
)
|
1515
|
+
return sc_ids # type: ignore
|
2179
1516
|
|
2180
|
-
|
2181
|
-
|
2182
|
-
|
2183
|
-
|
2184
|
-
|
2185
|
-
example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
|
2186
|
-
raise ValueError(
|
2187
|
-
f"{duplicated_pks.shape[0]} primary keys were duplicated "
|
2188
|
-
f"including {', '.join(example_duplicates)}"
|
1517
|
+
def _get_unused_species(self) -> set[str]:
|
1518
|
+
"""Returns a list of species that are not part of any reactions"""
|
1519
|
+
s_ids = set(self.species.index) - set(
|
1520
|
+
self.compartmentalized_species[SBML_DFS.S_ID]
|
2189
1521
|
)
|
1522
|
+
return s_ids # type: ignore
|
2190
1523
|
|
2191
|
-
|
2192
|
-
|
2193
|
-
table_vars = set(list(table_data.columns))
|
1524
|
+
def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
1525
|
+
"""Removes compartmentalized species from the model
|
2194
1526
|
|
2195
|
-
|
2196
|
-
|
2197
|
-
|
2198
|
-
f"{len(extra_vars)} extra variables were found for {table_name}: "
|
2199
|
-
f"{', '.join(extra_vars)}"
|
2200
|
-
)
|
1527
|
+
This should not be directly used by the user, as it can lead to
|
1528
|
+
invalid reactions when removing species without a logic to decide
|
1529
|
+
if the reaction needs to be removed as well.
|
2201
1530
|
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2205
|
-
|
2206
|
-
|
1531
|
+
Args:
|
1532
|
+
sc_ids (Iterable[str]): the compartmentalized species to remove
|
1533
|
+
"""
|
1534
|
+
# Remove compartmentalized species
|
1535
|
+
self.compartmentalized_species = self.compartmentalized_species.drop(
|
1536
|
+
index=list(sc_ids)
|
2207
1537
|
)
|
1538
|
+
# remove corresponding reactions_species
|
1539
|
+
self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
|
2208
1540
|
|
2209
|
-
|
2210
|
-
|
2211
|
-
|
1541
|
+
def _remove_entity_data(self, entity_type: str, label: str) -> None:
|
1542
|
+
"""
|
1543
|
+
Remove data from species_data or reactions_data by table name and label.
|
2212
1544
|
|
1545
|
+
Parameters
|
1546
|
+
----------
|
1547
|
+
entity_type : str
|
1548
|
+
Name of the table to remove data from ('species' or 'reactions')
|
1549
|
+
label : str
|
1550
|
+
Label of the data to remove
|
2213
1551
|
|
2214
|
-
|
2215
|
-
|
2216
|
-
|
1552
|
+
Notes
|
1553
|
+
-----
|
1554
|
+
If the label does not exist, a warning will be logged that includes the existing labels.
|
1555
|
+
"""
|
1556
|
+
if entity_type not in ENTITIES_W_DATA:
|
1557
|
+
raise ValueError("table_name must be either 'species' or 'reactions'")
|
2217
1558
|
|
2218
|
-
|
2219
|
-
|
2220
|
-
|
2221
|
-
|
2222
|
-
|
2223
|
-
|
2224
|
-
|
2225
|
-
|
2226
|
-
data=[True] * len(promiscuous_component_identifiers_index),
|
2227
|
-
index=promiscuous_component_identifiers_index,
|
2228
|
-
name="is_shared_component",
|
2229
|
-
dtype=bool,
|
2230
|
-
)
|
1559
|
+
data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
|
1560
|
+
if label not in data_dict:
|
1561
|
+
existing_labels = list(data_dict.keys())
|
1562
|
+
logger.warning(
|
1563
|
+
f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
|
1564
|
+
f"Existing labels: {existing_labels}"
|
1565
|
+
)
|
1566
|
+
return
|
2231
1567
|
|
2232
|
-
|
2233
|
-
return bqb_has_parts_species
|
1568
|
+
del data_dict[label]
|
2234
1569
|
|
2235
|
-
|
2236
|
-
|
2237
|
-
left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
|
2238
|
-
right_index=True,
|
2239
|
-
how="left",
|
2240
|
-
)
|
1570
|
+
def _remove_species(self, s_ids: Iterable[str]):
|
1571
|
+
"""Removes species from the model
|
2241
1572
|
|
2242
|
-
|
2243
|
-
|
2244
|
-
|
2245
|
-
# drop identifiers shared as components across many species
|
2246
|
-
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
2247
|
-
~filtered_bqb_has_parts["is_shared_component"]
|
2248
|
-
].drop(["is_shared_component"], axis=1)
|
1573
|
+
This should not be directly used by the user, as it can lead to
|
1574
|
+
invalid reactions when removing species without a logic to decide
|
1575
|
+
if the reaction needs to be removed as well.
|
2249
1576
|
|
2250
|
-
|
1577
|
+
This removes the species and corresponding compartmentalized species and
|
1578
|
+
reactions_species.
|
2251
1579
|
|
1580
|
+
Args:
|
1581
|
+
s_ids (Iterable[str]): the species to remove
|
1582
|
+
"""
|
1583
|
+
sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
|
1584
|
+
self._remove_compartmentalized_species(sc_ids)
|
1585
|
+
# Remove species
|
1586
|
+
self.species = self.species.drop(index=list(s_ids))
|
1587
|
+
# remove data
|
1588
|
+
for k, data in self.species_data.items():
|
1589
|
+
self.species_data[k] = data.drop(index=list(s_ids))
|
2252
1590
|
|
2253
|
-
def
|
2254
|
-
|
2255
|
-
|
2256
|
-
|
2257
|
-
)
|
2258
|
-
"""
|
2259
|
-
Validate input DataFrames have required columns.
|
1591
|
+
def _remove_unused_cspecies(self):
|
1592
|
+
"""Removes compartmentalized species that are no
|
1593
|
+
longer part of any reactions"""
|
1594
|
+
sc_ids = self._get_unused_cspecies()
|
1595
|
+
self._remove_compartmentalized_species(sc_ids)
|
2260
1596
|
|
2261
|
-
|
2262
|
-
|
2263
|
-
|
2264
|
-
|
2265
|
-
|
2266
|
-
Species data to validate
|
2267
|
-
compartments_df : pd.DataFrame
|
2268
|
-
Compartments data to validate
|
2269
|
-
"""
|
1597
|
+
def _remove_unused_species(self):
|
1598
|
+
"""Removes species that are no longer part of any
|
1599
|
+
compartmentalized species"""
|
1600
|
+
s_ids = self._get_unused_species()
|
1601
|
+
self._remove_species(s_ids)
|
2270
1602
|
|
2271
|
-
|
2272
|
-
|
2273
|
-
|
2274
|
-
|
2275
|
-
|
2276
|
-
|
2277
|
-
|
2278
|
-
|
2279
|
-
|
2280
|
-
|
1603
|
+
def _validate_identifiers(self):
|
1604
|
+
"""
|
1605
|
+
Validate identifiers in the model
|
1606
|
+
|
1607
|
+
Iterates through all tables and checks if the identifier columns are valid.
|
1608
|
+
|
1609
|
+
Raises:
|
1610
|
+
ValueError: missing identifiers in the table
|
1611
|
+
"""
|
1612
|
+
|
1613
|
+
SCHEMA = SBML_DFS_SCHEMA.SCHEMA
|
1614
|
+
for table in SBML_DFS_SCHEMA.SCHEMA.keys():
|
1615
|
+
if "id" not in SCHEMA[table].keys():
|
1616
|
+
continue
|
1617
|
+
id_series = self.get_table(table)[SCHEMA[table]["id"]]
|
1618
|
+
if id_series.isna().sum() > 0:
|
1619
|
+
missing_ids = id_series[id_series.isna()].index
|
1620
|
+
raise ValueError(
|
1621
|
+
f"{table} has {len(missing_ids)} missing ids: {missing_ids}"
|
1622
|
+
)
|
1623
|
+
|
1624
|
+
def _validate_pk_fk_correspondence(self):
|
1625
|
+
"""
|
1626
|
+
Check whether primary keys and foreign keys agree for all tables in the schema.
|
1627
|
+
Raises ValueError if any correspondence fails.
|
1628
|
+
"""
|
1629
|
+
|
1630
|
+
pk_df = pd.DataFrame(
|
1631
|
+
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
2281
1632
|
)
|
2282
1633
|
|
2283
|
-
|
2284
|
-
|
2285
|
-
|
2286
|
-
|
2287
|
-
|
2288
|
-
|
2289
|
-
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
1634
|
+
fk_df = (
|
1635
|
+
pd.DataFrame(
|
1636
|
+
[
|
1637
|
+
{"fk_table": k, "fk": v["fk"]}
|
1638
|
+
for k, v in self.schema.items()
|
1639
|
+
if "fk" in v.keys()
|
1640
|
+
]
|
1641
|
+
)
|
1642
|
+
.set_index("fk_table")["fk"]
|
1643
|
+
.apply(pd.Series)
|
1644
|
+
.reset_index()
|
1645
|
+
.melt(id_vars="fk_table")
|
1646
|
+
.drop(["variable"], axis=1)
|
1647
|
+
.rename(columns={"value": "key"})
|
1648
|
+
)
|
1649
|
+
|
1650
|
+
pk_fk_correspondences = pk_df.merge(fk_df)
|
1651
|
+
|
1652
|
+
for i in range(0, pk_fk_correspondences.shape[0]):
|
1653
|
+
pk_table_keys = set(
|
1654
|
+
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
1655
|
+
)
|
1656
|
+
if None in pk_table_keys:
|
1657
|
+
raise ValueError(
|
1658
|
+
f"{pk_fk_correspondences['pk_table'][i]} had "
|
1659
|
+
"missing values in its index"
|
1660
|
+
)
|
1661
|
+
|
1662
|
+
fk_table_keys = set(
|
1663
|
+
getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
|
1664
|
+
:, pk_fk_correspondences["key"][i]
|
1665
|
+
]
|
1666
|
+
)
|
1667
|
+
if None in fk_table_keys:
|
1668
|
+
raise ValueError(
|
1669
|
+
f"{pk_fk_correspondences['fk_table'][i]} included "
|
1670
|
+
f"missing {pk_fk_correspondences['key'][i]} values"
|
1671
|
+
)
|
1672
|
+
|
1673
|
+
# all foreign keys need to match a primary key
|
1674
|
+
extra_fks = fk_table_keys.difference(pk_table_keys)
|
1675
|
+
if len(extra_fks) != 0:
|
1676
|
+
raise ValueError(
|
1677
|
+
f"{len(extra_fks)} distinct "
|
1678
|
+
f"{pk_fk_correspondences['key'][i]} values were"
|
1679
|
+
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
1680
|
+
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
1681
|
+
" All foreign keys must have a matching primary key.\n\n"
|
1682
|
+
f"Extra key are: {', '.join(extra_fks)}"
|
1683
|
+
)
|
2293
1684
|
|
2294
|
-
|
2295
|
-
interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
|
2296
|
-
missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
|
2297
|
-
interaction_edgelist_columns
|
2298
|
-
)
|
2299
|
-
if len(missing_required_fields) > 0:
|
2300
|
-
raise ValueError(
|
2301
|
-
f"{', '.join(missing_required_fields)} are required "
|
2302
|
-
'variables in "interaction_edgelist" but were not '
|
2303
|
-
"present in the input file."
|
2304
|
-
)
|
1685
|
+
def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
|
2305
1686
|
|
2306
|
-
|
1687
|
+
if isinstance(r_ids, str):
|
1688
|
+
r_ids = [r_ids]
|
2307
1689
|
|
1690
|
+
if r_ids is None:
|
1691
|
+
return self.reactions.index.tolist()
|
1692
|
+
else:
|
1693
|
+
if not all(r_id in self.reactions.index for r_id in r_ids):
|
1694
|
+
raise ValueError(f"Reaction IDs {r_ids} not found in reactions table")
|
2308
1695
|
|
2309
|
-
|
2310
|
-
interaction_edgelist, species_df, keep_reactions_data, keep_species_data
|
2311
|
-
):
|
2312
|
-
"""
|
2313
|
-
Identify extra columns in input data that should be preserved.
|
1696
|
+
return r_ids
|
2314
1697
|
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2319
|
-
|
2320
|
-
Species data containing potential extra columns
|
2321
|
-
keep_reactions_data : bool or str
|
2322
|
-
Whether to keep extra reaction columns
|
2323
|
-
keep_species_data : bool or str
|
2324
|
-
Whether to keep extra species columns
|
1698
|
+
def _validate_reaction_species(self):
|
1699
|
+
if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
|
1700
|
+
raise ValueError(
|
1701
|
+
"All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
|
1702
|
+
)
|
2325
1703
|
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
|
2330
|
-
|
2331
|
-
|
2332
|
-
extra_species_columns = []
|
2333
|
-
|
2334
|
-
if keep_reactions_data is not False:
|
2335
|
-
extra_reactions_columns = [
|
2336
|
-
c
|
2337
|
-
for c in interaction_edgelist.columns
|
2338
|
-
if c not in INTERACTION_EDGELIST_EXPECTED_VARS
|
2339
|
-
]
|
1704
|
+
# test for null SBO terms
|
1705
|
+
n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
|
1706
|
+
if n_null_sbo_terms != 0:
|
1707
|
+
raise ValueError(
|
1708
|
+
f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
|
1709
|
+
)
|
2340
1710
|
|
2341
|
-
|
2342
|
-
|
2343
|
-
|
2344
|
-
|
2345
|
-
if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
1711
|
+
# find invalid SBO terms
|
1712
|
+
sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
|
1713
|
+
invalid_sbo_term_counts = sbo_counts[
|
1714
|
+
~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
|
2346
1715
|
]
|
2347
1716
|
|
2348
|
-
|
2349
|
-
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
2353
|
-
|
1717
|
+
if invalid_sbo_term_counts.shape[0] != 0:
|
1718
|
+
invalid_sbo_counts_str = ", ".join(
|
1719
|
+
[f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
|
1720
|
+
)
|
1721
|
+
raise ValueError(
|
1722
|
+
f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
|
1723
|
+
f"defined {invalid_sbo_counts_str}"
|
1724
|
+
)
|
2354
1725
|
|
2355
|
-
|
2356
|
-
|
2357
|
-
compartments_df : pd.DataFrame
|
2358
|
-
Raw compartments data
|
2359
|
-
interaction_source : source.Source
|
2360
|
-
Source object to assign to compartments
|
1726
|
+
def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
|
1727
|
+
"""Validates reactions data attribute
|
2361
1728
|
|
2362
|
-
|
2363
|
-
|
2364
|
-
pd.DataFrame
|
2365
|
-
Processed compartments with IDs, indexed by compartment ID
|
2366
|
-
"""
|
2367
|
-
compartments = compartments_df.copy()
|
2368
|
-
compartments[SBML_DFS.C_SOURCE] = interaction_source
|
2369
|
-
compartments[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
|
2370
|
-
range(compartments.shape[0]), SBML_DFS.C_ID
|
2371
|
-
)
|
2372
|
-
return compartments.set_index(SBML_DFS.C_ID)[
|
2373
|
-
[SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
|
2374
|
-
]
|
1729
|
+
Args:
|
1730
|
+
reactions_data_table (pd.DataFrame): a reactions data table
|
2375
1731
|
|
1732
|
+
Raises:
|
1733
|
+
ValueError: r_id not index name
|
1734
|
+
ValueError: r_id index contains duplicates
|
1735
|
+
ValueError: r_id not in reactions table
|
1736
|
+
"""
|
1737
|
+
sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
|
2376
1738
|
|
2377
|
-
def
|
2378
|
-
|
2379
|
-
|
1739
|
+
def _validate_sources(self):
|
1740
|
+
"""
|
1741
|
+
Validate sources in the model
|
2380
1742
|
|
2381
|
-
|
2382
|
-
----------
|
2383
|
-
species_df : pd.DataFrame
|
2384
|
-
Raw species data
|
2385
|
-
interaction_source : source.Source
|
2386
|
-
Source object to assign to species
|
2387
|
-
extra_species_columns : list
|
2388
|
-
Names of extra columns to preserve separately
|
1743
|
+
Iterates through all tables and checks if the source columns are valid.
|
2389
1744
|
|
2390
|
-
|
2391
|
-
|
2392
|
-
|
2393
|
-
Processed species DataFrame and species extra data DataFrame
|
2394
|
-
"""
|
2395
|
-
species = species_df.copy()
|
2396
|
-
species[SBML_DFS.S_SOURCE] = interaction_source
|
2397
|
-
species[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
|
2398
|
-
range(species.shape[0]), SBML_DFS.S_ID
|
2399
|
-
)
|
1745
|
+
Raises:
|
1746
|
+
ValueError: missing sources in the table
|
1747
|
+
"""
|
2400
1748
|
|
2401
|
-
|
2402
|
-
|
2403
|
-
|
2404
|
-
|
1749
|
+
SCHEMA = SBML_DFS_SCHEMA.SCHEMA
|
1750
|
+
for table in SBML_DFS_SCHEMA.SCHEMA.keys():
|
1751
|
+
if "source" not in SCHEMA[table].keys():
|
1752
|
+
continue
|
1753
|
+
source_series = self.get_table(table)[SCHEMA[table]["source"]]
|
1754
|
+
if source_series.isna().sum() > 0:
|
1755
|
+
missing_sources = source_series[source_series.isna()].index
|
1756
|
+
raise ValueError(
|
1757
|
+
f"{table} has {len(missing_sources)} missing sources: {missing_sources}"
|
1758
|
+
)
|
2405
1759
|
|
2406
|
-
|
2407
|
-
|
2408
|
-
processed_species = species_indexed[required_cols]
|
1760
|
+
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
1761
|
+
"""Validates species data attribute
|
2409
1762
|
|
2410
|
-
|
1763
|
+
Args:
|
1764
|
+
species_data_table (pd.DataFrame): a species data table
|
2411
1765
|
|
1766
|
+
Raises:
|
1767
|
+
ValueError: s_id not index name
|
1768
|
+
ValueError: s_id index contains duplicates
|
1769
|
+
ValueError: s_id not in species table
|
1770
|
+
"""
|
1771
|
+
sbml_dfs_utils._validate_matching_data(species_data_table, self.species)
|
2412
1772
|
|
2413
|
-
def
|
2414
|
-
|
2415
|
-
|
2416
|
-
"""
|
2417
|
-
Create compartmentalized species from interactions.
|
1773
|
+
def _validate_table(self, table_name: str) -> None:
|
1774
|
+
"""
|
1775
|
+
Validate a table in this SBML_dfs object against its schema.
|
2418
1776
|
|
2419
|
-
|
2420
|
-
|
2421
|
-
interaction_edgelist : pd.DataFrame
|
2422
|
-
Interaction data containing species-compartment combinations
|
2423
|
-
species_df : pd.DataFrame
|
2424
|
-
Processed species data with IDs
|
2425
|
-
compartments_df : pd.DataFrame
|
2426
|
-
Processed compartments data with IDs
|
2427
|
-
interaction_source : source.Source
|
2428
|
-
Source object to assign to compartmentalized species
|
1777
|
+
This is an internal method that validates a table that is part of this SBML_dfs
|
1778
|
+
object against the schema stored in self.schema.
|
2429
1779
|
|
2430
|
-
|
2431
|
-
|
2432
|
-
|
2433
|
-
|
2434
|
-
"""
|
2435
|
-
# Get all distinct upstream and downstream compartmentalized species
|
2436
|
-
comp_species = pd.concat(
|
2437
|
-
[
|
2438
|
-
interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
|
2439
|
-
{
|
2440
|
-
"upstream_name": SBML_DFS.S_NAME,
|
2441
|
-
"upstream_compartment": SBML_DFS.C_NAME,
|
2442
|
-
},
|
2443
|
-
axis=1,
|
2444
|
-
),
|
2445
|
-
interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
|
2446
|
-
{
|
2447
|
-
"downstream_name": SBML_DFS.S_NAME,
|
2448
|
-
"downstream_compartment": SBML_DFS.C_NAME,
|
2449
|
-
},
|
2450
|
-
axis=1,
|
2451
|
-
),
|
2452
|
-
]
|
2453
|
-
).drop_duplicates()
|
1780
|
+
Parameters
|
1781
|
+
----------
|
1782
|
+
table : str
|
1783
|
+
Name of the table to validate
|
2454
1784
|
|
2455
|
-
|
2456
|
-
|
2457
|
-
|
2458
|
-
|
2459
|
-
|
2460
|
-
|
1785
|
+
Raises
|
1786
|
+
------
|
1787
|
+
ValueError
|
1788
|
+
If the table does not conform to its schema
|
1789
|
+
"""
|
1790
|
+
table_data = getattr(self, table_name)
|
2461
1791
|
|
2462
|
-
|
2463
|
-
_sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
|
1792
|
+
sbml_dfs_utils.validate_sbml_dfs_table(table_data, table_name)
|
2464
1793
|
|
2465
|
-
# Format compartmentalized species with names, source, and IDs
|
2466
|
-
comp_species_w_ids[SBML_DFS.SC_NAME] = [
|
2467
|
-
f"{s} [{c}]"
|
2468
|
-
for s, c in zip(
|
2469
|
-
comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
|
2470
|
-
)
|
2471
|
-
]
|
2472
|
-
comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
|
2473
|
-
comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
|
2474
|
-
range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
|
2475
|
-
)
|
2476
1794
|
|
2477
|
-
|
2478
|
-
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
|
2483
|
-
|
2484
|
-
|
2485
|
-
|
2486
|
-
|
2487
|
-
|
2488
|
-
upstream_stoichiometry,
|
2489
|
-
downstream_stoichiometry,
|
2490
|
-
downstream_sbo_name,
|
2491
|
-
extra_reactions_columns,
|
2492
|
-
):
|
1795
|
+
def sbml_dfs_from_edgelist(
|
1796
|
+
interaction_edgelist: pd.DataFrame,
|
1797
|
+
species_df: pd.DataFrame,
|
1798
|
+
compartments_df: pd.DataFrame,
|
1799
|
+
interaction_source: source.Source,
|
1800
|
+
upstream_stoichiometry: int = 0,
|
1801
|
+
downstream_stoichiometry: int = 1,
|
1802
|
+
downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
|
1803
|
+
keep_species_data: bool | str = False,
|
1804
|
+
keep_reactions_data: bool | str = False,
|
1805
|
+
) -> SBML_dfs:
|
2493
1806
|
"""
|
2494
|
-
Create
|
1807
|
+
Create SBML_dfs from interaction edgelist.
|
1808
|
+
|
1809
|
+
Combines a set of molecular interactions into a mechanistic SBML_dfs model
|
1810
|
+
by processing interaction data, species information, and compartment definitions.
|
2495
1811
|
|
2496
1812
|
Parameters
|
2497
1813
|
----------
|
2498
1814
|
interaction_edgelist : pd.DataFrame
|
2499
|
-
|
2500
|
-
|
2501
|
-
|
1815
|
+
Table containing molecular interactions with columns:
|
1816
|
+
- upstream_name : str, matches "s_name" from species_df
|
1817
|
+
- downstream_name : str, matches "s_name" from species_df
|
1818
|
+
- upstream_compartment : str, matches "c_name" from compartments_df
|
1819
|
+
- downstream_compartment : str, matches "c_name" from compartments_df
|
1820
|
+
- r_name : str, name for the interaction
|
1821
|
+
- sbo_term : str, SBO term defining interaction type
|
1822
|
+
- r_Identifiers : identifiers.Identifiers, supporting identifiers
|
1823
|
+
- r_isreversible : bool, whether reaction is reversible
|
2502
1824
|
species_df : pd.DataFrame
|
2503
|
-
|
1825
|
+
Table defining molecular species with columns:
|
1826
|
+
- s_name : str, name of molecular species
|
1827
|
+
- s_Identifiers : identifiers.Identifiers, species identifiers
|
2504
1828
|
compartments_df : pd.DataFrame
|
2505
|
-
|
1829
|
+
Table defining compartments with columns:
|
1830
|
+
- c_name : str, name of compartment
|
1831
|
+
- c_Identifiers : identifiers.Identifiers, compartment identifiers
|
2506
1832
|
interaction_source : source.Source
|
2507
|
-
Source object
|
2508
|
-
upstream_stoichiometry : int
|
2509
|
-
Stoichiometry
|
2510
|
-
downstream_stoichiometry : int
|
2511
|
-
Stoichiometry
|
2512
|
-
downstream_sbo_name : str
|
2513
|
-
SBO term
|
2514
|
-
|
2515
|
-
|
1833
|
+
Source object linking model entities to interaction source
|
1834
|
+
upstream_stoichiometry : int, default 0
|
1835
|
+
Stoichiometry of upstream species in reactions
|
1836
|
+
downstream_stoichiometry : int, default 1
|
1837
|
+
Stoichiometry of downstream species in reactions
|
1838
|
+
downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
|
1839
|
+
SBO term for downstream reactant type
|
1840
|
+
keep_species_data : bool or str, default False
|
1841
|
+
Whether to preserve extra species columns. If True, saves as 'source' label.
|
1842
|
+
If string, uses as custom label. If False, discards extra data.
|
1843
|
+
keep_reactions_data : bool or str, default False
|
1844
|
+
Whether to preserve extra reaction columns. If True, saves as 'source' label.
|
1845
|
+
If string, uses as custom label. If False, discards extra data.
|
2516
1846
|
|
2517
1847
|
Returns
|
2518
1848
|
-------
|
2519
|
-
|
2520
|
-
|
1849
|
+
SBML_dfs
|
1850
|
+
Validated SBML data structure containing compartments, species,
|
1851
|
+
compartmentalized species, reactions, and reaction species tables.
|
2521
1852
|
"""
|
2522
|
-
#
|
2523
|
-
|
2524
|
-
|
2525
|
-
.merge(species_df[SBML_DFS.S_NAME].reset_index())
|
2526
|
-
.merge(compartments_df[SBML_DFS.C_NAME].reset_index())
|
1853
|
+
# 1. Validate inputs
|
1854
|
+
sbml_dfs_utils._edgelist_validate_inputs(
|
1855
|
+
interaction_edgelist, species_df, compartments_df
|
2527
1856
|
)
|
2528
1857
|
|
2529
|
-
|
2530
|
-
|
2531
|
-
|
2532
|
-
|
2533
|
-
SBML_DFS.S_NAME: "upstream_name",
|
2534
|
-
SBML_DFS.C_NAME: "upstream_compartment",
|
2535
|
-
},
|
2536
|
-
axis=1,
|
2537
|
-
),
|
2538
|
-
how="left",
|
2539
|
-
).merge(
|
2540
|
-
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
2541
|
-
{
|
2542
|
-
SBML_DFS.SC_ID: "sc_id_down",
|
2543
|
-
SBML_DFS.S_NAME: "downstream_name",
|
2544
|
-
SBML_DFS.C_NAME: "downstream_compartment",
|
2545
|
-
},
|
2546
|
-
axis=1,
|
2547
|
-
),
|
2548
|
-
how="left",
|
2549
|
-
)[
|
2550
|
-
REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
|
2551
|
-
]
|
2552
|
-
|
2553
|
-
# Validate merge didn't create duplicates
|
2554
|
-
if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
|
2555
|
-
raise ValueError(
|
2556
|
-
f"Merging compartmentalized species resulted in row count change "
|
2557
|
-
f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
|
2558
|
-
)
|
1858
|
+
# 2. Identify which extra columns to preserve
|
1859
|
+
extra_columns = sbml_dfs_utils._edgelist_identify_extra_columns(
|
1860
|
+
interaction_edgelist, species_df, keep_reactions_data, keep_species_data
|
1861
|
+
)
|
2559
1862
|
|
2560
|
-
#
|
2561
|
-
|
2562
|
-
|
1863
|
+
# 3. Process compartments and species tables
|
1864
|
+
processed_compartments = sbml_dfs_utils._edgelist_process_compartments(
|
1865
|
+
compartments_df, interaction_source
|
1866
|
+
)
|
1867
|
+
processed_species, species_data = sbml_dfs_utils._edgelist_process_species(
|
1868
|
+
species_df, interaction_source, extra_columns["species"]
|
2563
1869
|
)
|
2564
1870
|
|
2565
|
-
# Create
|
2566
|
-
|
2567
|
-
|
2568
|
-
|
2569
|
-
|
2570
|
-
|
2571
|
-
SBML_DFS.R_IDENTIFIERS,
|
2572
|
-
SBML_DFS.R_SOURCE,
|
2573
|
-
SBML_DFS.R_ISREVERSIBLE,
|
2574
|
-
]
|
2575
|
-
|
2576
|
-
reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
|
2577
|
-
reactions_columns + extra_reactions_columns
|
2578
|
-
]
|
2579
|
-
|
2580
|
-
# Separate extra data
|
2581
|
-
reactions_data = reactions_df[extra_reactions_columns]
|
2582
|
-
reactions_df = reactions_df[reactions_columns]
|
2583
|
-
|
2584
|
-
# Create reaction species relationships - NOW r_id exists
|
2585
|
-
reaction_species_df = pd.concat(
|
2586
|
-
[
|
2587
|
-
# Upstream species (modifiers/stimulators/inhibitors)
|
2588
|
-
interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
|
2589
|
-
.assign(stoichiometry=upstream_stoichiometry)
|
2590
|
-
.rename({"sc_id_up": "sc_id"}, axis=1),
|
2591
|
-
# Downstream species (products)
|
2592
|
-
interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
|
2593
|
-
.assign(
|
2594
|
-
stoichiometry=downstream_stoichiometry,
|
2595
|
-
sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
|
2596
|
-
)
|
2597
|
-
.rename({"sc_id_down": "sc_id"}, axis=1),
|
2598
|
-
]
|
1871
|
+
# 4. Create compartmentalized species
|
1872
|
+
comp_species = sbml_dfs_utils._edgelist_create_compartmentalized_species(
|
1873
|
+
interaction_edgelist,
|
1874
|
+
processed_species,
|
1875
|
+
processed_compartments,
|
1876
|
+
interaction_source,
|
2599
1877
|
)
|
2600
1878
|
|
2601
|
-
|
2602
|
-
|
1879
|
+
# 5. Create reactions and reaction species
|
1880
|
+
reactions, reaction_species, reactions_data = (
|
1881
|
+
sbml_dfs_utils._edgelist_create_reactions_and_species(
|
1882
|
+
interaction_edgelist,
|
1883
|
+
comp_species,
|
1884
|
+
processed_species,
|
1885
|
+
processed_compartments,
|
1886
|
+
interaction_source,
|
1887
|
+
upstream_stoichiometry,
|
1888
|
+
downstream_stoichiometry,
|
1889
|
+
downstream_sbo_name,
|
1890
|
+
extra_columns["reactions"],
|
1891
|
+
)
|
2603
1892
|
)
|
2604
1893
|
|
2605
|
-
|
1894
|
+
# 6. Assemble final SBML_dfs object
|
1895
|
+
sbml_dfs = _edgelist_assemble_sbml_model(
|
1896
|
+
processed_compartments,
|
1897
|
+
processed_species,
|
1898
|
+
comp_species,
|
1899
|
+
reactions,
|
1900
|
+
reaction_species,
|
1901
|
+
species_data,
|
1902
|
+
reactions_data,
|
1903
|
+
keep_species_data,
|
1904
|
+
keep_reactions_data,
|
1905
|
+
extra_columns,
|
1906
|
+
)
|
2606
1907
|
|
2607
|
-
return
|
1908
|
+
return sbml_dfs
|
2608
1909
|
|
2609
1910
|
|
2610
1911
|
def _edgelist_assemble_sbml_model(
|
2611
|
-
compartments,
|
2612
|
-
species,
|
2613
|
-
comp_species,
|
2614
|
-
reactions,
|
2615
|
-
reaction_species,
|
1912
|
+
compartments: pd.DataFrame,
|
1913
|
+
species: pd.DataFrame,
|
1914
|
+
comp_species: pd.DataFrame,
|
1915
|
+
reactions: pd.DataFrame,
|
1916
|
+
reaction_species: pd.DataFrame,
|
2616
1917
|
species_data,
|
2617
1918
|
reactions_data,
|
2618
1919
|
keep_species_data,
|
2619
1920
|
keep_reactions_data,
|
2620
|
-
extra_columns,
|
2621
|
-
):
|
1921
|
+
extra_columns: dict[str, list[str]],
|
1922
|
+
) -> SBML_dfs:
|
2622
1923
|
"""
|
2623
1924
|
Assemble the final SBML_dfs object.
|
2624
1925
|
|
@@ -2675,128 +1976,3 @@ def _edgelist_assemble_sbml_model(
|
|
2675
1976
|
sbml_model.validate()
|
2676
1977
|
|
2677
1978
|
return sbml_model
|
2678
|
-
|
2679
|
-
|
2680
|
-
def _sbml_dfs_from_edgelist_check_cspecies_merge(
|
2681
|
-
merged_species: pd.DataFrame, original_species: pd.DataFrame
|
2682
|
-
) -> None:
|
2683
|
-
"""Check for a mismatch between the provided species data and species implied by the edgelist."""
|
2684
|
-
|
2685
|
-
# check for 1-many merge
|
2686
|
-
if merged_species.shape[0] != original_species.shape[0]:
|
2687
|
-
raise ValueError(
|
2688
|
-
"Merging compartmentalized species to species_df"
|
2689
|
-
" and compartments_df by names resulted in an "
|
2690
|
-
f"increase in the tables from {original_species.shape[0]}"
|
2691
|
-
f" to {merged_species.shape[0]} indicating that names were"
|
2692
|
-
" not unique"
|
2693
|
-
)
|
2694
|
-
|
2695
|
-
# check for missing species and compartments
|
2696
|
-
missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
|
2697
|
-
SBML_DFS.C_NAME
|
2698
|
-
].unique()
|
2699
|
-
if len(missing_compartments) >= 1:
|
2700
|
-
raise ValueError(
|
2701
|
-
f"{len(missing_compartments)} compartments were present in"
|
2702
|
-
' "interaction_edgelist" but not "compartments_df":'
|
2703
|
-
f" {', '.join(missing_compartments)}"
|
2704
|
-
)
|
2705
|
-
|
2706
|
-
missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
|
2707
|
-
SBML_DFS.S_NAME
|
2708
|
-
].unique()
|
2709
|
-
if len(missing_species) >= 1:
|
2710
|
-
raise ValueError(
|
2711
|
-
f"{len(missing_species)} species were present in "
|
2712
|
-
'"interaction_edgelist" but not "species_df":'
|
2713
|
-
f" {', '.join(missing_species)}"
|
2714
|
-
)
|
2715
|
-
|
2716
|
-
return None
|
2717
|
-
|
2718
|
-
|
2719
|
-
def _stub_compartments(
|
2720
|
-
stubbed_compartment: str = GENERIC_COMPARTMENT,
|
2721
|
-
) -> pd.DataFrame:
|
2722
|
-
"""Stub Compartments
|
2723
|
-
|
2724
|
-
Create a compartments table with only a single compartment
|
2725
|
-
|
2726
|
-
Args:
|
2727
|
-
stubbed_compartment (str): the name of a compartment which should match the
|
2728
|
-
keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
|
2729
|
-
|
2730
|
-
Returns:
|
2731
|
-
compartments_df (pd.DataFrame): compartments dataframe
|
2732
|
-
"""
|
2733
|
-
|
2734
|
-
if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
|
2735
|
-
raise ValueError(
|
2736
|
-
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
|
2737
|
-
)
|
2738
|
-
|
2739
|
-
if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
|
2740
|
-
raise ValueError(
|
2741
|
-
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
|
2742
|
-
)
|
2743
|
-
|
2744
|
-
stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
|
2745
|
-
|
2746
|
-
formatted_uri = identifiers.format_uri(
|
2747
|
-
uri=identifiers.create_uri_url(
|
2748
|
-
ontology=ONTOLOGIES.GO,
|
2749
|
-
identifier=stubbed_compartment_id,
|
2750
|
-
),
|
2751
|
-
biological_qualifier_type=BQB.IS,
|
2752
|
-
)
|
2753
|
-
|
2754
|
-
compartments_df = pd.DataFrame(
|
2755
|
-
{
|
2756
|
-
SBML_DFS.C_NAME: [stubbed_compartment],
|
2757
|
-
SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
|
2758
|
-
}
|
2759
|
-
)
|
2760
|
-
compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
|
2761
|
-
compartments_df.index.name = SBML_DFS.C_ID
|
2762
|
-
|
2763
|
-
return compartments_df
|
2764
|
-
|
2765
|
-
|
2766
|
-
def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
|
2767
|
-
"""Validates a table against a reference
|
2768
|
-
|
2769
|
-
This check if the table has the same index, no duplicates in the index
|
2770
|
-
and that all values in the index are in the reference table.
|
2771
|
-
|
2772
|
-
Args:
|
2773
|
-
data_table (pd.DataFrame): a table with data that should
|
2774
|
-
match the reference
|
2775
|
-
ref_table (pd.DataFrame): a reference table
|
2776
|
-
|
2777
|
-
Raises:
|
2778
|
-
ValueError: not same index name
|
2779
|
-
ValueError: index contains duplicates
|
2780
|
-
ValueError: index not subset of index of reactions table
|
2781
|
-
"""
|
2782
|
-
ref_index_name = ref_table.index.name
|
2783
|
-
if data_table.index.name != ref_index_name:
|
2784
|
-
raise ValueError(
|
2785
|
-
"the index name for reaction data table was not"
|
2786
|
-
f" {ref_index_name}: {data_table.index.name}"
|
2787
|
-
)
|
2788
|
-
ids = data_table.index
|
2789
|
-
if any(ids.duplicated()):
|
2790
|
-
raise ValueError(
|
2791
|
-
"the index for reaction data table " "contained duplicate values"
|
2792
|
-
)
|
2793
|
-
if not all(ids.isin(ref_table.index)):
|
2794
|
-
raise ValueError(
|
2795
|
-
"the index for reaction data table contained values"
|
2796
|
-
" not found in the reactions table"
|
2797
|
-
)
|
2798
|
-
if not isinstance(data_table, pd.DataFrame):
|
2799
|
-
raise TypeError(
|
2800
|
-
f"The data table was type {type(data_table).__name__}"
|
2801
|
-
" but must be a pd.DataFrame"
|
2802
|
-
)
|