napistu 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +18 -18
- napistu/consensus.py +3 -2
- napistu/constants.py +1 -1
- napistu/context/filtering.py +2 -1
- napistu/identifiers.py +3 -6
- napistu/ingestion/bigg.py +6 -6
- napistu/ingestion/string.py +2 -1
- napistu/ingestion/yeast.py +2 -1
- napistu/matching/interactions.py +4 -4
- napistu/modify/uncompartmentalize.py +1 -1
- napistu/network/net_create.py +1 -1
- napistu/network/paths.py +1 -1
- napistu/ontologies/dogma.py +2 -1
- napistu/sbml_dfs_core.py +1282 -2169
- napistu/sbml_dfs_utils.py +1082 -143
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/METADATA +2 -2
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/RECORD +28 -28
- tests/conftest.py +43 -0
- tests/test_consensus.py +88 -0
- tests/test_context_filtering.py +2 -2
- tests/test_ontologies_genodexito.py +3 -0
- tests/test_ontologies_mygene.py +3 -0
- tests/test_sbml_dfs_core.py +102 -203
- tests/test_sbml_dfs_utils.py +194 -36
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/WHEEL +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/entry_points.txt +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_core.py
CHANGED
@@ -7,8 +7,12 @@ from typing import Iterable
|
|
7
7
|
from typing import Mapping
|
8
8
|
from typing import MutableMapping
|
9
9
|
from typing import TYPE_CHECKING
|
10
|
+
from typing import Optional
|
11
|
+
from typing import Union
|
10
12
|
|
13
|
+
from fs import open_fs
|
11
14
|
import pandas as pd
|
15
|
+
|
12
16
|
from napistu import identifiers
|
13
17
|
from napistu import sbml_dfs_utils
|
14
18
|
from napistu import source
|
@@ -17,25 +21,14 @@ from napistu.ingestion import sbml
|
|
17
21
|
from napistu.constants import SBML_DFS
|
18
22
|
from napistu.constants import SBML_DFS_SCHEMA
|
19
23
|
from napistu.constants import IDENTIFIERS
|
20
|
-
from napistu.constants import
|
21
|
-
from napistu.constants import CPR_STANDARD_OUTPUTS
|
22
|
-
from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
|
24
|
+
from napistu.constants import NAPISTU_STANDARD_OUTPUTS
|
23
25
|
from napistu.constants import BQB_PRIORITIES
|
24
26
|
from napistu.constants import ONTOLOGY_PRIORITIES
|
25
|
-
from napistu.constants import BQB
|
26
|
-
from napistu.constants import BQB_DEFINING_ATTRS
|
27
27
|
from napistu.constants import MINI_SBO_FROM_NAME
|
28
28
|
from napistu.constants import MINI_SBO_TO_NAME
|
29
|
-
from napistu.constants import ONTOLOGIES
|
30
|
-
from napistu.constants import SBO_NAME_TO_ROLE
|
31
29
|
from napistu.constants import SBOTERM_NAMES
|
32
|
-
from napistu.constants import SBO_ROLES_DEFS
|
33
30
|
from napistu.constants import ENTITIES_W_DATA
|
34
31
|
from napistu.constants import ENTITIES_TO_ENTITY_DATA
|
35
|
-
from napistu.ingestion.constants import GENERIC_COMPARTMENT
|
36
|
-
from napistu.ingestion.constants import COMPARTMENT_ALIASES
|
37
|
-
from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
|
38
|
-
from fs import open_fs
|
39
32
|
|
40
33
|
logger = logging.getLogger(__name__)
|
41
34
|
|
@@ -65,26 +58,76 @@ class SBML_dfs:
|
|
65
58
|
schema : dict
|
66
59
|
Dictionary representing the structure of the other attributes and meaning of their variables
|
67
60
|
|
68
|
-
Methods
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
61
|
+
Public Methods (alphabetical)
|
62
|
+
----------------------------
|
63
|
+
add_reactions_data(label, data)
|
64
|
+
Add a new reactions data table to the model with validation.
|
65
|
+
add_species_data(label, data)
|
66
|
+
Add a new species data table to the model with validation.
|
67
|
+
export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
|
68
|
+
Export the SBML_dfs model and its tables to files in a specified directory.
|
69
|
+
get_characteristic_species_ids(dogmatic=True)
|
70
|
+
Return characteristic systematic identifiers for molecular species, optionally using a strict or loose definition.
|
76
71
|
get_cspecies_features()
|
77
|
-
|
78
|
-
get_species_features()
|
79
|
-
Get additional attributes of species
|
72
|
+
Compute and return additional features for compartmentalized species, such as degree and type.
|
80
73
|
get_identifiers(id_type)
|
81
|
-
|
82
|
-
|
83
|
-
|
74
|
+
Retrieve a table of identifiers for a specified entity type (e.g., species or reactions).
|
75
|
+
get_network_summary()
|
76
|
+
Return a dictionary of diagnostic statistics summarizing the network structure.
|
77
|
+
get_species_features()
|
78
|
+
Compute and return additional features for species, such as species type.
|
79
|
+
get_table(entity_type, required_attributes=None)
|
80
|
+
Retrieve a table for a given entity type, optionally validating required attributes.
|
81
|
+
get_uri_urls(entity_type, entity_ids=None, required_ontology=None)
|
82
|
+
Return reference URLs for specified entities, optionally filtered by ontology.
|
83
|
+
infer_sbo_terms()
|
84
|
+
Infer and fill in missing SBO terms for reaction species based on stoichiometry.
|
85
|
+
infer_uncompartmentalized_species_location()
|
86
|
+
Infer and assign compartments for compartmentalized species with missing compartment information.
|
87
|
+
name_compartmentalized_species()
|
88
|
+
Rename compartmentalized species to include compartment information if needed.
|
89
|
+
reaction_formulas(r_ids=None)
|
90
|
+
Generate human-readable reaction formulas for specified reactions.
|
91
|
+
reaction_summaries(r_ids=None)
|
92
|
+
Return a summary DataFrame for specified reactions, including names and formulas.
|
93
|
+
remove_compartmentalized_species(sc_ids)
|
94
|
+
Remove specified compartmentalized species and associated reactions from the model.
|
95
|
+
remove_reactions(r_ids, remove_species=False)
|
96
|
+
Remove specified reactions and optionally remove unused species.
|
97
|
+
remove_reactions_data(label)
|
98
|
+
Remove a reactions data table by label.
|
99
|
+
remove_species_data(label)
|
100
|
+
Remove a species data table by label.
|
101
|
+
search_by_ids(ids, entity_type, identifiers_df, ontologies=None)
|
102
|
+
Find entities and identifiers matching a set of query IDs.
|
103
|
+
search_by_name(name, entity_type, partial_match=True)
|
104
|
+
Find entities by exact or partial name match.
|
105
|
+
select_species_data(species_data_table)
|
106
|
+
Select a species data table from the SBML_dfs object by name.
|
107
|
+
species_status(s_id)
|
108
|
+
Return all reactions a species participates in, with stoichiometry and formula information.
|
84
109
|
validate()
|
85
|
-
Validate the SBML_dfs structure and relationships
|
110
|
+
Validate the SBML_dfs structure and relationships.
|
86
111
|
validate_and_resolve()
|
87
|
-
Validate and attempt to automatically fix common issues
|
112
|
+
Validate and attempt to automatically fix common issues.
|
113
|
+
|
114
|
+
Private/Hidden Methods (alphabetical, appear after public methods)
|
115
|
+
-----------------------------------------------------------------
|
116
|
+
_attempt_resolve(e)
|
117
|
+
_check_pk_fk_correspondence()
|
118
|
+
_find_underspecified_reactions_by_scids(sc_ids)
|
119
|
+
_get_unused_cspecies()
|
120
|
+
_get_unused_species()
|
121
|
+
_remove_compartmentalized_species(sc_ids)
|
122
|
+
_remove_entity_data(entity_type, label)
|
123
|
+
_remove_species(s_ids)
|
124
|
+
_remove_unused_cspecies()
|
125
|
+
_remove_unused_species()
|
126
|
+
_validate_r_ids(r_ids)
|
127
|
+
_validate_reaction_species()
|
128
|
+
_validate_reactions_data(reactions_data_table)
|
129
|
+
_validate_species_data(species_data_table)
|
130
|
+
_validate_table(table_name)
|
88
131
|
"""
|
89
132
|
|
90
133
|
compartments: pd.DataFrame
|
@@ -162,193 +205,176 @@ class SBML_dfs:
|
|
162
205
|
'"validate" = False so "resolve" will be ignored (eventhough it was True)'
|
163
206
|
)
|
164
207
|
|
165
|
-
|
166
|
-
|
167
|
-
|
208
|
+
# =============================================================================
|
209
|
+
# PUBLIC METHODS (ALPHABETICAL ORDER)
|
210
|
+
# =============================================================================
|
211
|
+
|
212
|
+
def add_reactions_data(self, label: str, data: pd.DataFrame):
|
168
213
|
"""
|
169
|
-
|
214
|
+
Add additional reaction data with validation.
|
170
215
|
|
171
216
|
Parameters
|
172
217
|
----------
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
Must be passed as a set, e.g. {'id'}, not a string.
|
178
|
-
|
179
|
-
Returns
|
180
|
-
-------
|
181
|
-
pd.DataFrame
|
182
|
-
The requested table
|
218
|
+
label : str
|
219
|
+
Label for the new data
|
220
|
+
data : pd.DataFrame
|
221
|
+
Data to add, must be indexed by reaction_id
|
183
222
|
|
184
223
|
Raises
|
185
224
|
------
|
186
225
|
ValueError
|
187
|
-
If
|
188
|
-
TypeError
|
189
|
-
If required_attributes is not a set
|
226
|
+
If the data is invalid or label already exists
|
190
227
|
"""
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
if entity_type not in schema.keys():
|
228
|
+
self._validate_reactions_data(data)
|
229
|
+
if label in self.reactions_data:
|
195
230
|
raise ValueError(
|
196
|
-
f"{
|
197
|
-
f"which are present are {', '.join(schema.keys())}"
|
198
|
-
)
|
199
|
-
|
200
|
-
if required_attributes is not None:
|
201
|
-
if not isinstance(required_attributes, set):
|
202
|
-
raise TypeError(
|
203
|
-
f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
|
204
|
-
"Did you pass a string instead of a set?"
|
205
|
-
)
|
206
|
-
|
207
|
-
# determine whether required_attributes are appropriate
|
208
|
-
VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
|
209
|
-
invalid_required_attributes = required_attributes.difference(
|
210
|
-
VALID_REQUIRED_ATTRIBUTES
|
231
|
+
f"{label} already exists in reactions_data. " "Drop it first."
|
211
232
|
)
|
233
|
+
self.reactions_data[label] = data
|
212
234
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
|
217
|
-
)
|
235
|
+
def add_species_data(self, label: str, data: pd.DataFrame):
|
236
|
+
"""
|
237
|
+
Add additional species data with validation.
|
218
238
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
f"The following required attributes are not present for the {entity_type} table: "
|
226
|
-
f"{', '.join(invalid_attrs)}."
|
227
|
-
)
|
239
|
+
Parameters
|
240
|
+
----------
|
241
|
+
label : str
|
242
|
+
Label for the new data
|
243
|
+
data : pd.DataFrame
|
244
|
+
Data to add, must be indexed by species_id
|
228
245
|
|
229
|
-
|
246
|
+
Raises
|
247
|
+
------
|
248
|
+
ValueError
|
249
|
+
If the data is invalid or label already exists
|
250
|
+
"""
|
251
|
+
self._validate_species_data(data)
|
252
|
+
if label in self.species_data:
|
253
|
+
raise ValueError(
|
254
|
+
f"{label} already exists in species_data. " "Drop it first."
|
255
|
+
)
|
256
|
+
self.species_data[label] = data
|
230
257
|
|
231
|
-
def
|
258
|
+
def export_sbml_dfs(
|
232
259
|
self,
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
) ->
|
260
|
+
model_prefix: str,
|
261
|
+
outdir: str,
|
262
|
+
overwrite: bool = False,
|
263
|
+
dogmatic: bool = True,
|
264
|
+
) -> None:
|
238
265
|
"""
|
239
|
-
|
266
|
+
Export SBML_dfs
|
240
267
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
268
|
+
Export summaries of species identifiers and each table underlying
|
269
|
+
an SBML_dfs pathway model
|
270
|
+
|
271
|
+
Params
|
272
|
+
------
|
273
|
+
model_prefix: str
|
274
|
+
Label to prepend to all exported files
|
275
|
+
outdir: str
|
276
|
+
Path to an existing directory where results should be saved
|
277
|
+
overwrite: bool
|
278
|
+
Should the directory be overwritten if it already exists?
|
279
|
+
dogmatic: bool
|
280
|
+
If True then treat genes, transcript, and proteins as separate species. If False
|
281
|
+
then treat them interchangeably.
|
251
282
|
|
252
283
|
Returns
|
253
284
|
-------
|
254
|
-
|
255
|
-
- Matching entities
|
256
|
-
- Matching identifiers
|
257
|
-
|
258
|
-
Raises
|
259
|
-
------
|
260
|
-
ValueError
|
261
|
-
If entity_type is invalid or ontologies are invalid
|
262
|
-
TypeError
|
263
|
-
If ontologies is not a set
|
285
|
+
None
|
264
286
|
"""
|
265
|
-
|
266
|
-
|
267
|
-
|
287
|
+
if not isinstance(model_prefix, str):
|
288
|
+
raise TypeError(
|
289
|
+
f"model_prefix was a {type(model_prefix)} " "and must be a str"
|
290
|
+
)
|
291
|
+
if not isinstance(self, SBML_dfs):
|
292
|
+
raise TypeError(
|
293
|
+
f"sbml_dfs was a {type(self)} and must" " be an sbml.SBML_dfs"
|
294
|
+
)
|
268
295
|
|
269
|
-
|
270
|
-
|
271
|
-
req_vars={
|
272
|
-
entity_pk,
|
273
|
-
IDENTIFIERS.ONTOLOGY,
|
274
|
-
IDENTIFIERS.IDENTIFIER,
|
275
|
-
IDENTIFIERS.URL,
|
276
|
-
IDENTIFIERS.BQB,
|
277
|
-
},
|
278
|
-
allow_series=False,
|
279
|
-
).assert_present()
|
296
|
+
# filter to identifiers which make sense when mapping from ids -> species
|
297
|
+
species_identifiers = self.get_characteristic_species_ids(dogmatic=dogmatic)
|
280
298
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
299
|
+
try:
|
300
|
+
utils.initialize_dir(outdir, overwrite=overwrite)
|
301
|
+
except FileExistsError:
|
302
|
+
logger.warning(
|
303
|
+
f"Directory {outdir} already exists and overwrite is False. "
|
304
|
+
"Files will be added to the existing directory."
|
305
|
+
)
|
306
|
+
with open_fs(outdir, writeable=True) as fs:
|
307
|
+
species_identifiers_path = (
|
308
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
|
309
|
+
)
|
310
|
+
with fs.openbin(species_identifiers_path, "w") as f:
|
311
|
+
species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
|
312
|
+
f, sep="\t", index=False
|
293
313
|
)
|
294
314
|
|
295
|
-
#
|
296
|
-
|
315
|
+
# export jsons
|
316
|
+
species_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES
|
317
|
+
reactions_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTIONS
|
318
|
+
reation_species_path = (
|
319
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTION_SPECIES
|
320
|
+
)
|
321
|
+
compartments_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTS
|
322
|
+
compartmentalized_species_path = (
|
323
|
+
model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
|
324
|
+
)
|
325
|
+
with fs.openbin(species_path, "w") as f:
|
326
|
+
self.species[[SBML_DFS.S_NAME]].to_json(f)
|
297
327
|
|
298
|
-
|
299
|
-
|
300
|
-
]
|
301
|
-
entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
|
328
|
+
with fs.openbin(reactions_path, "w") as f:
|
329
|
+
self.reactions[[SBML_DFS.R_NAME]].to_json(f)
|
302
330
|
|
303
|
-
|
331
|
+
with fs.openbin(reation_species_path, "w") as f:
|
332
|
+
self.reaction_species.to_json(f)
|
304
333
|
|
305
|
-
|
306
|
-
|
307
|
-
|
334
|
+
with fs.openbin(compartments_path, "w") as f:
|
335
|
+
self.compartments[[SBML_DFS.C_NAME]].to_json(f)
|
336
|
+
|
337
|
+
with fs.openbin(compartmentalized_species_path, "w") as f:
|
338
|
+
self.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
|
339
|
+
f
|
340
|
+
)
|
341
|
+
|
342
|
+
return None
|
343
|
+
|
344
|
+
def get_characteristic_species_ids(self, dogmatic: bool = True) -> pd.DataFrame:
|
308
345
|
"""
|
309
|
-
|
346
|
+
Get Characteristic Species IDs
|
347
|
+
|
348
|
+
List the systematic identifiers which are characteristic of molecular species, e.g., excluding subcomponents, and optionally, treating proteins, transcripts, and genes equiavlently.
|
310
349
|
|
311
350
|
Parameters
|
312
351
|
----------
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
partial_match : bool, optional
|
318
|
-
Whether to allow partial string matches, by default True
|
352
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
353
|
+
The SBML_dfs object.
|
354
|
+
dogmatic : bool, default=True
|
355
|
+
Whether to use the dogmatic flag to determine which BQB attributes are valid.
|
319
356
|
|
320
357
|
Returns
|
321
358
|
-------
|
322
359
|
pd.DataFrame
|
323
|
-
|
360
|
+
A DataFrame containing the systematic identifiers which are characteristic of molecular species.
|
324
361
|
"""
|
325
|
-
entity_table = self.get_table(entity_type, required_attributes={"label"})
|
326
|
-
label_attr = self.schema[entity_type]["label"]
|
327
362
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
else:
|
333
|
-
matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
|
334
|
-
return matches
|
363
|
+
# select valid BQB attributes based on dogmatic flag
|
364
|
+
defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(
|
365
|
+
dogmatic
|
366
|
+
)
|
335
367
|
|
336
|
-
|
337
|
-
|
338
|
-
Get additional attributes of species.
|
368
|
+
# pre-summarize ontologies
|
369
|
+
species_identifiers = self.get_identifiers(SBML_DFS.SPECIES)
|
339
370
|
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
- species_type: Classification of the species (e.g., metabolite, protein)
|
345
|
-
"""
|
346
|
-
species = self.species
|
347
|
-
augmented_species = species.assign(
|
348
|
-
**{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
|
371
|
+
# drop some BQB_HAS_PART annotations
|
372
|
+
species_identifiers = sbml_dfs_utils.filter_to_characteristic_species_ids(
|
373
|
+
species_identifiers,
|
374
|
+
defining_biological_qualifiers=defining_biological_qualifiers,
|
349
375
|
)
|
350
376
|
|
351
|
-
return
|
377
|
+
return species_identifiers
|
352
378
|
|
353
379
|
def get_cspecies_features(self) -> pd.DataFrame:
|
354
380
|
"""
|
@@ -445,113 +471,28 @@ class SBML_dfs:
|
|
445
471
|
|
446
472
|
return named_identifiers
|
447
473
|
|
448
|
-
def
|
449
|
-
self,
|
450
|
-
entity_type: str,
|
451
|
-
entity_ids: Iterable[str] | None = None,
|
452
|
-
required_ontology: str | None = None,
|
453
|
-
) -> pd.Series:
|
474
|
+
def get_network_summary(self) -> Mapping[str, Any]:
|
454
475
|
"""
|
455
|
-
Get
|
456
|
-
|
457
|
-
Parameters
|
458
|
-
----------
|
459
|
-
entity_type : str
|
460
|
-
Type of entity to get URLs for (e.g., 'species', 'reactions')
|
461
|
-
entity_ids : Optional[Iterable[str]], optional
|
462
|
-
Specific entities to get URLs for, by default None (all entities)
|
463
|
-
required_ontology : Optional[str], optional
|
464
|
-
Specific ontology to get URLs from, by default None
|
476
|
+
Get diagnostic statistics about the network.
|
465
477
|
|
466
478
|
Returns
|
467
479
|
-------
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
if entity_type not in valid_entity_types:
|
486
|
-
raise ValueError(
|
487
|
-
f"{entity_type} is an invalid entity_type; valid types "
|
488
|
-
f"are {', '.join(valid_entity_types)}"
|
489
|
-
)
|
490
|
-
|
491
|
-
entity_table = getattr(self, entity_type)
|
492
|
-
|
493
|
-
if entity_ids is not None:
|
494
|
-
# ensure that entity_ids are unique and then convert back to list
|
495
|
-
# to support pandas indexing
|
496
|
-
entity_ids = list(set(entity_ids))
|
497
|
-
|
498
|
-
# filter to a subset of identifiers if one is provided
|
499
|
-
entity_table = entity_table.loc[entity_ids]
|
500
|
-
|
501
|
-
# create a dataframe of all identifiers for the select entities
|
502
|
-
all_ids = pd.concat(
|
503
|
-
[
|
504
|
-
sbml_dfs_utils._stub_ids(
|
505
|
-
entity_table[schema[entity_type]["id"]].iloc[i].ids
|
506
|
-
).assign(id=entity_table.index[i])
|
507
|
-
for i in range(0, entity_table.shape[0])
|
508
|
-
]
|
509
|
-
).rename(columns={"id": schema[entity_type]["pk"]})
|
510
|
-
|
511
|
-
# set priorities for ontologies and bqb terms
|
512
|
-
|
513
|
-
if required_ontology is None:
|
514
|
-
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
515
|
-
ONTOLOGY_PRIORITIES, how="left"
|
516
|
-
)
|
517
|
-
else:
|
518
|
-
ontology_priorities = pd.DataFrame(
|
519
|
-
[{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
|
520
|
-
)
|
521
|
-
# if only a single ontology is sought then just return matching entries
|
522
|
-
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
523
|
-
ontology_priorities, how="inner"
|
524
|
-
)
|
525
|
-
|
526
|
-
uri_urls = (
|
527
|
-
all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
|
528
|
-
.groupby(schema[entity_type]["pk"])
|
529
|
-
.first()[IDENTIFIERS.URL]
|
530
|
-
)
|
531
|
-
return uri_urls
|
532
|
-
|
533
|
-
def get_network_summary(self) -> Mapping[str, Any]:
|
534
|
-
"""
|
535
|
-
Get diagnostic statistics about the network.
|
536
|
-
|
537
|
-
Returns
|
538
|
-
-------
|
539
|
-
Mapping[str, Any]
|
540
|
-
Dictionary of diagnostic statistics including:
|
541
|
-
- n_species_types: Number of species types
|
542
|
-
- dict_n_species_per_type: Number of species per type
|
543
|
-
- n_species: Number of species
|
544
|
-
- n_cspecies: Number of compartmentalized species
|
545
|
-
- n_reaction_species: Number of reaction species
|
546
|
-
- n_reactions: Number of reactions
|
547
|
-
- n_compartments: Number of compartments
|
548
|
-
- dict_n_species_per_compartment: Number of species per compartment
|
549
|
-
- stats_species_per_reaction: Statistics on reactands per reaction
|
550
|
-
- top10_species_per_reaction: Top 10 reactions by number of reactands
|
551
|
-
- stats_degree: Statistics on species connectivity
|
552
|
-
- top10_degree: Top 10 species by connectivity
|
553
|
-
- stats_identifiers_per_species: Statistics on identifiers per species
|
554
|
-
- top10_identifiers_per_species: Top 10 species by number of identifiers
|
480
|
+
Mapping[str, Any]
|
481
|
+
Dictionary of diagnostic statistics including:
|
482
|
+
- n_species_types: Number of species types
|
483
|
+
- dict_n_species_per_type: Number of species per type
|
484
|
+
- n_species: Number of species
|
485
|
+
- n_cspecies: Number of compartmentalized species
|
486
|
+
- n_reaction_species: Number of reaction species
|
487
|
+
- n_reactions: Number of reactions
|
488
|
+
- n_compartments: Number of compartments
|
489
|
+
- dict_n_species_per_compartment: Number of species per compartment
|
490
|
+
- stats_species_per_reaction: Statistics on reactands per reaction
|
491
|
+
- top10_species_per_reaction: Top 10 reactions by number of reactands
|
492
|
+
- stats_degree: Statistics on species connectivity
|
493
|
+
- top10_degree: Top 10 species by connectivity
|
494
|
+
- stats_identifiers_per_species: Statistics on identifiers per species
|
495
|
+
- top10_identifiers_per_species: Top 10 species by number of identifiers
|
555
496
|
"""
|
556
497
|
stats: MutableMapping[str, Any] = {}
|
557
498
|
species_features = self.get_species_features()
|
@@ -616,2009 +557,1306 @@ class SBML_dfs:
|
|
616
557
|
|
617
558
|
return stats
|
618
559
|
|
619
|
-
def
|
560
|
+
def get_species_features(self) -> pd.DataFrame:
|
620
561
|
"""
|
621
|
-
|
622
|
-
|
623
|
-
Parameters
|
624
|
-
----------
|
625
|
-
label : str
|
626
|
-
Label for the new data
|
627
|
-
data : pd.DataFrame
|
628
|
-
Data to add, must be indexed by species_id
|
562
|
+
Get additional attributes of species.
|
629
563
|
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
564
|
+
Returns
|
565
|
+
-------
|
566
|
+
pd.DataFrame
|
567
|
+
Species with additional features including:
|
568
|
+
- species_type: Classification of the species (e.g., metabolite, protein)
|
634
569
|
"""
|
635
|
-
self.
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
570
|
+
species = self.species
|
571
|
+
augmented_species = species.assign(
|
572
|
+
**{
|
573
|
+
"species_type": lambda d: d["s_Identifiers"].apply(
|
574
|
+
sbml_dfs_utils.species_type_types
|
575
|
+
)
|
576
|
+
}
|
577
|
+
)
|
641
578
|
|
642
|
-
|
643
|
-
"""
|
644
|
-
Remove species data by label.
|
645
|
-
"""
|
646
|
-
self._remove_entity_data(SBML_DFS.SPECIES, label)
|
579
|
+
return augmented_species
|
647
580
|
|
648
|
-
def
|
581
|
+
def get_table(
|
582
|
+
self, entity_type: str, required_attributes: None | set[str] = None
|
583
|
+
) -> pd.DataFrame:
|
649
584
|
"""
|
650
|
-
|
585
|
+
Get a table from the SBML_dfs object with optional attribute validation.
|
651
586
|
|
652
587
|
Parameters
|
653
588
|
----------
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
589
|
+
entity_type : str
|
590
|
+
The type of entity table to retrieve (e.g., 'species', 'reactions')
|
591
|
+
required_attributes : Optional[Set[str]], optional
|
592
|
+
Set of attributes that must be present in the table, by default None.
|
593
|
+
Must be passed as a set, e.g. {'id'}, not a string.
|
594
|
+
|
595
|
+
Returns
|
596
|
+
-------
|
597
|
+
pd.DataFrame
|
598
|
+
The requested table
|
658
599
|
|
659
600
|
Raises
|
660
601
|
------
|
661
602
|
ValueError
|
662
|
-
If
|
603
|
+
If entity_type is invalid or required attributes are missing
|
604
|
+
TypeError
|
605
|
+
If required_attributes is not a set
|
663
606
|
"""
|
664
|
-
self._validate_reactions_data(data)
|
665
|
-
if label in self.reactions_data:
|
666
|
-
raise ValueError(
|
667
|
-
f"{label} already exists in reactions_data. Drop it first."
|
668
|
-
)
|
669
|
-
self.reactions_data[label] = data
|
670
607
|
|
671
|
-
|
672
|
-
"""
|
673
|
-
Remove reactions data by label.
|
674
|
-
"""
|
675
|
-
self._remove_entity_data(SBML_DFS.REACTIONS, label)
|
608
|
+
schema = self.schema
|
676
609
|
|
677
|
-
|
678
|
-
|
679
|
-
|
610
|
+
if entity_type not in schema.keys():
|
611
|
+
raise ValueError(
|
612
|
+
f"{entity_type} does not match a table in the SBML_dfs object. The tables "
|
613
|
+
f"which are present are {', '.join(schema.keys())}"
|
614
|
+
)
|
680
615
|
|
681
|
-
|
682
|
-
|
683
|
-
|
616
|
+
if required_attributes is not None:
|
617
|
+
if not isinstance(required_attributes, set):
|
618
|
+
raise TypeError(
|
619
|
+
f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
|
620
|
+
"Did you pass a string instead of a set?"
|
621
|
+
)
|
684
622
|
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
623
|
+
# determine whether required_attributes are appropriate
|
624
|
+
VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
|
625
|
+
invalid_required_attributes = required_attributes.difference(
|
626
|
+
VALID_REQUIRED_ATTRIBUTES
|
627
|
+
)
|
690
628
|
|
691
|
-
|
692
|
-
|
693
|
-
|
629
|
+
if len(invalid_required_attributes) > 0:
|
630
|
+
raise ValueError(
|
631
|
+
f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
|
632
|
+
f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
|
633
|
+
)
|
694
634
|
|
695
|
-
|
635
|
+
# determine if required_attributes are satisified
|
636
|
+
invalid_attrs = [
|
637
|
+
s for s in required_attributes if s not in schema[entity_type].keys()
|
638
|
+
]
|
639
|
+
if len(invalid_attrs) > 0:
|
640
|
+
raise ValueError(
|
641
|
+
f"The following required attributes are not present for the {entity_type} table: "
|
642
|
+
f"{', '.join(invalid_attrs)}."
|
643
|
+
)
|
696
644
|
|
697
|
-
|
698
|
-
self._remove_unused_species()
|
645
|
+
return getattr(self, entity_type)
|
699
646
|
|
700
|
-
def
|
647
|
+
def get_uri_urls(
|
648
|
+
self,
|
649
|
+
entity_type: str,
|
650
|
+
entity_ids: Iterable[str] | None = None,
|
651
|
+
required_ontology: str | None = None,
|
652
|
+
) -> pd.Series:
|
701
653
|
"""
|
702
|
-
|
654
|
+
Get reference URLs for specified entities.
|
703
655
|
|
704
656
|
Parameters
|
705
657
|
----------
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
# remove corresponding reactions_species
|
713
|
-
self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
|
714
|
-
# remove reactions
|
715
|
-
self.reactions = self.reactions.drop(index=list(r_ids))
|
716
|
-
# remove reactions_data
|
717
|
-
if hasattr(self, "reactions_data"):
|
718
|
-
for k, data in self.reactions_data.items():
|
719
|
-
self.reactions_data[k] = data.drop(index=list(r_ids))
|
720
|
-
# remove species if requested
|
721
|
-
if remove_species:
|
722
|
-
self._remove_unused_cspecies()
|
723
|
-
self._remove_unused_species()
|
724
|
-
|
725
|
-
def validate(self):
|
726
|
-
"""
|
727
|
-
Validate the SBML_dfs structure and relationships.
|
658
|
+
entity_type : str
|
659
|
+
Type of entity to get URLs for (e.g., 'species', 'reactions')
|
660
|
+
entity_ids : Optional[Iterable[str]], optional
|
661
|
+
Specific entities to get URLs for, by default None (all entities)
|
662
|
+
required_ontology : Optional[str], optional
|
663
|
+
Specific ontology to get URLs from, by default None
|
728
664
|
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
- Primary key uniqueness
|
734
|
-
- Foreign key relationships
|
735
|
-
- Optional data table validity
|
736
|
-
- Reaction species validity
|
665
|
+
Returns
|
666
|
+
-------
|
667
|
+
pd.Series
|
668
|
+
Series mapping entity IDs to their reference URLs
|
737
669
|
|
738
670
|
Raises
|
739
671
|
------
|
740
672
|
ValueError
|
741
|
-
If
|
673
|
+
If entity_type is invalid
|
742
674
|
"""
|
675
|
+
schema = self.schema
|
743
676
|
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
677
|
+
# valid entities and their identifier variables
|
678
|
+
valid_entity_types = [
|
679
|
+
SBML_DFS.COMPARTMENTS,
|
680
|
+
SBML_DFS.SPECIES,
|
681
|
+
SBML_DFS.REACTIONS,
|
682
|
+
]
|
749
683
|
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
f"{
|
754
|
-
f"{', '.join(extra_tables)}"
|
684
|
+
if entity_type not in valid_entity_types:
|
685
|
+
raise ValueError(
|
686
|
+
f"{entity_type} is an invalid entity_type; valid types "
|
687
|
+
f"are {', '.join(valid_entity_types)}"
|
755
688
|
)
|
756
689
|
|
757
|
-
|
758
|
-
|
690
|
+
entity_table = getattr(self, entity_type)
|
691
|
+
|
692
|
+
if entity_ids is not None:
|
693
|
+
# ensure that entity_ids are unique and then convert back to list
|
694
|
+
# to support pandas indexing
|
695
|
+
entity_ids = list(set(entity_ids))
|
696
|
+
|
697
|
+
# filter to a subset of identifiers if one is provided
|
698
|
+
entity_table = entity_table.loc[entity_ids]
|
699
|
+
|
700
|
+
# create a dataframe of all identifiers for the select entities
|
701
|
+
all_ids = pd.concat(
|
702
|
+
[
|
703
|
+
sbml_dfs_utils._id_dict_to_df(
|
704
|
+
entity_table[schema[entity_type]["id"]].iloc[i].ids
|
705
|
+
).assign(id=entity_table.index[i])
|
706
|
+
for i in range(0, entity_table.shape[0])
|
707
|
+
]
|
708
|
+
).rename(columns={"id": schema[entity_type]["pk"]})
|
709
|
+
|
710
|
+
# set priorities for ontologies and bqb terms
|
711
|
+
|
712
|
+
if required_ontology is None:
|
713
|
+
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
714
|
+
ONTOLOGY_PRIORITIES, how="left"
|
715
|
+
)
|
716
|
+
else:
|
717
|
+
ontology_priorities = pd.DataFrame(
|
718
|
+
[{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
|
719
|
+
)
|
720
|
+
# if only a single ontology is sought then just return matching entries
|
721
|
+
all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
|
722
|
+
ontology_priorities, how="inner"
|
723
|
+
)
|
724
|
+
|
725
|
+
uri_urls = (
|
726
|
+
all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
|
727
|
+
.groupby(schema[entity_type]["pk"])
|
728
|
+
.first()[IDENTIFIERS.URL]
|
729
|
+
)
|
730
|
+
return uri_urls
|
731
|
+
|
732
|
+
def infer_sbo_terms(self):
|
733
|
+
"""
|
734
|
+
Infer SBO Terms
|
735
|
+
|
736
|
+
Define SBO terms based on stoichiometry for reaction_species with missing terms.
|
737
|
+
Modifies the SBML_dfs object in-place.
|
738
|
+
|
739
|
+
Returns
|
740
|
+
-------
|
741
|
+
None (modifies SBML_dfs object in-place)
|
742
|
+
"""
|
743
|
+
valid_sbo_terms = self.reaction_species[
|
744
|
+
self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
745
|
+
]
|
746
|
+
|
747
|
+
invalid_sbo_terms = self.reaction_species[
|
748
|
+
~self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
749
|
+
]
|
750
|
+
|
751
|
+
if not all(self.reaction_species[SBML_DFS.SBO_TERM].notnull()):
|
752
|
+
raise ValueError("All reaction_species[SBML_DFS.SBO_TERM] must be not null")
|
753
|
+
if invalid_sbo_terms.shape[0] == 0:
|
754
|
+
logger.info("All sbo_terms were valid; nothing to update.")
|
755
|
+
return
|
756
|
+
|
757
|
+
logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
|
758
|
+
|
759
|
+
# add missing/invalid terms based on stoichiometry
|
760
|
+
invalid_sbo_terms.loc[
|
761
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
|
762
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
|
763
|
+
|
764
|
+
invalid_sbo_terms.loc[
|
765
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
|
766
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
|
767
|
+
|
768
|
+
invalid_sbo_terms.loc[
|
769
|
+
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
|
770
|
+
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
|
771
|
+
|
772
|
+
updated_reaction_species = pd.concat(
|
773
|
+
[valid_sbo_terms, invalid_sbo_terms]
|
774
|
+
).sort_index()
|
775
|
+
|
776
|
+
if self.reaction_species.shape[0] != updated_reaction_species.shape[0]:
|
759
777
|
raise ValueError(
|
760
|
-
f"
|
761
|
-
f"{', '.join(missing_tables)}"
|
778
|
+
f"Trying to overwrite {self.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
|
762
779
|
)
|
780
|
+
self.reaction_species = updated_reaction_species
|
781
|
+
return
|
763
782
|
|
764
|
-
|
765
|
-
|
766
|
-
|
783
|
+
def infer_uncompartmentalized_species_location(self):
|
784
|
+
"""
|
785
|
+
Infer Uncompartmentalized Species Location
|
767
786
|
|
768
|
-
|
769
|
-
|
770
|
-
|
787
|
+
If the compartment of a subset of compartmentalized species
|
788
|
+
was not specified, infer an appropriate compartment from
|
789
|
+
other members of reactions they participate in.
|
790
|
+
|
791
|
+
This method modifies the SBML_dfs object in-place.
|
792
|
+
|
793
|
+
Returns
|
794
|
+
-------
|
795
|
+
None (modifies SBML_dfs object in-place)
|
796
|
+
"""
|
797
|
+
default_compartment = (
|
798
|
+
self.compartmentalized_species.value_counts(SBML_DFS.C_ID)
|
799
|
+
.rename("N")
|
800
|
+
.reset_index()
|
801
|
+
.sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
|
771
802
|
)
|
803
|
+
if not isinstance(default_compartment, str):
|
804
|
+
raise ValueError(
|
805
|
+
"No default compartment could be found - compartment "
|
806
|
+
"information may not be present"
|
807
|
+
)
|
772
808
|
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
809
|
+
# infer the compartments of species missing compartments
|
810
|
+
missing_compartment_scids = self.compartmentalized_species[
|
811
|
+
self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
812
|
+
].index.tolist()
|
813
|
+
if len(missing_compartment_scids) == 0:
|
814
|
+
logger.info(
|
815
|
+
"All compartmentalized species have compartments, "
|
816
|
+
"returning input SBML_dfs"
|
780
817
|
)
|
781
|
-
|
782
|
-
|
818
|
+
return self
|
819
|
+
|
820
|
+
participating_reactions = (
|
821
|
+
self.reaction_species[
|
822
|
+
self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
823
|
+
][SBML_DFS.R_ID]
|
824
|
+
.unique()
|
825
|
+
.tolist()
|
826
|
+
)
|
827
|
+
reaction_participants = self.reaction_species[
|
828
|
+
self.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
|
829
|
+
].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
|
830
|
+
reaction_participants = reaction_participants.merge(
|
831
|
+
self.compartmentalized_species[SBML_DFS.C_ID],
|
832
|
+
left_on=SBML_DFS.SC_ID,
|
833
|
+
right_index=True,
|
834
|
+
)
|
835
|
+
|
836
|
+
# find a default compartment to fall back on if all compartmental information is missing
|
837
|
+
primary_reaction_compartment = (
|
838
|
+
reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
|
839
|
+
.rename("N")
|
840
|
+
.reset_index()
|
841
|
+
.sort_values("N", ascending=False)
|
842
|
+
.groupby(SBML_DFS.R_ID)
|
843
|
+
.first()[SBML_DFS.C_ID]
|
783
844
|
.reset_index()
|
784
|
-
.melt(id_vars="fk_table")
|
785
|
-
.drop(["variable"], axis=1)
|
786
|
-
.rename(columns={"value": "key"})
|
787
845
|
)
|
788
846
|
|
789
|
-
|
847
|
+
inferred_compartmentalization = (
|
848
|
+
self.reaction_species[
|
849
|
+
self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
850
|
+
]
|
851
|
+
.merge(primary_reaction_compartment)
|
852
|
+
.value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
|
853
|
+
.rename("N")
|
854
|
+
.reset_index()
|
855
|
+
.sort_values("N", ascending=False)
|
856
|
+
.groupby(SBML_DFS.SC_ID)
|
857
|
+
.first()
|
858
|
+
.reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
|
859
|
+
)
|
860
|
+
logger.info(
|
861
|
+
f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
|
862
|
+
)
|
790
863
|
|
791
|
-
|
792
|
-
|
793
|
-
|
864
|
+
# define where a reaction is most likely to occur based on the compartmentalization of its participants
|
865
|
+
species_with_unknown_compartmentalization = set(
|
866
|
+
missing_compartment_scids
|
867
|
+
).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
|
868
|
+
if len(species_with_unknown_compartmentalization) != 0:
|
869
|
+
logger.warning(
|
870
|
+
f"{len(species_with_unknown_compartmentalization)} "
|
871
|
+
"species compartmentalization could not be inferred"
|
872
|
+
" from other reaction participants. Their compartmentalization "
|
873
|
+
f"will be set to the default of {default_compartment}"
|
794
874
|
)
|
795
|
-
if None in pk_table_keys:
|
796
|
-
raise ValueError(
|
797
|
-
f"{pk_fk_correspondences['pk_table'][i]} had "
|
798
|
-
"missing values in its index"
|
799
|
-
)
|
800
875
|
|
801
|
-
|
802
|
-
|
803
|
-
|
876
|
+
inferred_compartmentalization = pd.concat(
|
877
|
+
[
|
878
|
+
inferred_compartmentalization,
|
879
|
+
pd.DataFrame(
|
880
|
+
{
|
881
|
+
SBML_DFS.SC_ID: list(
|
882
|
+
species_with_unknown_compartmentalization
|
883
|
+
)
|
884
|
+
}
|
885
|
+
).assign(c_id=default_compartment),
|
804
886
|
]
|
805
887
|
)
|
806
|
-
if None in fk_table_keys:
|
807
|
-
raise ValueError(
|
808
|
-
f"{pk_fk_correspondences['fk_table'][i]} included "
|
809
|
-
f"missing {pk_fk_correspondences['key'][i]} values"
|
810
|
-
)
|
811
888
|
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
889
|
+
if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
|
890
|
+
raise ValueError(
|
891
|
+
f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
|
892
|
+
)
|
893
|
+
|
894
|
+
updated_compartmentalized_species = pd.concat(
|
895
|
+
[
|
896
|
+
self.compartmentalized_species[
|
897
|
+
~self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
898
|
+
],
|
899
|
+
self.compartmentalized_species[
|
900
|
+
self.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
901
|
+
]
|
902
|
+
.drop(SBML_DFS.C_ID, axis=1)
|
903
|
+
.merge(
|
904
|
+
inferred_compartmentalization,
|
905
|
+
left_index=True,
|
906
|
+
right_on=SBML_DFS.SC_ID,
|
822
907
|
)
|
908
|
+
.set_index(SBML_DFS.SC_ID),
|
909
|
+
]
|
910
|
+
)
|
823
911
|
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
912
|
+
if (
|
913
|
+
updated_compartmentalized_species.shape[0]
|
914
|
+
!= self.compartmentalized_species.shape[0]
|
915
|
+
):
|
916
|
+
raise ValueError(
|
917
|
+
f"Trying to overwrite {self.compartmentalized_species.shape[0]}"
|
918
|
+
" compartmentalized species with "
|
919
|
+
f"{updated_compartmentalized_species.shape[0]}"
|
920
|
+
)
|
830
921
|
|
831
|
-
|
832
|
-
|
833
|
-
self._validate_reactions_data(v)
|
834
|
-
except ValueError as e:
|
835
|
-
raise ValueError(f"reactions data {k} was invalid.") from e
|
922
|
+
if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
|
923
|
+
raise ValueError("Some species compartments are still missing")
|
836
924
|
|
837
|
-
|
838
|
-
|
925
|
+
self.compartmentalized_species = updated_compartmentalized_species
|
926
|
+
return
|
839
927
|
|
840
|
-
def
|
928
|
+
def name_compartmentalized_species(self):
|
841
929
|
"""
|
842
|
-
|
930
|
+
Name Compartmentalized Species
|
843
931
|
|
844
|
-
|
845
|
-
|
846
|
-
2. If validation fails, tries to resolve the issue
|
847
|
-
3. Repeats until validation passes or issue cannot be resolved
|
932
|
+
Rename compartmentalized species if they have the same
|
933
|
+
name as their species. Modifies the SBML_dfs object in-place.
|
848
934
|
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
If validation fails and cannot be automatically resolved
|
935
|
+
Returns
|
936
|
+
-------
|
937
|
+
None (modifies SBML_dfs object in-place)
|
853
938
|
"""
|
939
|
+
augmented_cspecies = self.compartmentalized_species.merge(
|
940
|
+
self.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
|
941
|
+
).merge(
|
942
|
+
self.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
|
943
|
+
)
|
944
|
+
augmented_cspecies[SBML_DFS.SC_NAME] = [
|
945
|
+
f"{s} [{c}]" if sc == s else sc
|
946
|
+
for sc, c, s in zip(
|
947
|
+
augmented_cspecies[SBML_DFS.SC_NAME],
|
948
|
+
augmented_cspecies[SBML_DFS.C_NAME],
|
949
|
+
augmented_cspecies[SBML_DFS.S_NAME],
|
950
|
+
)
|
951
|
+
]
|
854
952
|
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
try:
|
860
|
-
self.validate()
|
861
|
-
validated = True
|
862
|
-
except Exception as e:
|
863
|
-
e_str = str(e)
|
864
|
-
if e_str == current_exception:
|
865
|
-
logger.warning(
|
866
|
-
"Automated resolution of an Exception was attempted but failed"
|
867
|
-
)
|
868
|
-
raise e
|
869
|
-
|
870
|
-
# try to resolve
|
871
|
-
self._attempt_resolve(e)
|
953
|
+
self.compartmentalized_species = augmented_cspecies.loc[
|
954
|
+
:, self.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
|
955
|
+
]
|
956
|
+
return
|
872
957
|
|
873
|
-
def
|
958
|
+
def reaction_formulas(
|
959
|
+
self, r_ids: Optional[Union[str, list[str]]] = None
|
960
|
+
) -> pd.Series:
|
874
961
|
"""
|
875
|
-
|
962
|
+
Reaction Summary
|
876
963
|
|
877
|
-
|
964
|
+
Return human-readable formulas for reactions.
|
965
|
+
|
966
|
+
Parameters:
|
878
967
|
----------
|
879
|
-
|
880
|
-
|
968
|
+
r_ids: [str], str or None
|
969
|
+
Reaction IDs or None for all reactions
|
881
970
|
|
882
971
|
Returns
|
883
|
-
|
884
|
-
pd.
|
885
|
-
The selected species data table
|
886
|
-
|
887
|
-
Raises
|
888
|
-
------
|
889
|
-
ValueError
|
890
|
-
If species_data_table is not found
|
972
|
+
----------
|
973
|
+
formula_strs: pd.Series
|
891
974
|
"""
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
975
|
+
|
976
|
+
validated_rids = self._validate_r_ids(r_ids)
|
977
|
+
|
978
|
+
matching_reaction_species = self.reaction_species[
|
979
|
+
self.reaction_species.r_id.isin(validated_rids)
|
980
|
+
].merge(
|
981
|
+
self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
982
|
+
)
|
983
|
+
|
984
|
+
# split into within compartment and cross-compartment reactions
|
985
|
+
r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
|
986
|
+
SBML_DFS.C_ID
|
987
|
+
].nunique()
|
988
|
+
|
989
|
+
# identify reactions which work across compartments
|
990
|
+
r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
|
991
|
+
# there species must be labelled with the sc_name to specify where a species exists
|
992
|
+
if r_id_cross_compartment.shape[0] > 0:
|
993
|
+
rxn_eqtn_cross_compartment = (
|
994
|
+
matching_reaction_species[
|
995
|
+
matching_reaction_species[SBML_DFS.R_ID].isin(
|
996
|
+
r_id_cross_compartment.index
|
997
|
+
)
|
998
|
+
]
|
999
|
+
.sort_values([SBML_DFS.SC_NAME])
|
1000
|
+
.groupby(SBML_DFS.R_ID)
|
1001
|
+
.apply(
|
1002
|
+
lambda x: sbml_dfs_utils.construct_formula_string(
|
1003
|
+
x, self.reactions, SBML_DFS.SC_NAME
|
1004
|
+
)
|
1005
|
+
)
|
1006
|
+
.rename("r_formula_str")
|
1007
|
+
)
|
1008
|
+
else:
|
1009
|
+
rxn_eqtn_cross_compartment = None
|
1010
|
+
|
1011
|
+
# identify reactions which occur within a single compartment; for these the reaction
|
1012
|
+
# can be labelled with the compartment and individual species can receive a more readable s_name
|
1013
|
+
r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
|
1014
|
+
if r_id_within_compartment.shape[0] > 0:
|
1015
|
+
# add s_name
|
1016
|
+
augmented_matching_reaction_species = (
|
1017
|
+
matching_reaction_species[
|
1018
|
+
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1019
|
+
r_id_within_compartment.index
|
1020
|
+
)
|
1021
|
+
]
|
1022
|
+
.merge(self.compartments, left_on=SBML_DFS.C_ID, right_index=True)
|
1023
|
+
.merge(self.species, left_on=SBML_DFS.S_ID, right_index=True)
|
1024
|
+
.sort_values([SBML_DFS.S_NAME])
|
897
1025
|
)
|
1026
|
+
# create formulas based on s_names of components
|
1027
|
+
rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
|
1028
|
+
[SBML_DFS.R_ID, SBML_DFS.C_NAME]
|
1029
|
+
).apply(
|
1030
|
+
lambda x: sbml_dfs_utils.construct_formula_string(
|
1031
|
+
x, self.reactions, SBML_DFS.S_NAME
|
1032
|
+
)
|
1033
|
+
)
|
1034
|
+
# add compartment for each reaction
|
1035
|
+
rxn_eqtn_within_compartment = pd.Series(
|
1036
|
+
[
|
1037
|
+
y + ": " + x
|
1038
|
+
for x, y in zip(
|
1039
|
+
rxn_eqtn_within_compartment,
|
1040
|
+
rxn_eqtn_within_compartment.index.get_level_values(
|
1041
|
+
SBML_DFS.C_NAME
|
1042
|
+
),
|
1043
|
+
)
|
1044
|
+
],
|
1045
|
+
index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
|
1046
|
+
).rename("r_formula_str")
|
1047
|
+
else:
|
1048
|
+
rxn_eqtn_within_compartment = None
|
898
1049
|
|
899
|
-
|
900
|
-
|
1050
|
+
formula_strs = pd.concat(
|
1051
|
+
[rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment]
|
1052
|
+
)
|
901
1053
|
|
902
|
-
|
1054
|
+
return formula_strs
|
1055
|
+
|
1056
|
+
def reaction_summaries(
|
1057
|
+
self, r_ids: Optional[Union[str, list[str]]] = None
|
1058
|
+
) -> pd.DataFrame:
|
903
1059
|
"""
|
904
|
-
|
1060
|
+
Reaction Summary
|
905
1061
|
|
906
|
-
|
907
|
-
object against the schema stored in self.schema.
|
1062
|
+
Return a summary of reactions.
|
908
1063
|
|
909
|
-
Parameters
|
1064
|
+
Parameters:
|
910
1065
|
----------
|
911
|
-
|
912
|
-
|
1066
|
+
r_ids: [str], str or None
|
1067
|
+
Reaction IDs or None for all reactions
|
913
1068
|
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
1069
|
+
Returns
|
1070
|
+
----------
|
1071
|
+
reaction_summaries_df: pd.DataFrame
|
1072
|
+
A table with r_id as an index and columns:
|
1073
|
+
- r_name: str, name of the reaction
|
1074
|
+
- r_formula_str: str, human-readable formula of the reaction
|
918
1075
|
"""
|
919
|
-
table_schema = self.schema[table]
|
920
|
-
table_data = getattr(self, table)
|
921
|
-
_perform_sbml_dfs_table_validation(table_data, table_schema, table)
|
922
1076
|
|
923
|
-
|
924
|
-
"""
|
925
|
-
Remove data from species_data or reactions_data by table name and label.
|
1077
|
+
validated_rids = self._validate_r_ids(r_ids)
|
926
1078
|
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
Label of the data to remove
|
1079
|
+
participating_r_names = self.reactions.loc[validated_rids, SBML_DFS.R_NAME]
|
1080
|
+
participating_r_formulas = self.reaction_formulas(r_ids=validated_rids)
|
1081
|
+
reaction_summareis_df = pd.concat(
|
1082
|
+
[participating_r_names, participating_r_formulas], axis=1
|
1083
|
+
)
|
933
1084
|
|
934
|
-
|
935
|
-
|
936
|
-
|
1085
|
+
return reaction_summareis_df
|
1086
|
+
|
1087
|
+
def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
937
1088
|
"""
|
938
|
-
|
939
|
-
raise ValueError("table_name must be either 'species' or 'reactions'")
|
1089
|
+
Remove compartmentalized species and associated reactions.
|
940
1090
|
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
logger.warning(
|
945
|
-
f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
|
946
|
-
f"Existing labels: {existing_labels}"
|
947
|
-
)
|
948
|
-
return
|
949
|
-
|
950
|
-
del data_dict[label]
|
951
|
-
|
952
|
-
def _remove_unused_cspecies(self):
|
953
|
-
"""Removes compartmentalized species that are no
|
954
|
-
longer part of any reactions"""
|
955
|
-
sc_ids = self._get_unused_cspecies()
|
956
|
-
self._remove_compartmentalized_species(sc_ids)
|
1091
|
+
Starting with a set of compartmentalized species, determine which reactions
|
1092
|
+
should be removed based on their removal. Then remove these reactions,
|
1093
|
+
compartmentalized species, and species.
|
957
1094
|
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
)
|
964
|
-
return sc_ids # type: ignore
|
1095
|
+
Parameters
|
1096
|
+
----------
|
1097
|
+
sc_ids : Iterable[str]
|
1098
|
+
IDs of compartmentalized species to remove
|
1099
|
+
"""
|
965
1100
|
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
s_ids = self._get_unused_species()
|
970
|
-
self._remove_species(s_ids)
|
1101
|
+
# find reactions which should be totally removed since they are losing critical species
|
1102
|
+
removed_reactions = self._find_underspecified_reactions_by_scids(sc_ids)
|
1103
|
+
self.remove_reactions(removed_reactions)
|
971
1104
|
|
972
|
-
|
973
|
-
"""Returns a list of species that are not part of any reactions"""
|
974
|
-
s_ids = set(self.species.index) - set(
|
975
|
-
self.compartmentalized_species[SBML_DFS.S_ID]
|
976
|
-
)
|
977
|
-
return s_ids # type: ignore
|
1105
|
+
self._remove_compartmentalized_species(sc_ids)
|
978
1106
|
|
979
|
-
|
980
|
-
|
1107
|
+
# remove species (and their associated species data if all their cspecies have been lost)
|
1108
|
+
self._remove_unused_species()
|
981
1109
|
|
982
|
-
|
983
|
-
|
984
|
-
|
1110
|
+
def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
|
1111
|
+
"""
|
1112
|
+
Remove reactions from the model.
|
985
1113
|
|
986
|
-
|
987
|
-
|
1114
|
+
Parameters
|
1115
|
+
----------
|
1116
|
+
r_ids : Iterable[str]
|
1117
|
+
IDs of reactions to remove
|
1118
|
+
remove_species : bool, optional
|
1119
|
+
Whether to remove species that are no longer part of any reactions,
|
1120
|
+
by default False
|
988
1121
|
"""
|
989
|
-
# Remove compartmentalized species
|
990
|
-
self.compartmentalized_species = self.compartmentalized_species.drop(
|
991
|
-
index=list(sc_ids)
|
992
|
-
)
|
993
1122
|
# remove corresponding reactions_species
|
994
|
-
self.reaction_species = self.reaction_species.query("
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1123
|
+
self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
|
1124
|
+
# remove reactions
|
1125
|
+
self.reactions = self.reactions.drop(index=list(r_ids))
|
1126
|
+
# remove reactions_data
|
1127
|
+
if hasattr(self, "reactions_data"):
|
1128
|
+
for k, data in self.reactions_data.items():
|
1129
|
+
self.reactions_data[k] = data.drop(index=list(r_ids))
|
1130
|
+
# remove species if requested
|
1131
|
+
if remove_species:
|
1132
|
+
self._remove_unused_cspecies()
|
1133
|
+
self._remove_unused_species()
|
1005
1134
|
|
1006
|
-
|
1007
|
-
s_ids (Iterable[str]): the species to remove
|
1135
|
+
def remove_reactions_data(self, label: str):
|
1008
1136
|
"""
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
self.species = self.species.drop(index=list(s_ids))
|
1013
|
-
# remove data
|
1014
|
-
for k, data in self.species_data.items():
|
1015
|
-
self.species_data[k] = data.drop(index=list(s_ids))
|
1016
|
-
|
1017
|
-
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
1018
|
-
"""Validates species data attribute
|
1137
|
+
Remove reactions data by label.
|
1138
|
+
"""
|
1139
|
+
self._remove_entity_data(SBML_DFS.REACTIONS, label)
|
1019
1140
|
|
1020
|
-
|
1021
|
-
|
1141
|
+
def remove_species_data(self, label: str):
|
1142
|
+
"""
|
1143
|
+
Remove species data by label.
|
1144
|
+
"""
|
1145
|
+
self._remove_entity_data(SBML_DFS.SPECIES, label)
|
1022
1146
|
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1147
|
+
def search_by_ids(
|
1148
|
+
self,
|
1149
|
+
ids: list[str],
|
1150
|
+
entity_type: str,
|
1151
|
+
identifiers_df: pd.DataFrame,
|
1152
|
+
ontologies: None | set[str] = None,
|
1153
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
1027
1154
|
"""
|
1028
|
-
|
1155
|
+
Find entities and identifiers matching a set of query IDs.
|
1029
1156
|
|
1030
|
-
|
1031
|
-
|
1157
|
+
Parameters
|
1158
|
+
----------
|
1159
|
+
ids : List[str]
|
1160
|
+
List of identifiers to search for
|
1161
|
+
entity_type : str
|
1162
|
+
Type of entity to search (e.g., 'species', 'reactions')
|
1163
|
+
identifiers_df : pd.DataFrame
|
1164
|
+
DataFrame containing identifier mappings
|
1165
|
+
ontologies : Optional[Set[str]], optional
|
1166
|
+
Set of ontologies to filter by, by default None
|
1032
1167
|
|
1033
|
-
|
1034
|
-
|
1168
|
+
Returns
|
1169
|
+
-------
|
1170
|
+
Tuple[pd.DataFrame, pd.DataFrame]
|
1171
|
+
- Matching entities
|
1172
|
+
- Matching identifiers
|
1035
1173
|
|
1036
|
-
Raises
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1174
|
+
Raises
|
1175
|
+
------
|
1176
|
+
ValueError
|
1177
|
+
If entity_type is invalid or ontologies are invalid
|
1178
|
+
TypeError
|
1179
|
+
If ontologies is not a set
|
1040
1180
|
"""
|
1041
|
-
|
1181
|
+
# validate inputs
|
1182
|
+
entity_table = self.get_table(entity_type, required_attributes={"id"})
|
1183
|
+
entity_pk = self.schema[entity_type]["pk"]
|
1042
1184
|
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1185
|
+
utils.match_pd_vars(
|
1186
|
+
identifiers_df,
|
1187
|
+
req_vars={
|
1188
|
+
entity_pk,
|
1189
|
+
IDENTIFIERS.ONTOLOGY,
|
1190
|
+
IDENTIFIERS.IDENTIFIER,
|
1191
|
+
IDENTIFIERS.URL,
|
1192
|
+
IDENTIFIERS.BQB,
|
1193
|
+
},
|
1194
|
+
allow_series=False,
|
1195
|
+
).assert_present()
|
1048
1196
|
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1197
|
+
if ontologies is not None:
|
1198
|
+
if not isinstance(ontologies, set):
|
1199
|
+
# for clarity this should not be reachable based on type hints
|
1200
|
+
raise TypeError(
|
1201
|
+
f"ontologies must be a set, but got {type(ontologies).__name__}"
|
1202
|
+
)
|
1203
|
+
ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
|
1204
|
+
invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
|
1205
|
+
if len(invalid_ontologies) > 0:
|
1206
|
+
raise ValueError(
|
1207
|
+
f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
|
1208
|
+
f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
|
1209
|
+
)
|
1055
1210
|
|
1056
|
-
|
1057
|
-
|
1058
|
-
invalid_sbo_term_counts = sbo_counts[
|
1059
|
-
~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
|
1060
|
-
]
|
1211
|
+
# fitler to just to identifiers matchign the ontologies of interest
|
1212
|
+
identifiers_df = identifiers_df.query("ontology in @ontologies")
|
1061
1213
|
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
raise ValueError(
|
1067
|
-
f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
|
1068
|
-
f"defined {invalid_sbo_counts_str}"
|
1069
|
-
)
|
1214
|
+
matching_identifiers = identifiers_df.loc[
|
1215
|
+
identifiers_df["identifier"].isin(ids)
|
1216
|
+
]
|
1217
|
+
entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
|
1070
1218
|
|
1071
|
-
|
1072
|
-
str_e = str(e)
|
1073
|
-
if str_e == "compartmentalized_species included missing c_id values":
|
1074
|
-
logger.warning(str_e)
|
1075
|
-
logger.warning(
|
1076
|
-
"Attempting to resolve with infer_uncompartmentalized_species_location()"
|
1077
|
-
)
|
1078
|
-
self = infer_uncompartmentalized_species_location(self)
|
1079
|
-
elif re.search("sbo_terms were not defined", str_e):
|
1080
|
-
logger.warning(str_e)
|
1081
|
-
logger.warning("Attempting to resolve with infer_sbo_terms()")
|
1082
|
-
self = infer_sbo_terms(self)
|
1083
|
-
else:
|
1084
|
-
logger.warning(
|
1085
|
-
"An error occurred which could not be automatically resolved"
|
1086
|
-
)
|
1087
|
-
raise e
|
1219
|
+
return entity_subset, matching_identifiers
|
1088
1220
|
|
1221
|
+
def search_by_name(
|
1222
|
+
self, name: str, entity_type: str, partial_match: bool = True
|
1223
|
+
) -> pd.DataFrame:
|
1224
|
+
"""
|
1225
|
+
Find entities by exact or partial name match.
|
1089
1226
|
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1227
|
+
Parameters
|
1228
|
+
----------
|
1229
|
+
name : str
|
1230
|
+
Name to search for
|
1231
|
+
entity_type : str
|
1232
|
+
Type of entity to search (e.g., 'species', 'reactions')
|
1233
|
+
partial_match : bool, optional
|
1234
|
+
Whether to allow partial string matches, by default True
|
1093
1235
|
|
1094
|
-
|
1236
|
+
Returns
|
1237
|
+
-------
|
1238
|
+
pd.DataFrame
|
1239
|
+
Matching entities
|
1240
|
+
"""
|
1241
|
+
entity_table = self.get_table(entity_type, required_attributes={"label"})
|
1242
|
+
label_attr = self.schema[entity_type]["label"]
|
1095
1243
|
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1244
|
+
if partial_match:
|
1245
|
+
matches = entity_table.loc[
|
1246
|
+
entity_table[label_attr].str.contains(name, case=False)
|
1247
|
+
]
|
1248
|
+
else:
|
1249
|
+
matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
|
1250
|
+
return matches
|
1100
1251
|
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
matching_species = sbml_dfs.species.loc[s_id]
|
1106
|
-
|
1107
|
-
if not isinstance(matching_species, pd.Series):
|
1108
|
-
raise ValueError(f"{s_id} did not match a single species")
|
1109
|
-
|
1110
|
-
# find all rxns species particpate in
|
1111
|
-
|
1112
|
-
matching_compartmentalized_species = sbml_dfs.compartmentalized_species[
|
1113
|
-
sbml_dfs.compartmentalized_species.s_id.isin([s_id])
|
1114
|
-
]
|
1115
|
-
|
1116
|
-
rxns_participating = sbml_dfs.reaction_species[
|
1117
|
-
sbml_dfs.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
|
1118
|
-
]
|
1119
|
-
|
1120
|
-
# find all participants in these rxns
|
1121
|
-
|
1122
|
-
full_rxns_participating = sbml_dfs.reaction_species[
|
1123
|
-
sbml_dfs.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
|
1124
|
-
].merge(
|
1125
|
-
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1126
|
-
)
|
1127
|
-
|
1128
|
-
reaction_descriptions = pd.concat(
|
1129
|
-
[
|
1130
|
-
reaction_summary(x, sbml_dfs)
|
1131
|
-
for x in set(full_rxns_participating[SBML_DFS.R_ID].tolist())
|
1132
|
-
]
|
1133
|
-
)
|
1134
|
-
|
1135
|
-
status = (
|
1136
|
-
full_rxns_participating.loc[
|
1137
|
-
full_rxns_participating[SBML_DFS.SC_ID].isin(
|
1138
|
-
matching_compartmentalized_species.index.values.tolist()
|
1139
|
-
),
|
1140
|
-
[SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
|
1141
|
-
]
|
1142
|
-
.merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
|
1143
|
-
.reset_index(drop=True)
|
1144
|
-
.drop(SBML_DFS.R_ID, axis=1)
|
1145
|
-
)
|
1146
|
-
|
1147
|
-
return status
|
1148
|
-
|
1149
|
-
|
1150
|
-
def reaction_summary(r_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
|
1151
|
-
"""
|
1152
|
-
Reaction Summary
|
1153
|
-
|
1154
|
-
Return a reaction's name and a human-readable formula.
|
1155
|
-
|
1156
|
-
Parameters:
|
1157
|
-
r_id: str
|
1158
|
-
A reaction ID
|
1159
|
-
sbml_dfs: SBML_dfs
|
1160
|
-
|
1161
|
-
Returns:
|
1162
|
-
one row pd.DataFrame
|
1163
|
-
"""
|
1164
|
-
|
1165
|
-
logger.warning(
|
1166
|
-
"reaction_summary is deprecated and will be removed in a future version of rcpr; "
|
1167
|
-
"please use reaction_summaries() instead"
|
1168
|
-
)
|
1169
|
-
|
1170
|
-
matching_reaction = sbml_dfs.reactions.loc[r_id]
|
1171
|
-
|
1172
|
-
if not isinstance(matching_reaction, pd.Series):
|
1173
|
-
raise ValueError(f"{r_id} did not match a single reaction")
|
1174
|
-
|
1175
|
-
matching_reaction = sbml_dfs.reactions.loc[r_id]
|
1176
|
-
|
1177
|
-
matching_reaction_species = sbml_dfs.reaction_species[
|
1178
|
-
sbml_dfs.reaction_species.r_id.isin([r_id])
|
1179
|
-
].merge(
|
1180
|
-
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1181
|
-
)
|
1182
|
-
|
1183
|
-
# collapse all reaction species to a formula string
|
1184
|
-
|
1185
|
-
if len(matching_reaction_species[SBML_DFS.C_ID].unique()) == 1:
|
1186
|
-
augmented_matching_reaction_species = matching_reaction_species.merge(
|
1187
|
-
sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True
|
1188
|
-
).merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
|
1189
|
-
str_formula = (
|
1190
|
-
construct_formula_string(
|
1191
|
-
augmented_matching_reaction_species, sbml_dfs.reactions, SBML_DFS.S_NAME
|
1192
|
-
)
|
1193
|
-
+ " ["
|
1194
|
-
+ augmented_matching_reaction_species[SBML_DFS.C_NAME].iloc[0]
|
1195
|
-
+ "]"
|
1196
|
-
)
|
1197
|
-
else:
|
1198
|
-
str_formula = construct_formula_string(
|
1199
|
-
matching_reaction_species, sbml_dfs.reactions, SBML_DFS.SC_NAME
|
1200
|
-
)
|
1201
|
-
|
1202
|
-
output = pd.DataFrame(
|
1203
|
-
{
|
1204
|
-
SBML_DFS.R_NAME: matching_reaction[SBML_DFS.R_NAME],
|
1205
|
-
"r_formula_str": str_formula,
|
1206
|
-
},
|
1207
|
-
index=[r_id],
|
1208
|
-
)
|
1209
|
-
|
1210
|
-
output.index.name = SBML_DFS.R_ID
|
1211
|
-
|
1212
|
-
return output
|
1213
|
-
|
1214
|
-
|
1215
|
-
def reaction_summaries(sbml_dfs: SBML_dfs, r_ids=None) -> pd.Series:
|
1216
|
-
"""
|
1217
|
-
Reaction Summary
|
1218
|
-
|
1219
|
-
Return human-readable formulas for reactions.
|
1220
|
-
|
1221
|
-
Parameters:
|
1222
|
-
----------
|
1223
|
-
sbml_dfs: sbml.SBML_dfs
|
1224
|
-
A relational mechanistic model
|
1225
|
-
r_ids: [str], str or None
|
1226
|
-
Reaction IDs or None for all reactions
|
1227
|
-
|
1228
|
-
Returns:
|
1229
|
-
----------
|
1230
|
-
formula_strs: pd.Series
|
1231
|
-
"""
|
1232
|
-
|
1233
|
-
if isinstance(r_ids, str):
|
1234
|
-
r_ids = [r_ids]
|
1235
|
-
|
1236
|
-
if r_ids is None:
|
1237
|
-
matching_reactions = sbml_dfs.reactions
|
1238
|
-
else:
|
1239
|
-
matching_reactions = sbml_dfs.reactions.loc[r_ids]
|
1240
|
-
|
1241
|
-
matching_reaction_species = sbml_dfs.reaction_species[
|
1242
|
-
sbml_dfs.reaction_species.r_id.isin(matching_reactions.index)
|
1243
|
-
].merge(
|
1244
|
-
sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
1245
|
-
)
|
1246
|
-
|
1247
|
-
# split into within compartment and cross-compartment reactions
|
1248
|
-
r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
|
1249
|
-
SBML_DFS.C_ID
|
1250
|
-
].nunique()
|
1251
|
-
|
1252
|
-
# identify reactions which work across compartments
|
1253
|
-
r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
|
1254
|
-
# there species must be labelled with the sc_name to specify where a species exists
|
1255
|
-
if r_id_cross_compartment.shape[0] > 0:
|
1256
|
-
rxn_eqtn_cross_compartment = (
|
1257
|
-
matching_reaction_species[
|
1258
|
-
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1259
|
-
r_id_cross_compartment.index
|
1260
|
-
)
|
1261
|
-
]
|
1262
|
-
.sort_values([SBML_DFS.SC_NAME])
|
1263
|
-
.groupby(SBML_DFS.R_ID)
|
1264
|
-
.apply(
|
1265
|
-
lambda x: construct_formula_string(
|
1266
|
-
x, sbml_dfs.reactions, SBML_DFS.SC_NAME
|
1267
|
-
)
|
1268
|
-
)
|
1269
|
-
.rename("r_formula_str")
|
1270
|
-
)
|
1271
|
-
else:
|
1272
|
-
rxn_eqtn_cross_compartment = None
|
1273
|
-
|
1274
|
-
# identify reactions which occur within a single compartment; for these the reaction
|
1275
|
-
# can be labelled with the compartment and individual species can receive a more readable s_name
|
1276
|
-
r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
|
1277
|
-
if r_id_within_compartment.shape[0] > 0:
|
1278
|
-
# add s_name
|
1279
|
-
augmented_matching_reaction_species = (
|
1280
|
-
matching_reaction_species[
|
1281
|
-
matching_reaction_species[SBML_DFS.R_ID].isin(
|
1282
|
-
r_id_within_compartment.index
|
1283
|
-
)
|
1284
|
-
]
|
1285
|
-
.merge(sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True)
|
1286
|
-
.merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
|
1287
|
-
.sort_values([SBML_DFS.S_NAME])
|
1288
|
-
)
|
1289
|
-
# create formulas based on s_names of components
|
1290
|
-
rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
|
1291
|
-
[SBML_DFS.R_ID, SBML_DFS.C_NAME]
|
1292
|
-
).apply(
|
1293
|
-
lambda x: construct_formula_string(x, sbml_dfs.reactions, SBML_DFS.S_NAME)
|
1294
|
-
)
|
1295
|
-
# add compartment for each reaction
|
1296
|
-
rxn_eqtn_within_compartment = pd.Series(
|
1297
|
-
[
|
1298
|
-
y + ": " + x
|
1299
|
-
for x, y in zip(
|
1300
|
-
rxn_eqtn_within_compartment,
|
1301
|
-
rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.C_NAME),
|
1302
|
-
)
|
1303
|
-
],
|
1304
|
-
index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
|
1305
|
-
).rename("r_formula_str")
|
1306
|
-
else:
|
1307
|
-
rxn_eqtn_within_compartment = None
|
1308
|
-
|
1309
|
-
formula_strs = pd.concat([rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment])
|
1310
|
-
|
1311
|
-
return formula_strs
|
1312
|
-
|
1313
|
-
|
1314
|
-
def construct_formula_string(
|
1315
|
-
reaction_species_df: pd.DataFrame,
|
1316
|
-
reactions_df: pd.DataFrame,
|
1317
|
-
name_var: str,
|
1318
|
-
) -> str:
|
1319
|
-
"""
|
1320
|
-
Construct Formula String
|
1321
|
-
|
1322
|
-
Convert a table of reaction species into a formula string
|
1323
|
-
|
1324
|
-
Parameters:
|
1325
|
-
----------
|
1326
|
-
reaction_species_df: pd.DataFrame
|
1327
|
-
Table containing a reactions' species
|
1328
|
-
reactions_df: pd.DataFrame
|
1329
|
-
smbl.reactions
|
1330
|
-
name_var: str
|
1331
|
-
Name used to label species
|
1332
|
-
|
1333
|
-
Returns:
|
1334
|
-
----------
|
1335
|
-
formula_str: str
|
1336
|
-
String representation of a reactions substrates, products and
|
1337
|
-
modifiers
|
1338
|
-
|
1339
|
-
"""
|
1340
|
-
|
1341
|
-
reaction_species_df["label"] = [
|
1342
|
-
add_stoi_to_species_name(x, y)
|
1343
|
-
for x, y in zip(
|
1344
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
|
1345
|
-
)
|
1346
|
-
]
|
1347
|
-
|
1348
|
-
rxn_reversible = bool(
|
1349
|
-
reactions_df.loc[
|
1350
|
-
reaction_species_df[SBML_DFS.R_ID].iloc[0], SBML_DFS.R_ISREVERSIBLE
|
1351
|
-
]
|
1352
|
-
) # convert from a np.bool_ to bool if needed
|
1353
|
-
if not isinstance(rxn_reversible, bool):
|
1354
|
-
raise TypeError(
|
1355
|
-
f"rxn_reversible must be a bool, but got {type(rxn_reversible).__name__}"
|
1356
|
-
)
|
1357
|
-
|
1358
|
-
if rxn_reversible:
|
1359
|
-
arrow_type = " <-> "
|
1360
|
-
else:
|
1361
|
-
arrow_type = " -> "
|
1362
|
-
|
1363
|
-
substrates = " + ".join(
|
1364
|
-
reaction_species_df["label"][
|
1365
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
|
1366
|
-
].tolist()
|
1367
|
-
)
|
1368
|
-
products = " + ".join(
|
1369
|
-
reaction_species_df["label"][
|
1370
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
|
1371
|
-
].tolist()
|
1372
|
-
)
|
1373
|
-
modifiers = " + ".join(
|
1374
|
-
reaction_species_df["label"][
|
1375
|
-
reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
|
1376
|
-
].tolist()
|
1377
|
-
)
|
1378
|
-
if modifiers != "":
|
1379
|
-
modifiers = f" ---- modifiers: {modifiers}]"
|
1380
|
-
|
1381
|
-
return f"{substrates}{arrow_type}{products}{modifiers}"
|
1382
|
-
|
1383
|
-
|
1384
|
-
def add_stoi_to_species_name(stoi: float | int, name: str) -> str:
|
1385
|
-
"""
|
1386
|
-
Add Stoi To Species Name
|
1387
|
-
|
1388
|
-
Add # of molecules to a species name
|
1389
|
-
|
1390
|
-
Parameters:
|
1391
|
-
----------
|
1392
|
-
stoi: float or int
|
1393
|
-
Number of molecules
|
1394
|
-
name: str
|
1395
|
-
Name of species
|
1396
|
-
|
1397
|
-
Returns:
|
1398
|
-
----------
|
1399
|
-
name: str
|
1400
|
-
Name containing number of species
|
1401
|
-
|
1402
|
-
"""
|
1403
|
-
|
1404
|
-
if stoi in [-1, 0, 1]:
|
1405
|
-
return name
|
1406
|
-
else:
|
1407
|
-
return str(abs(stoi)) + " " + name
|
1408
|
-
|
1409
|
-
|
1410
|
-
def filter_to_characteristic_species_ids(
|
1411
|
-
species_ids: pd.DataFrame,
|
1412
|
-
max_complex_size: int = 4,
|
1413
|
-
max_promiscuity: int = 20,
|
1414
|
-
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
1415
|
-
) -> pd.DataFrame:
|
1416
|
-
"""
|
1417
|
-
Filter to Characteristic Species IDs
|
1418
|
-
|
1419
|
-
Remove identifiers corresponding to one component within a large protein
|
1420
|
-
complexes and non-characteristic annotations such as pubmed references and
|
1421
|
-
homologues.
|
1252
|
+
def select_species_data(self, species_data_table: str) -> pd.DataFrame:
|
1253
|
+
"""
|
1254
|
+
Select a species data table from the SBML_dfs object.
|
1422
1255
|
|
1423
1256
|
Parameters
|
1424
1257
|
----------
|
1425
|
-
|
1426
|
-
|
1427
|
-
max_complex_size: int
|
1428
|
-
The largest size of a complex, where BQB_HAS_PART terms will be retained.
|
1429
|
-
In most cases, complexes are handled with specific formation and
|
1430
|
-
dissolutation reactions,but these identifiers will be pulled in when
|
1431
|
-
searching by identifiers or searching the identifiers associated with a
|
1432
|
-
species against an external resource such as Open Targets.
|
1433
|
-
max_promiscuity: int
|
1434
|
-
Maximum number of species where a single molecule can act as a
|
1435
|
-
BQB_HAS_PART component associated with a single identifier (and common ontology).
|
1436
|
-
defining_biological_qualifiers (list[str]):
|
1437
|
-
BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
|
1438
|
-
permissive settings would include homologs, different forms of the same gene.
|
1439
|
-
|
1440
|
-
Returns:
|
1441
|
-
--------
|
1442
|
-
species_id: pd.DataFrame
|
1443
|
-
Input species filtered to characteristic identifiers
|
1444
|
-
|
1445
|
-
"""
|
1446
|
-
|
1447
|
-
if not isinstance(species_ids, pd.DataFrame):
|
1448
|
-
raise TypeError(
|
1449
|
-
f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
|
1450
|
-
)
|
1451
|
-
|
1452
|
-
if not isinstance(max_complex_size, int):
|
1453
|
-
raise TypeError(
|
1454
|
-
f"max_complex_size was a {type(max_complex_size)} but must be an int"
|
1455
|
-
)
|
1456
|
-
|
1457
|
-
if not isinstance(max_promiscuity, int):
|
1458
|
-
raise TypeError(
|
1459
|
-
f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
|
1460
|
-
)
|
1461
|
-
|
1462
|
-
if not isinstance(defining_biological_qualifiers, list):
|
1463
|
-
raise TypeError(
|
1464
|
-
f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
|
1465
|
-
)
|
1466
|
-
|
1467
|
-
# primary annotations of a species
|
1468
|
-
bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
|
1469
|
-
|
1470
|
-
# add components within modestly sized protein complexes
|
1471
|
-
# look at HAS_PART IDs
|
1472
|
-
bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
|
1473
|
-
|
1474
|
-
# number of species in a complex
|
1475
|
-
n_species_components = bqb_has_parts_species.value_counts(
|
1476
|
-
[IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
|
1477
|
-
)
|
1478
|
-
big_complex_sids = set(
|
1479
|
-
n_species_components[
|
1480
|
-
n_species_components > max_complex_size
|
1481
|
-
].index.get_level_values(SBML_DFS.S_ID)
|
1482
|
-
)
|
1483
|
-
|
1484
|
-
filtered_bqb_has_parts = _filter_promiscuous_components(
|
1485
|
-
bqb_has_parts_species, max_promiscuity
|
1486
|
-
)
|
1487
|
-
|
1488
|
-
# drop species parts if there are many components
|
1489
|
-
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
1490
|
-
~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
|
1491
|
-
]
|
1492
|
-
|
1493
|
-
# combine primary identifiers and rare components
|
1494
|
-
characteristic_species_ids = pd.concat(
|
1495
|
-
[
|
1496
|
-
bqb_is_species,
|
1497
|
-
filtered_bqb_has_parts,
|
1498
|
-
]
|
1499
|
-
)
|
1500
|
-
|
1501
|
-
return characteristic_species_ids
|
1502
|
-
|
1503
|
-
|
1504
|
-
def infer_uncompartmentalized_species_location(sbml_dfs: SBML_dfs) -> SBML_dfs:
|
1505
|
-
"""
|
1506
|
-
Infer Uncompartmentalized Species Location
|
1507
|
-
|
1508
|
-
If the compartment of a subset of compartmentalized species
|
1509
|
-
was not specified, infer an appropriate compartment from
|
1510
|
-
other members of reactions they particpate in
|
1511
|
-
|
1512
|
-
Parameters:
|
1513
|
-
----------
|
1514
|
-
sbml_dfs: sbml.SBML_dfs
|
1515
|
-
A relational pathway model
|
1516
|
-
|
1517
|
-
Returns:
|
1518
|
-
----------
|
1519
|
-
sbml_dfs: sbml.SBML_dfs
|
1520
|
-
A relational pathway model (with filled in species compartments)
|
1521
|
-
|
1522
|
-
"""
|
1523
|
-
|
1524
|
-
default_compartment = (
|
1525
|
-
sbml_dfs.compartmentalized_species.value_counts(SBML_DFS.C_ID)
|
1526
|
-
.rename("N")
|
1527
|
-
.reset_index()
|
1528
|
-
.sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
|
1529
|
-
)
|
1530
|
-
if not isinstance(default_compartment, str):
|
1531
|
-
raise ValueError(
|
1532
|
-
"No default compartment could be found - compartment "
|
1533
|
-
"information may not be present"
|
1534
|
-
)
|
1535
|
-
|
1536
|
-
# infer the compartments of species missing compartments
|
1537
|
-
|
1538
|
-
missing_compartment_scids = sbml_dfs.compartmentalized_species[
|
1539
|
-
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1540
|
-
].index.tolist()
|
1541
|
-
if len(missing_compartment_scids) == 0:
|
1542
|
-
logger.info(
|
1543
|
-
"All compartmentalized species have compartments, "
|
1544
|
-
"returning input sbml_dfs"
|
1545
|
-
)
|
1546
|
-
return sbml_dfs
|
1547
|
-
|
1548
|
-
participating_reactions = (
|
1549
|
-
sbml_dfs.reaction_species[
|
1550
|
-
sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
1551
|
-
][SBML_DFS.R_ID]
|
1552
|
-
.unique()
|
1553
|
-
.tolist()
|
1554
|
-
)
|
1555
|
-
reaction_participants = sbml_dfs.reaction_species[
|
1556
|
-
sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
|
1557
|
-
].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
|
1558
|
-
reaction_participants = reaction_participants.merge(
|
1559
|
-
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID],
|
1560
|
-
left_on=SBML_DFS.SC_ID,
|
1561
|
-
right_index=True,
|
1562
|
-
)
|
1563
|
-
|
1564
|
-
# find a default compartment to fall back on if all compartmental information is missing
|
1565
|
-
|
1566
|
-
primary_reaction_compartment = (
|
1567
|
-
reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
|
1568
|
-
.rename("N")
|
1569
|
-
.reset_index()
|
1570
|
-
.sort_values("N", ascending=False)
|
1571
|
-
.groupby(SBML_DFS.R_ID)
|
1572
|
-
.first()[SBML_DFS.C_ID]
|
1573
|
-
.reset_index()
|
1574
|
-
)
|
1575
|
-
|
1576
|
-
inferred_compartmentalization = (
|
1577
|
-
sbml_dfs.reaction_species[
|
1578
|
-
sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
|
1579
|
-
]
|
1580
|
-
.merge(primary_reaction_compartment)
|
1581
|
-
.value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
|
1582
|
-
.rename("N")
|
1583
|
-
.reset_index()
|
1584
|
-
.sort_values("N", ascending=False)
|
1585
|
-
.groupby(SBML_DFS.SC_ID)
|
1586
|
-
.first()
|
1587
|
-
.reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
|
1588
|
-
)
|
1589
|
-
logger.info(
|
1590
|
-
f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
|
1591
|
-
)
|
1592
|
-
|
1593
|
-
# define where a reaction is most likely to occur based on the compartmentalization of its particpants
|
1594
|
-
species_with_unknown_compartmentalization = set(
|
1595
|
-
missing_compartment_scids
|
1596
|
-
).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
|
1597
|
-
if len(species_with_unknown_compartmentalization) != 0:
|
1598
|
-
logger.warning(
|
1599
|
-
f"{len(species_with_unknown_compartmentalization)} "
|
1600
|
-
"species compartmentalization could not be inferred"
|
1601
|
-
" from other reaction particpants. Their compartmentalization "
|
1602
|
-
f"will be set to the default of {default_compartment}"
|
1603
|
-
)
|
1604
|
-
|
1605
|
-
inferred_compartmentalization = pd.concat(
|
1606
|
-
[
|
1607
|
-
inferred_compartmentalization,
|
1608
|
-
pd.DataFrame(
|
1609
|
-
{SBML_DFS.SC_ID: list(species_with_unknown_compartmentalization)}
|
1610
|
-
).assign(c_id=default_compartment),
|
1611
|
-
]
|
1612
|
-
)
|
1613
|
-
|
1614
|
-
if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
|
1615
|
-
raise ValueError(
|
1616
|
-
f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
|
1617
|
-
)
|
1618
|
-
|
1619
|
-
updated_compartmentalized_species = pd.concat(
|
1620
|
-
[
|
1621
|
-
sbml_dfs.compartmentalized_species[
|
1622
|
-
~sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1623
|
-
],
|
1624
|
-
sbml_dfs.compartmentalized_species[
|
1625
|
-
sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
|
1626
|
-
]
|
1627
|
-
.drop(SBML_DFS.C_ID, axis=1)
|
1628
|
-
.merge(
|
1629
|
-
inferred_compartmentalization, left_index=True, right_on=SBML_DFS.SC_ID
|
1630
|
-
)
|
1631
|
-
.set_index(SBML_DFS.SC_ID),
|
1632
|
-
]
|
1633
|
-
)
|
1634
|
-
|
1635
|
-
if (
|
1636
|
-
updated_compartmentalized_species.shape[0]
|
1637
|
-
!= sbml_dfs.compartmentalized_species.shape[0]
|
1638
|
-
):
|
1639
|
-
raise ValueError(
|
1640
|
-
f"Trying to overwrite {sbml_dfs.compartmentalized_species.shape[0]}"
|
1641
|
-
" compartmentalized species with "
|
1642
|
-
f"{updated_compartmentalized_species.shape[0]}"
|
1643
|
-
)
|
1644
|
-
|
1645
|
-
if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
|
1646
|
-
raise ValueError("Some species compartments are still missing")
|
1647
|
-
|
1648
|
-
sbml_dfs.compartmentalized_species = updated_compartmentalized_species
|
1649
|
-
|
1650
|
-
return sbml_dfs
|
1651
|
-
|
1652
|
-
|
1653
|
-
def infer_sbo_terms(sbml_dfs: SBML_dfs) -> SBML_dfs:
|
1654
|
-
"""
|
1655
|
-
Infer SBO Terms
|
1656
|
-
|
1657
|
-
Define SBO terms based on stoichiometry for reaction_species with missing terms
|
1658
|
-
|
1659
|
-
Parameters:
|
1660
|
-
----------
|
1661
|
-
sbml_dfs: sbml.SBML_dfs
|
1662
|
-
A relational pathway model
|
1663
|
-
|
1664
|
-
Returns:
|
1665
|
-
----------
|
1666
|
-
sbml_dfs: sbml.SBML_dfs
|
1667
|
-
A relational pathway model (with missing/invalid reaction species sbo_terms resolved)
|
1668
|
-
|
1669
|
-
"""
|
1670
|
-
|
1671
|
-
valid_sbo_terms = sbml_dfs.reaction_species[
|
1672
|
-
sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
1673
|
-
]
|
1674
|
-
|
1675
|
-
invalid_sbo_terms = sbml_dfs.reaction_species[
|
1676
|
-
~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
|
1677
|
-
]
|
1678
|
-
|
1679
|
-
if not all(sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].notnull()):
|
1680
|
-
raise ValueError(
|
1681
|
-
"All sbml_dfs.reaction_species[SBML_DFS.SBO_TERM] must be not null"
|
1682
|
-
)
|
1683
|
-
if invalid_sbo_terms.shape[0] == 0:
|
1684
|
-
logger.info("All sbo_terms were valid; returning input sbml_dfs")
|
1685
|
-
return sbml_dfs
|
1686
|
-
|
1687
|
-
logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
|
1688
|
-
|
1689
|
-
# add missing/invalid terms based on stoichiometry
|
1690
|
-
invalid_sbo_terms.loc[
|
1691
|
-
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
|
1692
|
-
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
|
1693
|
-
|
1694
|
-
invalid_sbo_terms.loc[
|
1695
|
-
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
|
1696
|
-
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
|
1697
|
-
|
1698
|
-
invalid_sbo_terms.loc[
|
1699
|
-
invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
|
1700
|
-
] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
|
1701
|
-
|
1702
|
-
updated_reaction_species = pd.concat(
|
1703
|
-
[valid_sbo_terms, invalid_sbo_terms]
|
1704
|
-
).sort_index()
|
1705
|
-
|
1706
|
-
if sbml_dfs.reaction_species.shape[0] != updated_reaction_species.shape[0]:
|
1707
|
-
raise ValueError(
|
1708
|
-
f"Trying to overwrite {sbml_dfs.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
|
1709
|
-
)
|
1710
|
-
sbml_dfs.reaction_species = updated_reaction_species
|
1711
|
-
|
1712
|
-
return sbml_dfs
|
1713
|
-
|
1714
|
-
|
1715
|
-
def name_compartmentalized_species(sbml_dfs):
|
1716
|
-
"""
|
1717
|
-
Name Compartmentalized Species
|
1718
|
-
|
1719
|
-
Rename compartmentalized species if they have the same
|
1720
|
-
name as their species
|
1721
|
-
|
1722
|
-
Parameters
|
1723
|
-
----------
|
1724
|
-
sbml_dfs : SBML_dfs
|
1725
|
-
A model formed by aggregating pathways
|
1726
|
-
|
1727
|
-
Returns:
|
1728
|
-
----------
|
1729
|
-
sbml_dfs
|
1730
|
-
"""
|
1731
|
-
|
1732
|
-
augmented_cspecies = sbml_dfs.compartmentalized_species.merge(
|
1733
|
-
sbml_dfs.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
|
1734
|
-
).merge(
|
1735
|
-
sbml_dfs.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
|
1736
|
-
)
|
1737
|
-
augmented_cspecies[SBML_DFS.SC_NAME] = [
|
1738
|
-
f"{s} [{c}]" if sc == s else sc
|
1739
|
-
for sc, c, s in zip(
|
1740
|
-
augmented_cspecies[SBML_DFS.SC_NAME],
|
1741
|
-
augmented_cspecies[SBML_DFS.C_NAME],
|
1742
|
-
augmented_cspecies[SBML_DFS.S_NAME],
|
1743
|
-
)
|
1744
|
-
]
|
1745
|
-
|
1746
|
-
sbml_dfs.compartmentalized_species = augmented_cspecies.loc[
|
1747
|
-
:, sbml_dfs.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
|
1748
|
-
]
|
1749
|
-
|
1750
|
-
return sbml_dfs
|
1751
|
-
|
1752
|
-
|
1753
|
-
def export_sbml_dfs(
|
1754
|
-
model_prefix: str,
|
1755
|
-
sbml_dfs: SBML_dfs,
|
1756
|
-
outdir: str,
|
1757
|
-
overwrite: bool = False,
|
1758
|
-
dogmatic: bool = True,
|
1759
|
-
) -> None:
|
1760
|
-
"""
|
1761
|
-
Export SBML_dfs
|
1762
|
-
|
1763
|
-
Export summaries of species identifiers and each table underlying
|
1764
|
-
an SBML_dfs pathway model
|
1765
|
-
|
1766
|
-
Params
|
1767
|
-
------
|
1768
|
-
model_prefix: str
|
1769
|
-
Label to prepend to all exported files
|
1770
|
-
sbml_dfs: sbml.SBML_dfs
|
1771
|
-
A pathway model
|
1772
|
-
outdir: str
|
1773
|
-
Path to an existing directory where results should be saved
|
1774
|
-
overwrite: bool
|
1775
|
-
Should the directory be overwritten if it already exists?
|
1776
|
-
dogmatic: bool
|
1777
|
-
If True then treat genes, transcript, and proteins as separate species. If False
|
1778
|
-
then treat them interchangeably.
|
1258
|
+
species_data_table : str
|
1259
|
+
Name of the species data table to select
|
1779
1260
|
|
1780
1261
|
Returns
|
1781
1262
|
-------
|
1782
|
-
|
1783
|
-
|
1784
|
-
"""
|
1785
|
-
|
1786
|
-
if not isinstance(model_prefix, str):
|
1787
|
-
raise TypeError(f"model_prefix was a {type(model_prefix)} " "and must be a str")
|
1788
|
-
if not isinstance(sbml_dfs, SBML_dfs):
|
1789
|
-
raise TypeError(
|
1790
|
-
f"sbml_dfs was a {type(sbml_dfs)} and must" " be an sbml.SBML_dfs"
|
1791
|
-
)
|
1792
|
-
|
1793
|
-
# filter to identifiers which make sense when mapping from ids -> species
|
1794
|
-
species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
|
1795
|
-
sbml_dfs,
|
1796
|
-
dogmatic=dogmatic,
|
1797
|
-
)
|
1798
|
-
|
1799
|
-
try:
|
1800
|
-
utils.initialize_dir(outdir, overwrite=overwrite)
|
1801
|
-
except FileExistsError:
|
1802
|
-
logger.warning(
|
1803
|
-
f"Directory {outdir} already exists and overwrite is False. "
|
1804
|
-
"Files will be added to the existing directory."
|
1805
|
-
)
|
1806
|
-
with open_fs(outdir, writeable=True) as fs:
|
1807
|
-
species_identifiers_path = (
|
1808
|
-
model_prefix + CPR_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
|
1809
|
-
)
|
1810
|
-
with fs.openbin(species_identifiers_path, "w") as f:
|
1811
|
-
species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
|
1812
|
-
f, sep="\t", index=False
|
1813
|
-
)
|
1814
|
-
|
1815
|
-
# export jsons
|
1816
|
-
species_path = model_prefix + CPR_STANDARD_OUTPUTS.SPECIES
|
1817
|
-
reactions_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTIONS
|
1818
|
-
reation_species_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTION_SPECIES
|
1819
|
-
compartments_path = model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTS
|
1820
|
-
compartmentalized_species_path = (
|
1821
|
-
model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
|
1822
|
-
)
|
1823
|
-
with fs.openbin(species_path, "w") as f:
|
1824
|
-
sbml_dfs.species[[SBML_DFS.S_NAME]].to_json(f)
|
1825
|
-
|
1826
|
-
with fs.openbin(reactions_path, "w") as f:
|
1827
|
-
sbml_dfs.reactions[[SBML_DFS.R_NAME]].to_json(f)
|
1828
|
-
|
1829
|
-
with fs.openbin(reation_species_path, "w") as f:
|
1830
|
-
sbml_dfs.reaction_species.to_json(f)
|
1831
|
-
|
1832
|
-
with fs.openbin(compartments_path, "w") as f:
|
1833
|
-
sbml_dfs.compartments[[SBML_DFS.C_NAME]].to_json(f)
|
1834
|
-
|
1835
|
-
with fs.openbin(compartmentalized_species_path, "w") as f:
|
1836
|
-
sbml_dfs.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
|
1837
|
-
f
|
1838
|
-
)
|
1839
|
-
|
1840
|
-
return None
|
1841
|
-
|
1842
|
-
|
1843
|
-
def sbml_dfs_from_edgelist(
|
1844
|
-
interaction_edgelist: pd.DataFrame,
|
1845
|
-
species_df: pd.DataFrame,
|
1846
|
-
compartments_df: pd.DataFrame,
|
1847
|
-
interaction_source: source.Source,
|
1848
|
-
upstream_stoichiometry: int = 0,
|
1849
|
-
downstream_stoichiometry: int = 1,
|
1850
|
-
downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
|
1851
|
-
keep_species_data: bool | str = False,
|
1852
|
-
keep_reactions_data: bool | str = False,
|
1853
|
-
) -> SBML_dfs:
|
1854
|
-
"""
|
1855
|
-
Create SBML_dfs from interaction edgelist.
|
1856
|
-
|
1857
|
-
Combines a set of molecular interactions into a mechanistic SBML_dfs model
|
1858
|
-
by processing interaction data, species information, and compartment definitions.
|
1859
|
-
|
1860
|
-
Parameters
|
1861
|
-
----------
|
1862
|
-
interaction_edgelist : pd.DataFrame
|
1863
|
-
Table containing molecular interactions with columns:
|
1864
|
-
- upstream_name : str, matches "s_name" from species_df
|
1865
|
-
- downstream_name : str, matches "s_name" from species_df
|
1866
|
-
- upstream_compartment : str, matches "c_name" from compartments_df
|
1867
|
-
- downstream_compartment : str, matches "c_name" from compartments_df
|
1868
|
-
- r_name : str, name for the interaction
|
1869
|
-
- sbo_term : str, SBO term defining interaction type
|
1870
|
-
- r_Identifiers : identifiers.Identifiers, supporting identifiers
|
1871
|
-
- r_isreversible : bool, whether reaction is reversible
|
1872
|
-
species_df : pd.DataFrame
|
1873
|
-
Table defining molecular species with columns:
|
1874
|
-
- s_name : str, name of molecular species
|
1875
|
-
- s_Identifiers : identifiers.Identifiers, species identifiers
|
1876
|
-
compartments_df : pd.DataFrame
|
1877
|
-
Table defining compartments with columns:
|
1878
|
-
- c_name : str, name of compartment
|
1879
|
-
- c_Identifiers : identifiers.Identifiers, compartment identifiers
|
1880
|
-
interaction_source : source.Source
|
1881
|
-
Source object linking model entities to interaction source
|
1882
|
-
upstream_stoichiometry : int, default 0
|
1883
|
-
Stoichiometry of upstream species in reactions
|
1884
|
-
downstream_stoichiometry : int, default 1
|
1885
|
-
Stoichiometry of downstream species in reactions
|
1886
|
-
downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
|
1887
|
-
SBO term for downstream reactant type
|
1888
|
-
keep_species_data : bool or str, default False
|
1889
|
-
Whether to preserve extra species columns. If True, saves as 'source' label.
|
1890
|
-
If string, uses as custom label. If False, discards extra data.
|
1891
|
-
keep_reactions_data : bool or str, default False
|
1892
|
-
Whether to preserve extra reaction columns. If True, saves as 'source' label.
|
1893
|
-
If string, uses as custom label. If False, discards extra data.
|
1894
|
-
|
1895
|
-
Returns
|
1896
|
-
-------
|
1897
|
-
SBML_dfs
|
1898
|
-
Validated SBML data structure containing compartments, species,
|
1899
|
-
compartmentalized species, reactions, and reaction species tables.
|
1900
|
-
"""
|
1901
|
-
# 1. Validate inputs
|
1902
|
-
_edgelist_validate_inputs(interaction_edgelist, species_df, compartments_df)
|
1903
|
-
|
1904
|
-
# 2. Identify which extra columns to preserve
|
1905
|
-
extra_columns = _edgelist_identify_extra_columns(
|
1906
|
-
interaction_edgelist, species_df, keep_reactions_data, keep_species_data
|
1907
|
-
)
|
1908
|
-
|
1909
|
-
# 3. Process compartments and species tables
|
1910
|
-
processed_compartments = _edgelist_process_compartments(
|
1911
|
-
compartments_df, interaction_source
|
1912
|
-
)
|
1913
|
-
processed_species, species_data = _edgelist_process_species(
|
1914
|
-
species_df, interaction_source, extra_columns["species"]
|
1915
|
-
)
|
1916
|
-
|
1917
|
-
# 4. Create compartmentalized species
|
1918
|
-
comp_species = _edgelist_create_compartmentalized_species(
|
1919
|
-
interaction_edgelist,
|
1920
|
-
processed_species,
|
1921
|
-
processed_compartments,
|
1922
|
-
interaction_source,
|
1923
|
-
)
|
1924
|
-
|
1925
|
-
# 5. Create reactions and reaction species
|
1926
|
-
reactions, reaction_species, reactions_data = (
|
1927
|
-
_edgelist_create_reactions_and_species(
|
1928
|
-
interaction_edgelist,
|
1929
|
-
comp_species,
|
1930
|
-
processed_species,
|
1931
|
-
processed_compartments,
|
1932
|
-
interaction_source,
|
1933
|
-
upstream_stoichiometry,
|
1934
|
-
downstream_stoichiometry,
|
1935
|
-
downstream_sbo_name,
|
1936
|
-
extra_columns["reactions"],
|
1937
|
-
)
|
1938
|
-
)
|
1939
|
-
|
1940
|
-
# 6. Assemble final SBML_dfs object
|
1941
|
-
sbml_model = _edgelist_assemble_sbml_model(
|
1942
|
-
processed_compartments,
|
1943
|
-
processed_species,
|
1944
|
-
comp_species,
|
1945
|
-
reactions,
|
1946
|
-
reaction_species,
|
1947
|
-
species_data,
|
1948
|
-
reactions_data,
|
1949
|
-
keep_species_data,
|
1950
|
-
keep_reactions_data,
|
1951
|
-
extra_columns,
|
1952
|
-
)
|
1953
|
-
|
1954
|
-
return sbml_model
|
1955
|
-
|
1956
|
-
return sbml_model
|
1957
|
-
|
1958
|
-
|
1959
|
-
def species_type_types(x):
|
1960
|
-
"""Assign a high-level molecule type to a molecular species"""
|
1961
|
-
|
1962
|
-
if isinstance(x, identifiers.Identifiers):
|
1963
|
-
if x.filter(["chebi"]):
|
1964
|
-
return "metabolite"
|
1965
|
-
elif x.filter(["molodex"]):
|
1966
|
-
return "drug"
|
1967
|
-
else:
|
1968
|
-
return "protein"
|
1969
|
-
else:
|
1970
|
-
return "unknown"
|
1971
|
-
|
1972
|
-
|
1973
|
-
def stub_ids(ids):
|
1974
|
-
if len(ids) == 0:
|
1975
|
-
return pd.DataFrame(
|
1976
|
-
{
|
1977
|
-
IDENTIFIERS.ONTOLOGY: [None],
|
1978
|
-
IDENTIFIERS.IDENTIFIER: [None],
|
1979
|
-
IDENTIFIERS.URL: [None],
|
1980
|
-
IDENTIFIERS.BQB: [None],
|
1981
|
-
}
|
1982
|
-
)
|
1983
|
-
else:
|
1984
|
-
return pd.DataFrame(ids)
|
1985
|
-
|
1263
|
+
pd.DataFrame
|
1264
|
+
The selected species data table
|
1986
1265
|
|
1987
|
-
|
1988
|
-
|
1989
|
-
|
1266
|
+
Raises
|
1267
|
+
------
|
1268
|
+
ValueError
|
1269
|
+
If species_data_table is not found
|
1270
|
+
"""
|
1271
|
+
# Check if species_data_table exists in sbml_dfs.species_data
|
1272
|
+
if species_data_table not in self.species_data:
|
1273
|
+
raise ValueError(
|
1274
|
+
f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
|
1275
|
+
f"Available tables: {self.species_data.keys()}"
|
1276
|
+
)
|
1990
1277
|
|
1991
|
-
|
1992
|
-
|
1278
|
+
# Get the species data
|
1279
|
+
return self.species_data[species_data_table]
|
1993
1280
|
|
1994
|
-
|
1995
|
-
|
1281
|
+
def species_status(self, s_id: str) -> pd.DataFrame:
|
1282
|
+
"""
|
1283
|
+
Species Status
|
1996
1284
|
|
1997
|
-
|
1285
|
+
Return all of the reactions a species participates in.
|
1998
1286
|
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2002
|
-
.replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
|
2003
|
-
)
|
1287
|
+
Parameters:
|
1288
|
+
s_id: str
|
1289
|
+
A species ID
|
2004
1290
|
|
2005
|
-
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2013
|
-
reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
|
1291
|
+
Returns:
|
1292
|
+
pd.DataFrame, one row per reaction the species participates in
|
1293
|
+
with columns:
|
1294
|
+
- sc_name: str, name of the compartment the species participates in
|
1295
|
+
- stoichiometry: float, stoichiometry of the species in the reaction
|
1296
|
+
- r_name: str, name of the reaction
|
1297
|
+
- r_formula_str: str, human-readable formula of the reaction
|
1298
|
+
"""
|
2014
1299
|
|
2015
|
-
|
1300
|
+
if s_id not in self.species.index:
|
1301
|
+
raise ValueError(f"{s_id} not found in species table")
|
2016
1302
|
|
1303
|
+
matching_species = self.species.loc[s_id]
|
2017
1304
|
|
2018
|
-
|
2019
|
-
|
2020
|
-
) -> pd.DataFrame:
|
1305
|
+
if not isinstance(matching_species, pd.Series):
|
1306
|
+
raise ValueError(f"{s_id} did not match a single species")
|
2021
1307
|
|
2022
|
-
|
2023
|
-
|
2024
|
-
|
2025
|
-
|
2026
|
-
)
|
2027
|
-
if "new" not in reaction_species_w_roles.columns:
|
2028
|
-
raise ValueError(
|
2029
|
-
"The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2030
|
-
)
|
2031
|
-
# check that new is a boolean column
|
2032
|
-
if reaction_species_w_roles["new"].dtype != bool:
|
2033
|
-
raise ValueError(
|
2034
|
-
"The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2035
|
-
)
|
1308
|
+
# find all rxns species participate in
|
1309
|
+
matching_compartmentalized_species = self.compartmentalized_species[
|
1310
|
+
self.compartmentalized_species.s_id.isin([s_id])
|
1311
|
+
]
|
2036
1312
|
|
2037
|
-
|
2038
|
-
|
2039
|
-
|
2040
|
-
.tolist()
|
2041
|
-
)
|
1313
|
+
rxns_participating = self.reaction_species[
|
1314
|
+
self.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
|
1315
|
+
]
|
2042
1316
|
|
2043
|
-
|
2044
|
-
|
2045
|
-
|
2046
|
-
|
1317
|
+
# find all participants in these rxns
|
1318
|
+
full_rxns_participating = self.reaction_species[
|
1319
|
+
self.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
|
1320
|
+
].merge(
|
1321
|
+
self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
|
2047
1322
|
)
|
2048
1323
|
|
2049
|
-
|
2050
|
-
|
2051
|
-
reaction_species_w_roles
|
2052
|
-
# drop already filtered reactions
|
2053
|
-
.query("r_id not in @reactions_with_lost_defining_members")
|
2054
|
-
.query("sbo_role == 'REQUIRED'")
|
2055
|
-
# which entries which have some required attribute have all False values for that attribute
|
2056
|
-
.groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
|
2057
|
-
.agg({"new": "any"})
|
2058
|
-
.query("new == False")
|
2059
|
-
.index.get_level_values(SBML_DFS.R_ID)
|
2060
|
-
)
|
1324
|
+
participating_rids = full_rxns_participating[SBML_DFS.R_ID].unique()
|
1325
|
+
reaction_descriptions = self.reaction_summaries(r_ids=participating_rids)
|
2061
1326
|
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
1327
|
+
status = (
|
1328
|
+
full_rxns_participating.loc[
|
1329
|
+
full_rxns_participating[SBML_DFS.SC_ID].isin(
|
1330
|
+
matching_compartmentalized_species.index.values.tolist()
|
1331
|
+
),
|
1332
|
+
[SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
|
1333
|
+
]
|
1334
|
+
.merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
|
1335
|
+
.reset_index(drop=True)
|
1336
|
+
.drop(SBML_DFS.R_ID, axis=1)
|
2066
1337
|
)
|
2067
1338
|
|
2068
|
-
|
2069
|
-
reactions_with_lost_requirements
|
2070
|
-
)
|
1339
|
+
return status
|
2071
1340
|
|
2072
|
-
|
1341
|
+
def validate(self):
|
1342
|
+
"""
|
1343
|
+
Validate the SBML_dfs structure and relationships.
|
2073
1344
|
|
1345
|
+
Checks:
|
1346
|
+
- Schema existence
|
1347
|
+
- Required tables presence
|
1348
|
+
- Individual table structure
|
1349
|
+
- Primary key uniqueness
|
1350
|
+
- Foreign key relationships
|
1351
|
+
- Optional data table validity
|
1352
|
+
- Reaction species validity
|
2074
1353
|
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
1354
|
+
Raises
|
1355
|
+
------
|
1356
|
+
ValueError
|
1357
|
+
If any validation check fails
|
1358
|
+
"""
|
2080
1359
|
|
2081
|
-
|
2082
|
-
|
1360
|
+
if not hasattr(self, "schema"):
|
1361
|
+
raise ValueError("No schema found")
|
2083
1362
|
|
2084
|
-
|
2085
|
-
|
2086
|
-
A pathway representation
|
2087
|
-
sc_ids (list[str])
|
2088
|
-
A list of compartmentalized species ids (sc_ids) which will be removed.
|
1363
|
+
required_tables = self._required_entities
|
1364
|
+
schema_tables = set(self.schema.keys())
|
2089
1365
|
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
1366
|
+
extra_tables = schema_tables.difference(required_tables)
|
1367
|
+
if len(extra_tables) != 0:
|
1368
|
+
logger.debug(
|
1369
|
+
f"{len(extra_tables)} unexpected tables found: "
|
1370
|
+
f"{', '.join(extra_tables)}"
|
1371
|
+
)
|
2094
1372
|
|
2095
|
-
|
1373
|
+
missing_tables = required_tables.difference(schema_tables)
|
1374
|
+
if len(missing_tables) != 0:
|
1375
|
+
raise ValueError(
|
1376
|
+
f"Missing {len(missing_tables)} required tables: "
|
1377
|
+
f"{', '.join(missing_tables)}"
|
1378
|
+
)
|
2096
1379
|
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
)
|
1380
|
+
# check individual tables
|
1381
|
+
for table in required_tables:
|
1382
|
+
self._validate_table(table)
|
2101
1383
|
|
2102
|
-
|
2103
|
-
|
1384
|
+
# check whether pks and fks agree
|
1385
|
+
self._check_pk_fk_correspondence()
|
2104
1386
|
|
2105
|
-
|
1387
|
+
# check optional data tables:
|
1388
|
+
for k, v in self.species_data.items():
|
1389
|
+
try:
|
1390
|
+
self._validate_species_data(v)
|
1391
|
+
except ValueError as e:
|
1392
|
+
raise ValueError(f"species data {k} was invalid.") from e
|
2106
1393
|
|
1394
|
+
for k, v in self.reactions_data.items():
|
1395
|
+
try:
|
1396
|
+
self._validate_reactions_data(v)
|
1397
|
+
except ValueError as e:
|
1398
|
+
raise ValueError(f"reactions data {k} was invalid.") from e
|
2107
1399
|
|
2108
|
-
|
2109
|
-
|
2110
|
-
Validate a standalone table against the SBML_dfs schema.
|
1400
|
+
# validate reaction_species sbo_terms and stoi
|
1401
|
+
self._validate_reaction_species()
|
2111
1402
|
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
1403
|
+
def validate_and_resolve(self):
|
1404
|
+
"""
|
1405
|
+
Validate and attempt to automatically fix common issues.
|
2115
1406
|
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
table_name : str
|
2121
|
-
Name of the table in the SBML_dfs schema
|
1407
|
+
This method iteratively:
|
1408
|
+
1. Attempts validation
|
1409
|
+
2. If validation fails, tries to resolve the issue
|
1410
|
+
3. Repeats until validation passes or issue cannot be resolved
|
2122
1411
|
|
2123
1412
|
Raises
|
2124
1413
|
------
|
2125
1414
|
ValueError
|
2126
|
-
|
2127
|
-
|
2128
|
-
if table_name not in SBML_DFS_SCHEMA.SCHEMA:
|
2129
|
-
raise ValueError(
|
2130
|
-
f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
|
2131
|
-
f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
|
2132
|
-
)
|
1415
|
+
If validation fails and cannot be automatically resolved
|
1416
|
+
"""
|
2133
1417
|
|
2134
|
-
|
2135
|
-
|
1418
|
+
current_exception = None
|
1419
|
+
validated = False
|
2136
1420
|
|
1421
|
+
while not validated:
|
1422
|
+
try:
|
1423
|
+
self.validate()
|
1424
|
+
validated = True
|
1425
|
+
except Exception as e:
|
1426
|
+
e_str = str(e)
|
1427
|
+
if e_str == current_exception:
|
1428
|
+
logger.warning(
|
1429
|
+
"Automated resolution of an Exception was attempted but failed"
|
1430
|
+
)
|
1431
|
+
raise e
|
2137
1432
|
|
2138
|
-
|
2139
|
-
|
2140
|
-
table_schema: dict,
|
2141
|
-
table_name: str,
|
2142
|
-
) -> None:
|
2143
|
-
"""
|
2144
|
-
Core validation logic for SBML_dfs tables.
|
1433
|
+
# try to resolve
|
1434
|
+
self._attempt_resolve(e)
|
2145
1435
|
|
2146
|
-
|
2147
|
-
|
1436
|
+
# =============================================================================
|
1437
|
+
# PRIVATE METHODS (ALPHABETICAL ORDER)
|
1438
|
+
# =============================================================================
|
2148
1439
|
|
2149
|
-
|
2150
|
-
|
2151
|
-
|
2152
|
-
|
2153
|
-
|
2154
|
-
|
2155
|
-
|
2156
|
-
|
1440
|
+
def _attempt_resolve(self, e):
|
1441
|
+
str_e = str(e)
|
1442
|
+
if str_e == "compartmentalized_species included missing c_id values":
|
1443
|
+
logger.warning(str_e)
|
1444
|
+
logger.warning(
|
1445
|
+
"Attempting to resolve with infer_uncompartmentalized_species_location()"
|
1446
|
+
)
|
1447
|
+
self.infer_uncompartmentalized_species_location()
|
1448
|
+
elif re.search("sbo_terms were not defined", str_e):
|
1449
|
+
logger.warning(str_e)
|
1450
|
+
logger.warning("Attempting to resolve with infer_sbo_terms()")
|
1451
|
+
self.infer_sbo_terms()
|
1452
|
+
else:
|
1453
|
+
logger.warning(
|
1454
|
+
"An error occurred which could not be automatically resolved"
|
1455
|
+
)
|
1456
|
+
raise e
|
2157
1457
|
|
2158
|
-
|
2159
|
-
|
2160
|
-
|
2161
|
-
|
2162
|
-
|
2163
|
-
|
2164
|
-
|
2165
|
-
|
2166
|
-
- Empty table
|
2167
|
-
"""
|
2168
|
-
if not isinstance(table_data, pd.DataFrame):
|
2169
|
-
raise ValueError(
|
2170
|
-
f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
|
1458
|
+
def _check_pk_fk_correspondence(self):
|
1459
|
+
"""
|
1460
|
+
Check whether primary keys and foreign keys agree for all tables in the schema.
|
1461
|
+
Raises ValueError if any correspondence fails.
|
1462
|
+
"""
|
1463
|
+
|
1464
|
+
pk_df = pd.DataFrame(
|
1465
|
+
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
2171
1466
|
)
|
2172
1467
|
|
2173
|
-
|
2174
|
-
|
2175
|
-
|
2176
|
-
|
2177
|
-
|
1468
|
+
fk_df = (
|
1469
|
+
pd.DataFrame(
|
1470
|
+
[
|
1471
|
+
{"fk_table": k, "fk": v["fk"]}
|
1472
|
+
for k, v in self.schema.items()
|
1473
|
+
if "fk" in v.keys()
|
1474
|
+
]
|
1475
|
+
)
|
1476
|
+
.set_index("fk_table")["fk"]
|
1477
|
+
.apply(pd.Series)
|
1478
|
+
.reset_index()
|
1479
|
+
.melt(id_vars="fk_table")
|
1480
|
+
.drop(["variable"], axis=1)
|
1481
|
+
.rename(columns={"value": "key"})
|
2178
1482
|
)
|
2179
1483
|
|
2180
|
-
|
2181
|
-
if len(set(table_data.index.tolist())) != table_data.shape[0]:
|
2182
|
-
duplicated_pks = table_data.index.value_counts()
|
2183
|
-
duplicated_pks = duplicated_pks[duplicated_pks > 1]
|
1484
|
+
pk_fk_correspondences = pk_df.merge(fk_df)
|
2184
1485
|
|
2185
|
-
|
2186
|
-
|
2187
|
-
|
2188
|
-
|
2189
|
-
|
1486
|
+
for i in range(0, pk_fk_correspondences.shape[0]):
|
1487
|
+
pk_table_keys = set(
|
1488
|
+
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
1489
|
+
)
|
1490
|
+
if None in pk_table_keys:
|
1491
|
+
raise ValueError(
|
1492
|
+
f"{pk_fk_correspondences['pk_table'][i]} had "
|
1493
|
+
"missing values in its index"
|
1494
|
+
)
|
2190
1495
|
|
2191
|
-
|
2192
|
-
|
2193
|
-
|
1496
|
+
fk_table_keys = set(
|
1497
|
+
getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
|
1498
|
+
:, pk_fk_correspondences["key"][i]
|
1499
|
+
]
|
1500
|
+
)
|
1501
|
+
if None in fk_table_keys:
|
1502
|
+
raise ValueError(
|
1503
|
+
f"{pk_fk_correspondences['fk_table'][i]} included "
|
1504
|
+
f"missing {pk_fk_correspondences['key'][i]} values"
|
1505
|
+
)
|
2194
1506
|
|
2195
|
-
|
2196
|
-
|
2197
|
-
|
2198
|
-
|
2199
|
-
|
2200
|
-
|
1507
|
+
# all foreign keys need to match a primary key
|
1508
|
+
extra_fks = fk_table_keys.difference(pk_table_keys)
|
1509
|
+
if len(extra_fks) != 0:
|
1510
|
+
raise ValueError(
|
1511
|
+
f"{len(extra_fks)} distinct "
|
1512
|
+
f"{pk_fk_correspondences['key'][i]} values were"
|
1513
|
+
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
1514
|
+
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
1515
|
+
" All foreign keys must have a matching primary key.\n\n"
|
1516
|
+
f"Extra key are: {', '.join(extra_fks)}"
|
1517
|
+
)
|
2201
1518
|
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2205
|
-
|
2206
|
-
|
2207
|
-
)
|
1519
|
+
def _find_underspecified_reactions_by_scids(
|
1520
|
+
self, sc_ids: Iterable[str]
|
1521
|
+
) -> set[str]:
|
1522
|
+
"""
|
1523
|
+
Find Underspecified reactions
|
2208
1524
|
|
2209
|
-
|
2210
|
-
|
2211
|
-
raise ValueError(f"{table_name} contained no entries")
|
1525
|
+
Identify reactions which should be removed if a set of molecular species are removed
|
1526
|
+
from the system.
|
2212
1527
|
|
1528
|
+
Parameters
|
1529
|
+
----------
|
1530
|
+
sc_ids : list[str]
|
1531
|
+
A list of compartmentalized species ids (sc_ids) which will be removed.
|
2213
1532
|
|
2214
|
-
|
2215
|
-
|
2216
|
-
|
1533
|
+
Returns
|
1534
|
+
-------
|
1535
|
+
underspecified_reactions : set[str]
|
1536
|
+
A set of reactions which should be removed because they will not occur once
|
1537
|
+
"sc_ids" are removed.
|
1538
|
+
"""
|
1539
|
+
updated_reaction_species = self.reaction_species.copy()
|
1540
|
+
updated_reaction_species["new"] = ~updated_reaction_species[
|
1541
|
+
SBML_DFS.SC_ID
|
1542
|
+
].isin(sc_ids)
|
1543
|
+
updated_reaction_species = sbml_dfs_utils.add_sbo_role(updated_reaction_species)
|
1544
|
+
underspecified_reactions = sbml_dfs_utils.find_underspecified_reactions(
|
1545
|
+
updated_reaction_species
|
1546
|
+
)
|
1547
|
+
return underspecified_reactions
|
2217
1548
|
|
2218
|
-
|
2219
|
-
|
2220
|
-
|
2221
|
-
|
2222
|
-
|
2223
|
-
|
2224
|
-
|
2225
|
-
promiscuous_component_identifiers = pd.Series(
|
2226
|
-
data=[True] * len(promiscuous_component_identifiers_index),
|
2227
|
-
index=promiscuous_component_identifiers_index,
|
2228
|
-
name="is_shared_component",
|
2229
|
-
dtype=bool,
|
2230
|
-
)
|
1549
|
+
def _get_unused_cspecies(self) -> set[str]:
|
1550
|
+
"""Returns a set of compartmentalized species
|
1551
|
+
that are not part of any reactions"""
|
1552
|
+
sc_ids = set(self.compartmentalized_species.index) - set(
|
1553
|
+
self.reaction_species[SBML_DFS.SC_ID]
|
1554
|
+
)
|
1555
|
+
return sc_ids # type: ignore
|
2231
1556
|
|
2232
|
-
|
2233
|
-
|
1557
|
+
def _get_unused_species(self) -> set[str]:
|
1558
|
+
"""Returns a list of species that are not part of any reactions"""
|
1559
|
+
s_ids = set(self.species.index) - set(
|
1560
|
+
self.compartmentalized_species[SBML_DFS.S_ID]
|
1561
|
+
)
|
1562
|
+
return s_ids # type: ignore
|
2234
1563
|
|
2235
|
-
|
2236
|
-
|
2237
|
-
left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
|
2238
|
-
right_index=True,
|
2239
|
-
how="left",
|
2240
|
-
)
|
1564
|
+
def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
1565
|
+
"""Removes compartmentalized species from the model
|
2241
1566
|
|
2242
|
-
|
2243
|
-
|
2244
|
-
|
2245
|
-
# drop identifiers shared as components across many species
|
2246
|
-
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
2247
|
-
~filtered_bqb_has_parts["is_shared_component"]
|
2248
|
-
].drop(["is_shared_component"], axis=1)
|
1567
|
+
This should not be directly used by the user, as it can lead to
|
1568
|
+
invalid reactions when removing species without a logic to decide
|
1569
|
+
if the reaction needs to be removed as well.
|
2249
1570
|
|
2250
|
-
|
1571
|
+
Args:
|
1572
|
+
sc_ids (Iterable[str]): the compartmentalized species to remove
|
1573
|
+
"""
|
1574
|
+
# Remove compartmentalized species
|
1575
|
+
self.compartmentalized_species = self.compartmentalized_species.drop(
|
1576
|
+
index=list(sc_ids)
|
1577
|
+
)
|
1578
|
+
# remove corresponding reactions_species
|
1579
|
+
self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
|
2251
1580
|
|
1581
|
+
def _remove_entity_data(self, entity_type: str, label: str) -> None:
|
1582
|
+
"""
|
1583
|
+
Remove data from species_data or reactions_data by table name and label.
|
2252
1584
|
|
2253
|
-
|
2254
|
-
|
2255
|
-
|
2256
|
-
|
2257
|
-
|
2258
|
-
|
2259
|
-
Validate input DataFrames have required columns.
|
1585
|
+
Parameters
|
1586
|
+
----------
|
1587
|
+
entity_type : str
|
1588
|
+
Name of the table to remove data from ('species' or 'reactions')
|
1589
|
+
label : str
|
1590
|
+
Label of the data to remove
|
2260
1591
|
|
2261
|
-
|
2262
|
-
|
2263
|
-
|
2264
|
-
|
2265
|
-
|
2266
|
-
|
2267
|
-
compartments_df : pd.DataFrame
|
2268
|
-
Compartments data to validate
|
2269
|
-
"""
|
1592
|
+
Notes
|
1593
|
+
-----
|
1594
|
+
If the label does not exist, a warning will be logged that includes the existing labels.
|
1595
|
+
"""
|
1596
|
+
if entity_type not in ENTITIES_W_DATA:
|
1597
|
+
raise ValueError("table_name must be either 'species' or 'reactions'")
|
2270
1598
|
|
2271
|
-
|
2272
|
-
|
2273
|
-
|
2274
|
-
|
2275
|
-
|
2276
|
-
|
2277
|
-
|
2278
|
-
|
2279
|
-
f"{', '.join(missing_required_fields)} are required variables"
|
2280
|
-
' in "compartments_df" but were not present in the input file.'
|
2281
|
-
)
|
1599
|
+
data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
|
1600
|
+
if label not in data_dict:
|
1601
|
+
existing_labels = list(data_dict.keys())
|
1602
|
+
logger.warning(
|
1603
|
+
f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
|
1604
|
+
f"Existing labels: {existing_labels}"
|
1605
|
+
)
|
1606
|
+
return
|
2282
1607
|
|
2283
|
-
|
2284
|
-
species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
2285
|
-
species_df_columns = set(species_df.columns.tolist())
|
2286
|
-
missing_required_fields = species_df_expected_vars.difference(species_df_columns)
|
2287
|
-
if len(missing_required_fields) > 0:
|
2288
|
-
raise ValueError(
|
2289
|
-
f"{', '.join(missing_required_fields)} are required"
|
2290
|
-
' variables in "species_df" but were not present '
|
2291
|
-
"in the input file."
|
2292
|
-
)
|
1608
|
+
del data_dict[label]
|
2293
1609
|
|
2294
|
-
|
2295
|
-
|
2296
|
-
missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
|
2297
|
-
interaction_edgelist_columns
|
2298
|
-
)
|
2299
|
-
if len(missing_required_fields) > 0:
|
2300
|
-
raise ValueError(
|
2301
|
-
f"{', '.join(missing_required_fields)} are required "
|
2302
|
-
'variables in "interaction_edgelist" but were not '
|
2303
|
-
"present in the input file."
|
2304
|
-
)
|
1610
|
+
def _remove_species(self, s_ids: Iterable[str]):
|
1611
|
+
"""Removes species from the model
|
2305
1612
|
|
2306
|
-
|
1613
|
+
This should not be directly used by the user, as it can lead to
|
1614
|
+
invalid reactions when removing species without a logic to decide
|
1615
|
+
if the reaction needs to be removed as well.
|
2307
1616
|
|
1617
|
+
This removes the species and corresponding compartmentalized species and
|
1618
|
+
reactions_species.
|
2308
1619
|
|
2309
|
-
|
2310
|
-
|
2311
|
-
|
2312
|
-
|
2313
|
-
|
1620
|
+
Args:
|
1621
|
+
s_ids (Iterable[str]): the species to remove
|
1622
|
+
"""
|
1623
|
+
sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
|
1624
|
+
self._remove_compartmentalized_species(sc_ids)
|
1625
|
+
# Remove species
|
1626
|
+
self.species = self.species.drop(index=list(s_ids))
|
1627
|
+
# remove data
|
1628
|
+
for k, data in self.species_data.items():
|
1629
|
+
self.species_data[k] = data.drop(index=list(s_ids))
|
2314
1630
|
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2319
|
-
|
2320
|
-
Species data containing potential extra columns
|
2321
|
-
keep_reactions_data : bool or str
|
2322
|
-
Whether to keep extra reaction columns
|
2323
|
-
keep_species_data : bool or str
|
2324
|
-
Whether to keep extra species columns
|
1631
|
+
def _remove_unused_cspecies(self):
|
1632
|
+
"""Removes compartmentalized species that are no
|
1633
|
+
longer part of any reactions"""
|
1634
|
+
sc_ids = self._get_unused_cspecies()
|
1635
|
+
self._remove_compartmentalized_species(sc_ids)
|
2325
1636
|
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
|
2330
|
-
|
2331
|
-
extra_reactions_columns = []
|
2332
|
-
extra_species_columns = []
|
2333
|
-
|
2334
|
-
if keep_reactions_data is not False:
|
2335
|
-
extra_reactions_columns = [
|
2336
|
-
c
|
2337
|
-
for c in interaction_edgelist.columns
|
2338
|
-
if c not in INTERACTION_EDGELIST_EXPECTED_VARS
|
2339
|
-
]
|
1637
|
+
def _remove_unused_species(self):
|
1638
|
+
"""Removes species that are no longer part of any
|
1639
|
+
compartmentalized species"""
|
1640
|
+
s_ids = self._get_unused_species()
|
1641
|
+
self._remove_species(s_ids)
|
2340
1642
|
|
2341
|
-
|
2342
|
-
extra_species_columns = [
|
2343
|
-
c
|
2344
|
-
for c in species_df.columns
|
2345
|
-
if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
2346
|
-
]
|
1643
|
+
def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
|
2347
1644
|
|
2348
|
-
|
1645
|
+
if isinstance(r_ids, str):
|
1646
|
+
r_ids = [r_ids]
|
2349
1647
|
|
1648
|
+
if r_ids is None:
|
1649
|
+
return self.reactions.index.tolist()
|
1650
|
+
else:
|
1651
|
+
if not all(r_id in self.reactions.index for r_id in r_ids):
|
1652
|
+
raise ValueError(f"Reaction IDs {r_ids} not found in reactions table")
|
2350
1653
|
|
2351
|
-
|
2352
|
-
"""
|
2353
|
-
Format compartments DataFrame with source and ID columns.
|
1654
|
+
return r_ids
|
2354
1655
|
|
2355
|
-
|
2356
|
-
|
2357
|
-
|
2358
|
-
|
2359
|
-
|
2360
|
-
Source object to assign to compartments
|
1656
|
+
def _validate_reaction_species(self):
|
1657
|
+
if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
|
1658
|
+
raise ValueError(
|
1659
|
+
"All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
|
1660
|
+
)
|
2361
1661
|
|
2362
|
-
|
2363
|
-
|
2364
|
-
|
2365
|
-
|
2366
|
-
|
2367
|
-
|
2368
|
-
compartments[SBML_DFS.C_SOURCE] = interaction_source
|
2369
|
-
compartments[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
|
2370
|
-
range(compartments.shape[0]), SBML_DFS.C_ID
|
2371
|
-
)
|
2372
|
-
return compartments.set_index(SBML_DFS.C_ID)[
|
2373
|
-
[SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
|
2374
|
-
]
|
1662
|
+
# test for null SBO terms
|
1663
|
+
n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
|
1664
|
+
if n_null_sbo_terms != 0:
|
1665
|
+
raise ValueError(
|
1666
|
+
f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
|
1667
|
+
)
|
2375
1668
|
|
1669
|
+
# find invalid SBO terms
|
1670
|
+
sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
|
1671
|
+
invalid_sbo_term_counts = sbo_counts[
|
1672
|
+
~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
|
1673
|
+
]
|
2376
1674
|
|
2377
|
-
|
2378
|
-
|
2379
|
-
|
1675
|
+
if invalid_sbo_term_counts.shape[0] != 0:
|
1676
|
+
invalid_sbo_counts_str = ", ".join(
|
1677
|
+
[f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
|
1678
|
+
)
|
1679
|
+
raise ValueError(
|
1680
|
+
f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
|
1681
|
+
f"defined {invalid_sbo_counts_str}"
|
1682
|
+
)
|
2380
1683
|
|
2381
|
-
|
2382
|
-
|
2383
|
-
species_df : pd.DataFrame
|
2384
|
-
Raw species data
|
2385
|
-
interaction_source : source.Source
|
2386
|
-
Source object to assign to species
|
2387
|
-
extra_species_columns : list
|
2388
|
-
Names of extra columns to preserve separately
|
1684
|
+
def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
|
1685
|
+
"""Validates reactions data attribute
|
2389
1686
|
|
2390
|
-
|
2391
|
-
|
2392
|
-
tuple of pd.DataFrame
|
2393
|
-
Processed species DataFrame and species extra data DataFrame
|
2394
|
-
"""
|
2395
|
-
species = species_df.copy()
|
2396
|
-
species[SBML_DFS.S_SOURCE] = interaction_source
|
2397
|
-
species[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
|
2398
|
-
range(species.shape[0]), SBML_DFS.S_ID
|
2399
|
-
)
|
1687
|
+
Args:
|
1688
|
+
reactions_data_table (pd.DataFrame): a reactions data table
|
2400
1689
|
|
2401
|
-
|
2402
|
-
|
2403
|
-
|
2404
|
-
|
1690
|
+
Raises:
|
1691
|
+
ValueError: r_id not index name
|
1692
|
+
ValueError: r_id index contains duplicates
|
1693
|
+
ValueError: r_id not in reactions table
|
1694
|
+
"""
|
1695
|
+
sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
|
2405
1696
|
|
2406
|
-
|
2407
|
-
|
2408
|
-
processed_species = species_indexed[required_cols]
|
1697
|
+
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
1698
|
+
"""Validates species data attribute
|
2409
1699
|
|
2410
|
-
|
1700
|
+
Args:
|
1701
|
+
species_data_table (pd.DataFrame): a species data table
|
2411
1702
|
|
1703
|
+
Raises:
|
1704
|
+
ValueError: s_id not index name
|
1705
|
+
ValueError: s_id index contains duplicates
|
1706
|
+
ValueError: s_id not in species table
|
1707
|
+
"""
|
1708
|
+
sbml_dfs_utils._validate_matching_data(species_data_table, self.species)
|
2412
1709
|
|
2413
|
-
def
|
2414
|
-
|
2415
|
-
|
2416
|
-
"""
|
2417
|
-
Create compartmentalized species from interactions.
|
1710
|
+
def _validate_table(self, table_name: str) -> None:
|
1711
|
+
"""
|
1712
|
+
Validate a table in this SBML_dfs object against its schema.
|
2418
1713
|
|
2419
|
-
|
2420
|
-
|
2421
|
-
interaction_edgelist : pd.DataFrame
|
2422
|
-
Interaction data containing species-compartment combinations
|
2423
|
-
species_df : pd.DataFrame
|
2424
|
-
Processed species data with IDs
|
2425
|
-
compartments_df : pd.DataFrame
|
2426
|
-
Processed compartments data with IDs
|
2427
|
-
interaction_source : source.Source
|
2428
|
-
Source object to assign to compartmentalized species
|
1714
|
+
This is an internal method that validates a table that is part of this SBML_dfs
|
1715
|
+
object against the schema stored in self.schema.
|
2429
1716
|
|
2430
|
-
|
2431
|
-
|
2432
|
-
|
2433
|
-
|
2434
|
-
"""
|
2435
|
-
# Get all distinct upstream and downstream compartmentalized species
|
2436
|
-
comp_species = pd.concat(
|
2437
|
-
[
|
2438
|
-
interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
|
2439
|
-
{
|
2440
|
-
"upstream_name": SBML_DFS.S_NAME,
|
2441
|
-
"upstream_compartment": SBML_DFS.C_NAME,
|
2442
|
-
},
|
2443
|
-
axis=1,
|
2444
|
-
),
|
2445
|
-
interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
|
2446
|
-
{
|
2447
|
-
"downstream_name": SBML_DFS.S_NAME,
|
2448
|
-
"downstream_compartment": SBML_DFS.C_NAME,
|
2449
|
-
},
|
2450
|
-
axis=1,
|
2451
|
-
),
|
2452
|
-
]
|
2453
|
-
).drop_duplicates()
|
1717
|
+
Parameters
|
1718
|
+
----------
|
1719
|
+
table : str
|
1720
|
+
Name of the table to validate
|
2454
1721
|
|
2455
|
-
|
2456
|
-
|
2457
|
-
|
2458
|
-
|
2459
|
-
|
2460
|
-
|
1722
|
+
Raises
|
1723
|
+
------
|
1724
|
+
ValueError
|
1725
|
+
If the table does not conform to its schema
|
1726
|
+
"""
|
1727
|
+
table_data = getattr(self, table_name)
|
2461
1728
|
|
2462
|
-
|
2463
|
-
_sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
|
1729
|
+
sbml_dfs_utils.validate_sbml_dfs_table(table_data, table_name)
|
2464
1730
|
|
2465
|
-
# Format compartmentalized species with names, source, and IDs
|
2466
|
-
comp_species_w_ids[SBML_DFS.SC_NAME] = [
|
2467
|
-
f"{s} [{c}]"
|
2468
|
-
for s, c in zip(
|
2469
|
-
comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
|
2470
|
-
)
|
2471
|
-
]
|
2472
|
-
comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
|
2473
|
-
comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
|
2474
|
-
range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
|
2475
|
-
)
|
2476
1731
|
|
2477
|
-
|
2478
|
-
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
|
2483
|
-
|
2484
|
-
|
2485
|
-
|
2486
|
-
|
2487
|
-
|
2488
|
-
upstream_stoichiometry,
|
2489
|
-
downstream_stoichiometry,
|
2490
|
-
downstream_sbo_name,
|
2491
|
-
extra_reactions_columns,
|
2492
|
-
):
|
1732
|
+
def sbml_dfs_from_edgelist(
|
1733
|
+
interaction_edgelist: pd.DataFrame,
|
1734
|
+
species_df: pd.DataFrame,
|
1735
|
+
compartments_df: pd.DataFrame,
|
1736
|
+
interaction_source: source.Source,
|
1737
|
+
upstream_stoichiometry: int = 0,
|
1738
|
+
downstream_stoichiometry: int = 1,
|
1739
|
+
downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
|
1740
|
+
keep_species_data: bool | str = False,
|
1741
|
+
keep_reactions_data: bool | str = False,
|
1742
|
+
) -> SBML_dfs:
|
2493
1743
|
"""
|
2494
|
-
Create
|
1744
|
+
Create SBML_dfs from interaction edgelist.
|
1745
|
+
|
1746
|
+
Combines a set of molecular interactions into a mechanistic SBML_dfs model
|
1747
|
+
by processing interaction data, species information, and compartment definitions.
|
2495
1748
|
|
2496
1749
|
Parameters
|
2497
1750
|
----------
|
2498
1751
|
interaction_edgelist : pd.DataFrame
|
2499
|
-
|
2500
|
-
|
2501
|
-
|
1752
|
+
Table containing molecular interactions with columns:
|
1753
|
+
- upstream_name : str, matches "s_name" from species_df
|
1754
|
+
- downstream_name : str, matches "s_name" from species_df
|
1755
|
+
- upstream_compartment : str, matches "c_name" from compartments_df
|
1756
|
+
- downstream_compartment : str, matches "c_name" from compartments_df
|
1757
|
+
- r_name : str, name for the interaction
|
1758
|
+
- sbo_term : str, SBO term defining interaction type
|
1759
|
+
- r_Identifiers : identifiers.Identifiers, supporting identifiers
|
1760
|
+
- r_isreversible : bool, whether reaction is reversible
|
2502
1761
|
species_df : pd.DataFrame
|
2503
|
-
|
1762
|
+
Table defining molecular species with columns:
|
1763
|
+
- s_name : str, name of molecular species
|
1764
|
+
- s_Identifiers : identifiers.Identifiers, species identifiers
|
2504
1765
|
compartments_df : pd.DataFrame
|
2505
|
-
|
1766
|
+
Table defining compartments with columns:
|
1767
|
+
- c_name : str, name of compartment
|
1768
|
+
- c_Identifiers : identifiers.Identifiers, compartment identifiers
|
2506
1769
|
interaction_source : source.Source
|
2507
|
-
Source object
|
2508
|
-
upstream_stoichiometry : int
|
2509
|
-
Stoichiometry
|
2510
|
-
downstream_stoichiometry : int
|
2511
|
-
Stoichiometry
|
2512
|
-
downstream_sbo_name : str
|
2513
|
-
SBO term
|
2514
|
-
|
2515
|
-
|
1770
|
+
Source object linking model entities to interaction source
|
1771
|
+
upstream_stoichiometry : int, default 0
|
1772
|
+
Stoichiometry of upstream species in reactions
|
1773
|
+
downstream_stoichiometry : int, default 1
|
1774
|
+
Stoichiometry of downstream species in reactions
|
1775
|
+
downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
|
1776
|
+
SBO term for downstream reactant type
|
1777
|
+
keep_species_data : bool or str, default False
|
1778
|
+
Whether to preserve extra species columns. If True, saves as 'source' label.
|
1779
|
+
If string, uses as custom label. If False, discards extra data.
|
1780
|
+
keep_reactions_data : bool or str, default False
|
1781
|
+
Whether to preserve extra reaction columns. If True, saves as 'source' label.
|
1782
|
+
If string, uses as custom label. If False, discards extra data.
|
2516
1783
|
|
2517
1784
|
Returns
|
2518
1785
|
-------
|
2519
|
-
|
2520
|
-
|
1786
|
+
SBML_dfs
|
1787
|
+
Validated SBML data structure containing compartments, species,
|
1788
|
+
compartmentalized species, reactions, and reaction species tables.
|
2521
1789
|
"""
|
2522
|
-
#
|
2523
|
-
|
2524
|
-
|
2525
|
-
.merge(species_df[SBML_DFS.S_NAME].reset_index())
|
2526
|
-
.merge(compartments_df[SBML_DFS.C_NAME].reset_index())
|
1790
|
+
# 1. Validate inputs
|
1791
|
+
sbml_dfs_utils._edgelist_validate_inputs(
|
1792
|
+
interaction_edgelist, species_df, compartments_df
|
2527
1793
|
)
|
2528
1794
|
|
2529
|
-
|
2530
|
-
|
2531
|
-
|
2532
|
-
|
2533
|
-
SBML_DFS.S_NAME: "upstream_name",
|
2534
|
-
SBML_DFS.C_NAME: "upstream_compartment",
|
2535
|
-
},
|
2536
|
-
axis=1,
|
2537
|
-
),
|
2538
|
-
how="left",
|
2539
|
-
).merge(
|
2540
|
-
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
2541
|
-
{
|
2542
|
-
SBML_DFS.SC_ID: "sc_id_down",
|
2543
|
-
SBML_DFS.S_NAME: "downstream_name",
|
2544
|
-
SBML_DFS.C_NAME: "downstream_compartment",
|
2545
|
-
},
|
2546
|
-
axis=1,
|
2547
|
-
),
|
2548
|
-
how="left",
|
2549
|
-
)[
|
2550
|
-
REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
|
2551
|
-
]
|
2552
|
-
|
2553
|
-
# Validate merge didn't create duplicates
|
2554
|
-
if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
|
2555
|
-
raise ValueError(
|
2556
|
-
f"Merging compartmentalized species resulted in row count change "
|
2557
|
-
f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
|
2558
|
-
)
|
1795
|
+
# 2. Identify which extra columns to preserve
|
1796
|
+
extra_columns = sbml_dfs_utils._edgelist_identify_extra_columns(
|
1797
|
+
interaction_edgelist, species_df, keep_reactions_data, keep_species_data
|
1798
|
+
)
|
2559
1799
|
|
2560
|
-
#
|
2561
|
-
|
2562
|
-
|
1800
|
+
# 3. Process compartments and species tables
|
1801
|
+
processed_compartments = sbml_dfs_utils._edgelist_process_compartments(
|
1802
|
+
compartments_df, interaction_source
|
1803
|
+
)
|
1804
|
+
processed_species, species_data = sbml_dfs_utils._edgelist_process_species(
|
1805
|
+
species_df, interaction_source, extra_columns["species"]
|
2563
1806
|
)
|
2564
1807
|
|
2565
|
-
# Create
|
2566
|
-
|
2567
|
-
|
2568
|
-
|
2569
|
-
|
2570
|
-
|
2571
|
-
SBML_DFS.R_IDENTIFIERS,
|
2572
|
-
SBML_DFS.R_SOURCE,
|
2573
|
-
SBML_DFS.R_ISREVERSIBLE,
|
2574
|
-
]
|
2575
|
-
|
2576
|
-
reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
|
2577
|
-
reactions_columns + extra_reactions_columns
|
2578
|
-
]
|
2579
|
-
|
2580
|
-
# Separate extra data
|
2581
|
-
reactions_data = reactions_df[extra_reactions_columns]
|
2582
|
-
reactions_df = reactions_df[reactions_columns]
|
2583
|
-
|
2584
|
-
# Create reaction species relationships - NOW r_id exists
|
2585
|
-
reaction_species_df = pd.concat(
|
2586
|
-
[
|
2587
|
-
# Upstream species (modifiers/stimulators/inhibitors)
|
2588
|
-
interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
|
2589
|
-
.assign(stoichiometry=upstream_stoichiometry)
|
2590
|
-
.rename({"sc_id_up": "sc_id"}, axis=1),
|
2591
|
-
# Downstream species (products)
|
2592
|
-
interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
|
2593
|
-
.assign(
|
2594
|
-
stoichiometry=downstream_stoichiometry,
|
2595
|
-
sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
|
2596
|
-
)
|
2597
|
-
.rename({"sc_id_down": "sc_id"}, axis=1),
|
2598
|
-
]
|
1808
|
+
# 4. Create compartmentalized species
|
1809
|
+
comp_species = sbml_dfs_utils._edgelist_create_compartmentalized_species(
|
1810
|
+
interaction_edgelist,
|
1811
|
+
processed_species,
|
1812
|
+
processed_compartments,
|
1813
|
+
interaction_source,
|
2599
1814
|
)
|
2600
1815
|
|
2601
|
-
|
2602
|
-
|
1816
|
+
# 5. Create reactions and reaction species
|
1817
|
+
reactions, reaction_species, reactions_data = (
|
1818
|
+
sbml_dfs_utils._edgelist_create_reactions_and_species(
|
1819
|
+
interaction_edgelist,
|
1820
|
+
comp_species,
|
1821
|
+
processed_species,
|
1822
|
+
processed_compartments,
|
1823
|
+
interaction_source,
|
1824
|
+
upstream_stoichiometry,
|
1825
|
+
downstream_stoichiometry,
|
1826
|
+
downstream_sbo_name,
|
1827
|
+
extra_columns["reactions"],
|
1828
|
+
)
|
2603
1829
|
)
|
2604
1830
|
|
2605
|
-
|
1831
|
+
# 6. Assemble final SBML_dfs object
|
1832
|
+
sbml_dfs = _edgelist_assemble_sbml_model(
|
1833
|
+
processed_compartments,
|
1834
|
+
processed_species,
|
1835
|
+
comp_species,
|
1836
|
+
reactions,
|
1837
|
+
reaction_species,
|
1838
|
+
species_data,
|
1839
|
+
reactions_data,
|
1840
|
+
keep_species_data,
|
1841
|
+
keep_reactions_data,
|
1842
|
+
extra_columns,
|
1843
|
+
)
|
2606
1844
|
|
2607
|
-
return
|
1845
|
+
return sbml_dfs
|
2608
1846
|
|
2609
1847
|
|
2610
1848
|
def _edgelist_assemble_sbml_model(
|
2611
|
-
compartments,
|
2612
|
-
species,
|
2613
|
-
comp_species,
|
2614
|
-
reactions,
|
2615
|
-
reaction_species,
|
1849
|
+
compartments: pd.DataFrame,
|
1850
|
+
species: pd.DataFrame,
|
1851
|
+
comp_species: pd.DataFrame,
|
1852
|
+
reactions: pd.DataFrame,
|
1853
|
+
reaction_species: pd.DataFrame,
|
2616
1854
|
species_data,
|
2617
1855
|
reactions_data,
|
2618
1856
|
keep_species_data,
|
2619
1857
|
keep_reactions_data,
|
2620
|
-
extra_columns,
|
2621
|
-
):
|
1858
|
+
extra_columns: dict[str, list[str]],
|
1859
|
+
) -> SBML_dfs:
|
2622
1860
|
"""
|
2623
1861
|
Assemble the final SBML_dfs object.
|
2624
1862
|
|
@@ -2675,128 +1913,3 @@ def _edgelist_assemble_sbml_model(
|
|
2675
1913
|
sbml_model.validate()
|
2676
1914
|
|
2677
1915
|
return sbml_model
|
2678
|
-
|
2679
|
-
|
2680
|
-
def _sbml_dfs_from_edgelist_check_cspecies_merge(
|
2681
|
-
merged_species: pd.DataFrame, original_species: pd.DataFrame
|
2682
|
-
) -> None:
|
2683
|
-
"""Check for a mismatch between the provided species data and species implied by the edgelist."""
|
2684
|
-
|
2685
|
-
# check for 1-many merge
|
2686
|
-
if merged_species.shape[0] != original_species.shape[0]:
|
2687
|
-
raise ValueError(
|
2688
|
-
"Merging compartmentalized species to species_df"
|
2689
|
-
" and compartments_df by names resulted in an "
|
2690
|
-
f"increase in the tables from {original_species.shape[0]}"
|
2691
|
-
f" to {merged_species.shape[0]} indicating that names were"
|
2692
|
-
" not unique"
|
2693
|
-
)
|
2694
|
-
|
2695
|
-
# check for missing species and compartments
|
2696
|
-
missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
|
2697
|
-
SBML_DFS.C_NAME
|
2698
|
-
].unique()
|
2699
|
-
if len(missing_compartments) >= 1:
|
2700
|
-
raise ValueError(
|
2701
|
-
f"{len(missing_compartments)} compartments were present in"
|
2702
|
-
' "interaction_edgelist" but not "compartments_df":'
|
2703
|
-
f" {', '.join(missing_compartments)}"
|
2704
|
-
)
|
2705
|
-
|
2706
|
-
missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
|
2707
|
-
SBML_DFS.S_NAME
|
2708
|
-
].unique()
|
2709
|
-
if len(missing_species) >= 1:
|
2710
|
-
raise ValueError(
|
2711
|
-
f"{len(missing_species)} species were present in "
|
2712
|
-
'"interaction_edgelist" but not "species_df":'
|
2713
|
-
f" {', '.join(missing_species)}"
|
2714
|
-
)
|
2715
|
-
|
2716
|
-
return None
|
2717
|
-
|
2718
|
-
|
2719
|
-
def _stub_compartments(
|
2720
|
-
stubbed_compartment: str = GENERIC_COMPARTMENT,
|
2721
|
-
) -> pd.DataFrame:
|
2722
|
-
"""Stub Compartments
|
2723
|
-
|
2724
|
-
Create a compartments table with only a single compartment
|
2725
|
-
|
2726
|
-
Args:
|
2727
|
-
stubbed_compartment (str): the name of a compartment which should match the
|
2728
|
-
keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
|
2729
|
-
|
2730
|
-
Returns:
|
2731
|
-
compartments_df (pd.DataFrame): compartments dataframe
|
2732
|
-
"""
|
2733
|
-
|
2734
|
-
if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
|
2735
|
-
raise ValueError(
|
2736
|
-
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
|
2737
|
-
)
|
2738
|
-
|
2739
|
-
if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
|
2740
|
-
raise ValueError(
|
2741
|
-
f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
|
2742
|
-
)
|
2743
|
-
|
2744
|
-
stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
|
2745
|
-
|
2746
|
-
formatted_uri = identifiers.format_uri(
|
2747
|
-
uri=identifiers.create_uri_url(
|
2748
|
-
ontology=ONTOLOGIES.GO,
|
2749
|
-
identifier=stubbed_compartment_id,
|
2750
|
-
),
|
2751
|
-
biological_qualifier_type=BQB.IS,
|
2752
|
-
)
|
2753
|
-
|
2754
|
-
compartments_df = pd.DataFrame(
|
2755
|
-
{
|
2756
|
-
SBML_DFS.C_NAME: [stubbed_compartment],
|
2757
|
-
SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
|
2758
|
-
}
|
2759
|
-
)
|
2760
|
-
compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
|
2761
|
-
compartments_df.index.name = SBML_DFS.C_ID
|
2762
|
-
|
2763
|
-
return compartments_df
|
2764
|
-
|
2765
|
-
|
2766
|
-
def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
|
2767
|
-
"""Validates a table against a reference
|
2768
|
-
|
2769
|
-
This check if the table has the same index, no duplicates in the index
|
2770
|
-
and that all values in the index are in the reference table.
|
2771
|
-
|
2772
|
-
Args:
|
2773
|
-
data_table (pd.DataFrame): a table with data that should
|
2774
|
-
match the reference
|
2775
|
-
ref_table (pd.DataFrame): a reference table
|
2776
|
-
|
2777
|
-
Raises:
|
2778
|
-
ValueError: not same index name
|
2779
|
-
ValueError: index contains duplicates
|
2780
|
-
ValueError: index not subset of index of reactions table
|
2781
|
-
"""
|
2782
|
-
ref_index_name = ref_table.index.name
|
2783
|
-
if data_table.index.name != ref_index_name:
|
2784
|
-
raise ValueError(
|
2785
|
-
"the index name for reaction data table was not"
|
2786
|
-
f" {ref_index_name}: {data_table.index.name}"
|
2787
|
-
)
|
2788
|
-
ids = data_table.index
|
2789
|
-
if any(ids.duplicated()):
|
2790
|
-
raise ValueError(
|
2791
|
-
"the index for reaction data table " "contained duplicate values"
|
2792
|
-
)
|
2793
|
-
if not all(ids.isin(ref_table.index)):
|
2794
|
-
raise ValueError(
|
2795
|
-
"the index for reaction data table contained values"
|
2796
|
-
" not found in the reactions table"
|
2797
|
-
)
|
2798
|
-
if not isinstance(data_table, pd.DataFrame):
|
2799
|
-
raise TypeError(
|
2800
|
-
f"The data table was type {type(data_table).__name__}"
|
2801
|
-
" but must be a pd.DataFrame"
|
2802
|
-
)
|