napistu 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. napistu/__main__.py +38 -27
  2. napistu/consensus.py +22 -27
  3. napistu/constants.py +91 -65
  4. napistu/context/filtering.py +2 -1
  5. napistu/identifiers.py +3 -6
  6. napistu/indices.py +3 -1
  7. napistu/ingestion/bigg.py +6 -6
  8. napistu/ingestion/sbml.py +298 -295
  9. napistu/ingestion/string.py +16 -19
  10. napistu/ingestion/trrust.py +22 -27
  11. napistu/ingestion/yeast.py +2 -1
  12. napistu/matching/interactions.py +4 -4
  13. napistu/matching/species.py +1 -1
  14. napistu/modify/uncompartmentalize.py +1 -1
  15. napistu/network/net_create.py +1 -1
  16. napistu/network/paths.py +1 -1
  17. napistu/ontologies/dogma.py +2 -1
  18. napistu/ontologies/genodexito.py +5 -1
  19. napistu/ontologies/renaming.py +4 -0
  20. napistu/sbml_dfs_core.py +1343 -2167
  21. napistu/sbml_dfs_utils.py +1086 -143
  22. napistu/utils.py +52 -41
  23. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/METADATA +2 -2
  24. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/RECORD +40 -40
  25. tests/conftest.py +113 -13
  26. tests/test_consensus.py +161 -4
  27. tests/test_context_filtering.py +2 -2
  28. tests/test_gaps.py +26 -15
  29. tests/test_network_net_create.py +1 -1
  30. tests/test_network_precompute.py +1 -1
  31. tests/test_ontologies_genodexito.py +3 -0
  32. tests/test_ontologies_mygene.py +3 -0
  33. tests/test_ontologies_renaming.py +28 -24
  34. tests/test_sbml_dfs_core.py +260 -211
  35. tests/test_sbml_dfs_utils.py +194 -36
  36. tests/test_utils.py +19 -0
  37. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/WHEEL +0 -0
  38. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/entry_points.txt +0 -0
  39. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/licenses/LICENSE +0 -0
  40. {napistu-0.3.5.dist-info → napistu-0.3.7.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_core.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import copy
3
4
  import logging
4
5
  import re
5
6
  from typing import Any
@@ -7,8 +8,12 @@ from typing import Iterable
7
8
  from typing import Mapping
8
9
  from typing import MutableMapping
9
10
  from typing import TYPE_CHECKING
11
+ from typing import Optional
12
+ from typing import Union
10
13
 
14
+ from fs import open_fs
11
15
  import pandas as pd
16
+
12
17
  from napistu import identifiers
13
18
  from napistu import sbml_dfs_utils
14
19
  from napistu import source
@@ -17,25 +22,14 @@ from napistu.ingestion import sbml
17
22
  from napistu.constants import SBML_DFS
18
23
  from napistu.constants import SBML_DFS_SCHEMA
19
24
  from napistu.constants import IDENTIFIERS
20
- from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
21
- from napistu.constants import CPR_STANDARD_OUTPUTS
22
- from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
25
+ from napistu.constants import NAPISTU_STANDARD_OUTPUTS
23
26
  from napistu.constants import BQB_PRIORITIES
24
27
  from napistu.constants import ONTOLOGY_PRIORITIES
25
- from napistu.constants import BQB
26
- from napistu.constants import BQB_DEFINING_ATTRS
27
28
  from napistu.constants import MINI_SBO_FROM_NAME
28
29
  from napistu.constants import MINI_SBO_TO_NAME
29
- from napistu.constants import ONTOLOGIES
30
- from napistu.constants import SBO_NAME_TO_ROLE
31
30
  from napistu.constants import SBOTERM_NAMES
32
- from napistu.constants import SBO_ROLES_DEFS
33
31
  from napistu.constants import ENTITIES_W_DATA
34
32
  from napistu.constants import ENTITIES_TO_ENTITY_DATA
35
- from napistu.ingestion.constants import GENERIC_COMPARTMENT
36
- from napistu.ingestion.constants import COMPARTMENT_ALIASES
37
- from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
38
- from fs import open_fs
39
33
 
40
34
  logger = logging.getLogger(__name__)
41
35
 
@@ -65,26 +59,80 @@ class SBML_dfs:
65
59
  schema : dict
66
60
  Dictionary representing the structure of the other attributes and meaning of their variables
67
61
 
68
- Methods
69
- -------
70
- get_table(entity_type, required_attributes)
71
- Get a table from the SBML_dfs object with optional attribute validation
72
- search_by_ids(ids, entity_type, identifiers_df, ontologies)
73
- Find entities and identifiers matching a set of query IDs
74
- search_by_name(name, entity_type, partial_match)
75
- Find entities by exact or partial name match
62
+ Public Methods (alphabetical)
63
+ ----------------------------
64
+ add_reactions_data(label, data)
65
+ Add a new reactions data table to the model with validation.
66
+ add_species_data(label, data)
67
+ Add a new species data table to the model with validation.
68
+ copy()
69
+ Return a deep copy of the SBML_dfs object.
70
+ export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
71
+ Export the SBML_dfs model and its tables to files in a specified directory.
72
+ get_characteristic_species_ids(dogmatic=True)
73
+ Return characteristic systematic identifiers for molecular species, optionally using a strict or loose definition.
76
74
  get_cspecies_features()
77
- Get additional attributes of compartmentalized species
78
- get_species_features()
79
- Get additional attributes of species
75
+ Compute and return additional features for compartmentalized species, such as degree and type.
80
76
  get_identifiers(id_type)
81
- Get identifiers from a specified entity type
82
- get_uri_urls(entity_type, entity_ids)
83
- Get reference URLs for specified entities
77
+ Retrieve a table of identifiers for a specified entity type (e.g., species or reactions).
78
+ get_network_summary()
79
+ Return a dictionary of diagnostic statistics summarizing the network structure.
80
+ get_species_features()
81
+ Compute and return additional features for species, such as species type.
82
+ get_table(entity_type, required_attributes=None)
83
+ Retrieve a table for a given entity type, optionally validating required attributes.
84
+ get_uri_urls(entity_type, entity_ids=None, required_ontology=None)
85
+ Return reference URLs for specified entities, optionally filtered by ontology.
86
+ infer_sbo_terms()
87
+ Infer and fill in missing SBO terms for reaction species based on stoichiometry.
88
+ infer_uncompartmentalized_species_location()
89
+ Infer and assign compartments for compartmentalized species with missing compartment information.
90
+ name_compartmentalized_species()
91
+ Rename compartmentalized species to include compartment information if needed.
92
+ reaction_formulas(r_ids=None)
93
+ Generate human-readable reaction formulas for specified reactions.
94
+ reaction_summaries(r_ids=None)
95
+ Return a summary DataFrame for specified reactions, including names and formulas.
96
+ remove_compartmentalized_species(sc_ids)
97
+ Remove specified compartmentalized species and associated reactions from the model.
98
+ remove_reactions(r_ids, remove_species=False)
99
+ Remove specified reactions and optionally remove unused species.
100
+ remove_reactions_data(label)
101
+ Remove a reactions data table by label.
102
+ remove_species_data(label)
103
+ Remove a species data table by label.
104
+ search_by_ids(ids, entity_type, identifiers_df, ontologies=None)
105
+ Find entities and identifiers matching a set of query IDs.
106
+ search_by_name(name, entity_type, partial_match=True)
107
+ Find entities by exact or partial name match.
108
+ select_species_data(species_data_table)
109
+ Select a species data table from the SBML_dfs object by name.
110
+ species_status(s_id)
111
+ Return all reactions a species participates in, with stoichiometry and formula information.
84
112
  validate()
85
- Validate the SBML_dfs structure and relationships
113
+ Validate the SBML_dfs structure and relationships.
86
114
  validate_and_resolve()
87
- Validate and attempt to automatically fix common issues
115
+ Validate and attempt to automatically fix common issues.
116
+
117
+ Private/Hidden Methods (alphabetical, appear after public methods)
118
+ -----------------------------------------------------------------
119
+ _attempt_resolve(e)
120
+ _find_underspecified_reactions_by_scids(sc_ids)
121
+ _get_unused_cspecies()
122
+ _get_unused_species()
123
+ _remove_compartmentalized_species(sc_ids)
124
+ _remove_entity_data(entity_type, label)
125
+ _remove_species(s_ids)
126
+ _remove_unused_cspecies()
127
+ _remove_unused_species()
128
+ _validate_identifiers()
129
+ _validate_pk_fk_correspondence()
130
+ _validate_r_ids(r_ids)
131
+ _validate_reaction_species()
132
+ _validate_reactions_data(reactions_data_table)
133
+ _validate_sources()
134
+ _validate_species_data(species_data_table)
135
+ _validate_table(table_name)
88
136
  """
89
137
 
90
138
  compartments: pd.DataFrame
@@ -162,193 +210,187 @@ class SBML_dfs:
162
210
  '"validate" = False so "resolve" will be ignored (eventhough it was True)'
163
211
  )
164
212
 
165
- def get_table(
166
- self, entity_type: str, required_attributes: None | set[str] = None
167
- ) -> pd.DataFrame:
213
+ # =============================================================================
214
+ # PUBLIC METHODS (ALPHABETICAL ORDER)
215
+ # =============================================================================
216
+
217
+ def add_reactions_data(self, label: str, data: pd.DataFrame):
168
218
  """
169
- Get a table from the SBML_dfs object with optional attribute validation.
219
+ Add additional reaction data with validation.
170
220
 
171
221
  Parameters
172
222
  ----------
173
- entity_type : str
174
- The type of entity table to retrieve (e.g., 'species', 'reactions')
175
- required_attributes : Optional[Set[str]], optional
176
- Set of attributes that must be present in the table, by default None.
177
- Must be passed as a set, e.g. {'id'}, not a string.
178
-
179
- Returns
180
- -------
181
- pd.DataFrame
182
- The requested table
223
+ label : str
224
+ Label for the new data
225
+ data : pd.DataFrame
226
+ Data to add, must be indexed by reaction_id
183
227
 
184
228
  Raises
185
229
  ------
186
230
  ValueError
187
- If entity_type is invalid or required attributes are missing
188
- TypeError
189
- If required_attributes is not a set
231
+ If the data is invalid or label already exists
190
232
  """
191
-
192
- schema = self.schema
193
-
194
- if entity_type not in schema.keys():
233
+ self._validate_reactions_data(data)
234
+ if label in self.reactions_data:
195
235
  raise ValueError(
196
- f"{entity_type} does not match a table in the SBML_dfs object. The tables "
197
- f"which are present are {', '.join(schema.keys())}"
236
+ f"{label} already exists in reactions_data. " "Drop it first."
198
237
  )
238
+ self.reactions_data[label] = data
199
239
 
200
- if required_attributes is not None:
201
- if not isinstance(required_attributes, set):
202
- raise TypeError(
203
- f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
204
- "Did you pass a string instead of a set?"
205
- )
240
+ def add_species_data(self, label: str, data: pd.DataFrame):
241
+ """
242
+ Add additional species data with validation.
206
243
 
207
- # determine whether required_attributes are appropriate
208
- VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
209
- invalid_required_attributes = required_attributes.difference(
210
- VALID_REQUIRED_ATTRIBUTES
211
- )
244
+ Parameters
245
+ ----------
246
+ label : str
247
+ Label for the new data
248
+ data : pd.DataFrame
249
+ Data to add, must be indexed by species_id
212
250
 
213
- if len(invalid_required_attributes) > 0:
214
- raise ValueError(
215
- f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
216
- f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
217
- )
251
+ Raises
252
+ ------
253
+ ValueError
254
+ If the data is invalid or label already exists
255
+ """
256
+ self._validate_species_data(data)
257
+ if label in self.species_data:
258
+ raise ValueError(
259
+ f"{label} already exists in species_data. " "Drop it first."
260
+ )
261
+ self.species_data[label] = data
218
262
 
219
- # determine if required_attributes are satisified
220
- invalid_attrs = [
221
- s for s in required_attributes if s not in schema[entity_type].keys()
222
- ]
223
- if len(invalid_attrs) > 0:
224
- raise ValueError(
225
- f"The following required attributes are not present for the {entity_type} table: "
226
- f"{', '.join(invalid_attrs)}."
227
- )
263
+ def copy(self):
264
+ """
265
+ Return a deep copy of the SBML_dfs object.
228
266
 
229
- return getattr(self, entity_type)
267
+ Returns
268
+ -------
269
+ SBML_dfs
270
+ A deep copy of the current SBML_dfs object.
271
+ """
272
+ return copy.deepcopy(self)
230
273
 
231
- def search_by_ids(
274
+ def export_sbml_dfs(
232
275
  self,
233
- ids: list[str],
234
- entity_type: str,
235
- identifiers_df: pd.DataFrame,
236
- ontologies: None | set[str] = None,
237
- ) -> tuple[pd.DataFrame, pd.DataFrame]:
276
+ model_prefix: str,
277
+ outdir: str,
278
+ overwrite: bool = False,
279
+ dogmatic: bool = True,
280
+ ) -> None:
238
281
  """
239
- Find entities and identifiers matching a set of query IDs.
282
+ Export SBML_dfs
240
283
 
241
- Parameters
242
- ----------
243
- ids : List[str]
244
- List of identifiers to search for
245
- entity_type : str
246
- Type of entity to search (e.g., 'species', 'reactions')
247
- identifiers_df : pd.DataFrame
248
- DataFrame containing identifier mappings
249
- ontologies : Optional[Set[str]], optional
250
- Set of ontologies to filter by, by default None
284
+ Export summaries of species identifiers and each table underlying
285
+ an SBML_dfs pathway model
286
+
287
+ Params
288
+ ------
289
+ model_prefix: str
290
+ Label to prepend to all exported files
291
+ outdir: str
292
+ Path to an existing directory where results should be saved
293
+ overwrite: bool
294
+ Should the directory be overwritten if it already exists?
295
+ dogmatic: bool
296
+ If True then treat genes, transcript, and proteins as separate species. If False
297
+ then treat them interchangeably.
251
298
 
252
299
  Returns
253
300
  -------
254
- Tuple[pd.DataFrame, pd.DataFrame]
255
- - Matching entities
256
- - Matching identifiers
257
-
258
- Raises
259
- ------
260
- ValueError
261
- If entity_type is invalid or ontologies are invalid
262
- TypeError
263
- If ontologies is not a set
301
+ None
264
302
  """
265
- # validate inputs
266
- entity_table = self.get_table(entity_type, required_attributes={"id"})
267
- entity_pk = self.schema[entity_type]["pk"]
303
+ if not isinstance(model_prefix, str):
304
+ raise TypeError(
305
+ f"model_prefix was a {type(model_prefix)} " "and must be a str"
306
+ )
307
+ if not isinstance(self, SBML_dfs):
308
+ raise TypeError(
309
+ f"sbml_dfs was a {type(self)} and must" " be an sbml.SBML_dfs"
310
+ )
268
311
 
269
- utils.match_pd_vars(
270
- identifiers_df,
271
- req_vars={
272
- entity_pk,
273
- IDENTIFIERS.ONTOLOGY,
274
- IDENTIFIERS.IDENTIFIER,
275
- IDENTIFIERS.URL,
276
- IDENTIFIERS.BQB,
277
- },
278
- allow_series=False,
279
- ).assert_present()
312
+ # filter to identifiers which make sense when mapping from ids -> species
313
+ species_identifiers = self.get_characteristic_species_ids(dogmatic=dogmatic)
280
314
 
281
- if ontologies is not None:
282
- if not isinstance(ontologies, set):
283
- # for clarity this should not be reachable based on type hints
284
- raise TypeError(
285
- f"ontologies must be a set, but got {type(ontologies).__name__}"
286
- )
287
- ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
288
- invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
289
- if len(invalid_ontologies) > 0:
290
- raise ValueError(
291
- f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
292
- f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
315
+ try:
316
+ utils.initialize_dir(outdir, overwrite=overwrite)
317
+ except FileExistsError:
318
+ logger.warning(
319
+ f"Directory {outdir} already exists and overwrite is False. "
320
+ "Files will be added to the existing directory."
321
+ )
322
+ with open_fs(outdir, writeable=True) as fs:
323
+ species_identifiers_path = (
324
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
325
+ )
326
+ with fs.openbin(species_identifiers_path, "w") as f:
327
+ species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
328
+ f, sep="\t", index=False
293
329
  )
294
330
 
295
- # fitler to just to identifiers matchign the ontologies of interest
296
- identifiers_df = identifiers_df.query("ontology in @ontologies")
331
+ # export jsons
332
+ species_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.SPECIES
333
+ reactions_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTIONS
334
+ reation_species_path = (
335
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.REACTION_SPECIES
336
+ )
337
+ compartments_path = model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTS
338
+ compartmentalized_species_path = (
339
+ model_prefix + NAPISTU_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
340
+ )
341
+ with fs.openbin(species_path, "w") as f:
342
+ self.species[[SBML_DFS.S_NAME]].to_json(f)
297
343
 
298
- matching_identifiers = identifiers_df.loc[
299
- identifiers_df["identifier"].isin(ids)
300
- ]
301
- entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
344
+ with fs.openbin(reactions_path, "w") as f:
345
+ self.reactions[[SBML_DFS.R_NAME]].to_json(f)
302
346
 
303
- return entity_subset, matching_identifiers
347
+ with fs.openbin(reation_species_path, "w") as f:
348
+ self.reaction_species.to_json(f)
304
349
 
305
- def search_by_name(
306
- self, name: str, entity_type: str, partial_match: bool = True
307
- ) -> pd.DataFrame:
350
+ with fs.openbin(compartments_path, "w") as f:
351
+ self.compartments[[SBML_DFS.C_NAME]].to_json(f)
352
+
353
+ with fs.openbin(compartmentalized_species_path, "w") as f:
354
+ self.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
355
+ f
356
+ )
357
+
358
+ return None
359
+
360
+ def get_characteristic_species_ids(self, dogmatic: bool = True) -> pd.DataFrame:
308
361
  """
309
- Find entities by exact or partial name match.
362
+ Get Characteristic Species IDs
363
+
364
+ List the systematic identifiers which are characteristic of molecular species, e.g., excluding subcomponents, and optionally, treating proteins, transcripts, and genes equiavlently.
310
365
 
311
366
  Parameters
312
367
  ----------
313
- name : str
314
- Name to search for
315
- entity_type : str
316
- Type of entity to search (e.g., 'species', 'reactions')
317
- partial_match : bool, optional
318
- Whether to allow partial string matches, by default True
368
+ sbml_dfs : sbml_dfs_core.SBML_dfs
369
+ The SBML_dfs object.
370
+ dogmatic : bool, default=True
371
+ Whether to use the dogmatic flag to determine which BQB attributes are valid.
319
372
 
320
373
  Returns
321
374
  -------
322
375
  pd.DataFrame
323
- Matching entities
376
+ A DataFrame containing the systematic identifiers which are characteristic of molecular species.
324
377
  """
325
- entity_table = self.get_table(entity_type, required_attributes={"label"})
326
- label_attr = self.schema[entity_type]["label"]
327
378
 
328
- if partial_match:
329
- matches = entity_table.loc[
330
- entity_table[label_attr].str.contains(name, case=False)
331
- ]
332
- else:
333
- matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
334
- return matches
379
+ # select valid BQB attributes based on dogmatic flag
380
+ defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(
381
+ dogmatic
382
+ )
335
383
 
336
- def get_species_features(self) -> pd.DataFrame:
337
- """
338
- Get additional attributes of species.
384
+ # pre-summarize ontologies
385
+ species_identifiers = self.get_identifiers(SBML_DFS.SPECIES)
339
386
 
340
- Returns
341
- -------
342
- pd.DataFrame
343
- Species with additional features including:
344
- - species_type: Classification of the species (e.g., metabolite, protein)
345
- """
346
- species = self.species
347
- augmented_species = species.assign(
348
- **{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
387
+ # drop some BQB_HAS_PART annotations
388
+ species_identifiers = sbml_dfs_utils.filter_to_characteristic_species_ids(
389
+ species_identifiers,
390
+ defining_biological_qualifiers=defining_biological_qualifiers,
349
391
  )
350
392
 
351
- return augmented_species
393
+ return species_identifiers
352
394
 
353
395
  def get_cspecies_features(self) -> pd.DataFrame:
354
396
  """
@@ -414,7 +456,7 @@ class SBML_dfs:
414
456
  If id_type is invalid or identifiers are malformed
415
457
  """
416
458
  selected_table = self.get_table(id_type, {"id"})
417
- schema = self.schema
459
+ schema = SBML_DFS_SCHEMA.SCHEMA
418
460
 
419
461
  identifiers_dict = dict()
420
462
  for sysid in selected_table.index:
@@ -432,6 +474,7 @@ class SBML_dfs:
432
474
  if not identifiers_dict:
433
475
  # Return empty DataFrame with expected columns if nothing found
434
476
  return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
477
+
435
478
  identifiers_tbl = pd.concat(identifiers_dict)
436
479
 
437
480
  identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
@@ -445,113 +488,28 @@ class SBML_dfs:
445
488
 
446
489
  return named_identifiers
447
490
 
448
- def get_uri_urls(
449
- self,
450
- entity_type: str,
451
- entity_ids: Iterable[str] | None = None,
452
- required_ontology: str | None = None,
453
- ) -> pd.Series:
491
+ def get_network_summary(self) -> Mapping[str, Any]:
454
492
  """
455
- Get reference URLs for specified entities.
456
-
457
- Parameters
458
- ----------
459
- entity_type : str
460
- Type of entity to get URLs for (e.g., 'species', 'reactions')
461
- entity_ids : Optional[Iterable[str]], optional
462
- Specific entities to get URLs for, by default None (all entities)
463
- required_ontology : Optional[str], optional
464
- Specific ontology to get URLs from, by default None
493
+ Get diagnostic statistics about the network.
465
494
 
466
495
  Returns
467
496
  -------
468
- pd.Series
469
- Series mapping entity IDs to their reference URLs
470
-
471
- Raises
472
- ------
473
- ValueError
474
- If entity_type is invalid
475
- """
476
- schema = self.schema
477
-
478
- # valid entities and their identifier variables
479
- valid_entity_types = [
480
- SBML_DFS.COMPARTMENTS,
481
- SBML_DFS.SPECIES,
482
- SBML_DFS.REACTIONS,
483
- ]
484
-
485
- if entity_type not in valid_entity_types:
486
- raise ValueError(
487
- f"{entity_type} is an invalid entity_type; valid types "
488
- f"are {', '.join(valid_entity_types)}"
489
- )
490
-
491
- entity_table = getattr(self, entity_type)
492
-
493
- if entity_ids is not None:
494
- # ensure that entity_ids are unique and then convert back to list
495
- # to support pandas indexing
496
- entity_ids = list(set(entity_ids))
497
-
498
- # filter to a subset of identifiers if one is provided
499
- entity_table = entity_table.loc[entity_ids]
500
-
501
- # create a dataframe of all identifiers for the select entities
502
- all_ids = pd.concat(
503
- [
504
- sbml_dfs_utils._stub_ids(
505
- entity_table[schema[entity_type]["id"]].iloc[i].ids
506
- ).assign(id=entity_table.index[i])
507
- for i in range(0, entity_table.shape[0])
508
- ]
509
- ).rename(columns={"id": schema[entity_type]["pk"]})
510
-
511
- # set priorities for ontologies and bqb terms
512
-
513
- if required_ontology is None:
514
- all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
515
- ONTOLOGY_PRIORITIES, how="left"
516
- )
517
- else:
518
- ontology_priorities = pd.DataFrame(
519
- [{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
520
- )
521
- # if only a single ontology is sought then just return matching entries
522
- all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
523
- ontology_priorities, how="inner"
524
- )
525
-
526
- uri_urls = (
527
- all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
528
- .groupby(schema[entity_type]["pk"])
529
- .first()[IDENTIFIERS.URL]
530
- )
531
- return uri_urls
532
-
533
- def get_network_summary(self) -> Mapping[str, Any]:
534
- """
535
- Get diagnostic statistics about the network.
536
-
537
- Returns
538
- -------
539
- Mapping[str, Any]
540
- Dictionary of diagnostic statistics including:
541
- - n_species_types: Number of species types
542
- - dict_n_species_per_type: Number of species per type
543
- - n_species: Number of species
544
- - n_cspecies: Number of compartmentalized species
545
- - n_reaction_species: Number of reaction species
546
- - n_reactions: Number of reactions
547
- - n_compartments: Number of compartments
548
- - dict_n_species_per_compartment: Number of species per compartment
549
- - stats_species_per_reaction: Statistics on reactands per reaction
550
- - top10_species_per_reaction: Top 10 reactions by number of reactands
551
- - stats_degree: Statistics on species connectivity
552
- - top10_degree: Top 10 species by connectivity
553
- - stats_identifiers_per_species: Statistics on identifiers per species
554
- - top10_identifiers_per_species: Top 10 species by number of identifiers
497
+ Mapping[str, Any]
498
+ Dictionary of diagnostic statistics including:
499
+ - n_species_types: Number of species types
500
+ - dict_n_species_per_type: Number of species per type
501
+ - n_species: Number of species
502
+ - n_cspecies: Number of compartmentalized species
503
+ - n_reaction_species: Number of reaction species
504
+ - n_reactions: Number of reactions
505
+ - n_compartments: Number of compartments
506
+ - dict_n_species_per_compartment: Number of species per compartment
507
+ - stats_species_per_reaction: Statistics on reactands per reaction
508
+ - top10_species_per_reaction: Top 10 reactions by number of reactands
509
+ - stats_degree: Statistics on species connectivity
510
+ - top10_degree: Top 10 species by connectivity
511
+ - stats_identifiers_per_species: Statistics on identifiers per species
512
+ - top10_identifiers_per_species: Top 10 species by number of identifiers
555
513
  """
556
514
  stats: MutableMapping[str, Any] = {}
557
515
  species_features = self.get_species_features()
@@ -616,2009 +574,1352 @@ class SBML_dfs:
616
574
 
617
575
  return stats
618
576
 
619
- def add_species_data(self, label: str, data: pd.DataFrame):
577
+ def get_species_features(self) -> pd.DataFrame:
620
578
  """
621
- Add additional species data with validation.
622
-
623
- Parameters
624
- ----------
625
- label : str
626
- Label for the new data
627
- data : pd.DataFrame
628
- Data to add, must be indexed by species_id
579
+ Get additional attributes of species.
629
580
 
630
- Raises
631
- ------
632
- ValueError
633
- If the data is invalid or label already exists
581
+ Returns
582
+ -------
583
+ pd.DataFrame
584
+ Species with additional features including:
585
+ - species_type: Classification of the species (e.g., metabolite, protein)
634
586
  """
635
- self._validate_species_data(data)
636
- if label in self.species_data:
637
- raise ValueError(
638
- f"{label} already exists in species_data. " "Drop it first."
639
- )
640
- self.species_data[label] = data
587
+ species = self.species
588
+ augmented_species = species.assign(
589
+ **{
590
+ "species_type": lambda d: d["s_Identifiers"].apply(
591
+ sbml_dfs_utils.species_type_types
592
+ )
593
+ }
594
+ )
641
595
 
642
- def remove_species_data(self, label: str):
643
- """
644
- Remove species data by label.
645
- """
646
- self._remove_entity_data(SBML_DFS.SPECIES, label)
596
+ return augmented_species
647
597
 
648
- def add_reactions_data(self, label: str, data: pd.DataFrame):
598
+ def get_table(
599
+ self, entity_type: str, required_attributes: None | set[str] = None
600
+ ) -> pd.DataFrame:
649
601
  """
650
- Add additional reaction data with validation.
602
+ Get a table from the SBML_dfs object with optional attribute validation.
651
603
 
652
604
  Parameters
653
605
  ----------
654
- label : str
655
- Label for the new data
656
- data : pd.DataFrame
657
- Data to add, must be indexed by reaction_id
606
+ entity_type : str
607
+ The type of entity table to retrieve (e.g., 'species', 'reactions')
608
+ required_attributes : Optional[Set[str]], optional
609
+ Set of attributes that must be present in the table, by default None.
610
+ Must be passed as a set, e.g. {'id'}, not a string.
611
+
612
+ Returns
613
+ -------
614
+ pd.DataFrame
615
+ The requested table
658
616
 
659
617
  Raises
660
618
  ------
661
619
  ValueError
662
- If the data is invalid or label already exists
620
+ If entity_type is invalid or required attributes are missing
621
+ TypeError
622
+ If required_attributes is not a set
663
623
  """
664
- self._validate_reactions_data(data)
665
- if label in self.reactions_data:
666
- raise ValueError(
667
- f"{label} already exists in reactions_data. Drop it first."
668
- )
669
- self.reactions_data[label] = data
670
624
 
671
- def remove_reactions_data(self, label: str):
672
- """
673
- Remove reactions data by label.
674
- """
675
- self._remove_entity_data(SBML_DFS.REACTIONS, label)
625
+ schema = self.schema
676
626
 
677
- def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
678
- """
679
- Remove compartmentalized species and associated reactions.
627
+ if entity_type not in schema.keys():
628
+ raise ValueError(
629
+ f"{entity_type} does not match a table in the SBML_dfs object. The tables "
630
+ f"which are present are {', '.join(schema.keys())}"
631
+ )
680
632
 
681
- Starting with a set of compartmentalized species, determine which reactions
682
- should be removed based on their removal. Then remove these reactions,
683
- compartmentalized species, and species.
633
+ if required_attributes is not None:
634
+ if not isinstance(required_attributes, set):
635
+ raise TypeError(
636
+ f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
637
+ "Did you pass a string instead of a set?"
638
+ )
684
639
 
685
- Parameters
686
- ----------
687
- sc_ids : Iterable[str]
688
- IDs of compartmentalized species to remove
689
- """
640
+ # determine whether required_attributes are appropriate
641
+ VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
642
+ invalid_required_attributes = required_attributes.difference(
643
+ VALID_REQUIRED_ATTRIBUTES
644
+ )
690
645
 
691
- # find reactions which should be totally removed since they are losing critical species
692
- removed_reactions = _find_underspecified_reactions_by_scids(self, sc_ids)
693
- self.remove_reactions(removed_reactions)
646
+ if len(invalid_required_attributes) > 0:
647
+ raise ValueError(
648
+ f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
649
+ f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
650
+ )
694
651
 
695
- self._remove_compartmentalized_species(sc_ids)
652
+ # determine if required_attributes are satisified
653
+ invalid_attrs = [
654
+ s for s in required_attributes if s not in schema[entity_type].keys()
655
+ ]
656
+ if len(invalid_attrs) > 0:
657
+ raise ValueError(
658
+ f"The following required attributes are not present for the {entity_type} table: "
659
+ f"{', '.join(invalid_attrs)}."
660
+ )
696
661
 
697
- # remove species (and their associated species data if all their cspecies have been lost)
698
- self._remove_unused_species()
662
+ return getattr(self, entity_type)
699
663
 
700
- def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
664
+ def get_uri_urls(
665
+ self,
666
+ entity_type: str,
667
+ entity_ids: Iterable[str] | None = None,
668
+ required_ontology: str | None = None,
669
+ ) -> pd.Series:
701
670
  """
702
- Remove reactions from the model.
671
+ Get reference URLs for specified entities.
703
672
 
704
673
  Parameters
705
674
  ----------
706
- r_ids : Iterable[str]
707
- IDs of reactions to remove
708
- remove_species : bool, optional
709
- Whether to remove species that are no longer part of any reactions,
710
- by default False
711
- """
712
- # remove corresponding reactions_species
713
- self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
714
- # remove reactions
715
- self.reactions = self.reactions.drop(index=list(r_ids))
716
- # remove reactions_data
717
- if hasattr(self, "reactions_data"):
718
- for k, data in self.reactions_data.items():
719
- self.reactions_data[k] = data.drop(index=list(r_ids))
720
- # remove species if requested
721
- if remove_species:
722
- self._remove_unused_cspecies()
723
- self._remove_unused_species()
724
-
725
- def validate(self):
726
- """
727
- Validate the SBML_dfs structure and relationships.
675
+ entity_type : str
676
+ Type of entity to get URLs for (e.g., 'species', 'reactions')
677
+ entity_ids : Optional[Iterable[str]], optional
678
+ Specific entities to get URLs for, by default None (all entities)
679
+ required_ontology : Optional[str], optional
680
+ Specific ontology to get URLs from, by default None
728
681
 
729
- Checks:
730
- - Schema existence
731
- - Required tables presence
732
- - Individual table structure
733
- - Primary key uniqueness
734
- - Foreign key relationships
735
- - Optional data table validity
736
- - Reaction species validity
682
+ Returns
683
+ -------
684
+ pd.Series
685
+ Series mapping entity IDs to their reference URLs
737
686
 
738
687
  Raises
739
688
  ------
740
689
  ValueError
741
- If any validation check fails
690
+ If entity_type is invalid
742
691
  """
692
+ schema = self.schema
743
693
 
744
- if not hasattr(self, "schema"):
745
- raise ValueError("No schema found")
694
+ # valid entities and their identifier variables
695
+ valid_entity_types = [
696
+ SBML_DFS.COMPARTMENTS,
697
+ SBML_DFS.SPECIES,
698
+ SBML_DFS.REACTIONS,
699
+ ]
746
700
 
747
- required_tables = self._required_entities
748
- schema_tables = set(self.schema.keys())
701
+ if entity_type not in valid_entity_types:
702
+ raise ValueError(
703
+ f"{entity_type} is an invalid entity_type; valid types "
704
+ f"are {', '.join(valid_entity_types)}"
705
+ )
749
706
 
750
- extra_tables = schema_tables.difference(required_tables)
751
- if len(extra_tables) != 0:
752
- logger.debug(
753
- f"{len(extra_tables)} unexpected tables found: "
754
- f"{', '.join(extra_tables)}"
707
+ entity_table = getattr(self, entity_type)
708
+
709
+ if entity_ids is not None:
710
+ # ensure that entity_ids are unique and then convert back to list
711
+ # to support pandas indexing
712
+ entity_ids = list(set(entity_ids))
713
+
714
+ # filter to a subset of identifiers if one is provided
715
+ entity_table = entity_table.loc[entity_ids]
716
+
717
+ # create a dataframe of all identifiers for the select entities
718
+ all_ids = pd.concat(
719
+ [
720
+ sbml_dfs_utils._id_dict_to_df(
721
+ entity_table[schema[entity_type]["id"]].iloc[i].ids
722
+ ).assign(id=entity_table.index[i])
723
+ for i in range(0, entity_table.shape[0])
724
+ ]
725
+ ).rename(columns={"id": schema[entity_type]["pk"]})
726
+
727
+ # set priorities for ontologies and bqb terms
728
+
729
+ if required_ontology is None:
730
+ all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
731
+ ONTOLOGY_PRIORITIES, how="left"
732
+ )
733
+ else:
734
+ ontology_priorities = pd.DataFrame(
735
+ [{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
736
+ )
737
+ # if only a single ontology is sought then just return matching entries
738
+ all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
739
+ ontology_priorities, how="inner"
755
740
  )
756
741
 
757
- missing_tables = required_tables.difference(schema_tables)
758
- if len(missing_tables) != 0:
742
+ uri_urls = (
743
+ all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
744
+ .groupby(schema[entity_type]["pk"])
745
+ .first()[IDENTIFIERS.URL]
746
+ )
747
+ return uri_urls
748
+
749
+ def infer_sbo_terms(self):
750
+ """
751
+ Infer SBO Terms
752
+
753
+ Define SBO terms based on stoichiometry for reaction_species with missing terms.
754
+ Modifies the SBML_dfs object in-place.
755
+
756
+ Returns
757
+ -------
758
+ None (modifies SBML_dfs object in-place)
759
+ """
760
+ valid_sbo_terms = self.reaction_species[
761
+ self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
762
+ ]
763
+
764
+ invalid_sbo_terms = self.reaction_species[
765
+ ~self.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
766
+ ]
767
+
768
+ if not all(self.reaction_species[SBML_DFS.SBO_TERM].notnull()):
769
+ raise ValueError("All reaction_species[SBML_DFS.SBO_TERM] must be not null")
770
+ if invalid_sbo_terms.shape[0] == 0:
771
+ logger.info("All sbo_terms were valid; nothing to update.")
772
+ return
773
+
774
+ logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
775
+
776
+ # add missing/invalid terms based on stoichiometry
777
+ invalid_sbo_terms.loc[
778
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
779
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
780
+
781
+ invalid_sbo_terms.loc[
782
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
783
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
784
+
785
+ invalid_sbo_terms.loc[
786
+ invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
787
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
788
+
789
+ updated_reaction_species = pd.concat(
790
+ [valid_sbo_terms, invalid_sbo_terms]
791
+ ).sort_index()
792
+
793
+ if self.reaction_species.shape[0] != updated_reaction_species.shape[0]:
759
794
  raise ValueError(
760
- f"Missing {len(missing_tables)} required tables: "
761
- f"{', '.join(missing_tables)}"
795
+ f"Trying to overwrite {self.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
762
796
  )
797
+ self.reaction_species = updated_reaction_species
798
+ return
763
799
 
764
- # check individual tables
765
- for table in required_tables:
766
- self._validate_table(table)
800
+ def infer_uncompartmentalized_species_location(self):
801
+ """
802
+ Infer Uncompartmentalized Species Location
767
803
 
768
- # check whether pks and fks agree
769
- pk_df = pd.DataFrame(
770
- [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
804
+ If the compartment of a subset of compartmentalized species
805
+ was not specified, infer an appropriate compartment from
806
+ other members of reactions they participate in.
807
+
808
+ This method modifies the SBML_dfs object in-place.
809
+
810
+ Returns
811
+ -------
812
+ None (modifies SBML_dfs object in-place)
813
+ """
814
+ default_compartment = (
815
+ self.compartmentalized_species.value_counts(SBML_DFS.C_ID)
816
+ .rename("N")
817
+ .reset_index()
818
+ .sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
771
819
  )
820
+ if not isinstance(default_compartment, str):
821
+ raise ValueError(
822
+ "No default compartment could be found - compartment "
823
+ "information may not be present"
824
+ )
772
825
 
773
- fk_df = (
774
- pd.DataFrame(
775
- [
776
- {"fk_table": k, "fk": v["fk"]}
777
- for k, v in self.schema.items()
778
- if "fk" in v.keys()
779
- ]
826
+ # infer the compartments of species missing compartments
827
+ missing_compartment_scids = self.compartmentalized_species[
828
+ self.compartmentalized_species[SBML_DFS.C_ID].isnull()
829
+ ].index.tolist()
830
+ if len(missing_compartment_scids) == 0:
831
+ logger.info(
832
+ "All compartmentalized species have compartments, "
833
+ "returning input SBML_dfs"
780
834
  )
781
- .set_index("fk_table")["fk"]
782
- .apply(pd.Series)
835
+ return self
836
+
837
+ participating_reactions = (
838
+ self.reaction_species[
839
+ self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
840
+ ][SBML_DFS.R_ID]
841
+ .unique()
842
+ .tolist()
843
+ )
844
+ reaction_participants = self.reaction_species[
845
+ self.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
846
+ ].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
847
+ reaction_participants = reaction_participants.merge(
848
+ self.compartmentalized_species[SBML_DFS.C_ID],
849
+ left_on=SBML_DFS.SC_ID,
850
+ right_index=True,
851
+ )
852
+
853
+ # find a default compartment to fall back on if all compartmental information is missing
854
+ primary_reaction_compartment = (
855
+ reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
856
+ .rename("N")
857
+ .reset_index()
858
+ .sort_values("N", ascending=False)
859
+ .groupby(SBML_DFS.R_ID)
860
+ .first()[SBML_DFS.C_ID]
783
861
  .reset_index()
784
- .melt(id_vars="fk_table")
785
- .drop(["variable"], axis=1)
786
- .rename(columns={"value": "key"})
787
862
  )
788
863
 
789
- pk_fk_correspondences = pk_df.merge(fk_df)
864
+ inferred_compartmentalization = (
865
+ self.reaction_species[
866
+ self.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
867
+ ]
868
+ .merge(primary_reaction_compartment)
869
+ .value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
870
+ .rename("N")
871
+ .reset_index()
872
+ .sort_values("N", ascending=False)
873
+ .groupby(SBML_DFS.SC_ID)
874
+ .first()
875
+ .reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
876
+ )
877
+ logger.info(
878
+ f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
879
+ )
790
880
 
791
- for i in range(0, pk_fk_correspondences.shape[0]):
792
- pk_table_keys = set(
793
- getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
881
+ # define where a reaction is most likely to occur based on the compartmentalization of its participants
882
+ species_with_unknown_compartmentalization = set(
883
+ missing_compartment_scids
884
+ ).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
885
+ if len(species_with_unknown_compartmentalization) != 0:
886
+ logger.warning(
887
+ f"{len(species_with_unknown_compartmentalization)} "
888
+ "species compartmentalization could not be inferred"
889
+ " from other reaction participants. Their compartmentalization "
890
+ f"will be set to the default of {default_compartment}"
794
891
  )
795
- if None in pk_table_keys:
796
- raise ValueError(
797
- f"{pk_fk_correspondences['pk_table'][i]} had "
798
- "missing values in its index"
799
- )
800
892
 
801
- fk_table_keys = set(
802
- getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
803
- :, pk_fk_correspondences["key"][i]
893
+ inferred_compartmentalization = pd.concat(
894
+ [
895
+ inferred_compartmentalization,
896
+ pd.DataFrame(
897
+ {
898
+ SBML_DFS.SC_ID: list(
899
+ species_with_unknown_compartmentalization
900
+ )
901
+ }
902
+ ).assign(c_id=default_compartment),
804
903
  ]
805
904
  )
806
- if None in fk_table_keys:
807
- raise ValueError(
808
- f"{pk_fk_correspondences['fk_table'][i]} included "
809
- f"missing {pk_fk_correspondences['key'][i]} values"
810
- )
811
905
 
812
- # all foreign keys need to match a primary key
813
- extra_fks = fk_table_keys.difference(pk_table_keys)
814
- if len(extra_fks) != 0:
815
- raise ValueError(
816
- f"{len(extra_fks)} distinct "
817
- f"{pk_fk_correspondences['key'][i]} values were"
818
- f" found in {pk_fk_correspondences['fk_table'][i]} "
819
- f"but missing from {pk_fk_correspondences['pk_table'][i]}."
820
- " All foreign keys must have a matching primary key.\n\n"
821
- f"Extra key are: {', '.join(extra_fks)}"
906
+ if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
907
+ raise ValueError(
908
+ f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
909
+ )
910
+
911
+ updated_compartmentalized_species = pd.concat(
912
+ [
913
+ self.compartmentalized_species[
914
+ ~self.compartmentalized_species[SBML_DFS.C_ID].isnull()
915
+ ],
916
+ self.compartmentalized_species[
917
+ self.compartmentalized_species[SBML_DFS.C_ID].isnull()
918
+ ]
919
+ .drop(SBML_DFS.C_ID, axis=1)
920
+ .merge(
921
+ inferred_compartmentalization,
922
+ left_index=True,
923
+ right_on=SBML_DFS.SC_ID,
822
924
  )
925
+ .set_index(SBML_DFS.SC_ID),
926
+ ]
927
+ )
823
928
 
824
- # check optional data tables:
825
- for k, v in self.species_data.items():
826
- try:
827
- self._validate_species_data(v)
828
- except ValueError as e:
829
- raise ValueError(f"species data {k} was invalid.") from e
929
+ if (
930
+ updated_compartmentalized_species.shape[0]
931
+ != self.compartmentalized_species.shape[0]
932
+ ):
933
+ raise ValueError(
934
+ f"Trying to overwrite {self.compartmentalized_species.shape[0]}"
935
+ " compartmentalized species with "
936
+ f"{updated_compartmentalized_species.shape[0]}"
937
+ )
830
938
 
831
- for k, v in self.reactions_data.items():
832
- try:
833
- self._validate_reactions_data(v)
834
- except ValueError as e:
835
- raise ValueError(f"reactions data {k} was invalid.") from e
939
+ if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
940
+ raise ValueError("Some species compartments are still missing")
836
941
 
837
- # validate reaction_species sbo_terms and stoi
838
- self._validate_reaction_species()
942
+ self.compartmentalized_species = updated_compartmentalized_species
943
+ return
839
944
 
840
- def validate_and_resolve(self):
945
+ def name_compartmentalized_species(self):
841
946
  """
842
- Validate and attempt to automatically fix common issues.
947
+ Name Compartmentalized Species
843
948
 
844
- This method iteratively:
845
- 1. Attempts validation
846
- 2. If validation fails, tries to resolve the issue
847
- 3. Repeats until validation passes or issue cannot be resolved
949
+ Rename compartmentalized species if they have the same
950
+ name as their species. Modifies the SBML_dfs object in-place.
848
951
 
849
- Raises
850
- ------
851
- ValueError
852
- If validation fails and cannot be automatically resolved
952
+ Returns
953
+ -------
954
+ None (modifies SBML_dfs object in-place)
853
955
  """
956
+ augmented_cspecies = self.compartmentalized_species.merge(
957
+ self.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
958
+ ).merge(
959
+ self.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
960
+ )
961
+ augmented_cspecies[SBML_DFS.SC_NAME] = [
962
+ f"{s} [{c}]" if sc == s else sc
963
+ for sc, c, s in zip(
964
+ augmented_cspecies[SBML_DFS.SC_NAME],
965
+ augmented_cspecies[SBML_DFS.C_NAME],
966
+ augmented_cspecies[SBML_DFS.S_NAME],
967
+ )
968
+ ]
854
969
 
855
- current_exception = None
856
- validated = False
857
-
858
- while not validated:
859
- try:
860
- self.validate()
861
- validated = True
862
- except Exception as e:
863
- e_str = str(e)
864
- if e_str == current_exception:
865
- logger.warning(
866
- "Automated resolution of an Exception was attempted but failed"
867
- )
868
- raise e
869
-
870
- # try to resolve
871
- self._attempt_resolve(e)
970
+ self.compartmentalized_species = augmented_cspecies.loc[
971
+ :, self.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
972
+ ]
973
+ return
872
974
 
873
- def select_species_data(self, species_data_table: str) -> pd.DataFrame:
975
+ def reaction_formulas(
976
+ self, r_ids: Optional[Union[str, list[str]]] = None
977
+ ) -> pd.Series:
874
978
  """
875
- Select a species data table from the SBML_dfs object.
979
+ Reaction Summary
876
980
 
877
- Parameters
981
+ Return human-readable formulas for reactions.
982
+
983
+ Parameters:
878
984
  ----------
879
- species_data_table : str
880
- Name of the species data table to select
985
+ r_ids: [str], str or None
986
+ Reaction IDs or None for all reactions
881
987
 
882
988
  Returns
883
- -------
884
- pd.DataFrame
885
- The selected species data table
886
-
887
- Raises
888
- ------
889
- ValueError
890
- If species_data_table is not found
989
+ ----------
990
+ formula_strs: pd.Series
891
991
  """
892
- # Check if species_data_table exists in sbml_dfs.species_data
893
- if species_data_table not in self.species_data:
894
- raise ValueError(
895
- f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
896
- f"Available tables: {self.species_data.keys()}"
992
+
993
+ validated_rids = self._validate_r_ids(r_ids)
994
+
995
+ matching_reaction_species = self.reaction_species[
996
+ self.reaction_species.r_id.isin(validated_rids)
997
+ ].merge(
998
+ self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
999
+ )
1000
+
1001
+ # split into within compartment and cross-compartment reactions
1002
+ r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
1003
+ SBML_DFS.C_ID
1004
+ ].nunique()
1005
+
1006
+ # identify reactions which work across compartments
1007
+ r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
1008
+ # there species must be labelled with the sc_name to specify where a species exists
1009
+ if r_id_cross_compartment.shape[0] > 0:
1010
+ rxn_eqtn_cross_compartment = (
1011
+ matching_reaction_species[
1012
+ matching_reaction_species[SBML_DFS.R_ID].isin(
1013
+ r_id_cross_compartment.index
1014
+ )
1015
+ ]
1016
+ .sort_values([SBML_DFS.SC_NAME])
1017
+ .groupby(SBML_DFS.R_ID)
1018
+ .apply(
1019
+ lambda x: sbml_dfs_utils.construct_formula_string(
1020
+ x, self.reactions, SBML_DFS.SC_NAME
1021
+ )
1022
+ )
1023
+ .rename("r_formula_str")
1024
+ )
1025
+ else:
1026
+ rxn_eqtn_cross_compartment = None
1027
+
1028
+ # identify reactions which occur within a single compartment; for these the reaction
1029
+ # can be labelled with the compartment and individual species can receive a more readable s_name
1030
+ r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
1031
+ if r_id_within_compartment.shape[0] > 0:
1032
+ # add s_name
1033
+ augmented_matching_reaction_species = (
1034
+ matching_reaction_species[
1035
+ matching_reaction_species[SBML_DFS.R_ID].isin(
1036
+ r_id_within_compartment.index
1037
+ )
1038
+ ]
1039
+ .merge(self.compartments, left_on=SBML_DFS.C_ID, right_index=True)
1040
+ .merge(self.species, left_on=SBML_DFS.S_ID, right_index=True)
1041
+ .sort_values([SBML_DFS.S_NAME])
1042
+ )
1043
+ # create formulas based on s_names of components
1044
+ rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
1045
+ [SBML_DFS.R_ID, SBML_DFS.C_NAME]
1046
+ ).apply(
1047
+ lambda x: sbml_dfs_utils.construct_formula_string(
1048
+ x, self.reactions, SBML_DFS.S_NAME
1049
+ )
897
1050
  )
1051
+ # add compartment for each reaction
1052
+ rxn_eqtn_within_compartment = pd.Series(
1053
+ [
1054
+ y + ": " + x
1055
+ for x, y in zip(
1056
+ rxn_eqtn_within_compartment,
1057
+ rxn_eqtn_within_compartment.index.get_level_values(
1058
+ SBML_DFS.C_NAME
1059
+ ),
1060
+ )
1061
+ ],
1062
+ index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
1063
+ ).rename("r_formula_str")
1064
+ else:
1065
+ rxn_eqtn_within_compartment = None
898
1066
 
899
- # Get the species data
900
- return self.species_data[species_data_table]
1067
+ formula_strs = pd.concat(
1068
+ [rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment]
1069
+ )
901
1070
 
902
- def _validate_table(self, table: str) -> None:
1071
+ return formula_strs
1072
+
1073
+ def reaction_summaries(
1074
+ self, r_ids: Optional[Union[str, list[str]]] = None
1075
+ ) -> pd.DataFrame:
903
1076
  """
904
- Validate a table in this SBML_dfs object against its schema.
1077
+ Reaction Summary
905
1078
 
906
- This is an internal method that validates a table that is part of this SBML_dfs
907
- object against the schema stored in self.schema.
1079
+ Return a summary of reactions.
908
1080
 
909
- Parameters
1081
+ Parameters:
910
1082
  ----------
911
- table : str
912
- Name of the table to validate
1083
+ r_ids: [str], str or None
1084
+ Reaction IDs or None for all reactions
913
1085
 
914
- Raises
915
- ------
916
- ValueError
917
- If the table does not conform to its schema
1086
+ Returns
1087
+ ----------
1088
+ reaction_summaries_df: pd.DataFrame
1089
+ A table with r_id as an index and columns:
1090
+ - r_name: str, name of the reaction
1091
+ - r_formula_str: str, human-readable formula of the reaction
918
1092
  """
919
- table_schema = self.schema[table]
920
- table_data = getattr(self, table)
921
- _perform_sbml_dfs_table_validation(table_data, table_schema, table)
922
1093
 
923
- def _remove_entity_data(self, entity_type: str, label: str) -> None:
1094
+ validated_rids = self._validate_r_ids(r_ids)
1095
+
1096
+ participating_r_names = self.reactions.loc[validated_rids, SBML_DFS.R_NAME]
1097
+ participating_r_formulas = self.reaction_formulas(r_ids=validated_rids)
1098
+ reaction_summareis_df = pd.concat(
1099
+ [participating_r_names, participating_r_formulas], axis=1
1100
+ )
1101
+
1102
+ return reaction_summareis_df
1103
+
1104
+ def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
924
1105
  """
925
- Remove data from species_data or reactions_data by table name and label.
1106
+ Remove compartmentalized species and associated reactions.
1107
+
1108
+ Starting with a set of compartmentalized species, determine which reactions
1109
+ should be removed based on their removal. Then remove these reactions,
1110
+ compartmentalized species, and species.
926
1111
 
927
1112
  Parameters
928
1113
  ----------
929
- entity_type : str
930
- Name of the table to remove data from ('species' or 'reactions')
931
- label : str
932
- Label of the data to remove
933
-
934
- Notes
935
- -----
936
- If the label does not exist, a warning will be logged that includes the existing labels.
937
- """
938
- if entity_type not in ENTITIES_W_DATA:
939
- raise ValueError("table_name must be either 'species' or 'reactions'")
940
-
941
- data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
942
- if label not in data_dict:
943
- existing_labels = list(data_dict.keys())
944
- logger.warning(
945
- f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
946
- f"Existing labels: {existing_labels}"
947
- )
948
- return
949
-
950
- del data_dict[label]
951
-
952
- def _remove_unused_cspecies(self):
953
- """Removes compartmentalized species that are no
954
- longer part of any reactions"""
955
- sc_ids = self._get_unused_cspecies()
956
- self._remove_compartmentalized_species(sc_ids)
957
-
958
- def _get_unused_cspecies(self) -> set[str]:
959
- """Returns a set of compartmentalized species
960
- that are not part of any reactions"""
961
- sc_ids = set(self.compartmentalized_species.index) - set(
962
- self.reaction_species[SBML_DFS.SC_ID]
963
- )
964
- return sc_ids # type: ignore
965
-
966
- def _remove_unused_species(self):
967
- """Removes species that are no longer part of any
968
- compartmentalized species"""
969
- s_ids = self._get_unused_species()
970
- self._remove_species(s_ids)
971
-
972
- def _get_unused_species(self) -> set[str]:
973
- """Returns a list of species that are not part of any reactions"""
974
- s_ids = set(self.species.index) - set(
975
- self.compartmentalized_species[SBML_DFS.S_ID]
976
- )
977
- return s_ids # type: ignore
978
-
979
- def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
980
- """Removes compartmentalized species from the model
981
-
982
- This should not be directly used by the user, as it can lead to
983
- invalid reactions when removing species without a logic to decide
984
- if the reaction needs to be removed as well.
985
-
986
- Args:
987
- sc_ids (Iterable[str]): the compartmentalized species to remove
1114
+ sc_ids : Iterable[str]
1115
+ IDs of compartmentalized species to remove
988
1116
  """
989
- # Remove compartmentalized species
990
- self.compartmentalized_species = self.compartmentalized_species.drop(
991
- index=list(sc_ids)
992
- )
993
- # remove corresponding reactions_species
994
- self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
995
-
996
- def _remove_species(self, s_ids: Iterable[str]):
997
- """Removes species from the model
998
1117
 
999
- This should not be directly used by the user, as it can lead to
1000
- invalid reactions when removing species without a logic to decide
1001
- if the reaction needs to be removed as well.
1002
-
1003
- This removes the species and corresponding compartmentalized species and
1004
- reactions_species.
1118
+ # find reactions which should be totally removed since they are losing critical species
1119
+ removed_reactions = self._find_underspecified_reactions_by_scids(sc_ids)
1120
+ self.remove_reactions(removed_reactions)
1005
1121
 
1006
- Args:
1007
- s_ids (Iterable[str]): the species to remove
1008
- """
1009
- sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
1010
1122
  self._remove_compartmentalized_species(sc_ids)
1011
- # Remove species
1012
- self.species = self.species.drop(index=list(s_ids))
1013
- # remove data
1014
- for k, data in self.species_data.items():
1015
- self.species_data[k] = data.drop(index=list(s_ids))
1016
-
1017
- def _validate_species_data(self, species_data_table: pd.DataFrame):
1018
- """Validates species data attribute
1019
-
1020
- Args:
1021
- species_data_table (pd.DataFrame): a species data table
1022
-
1023
- Raises:
1024
- ValueError: s_id not index name
1025
- ValueError: s_id index contains duplicates
1026
- ValueError: s_id not in species table
1027
- """
1028
- _validate_matching_data(species_data_table, self.species)
1029
-
1030
- def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
1031
- """Validates reactions data attribute
1032
1123
 
1033
- Args:
1034
- reactions_data_table (pd.DataFrame): a reactions data table
1124
+ # remove species (and their associated species data if all their cspecies have been lost)
1125
+ self._remove_unused_species()
1035
1126
 
1036
- Raises:
1037
- ValueError: r_id not index name
1038
- ValueError: r_id index contains duplicates
1039
- ValueError: r_id not in reactions table
1127
+ def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
1040
1128
  """
1041
- _validate_matching_data(reactions_data_table, self.reactions)
1042
-
1043
- def _validate_reaction_species(self):
1044
- if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
1045
- raise ValueError(
1046
- "All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
1047
- )
1048
-
1049
- # test for null SBO terms
1050
- n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
1051
- if n_null_sbo_terms != 0:
1052
- raise ValueError(
1053
- f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
1054
- )
1055
-
1056
- # find invalid SBO terms
1057
- sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
1058
- invalid_sbo_term_counts = sbo_counts[
1059
- ~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
1060
- ]
1061
-
1062
- if invalid_sbo_term_counts.shape[0] != 0:
1063
- invalid_sbo_counts_str = ", ".join(
1064
- [f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
1065
- )
1066
- raise ValueError(
1067
- f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
1068
- f"defined {invalid_sbo_counts_str}"
1069
- )
1070
-
1071
- def _attempt_resolve(self, e):
1072
- str_e = str(e)
1073
- if str_e == "compartmentalized_species included missing c_id values":
1074
- logger.warning(str_e)
1075
- logger.warning(
1076
- "Attempting to resolve with infer_uncompartmentalized_species_location()"
1077
- )
1078
- self = infer_uncompartmentalized_species_location(self)
1079
- elif re.search("sbo_terms were not defined", str_e):
1080
- logger.warning(str_e)
1081
- logger.warning("Attempting to resolve with infer_sbo_terms()")
1082
- self = infer_sbo_terms(self)
1083
- else:
1084
- logger.warning(
1085
- "An error occurred which could not be automatically resolved"
1086
- )
1087
- raise e
1088
-
1089
-
1090
- def species_status(s_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
1091
- """
1092
- Species Status
1093
-
1094
- Return all of the reaction's a species particpates in.
1095
-
1096
- Parameters:
1097
- s_id: str
1098
- A species ID
1099
- sbml_dfs: SBML_dfs
1100
-
1101
- Returns:
1102
- pd.DataFrame, one row reaction
1103
- """
1104
-
1105
- matching_species = sbml_dfs.species.loc[s_id]
1106
-
1107
- if not isinstance(matching_species, pd.Series):
1108
- raise ValueError(f"{s_id} did not match a single species")
1109
-
1110
- # find all rxns species particpate in
1111
-
1112
- matching_compartmentalized_species = sbml_dfs.compartmentalized_species[
1113
- sbml_dfs.compartmentalized_species.s_id.isin([s_id])
1114
- ]
1115
-
1116
- rxns_participating = sbml_dfs.reaction_species[
1117
- sbml_dfs.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
1118
- ]
1119
-
1120
- # find all participants in these rxns
1121
-
1122
- full_rxns_participating = sbml_dfs.reaction_species[
1123
- sbml_dfs.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
1124
- ].merge(
1125
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1126
- )
1127
-
1128
- reaction_descriptions = pd.concat(
1129
- [
1130
- reaction_summary(x, sbml_dfs)
1131
- for x in set(full_rxns_participating[SBML_DFS.R_ID].tolist())
1132
- ]
1133
- )
1134
-
1135
- status = (
1136
- full_rxns_participating.loc[
1137
- full_rxns_participating[SBML_DFS.SC_ID].isin(
1138
- matching_compartmentalized_species.index.values.tolist()
1139
- ),
1140
- [SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
1141
- ]
1142
- .merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
1143
- .reset_index(drop=True)
1144
- .drop(SBML_DFS.R_ID, axis=1)
1145
- )
1146
-
1147
- return status
1148
-
1149
-
1150
- def reaction_summary(r_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
1151
- """
1152
- Reaction Summary
1153
-
1154
- Return a reaction's name and a human-readable formula.
1155
-
1156
- Parameters:
1157
- r_id: str
1158
- A reaction ID
1159
- sbml_dfs: SBML_dfs
1160
-
1161
- Returns:
1162
- one row pd.DataFrame
1163
- """
1164
-
1165
- logger.warning(
1166
- "reaction_summary is deprecated and will be removed in a future version of rcpr; "
1167
- "please use reaction_summaries() instead"
1168
- )
1169
-
1170
- matching_reaction = sbml_dfs.reactions.loc[r_id]
1171
-
1172
- if not isinstance(matching_reaction, pd.Series):
1173
- raise ValueError(f"{r_id} did not match a single reaction")
1174
-
1175
- matching_reaction = sbml_dfs.reactions.loc[r_id]
1176
-
1177
- matching_reaction_species = sbml_dfs.reaction_species[
1178
- sbml_dfs.reaction_species.r_id.isin([r_id])
1179
- ].merge(
1180
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1181
- )
1182
-
1183
- # collapse all reaction species to a formula string
1184
-
1185
- if len(matching_reaction_species[SBML_DFS.C_ID].unique()) == 1:
1186
- augmented_matching_reaction_species = matching_reaction_species.merge(
1187
- sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True
1188
- ).merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
1189
- str_formula = (
1190
- construct_formula_string(
1191
- augmented_matching_reaction_species, sbml_dfs.reactions, SBML_DFS.S_NAME
1192
- )
1193
- + " ["
1194
- + augmented_matching_reaction_species[SBML_DFS.C_NAME].iloc[0]
1195
- + "]"
1196
- )
1197
- else:
1198
- str_formula = construct_formula_string(
1199
- matching_reaction_species, sbml_dfs.reactions, SBML_DFS.SC_NAME
1200
- )
1201
-
1202
- output = pd.DataFrame(
1203
- {
1204
- SBML_DFS.R_NAME: matching_reaction[SBML_DFS.R_NAME],
1205
- "r_formula_str": str_formula,
1206
- },
1207
- index=[r_id],
1208
- )
1209
-
1210
- output.index.name = SBML_DFS.R_ID
1211
-
1212
- return output
1213
-
1214
-
1215
- def reaction_summaries(sbml_dfs: SBML_dfs, r_ids=None) -> pd.Series:
1216
- """
1217
- Reaction Summary
1218
-
1219
- Return human-readable formulas for reactions.
1220
-
1221
- Parameters:
1222
- ----------
1223
- sbml_dfs: sbml.SBML_dfs
1224
- A relational mechanistic model
1225
- r_ids: [str], str or None
1226
- Reaction IDs or None for all reactions
1227
-
1228
- Returns:
1229
- ----------
1230
- formula_strs: pd.Series
1231
- """
1232
-
1233
- if isinstance(r_ids, str):
1234
- r_ids = [r_ids]
1235
-
1236
- if r_ids is None:
1237
- matching_reactions = sbml_dfs.reactions
1238
- else:
1239
- matching_reactions = sbml_dfs.reactions.loc[r_ids]
1240
-
1241
- matching_reaction_species = sbml_dfs.reaction_species[
1242
- sbml_dfs.reaction_species.r_id.isin(matching_reactions.index)
1243
- ].merge(
1244
- sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1245
- )
1246
-
1247
- # split into within compartment and cross-compartment reactions
1248
- r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
1249
- SBML_DFS.C_ID
1250
- ].nunique()
1251
-
1252
- # identify reactions which work across compartments
1253
- r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
1254
- # there species must be labelled with the sc_name to specify where a species exists
1255
- if r_id_cross_compartment.shape[0] > 0:
1256
- rxn_eqtn_cross_compartment = (
1257
- matching_reaction_species[
1258
- matching_reaction_species[SBML_DFS.R_ID].isin(
1259
- r_id_cross_compartment.index
1260
- )
1261
- ]
1262
- .sort_values([SBML_DFS.SC_NAME])
1263
- .groupby(SBML_DFS.R_ID)
1264
- .apply(
1265
- lambda x: construct_formula_string(
1266
- x, sbml_dfs.reactions, SBML_DFS.SC_NAME
1267
- )
1268
- )
1269
- .rename("r_formula_str")
1270
- )
1271
- else:
1272
- rxn_eqtn_cross_compartment = None
1273
-
1274
- # identify reactions which occur within a single compartment; for these the reaction
1275
- # can be labelled with the compartment and individual species can receive a more readable s_name
1276
- r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
1277
- if r_id_within_compartment.shape[0] > 0:
1278
- # add s_name
1279
- augmented_matching_reaction_species = (
1280
- matching_reaction_species[
1281
- matching_reaction_species[SBML_DFS.R_ID].isin(
1282
- r_id_within_compartment.index
1283
- )
1284
- ]
1285
- .merge(sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True)
1286
- .merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
1287
- .sort_values([SBML_DFS.S_NAME])
1288
- )
1289
- # create formulas based on s_names of components
1290
- rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
1291
- [SBML_DFS.R_ID, SBML_DFS.C_NAME]
1292
- ).apply(
1293
- lambda x: construct_formula_string(x, sbml_dfs.reactions, SBML_DFS.S_NAME)
1294
- )
1295
- # add compartment for each reaction
1296
- rxn_eqtn_within_compartment = pd.Series(
1297
- [
1298
- y + ": " + x
1299
- for x, y in zip(
1300
- rxn_eqtn_within_compartment,
1301
- rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.C_NAME),
1302
- )
1303
- ],
1304
- index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
1305
- ).rename("r_formula_str")
1306
- else:
1307
- rxn_eqtn_within_compartment = None
1308
-
1309
- formula_strs = pd.concat([rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment])
1310
-
1311
- return formula_strs
1312
-
1313
-
1314
- def construct_formula_string(
1315
- reaction_species_df: pd.DataFrame,
1316
- reactions_df: pd.DataFrame,
1317
- name_var: str,
1318
- ) -> str:
1319
- """
1320
- Construct Formula String
1321
-
1322
- Convert a table of reaction species into a formula string
1323
-
1324
- Parameters:
1325
- ----------
1326
- reaction_species_df: pd.DataFrame
1327
- Table containing a reactions' species
1328
- reactions_df: pd.DataFrame
1329
- smbl.reactions
1330
- name_var: str
1331
- Name used to label species
1332
-
1333
- Returns:
1334
- ----------
1335
- formula_str: str
1336
- String representation of a reactions substrates, products and
1337
- modifiers
1338
-
1339
- """
1340
-
1341
- reaction_species_df["label"] = [
1342
- add_stoi_to_species_name(x, y)
1343
- for x, y in zip(
1344
- reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
1345
- )
1346
- ]
1347
-
1348
- rxn_reversible = bool(
1349
- reactions_df.loc[
1350
- reaction_species_df[SBML_DFS.R_ID].iloc[0], SBML_DFS.R_ISREVERSIBLE
1351
- ]
1352
- ) # convert from a np.bool_ to bool if needed
1353
- if not isinstance(rxn_reversible, bool):
1354
- raise TypeError(
1355
- f"rxn_reversible must be a bool, but got {type(rxn_reversible).__name__}"
1356
- )
1357
-
1358
- if rxn_reversible:
1359
- arrow_type = " <-> "
1360
- else:
1361
- arrow_type = " -> "
1362
-
1363
- substrates = " + ".join(
1364
- reaction_species_df["label"][
1365
- reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
1366
- ].tolist()
1367
- )
1368
- products = " + ".join(
1369
- reaction_species_df["label"][
1370
- reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
1371
- ].tolist()
1372
- )
1373
- modifiers = " + ".join(
1374
- reaction_species_df["label"][
1375
- reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
1376
- ].tolist()
1377
- )
1378
- if modifiers != "":
1379
- modifiers = f" ---- modifiers: {modifiers}]"
1380
-
1381
- return f"{substrates}{arrow_type}{products}{modifiers}"
1382
-
1383
-
1384
- def add_stoi_to_species_name(stoi: float | int, name: str) -> str:
1385
- """
1386
- Add Stoi To Species Name
1387
-
1388
- Add # of molecules to a species name
1389
-
1390
- Parameters:
1391
- ----------
1392
- stoi: float or int
1393
- Number of molecules
1394
- name: str
1395
- Name of species
1396
-
1397
- Returns:
1398
- ----------
1399
- name: str
1400
- Name containing number of species
1401
-
1402
- """
1403
-
1404
- if stoi in [-1, 0, 1]:
1405
- return name
1406
- else:
1407
- return str(abs(stoi)) + " " + name
1408
-
1409
-
1410
- def filter_to_characteristic_species_ids(
1411
- species_ids: pd.DataFrame,
1412
- max_complex_size: int = 4,
1413
- max_promiscuity: int = 20,
1414
- defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
1415
- ) -> pd.DataFrame:
1416
- """
1417
- Filter to Characteristic Species IDs
1418
-
1419
- Remove identifiers corresponding to one component within a large protein
1420
- complexes and non-characteristic annotations such as pubmed references and
1421
- homologues.
1129
+ Remove reactions from the model.
1422
1130
 
1423
1131
  Parameters
1424
1132
  ----------
1425
- species_ids: pd.DataFrame
1426
- A table of identifiers produced by sdbml_dfs.get_identifiers("species")
1427
- max_complex_size: int
1428
- The largest size of a complex, where BQB_HAS_PART terms will be retained.
1429
- In most cases, complexes are handled with specific formation and
1430
- dissolutation reactions,but these identifiers will be pulled in when
1431
- searching by identifiers or searching the identifiers associated with a
1432
- species against an external resource such as Open Targets.
1433
- max_promiscuity: int
1434
- Maximum number of species where a single molecule can act as a
1435
- BQB_HAS_PART component associated with a single identifier (and common ontology).
1436
- defining_biological_qualifiers (list[str]):
1437
- BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
1438
- permissive settings would include homologs, different forms of the same gene.
1439
-
1440
- Returns:
1441
- --------
1442
- species_id: pd.DataFrame
1443
- Input species filtered to characteristic identifiers
1444
-
1445
- """
1446
-
1447
- if not isinstance(species_ids, pd.DataFrame):
1448
- raise TypeError(
1449
- f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
1450
- )
1451
-
1452
- if not isinstance(max_complex_size, int):
1453
- raise TypeError(
1454
- f"max_complex_size was a {type(max_complex_size)} but must be an int"
1455
- )
1456
-
1457
- if not isinstance(max_promiscuity, int):
1458
- raise TypeError(
1459
- f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
1460
- )
1461
-
1462
- if not isinstance(defining_biological_qualifiers, list):
1463
- raise TypeError(
1464
- f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
1465
- )
1466
-
1467
- # primary annotations of a species
1468
- bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
1469
-
1470
- # add components within modestly sized protein complexes
1471
- # look at HAS_PART IDs
1472
- bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
1473
-
1474
- # number of species in a complex
1475
- n_species_components = bqb_has_parts_species.value_counts(
1476
- [IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
1477
- )
1478
- big_complex_sids = set(
1479
- n_species_components[
1480
- n_species_components > max_complex_size
1481
- ].index.get_level_values(SBML_DFS.S_ID)
1482
- )
1483
-
1484
- filtered_bqb_has_parts = _filter_promiscuous_components(
1485
- bqb_has_parts_species, max_promiscuity
1486
- )
1487
-
1488
- # drop species parts if there are many components
1489
- filtered_bqb_has_parts = filtered_bqb_has_parts[
1490
- ~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
1491
- ]
1492
-
1493
- # combine primary identifiers and rare components
1494
- characteristic_species_ids = pd.concat(
1495
- [
1496
- bqb_is_species,
1497
- filtered_bqb_has_parts,
1498
- ]
1499
- )
1500
-
1501
- return characteristic_species_ids
1502
-
1503
-
1504
- def infer_uncompartmentalized_species_location(sbml_dfs: SBML_dfs) -> SBML_dfs:
1505
- """
1506
- Infer Uncompartmentalized Species Location
1507
-
1508
- If the compartment of a subset of compartmentalized species
1509
- was not specified, infer an appropriate compartment from
1510
- other members of reactions they particpate in
1511
-
1512
- Parameters:
1513
- ----------
1514
- sbml_dfs: sbml.SBML_dfs
1515
- A relational pathway model
1516
-
1517
- Returns:
1518
- ----------
1519
- sbml_dfs: sbml.SBML_dfs
1520
- A relational pathway model (with filled in species compartments)
1521
-
1522
- """
1523
-
1524
- default_compartment = (
1525
- sbml_dfs.compartmentalized_species.value_counts(SBML_DFS.C_ID)
1526
- .rename("N")
1527
- .reset_index()
1528
- .sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
1529
- )
1530
- if not isinstance(default_compartment, str):
1531
- raise ValueError(
1532
- "No default compartment could be found - compartment "
1533
- "information may not be present"
1534
- )
1535
-
1536
- # infer the compartments of species missing compartments
1537
-
1538
- missing_compartment_scids = sbml_dfs.compartmentalized_species[
1539
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1540
- ].index.tolist()
1541
- if len(missing_compartment_scids) == 0:
1542
- logger.info(
1543
- "All compartmentalized species have compartments, "
1544
- "returning input sbml_dfs"
1545
- )
1546
- return sbml_dfs
1547
-
1548
- participating_reactions = (
1549
- sbml_dfs.reaction_species[
1550
- sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1551
- ][SBML_DFS.R_ID]
1552
- .unique()
1553
- .tolist()
1554
- )
1555
- reaction_participants = sbml_dfs.reaction_species[
1556
- sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
1557
- ].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
1558
- reaction_participants = reaction_participants.merge(
1559
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID],
1560
- left_on=SBML_DFS.SC_ID,
1561
- right_index=True,
1562
- )
1563
-
1564
- # find a default compartment to fall back on if all compartmental information is missing
1565
-
1566
- primary_reaction_compartment = (
1567
- reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
1568
- .rename("N")
1569
- .reset_index()
1570
- .sort_values("N", ascending=False)
1571
- .groupby(SBML_DFS.R_ID)
1572
- .first()[SBML_DFS.C_ID]
1573
- .reset_index()
1574
- )
1575
-
1576
- inferred_compartmentalization = (
1577
- sbml_dfs.reaction_species[
1578
- sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1579
- ]
1580
- .merge(primary_reaction_compartment)
1581
- .value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
1582
- .rename("N")
1583
- .reset_index()
1584
- .sort_values("N", ascending=False)
1585
- .groupby(SBML_DFS.SC_ID)
1586
- .first()
1587
- .reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
1588
- )
1589
- logger.info(
1590
- f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
1591
- )
1592
-
1593
- # define where a reaction is most likely to occur based on the compartmentalization of its particpants
1594
- species_with_unknown_compartmentalization = set(
1595
- missing_compartment_scids
1596
- ).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
1597
- if len(species_with_unknown_compartmentalization) != 0:
1598
- logger.warning(
1599
- f"{len(species_with_unknown_compartmentalization)} "
1600
- "species compartmentalization could not be inferred"
1601
- " from other reaction particpants. Their compartmentalization "
1602
- f"will be set to the default of {default_compartment}"
1603
- )
1604
-
1605
- inferred_compartmentalization = pd.concat(
1606
- [
1607
- inferred_compartmentalization,
1608
- pd.DataFrame(
1609
- {SBML_DFS.SC_ID: list(species_with_unknown_compartmentalization)}
1610
- ).assign(c_id=default_compartment),
1611
- ]
1612
- )
1613
-
1614
- if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
1615
- raise ValueError(
1616
- f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
1617
- )
1618
-
1619
- updated_compartmentalized_species = pd.concat(
1620
- [
1621
- sbml_dfs.compartmentalized_species[
1622
- ~sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1623
- ],
1624
- sbml_dfs.compartmentalized_species[
1625
- sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1626
- ]
1627
- .drop(SBML_DFS.C_ID, axis=1)
1628
- .merge(
1629
- inferred_compartmentalization, left_index=True, right_on=SBML_DFS.SC_ID
1630
- )
1631
- .set_index(SBML_DFS.SC_ID),
1632
- ]
1633
- )
1634
-
1635
- if (
1636
- updated_compartmentalized_species.shape[0]
1637
- != sbml_dfs.compartmentalized_species.shape[0]
1638
- ):
1639
- raise ValueError(
1640
- f"Trying to overwrite {sbml_dfs.compartmentalized_species.shape[0]}"
1641
- " compartmentalized species with "
1642
- f"{updated_compartmentalized_species.shape[0]}"
1643
- )
1644
-
1645
- if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
1646
- raise ValueError("Some species compartments are still missing")
1647
-
1648
- sbml_dfs.compartmentalized_species = updated_compartmentalized_species
1649
-
1650
- return sbml_dfs
1651
-
1652
-
1653
- def infer_sbo_terms(sbml_dfs: SBML_dfs) -> SBML_dfs:
1654
- """
1655
- Infer SBO Terms
1656
-
1657
- Define SBO terms based on stoichiometry for reaction_species with missing terms
1658
-
1659
- Parameters:
1660
- ----------
1661
- sbml_dfs: sbml.SBML_dfs
1662
- A relational pathway model
1663
-
1664
- Returns:
1665
- ----------
1666
- sbml_dfs: sbml.SBML_dfs
1667
- A relational pathway model (with missing/invalid reaction species sbo_terms resolved)
1668
-
1669
- """
1670
-
1671
- valid_sbo_terms = sbml_dfs.reaction_species[
1672
- sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
1673
- ]
1674
-
1675
- invalid_sbo_terms = sbml_dfs.reaction_species[
1676
- ~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
1677
- ]
1678
-
1679
- if not all(sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].notnull()):
1680
- raise ValueError(
1681
- "All sbml_dfs.reaction_species[SBML_DFS.SBO_TERM] must be not null"
1682
- )
1683
- if invalid_sbo_terms.shape[0] == 0:
1684
- logger.info("All sbo_terms were valid; returning input sbml_dfs")
1685
- return sbml_dfs
1686
-
1687
- logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
1688
-
1689
- # add missing/invalid terms based on stoichiometry
1690
- invalid_sbo_terms.loc[
1691
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
1692
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
1693
-
1694
- invalid_sbo_terms.loc[
1695
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
1696
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
1697
-
1698
- invalid_sbo_terms.loc[
1699
- invalid_sbo_terms[SBML_DFS.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
1700
- ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
1701
-
1702
- updated_reaction_species = pd.concat(
1703
- [valid_sbo_terms, invalid_sbo_terms]
1704
- ).sort_index()
1705
-
1706
- if sbml_dfs.reaction_species.shape[0] != updated_reaction_species.shape[0]:
1707
- raise ValueError(
1708
- f"Trying to overwrite {sbml_dfs.reaction_species.shape[0]} reaction_species with {updated_reaction_species.shape[0]}"
1709
- )
1710
- sbml_dfs.reaction_species = updated_reaction_species
1711
-
1712
- return sbml_dfs
1713
-
1714
-
1715
- def name_compartmentalized_species(sbml_dfs):
1716
- """
1717
- Name Compartmentalized Species
1718
-
1719
- Rename compartmentalized species if they have the same
1720
- name as their species
1721
-
1722
- Parameters
1723
- ----------
1724
- sbml_dfs : SBML_dfs
1725
- A model formed by aggregating pathways
1726
-
1727
- Returns:
1728
- ----------
1729
- sbml_dfs
1730
- """
1731
-
1732
- augmented_cspecies = sbml_dfs.compartmentalized_species.merge(
1733
- sbml_dfs.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
1734
- ).merge(
1735
- sbml_dfs.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
1736
- )
1737
- augmented_cspecies[SBML_DFS.SC_NAME] = [
1738
- f"{s} [{c}]" if sc == s else sc
1739
- for sc, c, s in zip(
1740
- augmented_cspecies[SBML_DFS.SC_NAME],
1741
- augmented_cspecies[SBML_DFS.C_NAME],
1742
- augmented_cspecies[SBML_DFS.S_NAME],
1743
- )
1744
- ]
1133
+ r_ids : Iterable[str]
1134
+ IDs of reactions to remove
1135
+ remove_species : bool, optional
1136
+ Whether to remove species that are no longer part of any reactions,
1137
+ by default False
1138
+ """
1139
+ # remove corresponding reactions_species
1140
+ self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
1141
+ # remove reactions
1142
+ self.reactions = self.reactions.drop(index=list(r_ids))
1143
+ # remove reactions_data
1144
+ if hasattr(self, "reactions_data"):
1145
+ for k, data in self.reactions_data.items():
1146
+ self.reactions_data[k] = data.drop(index=list(r_ids))
1147
+ # remove species if requested
1148
+ if remove_species:
1149
+ self._remove_unused_cspecies()
1150
+ self._remove_unused_species()
1745
1151
 
1746
- sbml_dfs.compartmentalized_species = augmented_cspecies.loc[
1747
- :, sbml_dfs.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
1748
- ]
1152
+ def remove_reactions_data(self, label: str):
1153
+ """
1154
+ Remove reactions data by label.
1155
+ """
1156
+ self._remove_entity_data(SBML_DFS.REACTIONS, label)
1749
1157
 
1750
- return sbml_dfs
1158
+ def remove_species_data(self, label: str):
1159
+ """
1160
+ Remove species data by label.
1161
+ """
1162
+ self._remove_entity_data(SBML_DFS.SPECIES, label)
1751
1163
 
1164
+ def search_by_ids(
1165
+ self,
1166
+ ids: list[str],
1167
+ entity_type: str,
1168
+ identifiers_df: pd.DataFrame,
1169
+ ontologies: None | set[str] = None,
1170
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
1171
+ """
1172
+ Find entities and identifiers matching a set of query IDs.
1752
1173
 
1753
- def export_sbml_dfs(
1754
- model_prefix: str,
1755
- sbml_dfs: SBML_dfs,
1756
- outdir: str,
1757
- overwrite: bool = False,
1758
- dogmatic: bool = True,
1759
- ) -> None:
1760
- """
1761
- Export SBML_dfs
1762
-
1763
- Export summaries of species identifiers and each table underlying
1764
- an SBML_dfs pathway model
1765
-
1766
- Params
1767
- ------
1768
- model_prefix: str
1769
- Label to prepend to all exported files
1770
- sbml_dfs: sbml.SBML_dfs
1771
- A pathway model
1772
- outdir: str
1773
- Path to an existing directory where results should be saved
1774
- overwrite: bool
1775
- Should the directory be overwritten if it already exists?
1776
- dogmatic: bool
1777
- If True then treat genes, transcript, and proteins as separate species. If False
1778
- then treat them interchangeably.
1174
+ Parameters
1175
+ ----------
1176
+ ids : List[str]
1177
+ List of identifiers to search for
1178
+ entity_type : str
1179
+ Type of entity to search (e.g., 'species', 'reactions')
1180
+ identifiers_df : pd.DataFrame
1181
+ DataFrame containing identifier mappings
1182
+ ontologies : Optional[Set[str]], optional
1183
+ Set of ontologies to filter by, by default None
1779
1184
 
1780
1185
  Returns
1781
1186
  -------
1782
- None
1783
-
1784
- """
1785
-
1786
- if not isinstance(model_prefix, str):
1787
- raise TypeError(f"model_prefix was a {type(model_prefix)} " "and must be a str")
1788
- if not isinstance(sbml_dfs, SBML_dfs):
1789
- raise TypeError(
1790
- f"sbml_dfs was a {type(sbml_dfs)} and must" " be an sbml.SBML_dfs"
1791
- )
1792
-
1793
- # filter to identifiers which make sense when mapping from ids -> species
1794
- species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
1795
- sbml_dfs,
1796
- dogmatic=dogmatic,
1797
- )
1798
-
1799
- try:
1800
- utils.initialize_dir(outdir, overwrite=overwrite)
1801
- except FileExistsError:
1802
- logger.warning(
1803
- f"Directory {outdir} already exists and overwrite is False. "
1804
- "Files will be added to the existing directory."
1805
- )
1806
- with open_fs(outdir, writeable=True) as fs:
1807
- species_identifiers_path = (
1808
- model_prefix + CPR_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
1809
- )
1810
- with fs.openbin(species_identifiers_path, "w") as f:
1811
- species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
1812
- f, sep="\t", index=False
1813
- )
1814
-
1815
- # export jsons
1816
- species_path = model_prefix + CPR_STANDARD_OUTPUTS.SPECIES
1817
- reactions_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTIONS
1818
- reation_species_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTION_SPECIES
1819
- compartments_path = model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTS
1820
- compartmentalized_species_path = (
1821
- model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
1822
- )
1823
- with fs.openbin(species_path, "w") as f:
1824
- sbml_dfs.species[[SBML_DFS.S_NAME]].to_json(f)
1825
-
1826
- with fs.openbin(reactions_path, "w") as f:
1827
- sbml_dfs.reactions[[SBML_DFS.R_NAME]].to_json(f)
1828
-
1829
- with fs.openbin(reation_species_path, "w") as f:
1830
- sbml_dfs.reaction_species.to_json(f)
1831
-
1832
- with fs.openbin(compartments_path, "w") as f:
1833
- sbml_dfs.compartments[[SBML_DFS.C_NAME]].to_json(f)
1834
-
1835
- with fs.openbin(compartmentalized_species_path, "w") as f:
1836
- sbml_dfs.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
1837
- f
1838
- )
1839
-
1840
- return None
1841
-
1842
-
1843
- def sbml_dfs_from_edgelist(
1844
- interaction_edgelist: pd.DataFrame,
1845
- species_df: pd.DataFrame,
1846
- compartments_df: pd.DataFrame,
1847
- interaction_source: source.Source,
1848
- upstream_stoichiometry: int = 0,
1849
- downstream_stoichiometry: int = 1,
1850
- downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
1851
- keep_species_data: bool | str = False,
1852
- keep_reactions_data: bool | str = False,
1853
- ) -> SBML_dfs:
1854
- """
1855
- Create SBML_dfs from interaction edgelist.
1856
-
1857
- Combines a set of molecular interactions into a mechanistic SBML_dfs model
1858
- by processing interaction data, species information, and compartment definitions.
1187
+ Tuple[pd.DataFrame, pd.DataFrame]
1188
+ - Matching entities
1189
+ - Matching identifiers
1859
1190
 
1860
- Parameters
1861
- ----------
1862
- interaction_edgelist : pd.DataFrame
1863
- Table containing molecular interactions with columns:
1864
- - upstream_name : str, matches "s_name" from species_df
1865
- - downstream_name : str, matches "s_name" from species_df
1866
- - upstream_compartment : str, matches "c_name" from compartments_df
1867
- - downstream_compartment : str, matches "c_name" from compartments_df
1868
- - r_name : str, name for the interaction
1869
- - sbo_term : str, SBO term defining interaction type
1870
- - r_Identifiers : identifiers.Identifiers, supporting identifiers
1871
- - r_isreversible : bool, whether reaction is reversible
1872
- species_df : pd.DataFrame
1873
- Table defining molecular species with columns:
1874
- - s_name : str, name of molecular species
1875
- - s_Identifiers : identifiers.Identifiers, species identifiers
1876
- compartments_df : pd.DataFrame
1877
- Table defining compartments with columns:
1878
- - c_name : str, name of compartment
1879
- - c_Identifiers : identifiers.Identifiers, compartment identifiers
1880
- interaction_source : source.Source
1881
- Source object linking model entities to interaction source
1882
- upstream_stoichiometry : int, default 0
1883
- Stoichiometry of upstream species in reactions
1884
- downstream_stoichiometry : int, default 1
1885
- Stoichiometry of downstream species in reactions
1886
- downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
1887
- SBO term for downstream reactant type
1888
- keep_species_data : bool or str, default False
1889
- Whether to preserve extra species columns. If True, saves as 'source' label.
1890
- If string, uses as custom label. If False, discards extra data.
1891
- keep_reactions_data : bool or str, default False
1892
- Whether to preserve extra reaction columns. If True, saves as 'source' label.
1893
- If string, uses as custom label. If False, discards extra data.
1191
+ Raises
1192
+ ------
1193
+ ValueError
1194
+ If entity_type is invalid or ontologies are invalid
1195
+ TypeError
1196
+ If ontologies is not a set
1197
+ """
1198
+ # validate inputs
1199
+ entity_table = self.get_table(entity_type, required_attributes={"id"})
1200
+ entity_pk = self.schema[entity_type]["pk"]
1894
1201
 
1895
- Returns
1896
- -------
1897
- SBML_dfs
1898
- Validated SBML data structure containing compartments, species,
1899
- compartmentalized species, reactions, and reaction species tables.
1900
- """
1901
- # 1. Validate inputs
1902
- _edgelist_validate_inputs(interaction_edgelist, species_df, compartments_df)
1202
+ utils.match_pd_vars(
1203
+ identifiers_df,
1204
+ req_vars={
1205
+ entity_pk,
1206
+ IDENTIFIERS.ONTOLOGY,
1207
+ IDENTIFIERS.IDENTIFIER,
1208
+ IDENTIFIERS.URL,
1209
+ IDENTIFIERS.BQB,
1210
+ },
1211
+ allow_series=False,
1212
+ ).assert_present()
1903
1213
 
1904
- # 2. Identify which extra columns to preserve
1905
- extra_columns = _edgelist_identify_extra_columns(
1906
- interaction_edgelist, species_df, keep_reactions_data, keep_species_data
1907
- )
1214
+ if ontologies is not None:
1215
+ if not isinstance(ontologies, set):
1216
+ # for clarity this should not be reachable based on type hints
1217
+ raise TypeError(
1218
+ f"ontologies must be a set, but got {type(ontologies).__name__}"
1219
+ )
1220
+ ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
1221
+ invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
1222
+ if len(invalid_ontologies) > 0:
1223
+ raise ValueError(
1224
+ f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
1225
+ f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
1226
+ )
1908
1227
 
1909
- # 3. Process compartments and species tables
1910
- processed_compartments = _edgelist_process_compartments(
1911
- compartments_df, interaction_source
1912
- )
1913
- processed_species, species_data = _edgelist_process_species(
1914
- species_df, interaction_source, extra_columns["species"]
1915
- )
1228
+ # fitler to just to identifiers matchign the ontologies of interest
1229
+ identifiers_df = identifiers_df.query("ontology in @ontologies")
1916
1230
 
1917
- # 4. Create compartmentalized species
1918
- comp_species = _edgelist_create_compartmentalized_species(
1919
- interaction_edgelist,
1920
- processed_species,
1921
- processed_compartments,
1922
- interaction_source,
1923
- )
1231
+ matching_identifiers = identifiers_df.loc[
1232
+ identifiers_df["identifier"].isin(ids)
1233
+ ]
1234
+ entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
1924
1235
 
1925
- # 5. Create reactions and reaction species
1926
- reactions, reaction_species, reactions_data = (
1927
- _edgelist_create_reactions_and_species(
1928
- interaction_edgelist,
1929
- comp_species,
1930
- processed_species,
1931
- processed_compartments,
1932
- interaction_source,
1933
- upstream_stoichiometry,
1934
- downstream_stoichiometry,
1935
- downstream_sbo_name,
1936
- extra_columns["reactions"],
1937
- )
1938
- )
1236
+ return entity_subset, matching_identifiers
1939
1237
 
1940
- # 6. Assemble final SBML_dfs object
1941
- sbml_model = _edgelist_assemble_sbml_model(
1942
- processed_compartments,
1943
- processed_species,
1944
- comp_species,
1945
- reactions,
1946
- reaction_species,
1947
- species_data,
1948
- reactions_data,
1949
- keep_species_data,
1950
- keep_reactions_data,
1951
- extra_columns,
1952
- )
1238
+ def search_by_name(
1239
+ self, name: str, entity_type: str, partial_match: bool = True
1240
+ ) -> pd.DataFrame:
1241
+ """
1242
+ Find entities by exact or partial name match.
1953
1243
 
1954
- return sbml_model
1244
+ Parameters
1245
+ ----------
1246
+ name : str
1247
+ Name to search for
1248
+ entity_type : str
1249
+ Type of entity to search (e.g., 'species', 'reactions')
1250
+ partial_match : bool, optional
1251
+ Whether to allow partial string matches, by default True
1955
1252
 
1956
- return sbml_model
1253
+ Returns
1254
+ -------
1255
+ pd.DataFrame
1256
+ Matching entities
1257
+ """
1258
+ entity_table = self.get_table(entity_type, required_attributes={"label"})
1259
+ label_attr = self.schema[entity_type]["label"]
1957
1260
 
1261
+ if partial_match:
1262
+ matches = entity_table.loc[
1263
+ entity_table[label_attr].str.contains(name, case=False)
1264
+ ]
1265
+ else:
1266
+ matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
1267
+ return matches
1958
1268
 
1959
- def species_type_types(x):
1960
- """Assign a high-level molecule type to a molecular species"""
1269
+ def select_species_data(self, species_data_table: str) -> pd.DataFrame:
1270
+ """
1271
+ Select a species data table from the SBML_dfs object.
1961
1272
 
1962
- if isinstance(x, identifiers.Identifiers):
1963
- if x.filter(["chebi"]):
1964
- return "metabolite"
1965
- elif x.filter(["molodex"]):
1966
- return "drug"
1967
- else:
1968
- return "protein"
1969
- else:
1970
- return "unknown"
1971
-
1972
-
1973
- def stub_ids(ids):
1974
- if len(ids) == 0:
1975
- return pd.DataFrame(
1976
- {
1977
- IDENTIFIERS.ONTOLOGY: [None],
1978
- IDENTIFIERS.IDENTIFIER: [None],
1979
- IDENTIFIERS.URL: [None],
1980
- IDENTIFIERS.BQB: [None],
1981
- }
1982
- )
1983
- else:
1984
- return pd.DataFrame(ids)
1273
+ Parameters
1274
+ ----------
1275
+ species_data_table : str
1276
+ Name of the species data table to select
1985
1277
 
1278
+ Returns
1279
+ -------
1280
+ pd.DataFrame
1281
+ The selected species data table
1986
1282
 
1987
- def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
1988
- """
1989
- Add an sbo_role column to the reaction_species table.
1283
+ Raises
1284
+ ------
1285
+ ValueError
1286
+ If species_data_table is not found
1287
+ """
1288
+ # Check if species_data_table exists in sbml_dfs.species_data
1289
+ if species_data_table not in self.species_data:
1290
+ raise ValueError(
1291
+ f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
1292
+ f"Available tables: {self.species_data.keys()}"
1293
+ )
1990
1294
 
1991
- The sbo_role column is a string column that contains the SBO role of the reaction species.
1992
- The values in the sbo_role column are taken from the sbo_term column.
1295
+ # Get the species data
1296
+ return self.species_data[species_data_table]
1993
1297
 
1994
- The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
1995
- """
1298
+ def species_status(self, s_id: str) -> pd.DataFrame:
1299
+ """
1300
+ Species Status
1996
1301
 
1997
- validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
1302
+ Return all of the reactions a species participates in.
1998
1303
 
1999
- reaction_species = (
2000
- reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
2001
- .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
2002
- .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
2003
- )
1304
+ Parameters:
1305
+ s_id: str
1306
+ A species ID
2004
1307
 
2005
- undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
2006
- SBO_NAME_TO_ROLE.values()
2007
- )
2008
- if len(undefined_roles) > 0:
2009
- logger.warning(
2010
- f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
2011
- )
2012
- mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
2013
- reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
1308
+ Returns:
1309
+ pd.DataFrame, one row per reaction the species participates in
1310
+ with columns:
1311
+ - sc_name: str, name of the compartment the species participates in
1312
+ - stoichiometry: float, stoichiometry of the species in the reaction
1313
+ - r_name: str, name of the reaction
1314
+ - r_formula_str: str, human-readable formula of the reaction
1315
+ """
2014
1316
 
2015
- return reaction_species
1317
+ if s_id not in self.species.index:
1318
+ raise ValueError(f"{s_id} not found in species table")
2016
1319
 
1320
+ matching_species = self.species.loc[s_id]
2017
1321
 
2018
- def find_underspecified_reactions(
2019
- reaction_species_w_roles: pd.DataFrame,
2020
- ) -> pd.DataFrame:
1322
+ if not isinstance(matching_species, pd.Series):
1323
+ raise ValueError(f"{s_id} did not match a single species")
2021
1324
 
2022
- # check that both sbo_role and "new" are present
2023
- if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
2024
- raise ValueError(
2025
- "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
2026
- )
2027
- if "new" not in reaction_species_w_roles.columns:
2028
- raise ValueError(
2029
- "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2030
- )
2031
- # check that new is a boolean column
2032
- if reaction_species_w_roles["new"].dtype != bool:
2033
- raise ValueError(
2034
- "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2035
- )
1325
+ # find all rxns species participate in
1326
+ matching_compartmentalized_species = self.compartmentalized_species[
1327
+ self.compartmentalized_species.s_id.isin([s_id])
1328
+ ]
2036
1329
 
2037
- reactions_with_lost_defining_members = set(
2038
- reaction_species_w_roles.query("~new")
2039
- .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
2040
- .tolist()
2041
- )
1330
+ rxns_participating = self.reaction_species[
1331
+ self.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
1332
+ ]
2042
1333
 
2043
- N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
2044
- if N_reactions_with_lost_defining_members > 0:
2045
- logger.info(
2046
- f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
1334
+ # find all participants in these rxns
1335
+ full_rxns_participating = self.reaction_species[
1336
+ self.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
1337
+ ].merge(
1338
+ self.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
2047
1339
  )
2048
1340
 
2049
- # find the cases where all "new" values for a given (r_id, sbo_term) are False
2050
- reactions_with_lost_requirements = set(
2051
- reaction_species_w_roles
2052
- # drop already filtered reactions
2053
- .query("r_id not in @reactions_with_lost_defining_members")
2054
- .query("sbo_role == 'REQUIRED'")
2055
- # which entries which have some required attribute have all False values for that attribute
2056
- .groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
2057
- .agg({"new": "any"})
2058
- .query("new == False")
2059
- .index.get_level_values(SBML_DFS.R_ID)
2060
- )
1341
+ participating_rids = full_rxns_participating[SBML_DFS.R_ID].unique()
1342
+ reaction_descriptions = self.reaction_summaries(r_ids=participating_rids)
2061
1343
 
2062
- N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
2063
- if N_reactions_with_lost_requirements > 0:
2064
- logger.info(
2065
- f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
1344
+ status = (
1345
+ full_rxns_participating.loc[
1346
+ full_rxns_participating[SBML_DFS.SC_ID].isin(
1347
+ matching_compartmentalized_species.index.values.tolist()
1348
+ ),
1349
+ [SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
1350
+ ]
1351
+ .merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
1352
+ .reset_index(drop=True)
1353
+ .drop(SBML_DFS.R_ID, axis=1)
2066
1354
  )
2067
1355
 
2068
- underspecified_reactions = reactions_with_lost_defining_members.union(
2069
- reactions_with_lost_requirements
2070
- )
1356
+ return status
2071
1357
 
2072
- return underspecified_reactions
1358
+ def validate(self):
1359
+ """
1360
+ Validate the SBML_dfs structure and relationships.
2073
1361
 
1362
+ Checks:
1363
+ - Schema existence
1364
+ - Required tables presence
1365
+ - Individual table structure
1366
+ - Primary key uniqueness
1367
+ - Foreign key relationships
1368
+ - Optional data table validity
1369
+ - Reaction species validity
2074
1370
 
2075
- def _find_underspecified_reactions_by_scids(
2076
- sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
2077
- ) -> set[str]:
2078
- """
2079
- Find Underspecified reactions
1371
+ Raises
1372
+ ------
1373
+ ValueError
1374
+ If any validation check fails
1375
+ """
2080
1376
 
2081
- Identity reactions which should be removed if a set of molecular species are removed
2082
- from the system.
1377
+ if not hasattr(self, "schema"):
1378
+ raise ValueError("No schema found")
2083
1379
 
2084
- Params:
2085
- sbml_dfs (SBML_dfs):
2086
- A pathway representation
2087
- sc_ids (list[str])
2088
- A list of compartmentalized species ids (sc_ids) which will be removed.
1380
+ required_tables = self._required_entities
1381
+ schema_tables = set(self.schema.keys())
2089
1382
 
2090
- Returns:
2091
- underspecified_reactions (set[str]):
2092
- A list of reactions which should be removed because they will not occur once
2093
- \"sc_ids\" are removed.
1383
+ extra_tables = schema_tables.difference(required_tables)
1384
+ if len(extra_tables) != 0:
1385
+ logger.debug(
1386
+ f"{len(extra_tables)} unexpected tables found: "
1387
+ f"{', '.join(extra_tables)}"
1388
+ )
2094
1389
 
2095
- """
1390
+ missing_tables = required_tables.difference(schema_tables)
1391
+ if len(missing_tables) != 0:
1392
+ raise ValueError(
1393
+ f"Missing {len(missing_tables)} required tables: "
1394
+ f"{', '.join(missing_tables)}"
1395
+ )
2096
1396
 
2097
- updated_reaction_species = sbml_dfs.reaction_species.copy()
2098
- updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
2099
- sc_ids
2100
- )
1397
+ # check individual tables
1398
+ for table in required_tables:
1399
+ self._validate_table(table)
2101
1400
 
2102
- updated_reaction_species = add_sbo_role(updated_reaction_species)
2103
- underspecified_reactions = find_underspecified_reactions(updated_reaction_species)
1401
+ # check whether pks and fks agree
1402
+ self._validate_pk_fk_correspondence()
2104
1403
 
2105
- return underspecified_reactions
1404
+ # check optional data tables:
1405
+ for k, v in self.species_data.items():
1406
+ try:
1407
+ self._validate_species_data(v)
1408
+ except ValueError as e:
1409
+ raise ValueError(f"species data {k} was invalid.") from e
2106
1410
 
1411
+ for k, v in self.reactions_data.items():
1412
+ try:
1413
+ self._validate_reactions_data(v)
1414
+ except ValueError as e:
1415
+ raise ValueError(f"reactions data {k} was invalid.") from e
2107
1416
 
2108
- def validate_sbml_dfs_table(table_data: pd.DataFrame, table_name: str) -> None:
2109
- """
2110
- Validate a standalone table against the SBML_dfs schema.
1417
+ # validate reaction_species sbo_terms and stoi
1418
+ self._validate_reaction_species()
2111
1419
 
2112
- This function validates a table against the schema defined in SBML_DFS_SCHEMA,
2113
- without requiring an SBML_dfs object. Useful for validating tables before
2114
- creating an SBML_dfs object.
1420
+ # validate identifiers and sources
1421
+ self._validate_identifiers()
1422
+ self._validate_sources()
2115
1423
 
2116
- Parameters
2117
- ----------
2118
- table_data : pd.DataFrame
2119
- The table to validate
2120
- table_name : str
2121
- Name of the table in the SBML_dfs schema
1424
+ def validate_and_resolve(self):
1425
+ """
1426
+ Validate and attempt to automatically fix common issues.
1427
+
1428
+ This method iteratively:
1429
+ 1. Attempts validation
1430
+ 2. If validation fails, tries to resolve the issue
1431
+ 3. Repeats until validation passes or issue cannot be resolved
2122
1432
 
2123
1433
  Raises
2124
1434
  ------
2125
1435
  ValueError
2126
- If table_name is not in schema or validation fails
2127
- """
2128
- if table_name not in SBML_DFS_SCHEMA.SCHEMA:
2129
- raise ValueError(
2130
- f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
2131
- f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
2132
- )
1436
+ If validation fails and cannot be automatically resolved
1437
+ """
1438
+
1439
+ current_exception = None
1440
+ validated = False
1441
+
1442
+ while not validated:
1443
+ try:
1444
+ self.validate()
1445
+ validated = True
1446
+ except Exception as e:
1447
+ e_str = str(e)
1448
+ if e_str == current_exception:
1449
+ logger.warning(
1450
+ "Automated resolution of an Exception was attempted but failed"
1451
+ )
1452
+ raise e
2133
1453
 
2134
- table_schema = SBML_DFS_SCHEMA.SCHEMA[table_name]
2135
- _perform_sbml_dfs_table_validation(table_data, table_schema, table_name)
1454
+ # try to resolve
1455
+ self._attempt_resolve(e)
2136
1456
 
1457
+ # =============================================================================
1458
+ # PRIVATE METHODS (ALPHABETICAL ORDER)
1459
+ # =============================================================================
2137
1460
 
2138
- def _perform_sbml_dfs_table_validation(
2139
- table_data: pd.DataFrame,
2140
- table_schema: dict,
2141
- table_name: str,
2142
- ) -> None:
2143
- """
2144
- Core validation logic for SBML_dfs tables.
1461
+ def _attempt_resolve(self, e):
1462
+ str_e = str(e)
1463
+ if str_e == "compartmentalized_species included missing c_id values":
1464
+ logger.warning(str_e)
1465
+ logger.warning(
1466
+ "Attempting to resolve with infer_uncompartmentalized_species_location()"
1467
+ )
1468
+ self.infer_uncompartmentalized_species_location()
1469
+ elif re.search("sbo_terms were not defined", str_e):
1470
+ logger.warning(str_e)
1471
+ logger.warning("Attempting to resolve with infer_sbo_terms()")
1472
+ self.infer_sbo_terms()
1473
+ else:
1474
+ logger.warning(
1475
+ "An error occurred which could not be automatically resolved"
1476
+ )
1477
+ raise e
2145
1478
 
2146
- This function performs the actual validation checks for any table against its schema,
2147
- regardless of whether it's part of an SBML_dfs object or standalone.
1479
+ def _find_underspecified_reactions_by_scids(
1480
+ self, sc_ids: Iterable[str]
1481
+ ) -> set[str]:
1482
+ """
1483
+ Find Underspecified reactions
1484
+
1485
+ Identify reactions which should be removed if a set of molecular species are removed
1486
+ from the system.
2148
1487
 
2149
1488
  Parameters
2150
1489
  ----------
2151
- table_data : pd.DataFrame
2152
- The table data to validate
2153
- table_schema : dict
2154
- Schema definition for the table
2155
- table_name : str
2156
- Name of the table (for error messages)
1490
+ sc_ids : list[str]
1491
+ A list of compartmentalized species ids (sc_ids) which will be removed.
2157
1492
 
2158
- Raises
2159
- ------
2160
- ValueError
2161
- If the table does not conform to its schema:
2162
- - Not a DataFrame
2163
- - Wrong index name
2164
- - Duplicate primary keys
2165
- - Missing required variables
2166
- - Empty table
2167
- """
2168
- if not isinstance(table_data, pd.DataFrame):
2169
- raise ValueError(
2170
- f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
1493
+ Returns
1494
+ -------
1495
+ underspecified_reactions : set[str]
1496
+ A set of reactions which should be removed because they will not occur once
1497
+ "sc_ids" are removed.
1498
+ """
1499
+ updated_reaction_species = self.reaction_species.copy()
1500
+ updated_reaction_species["new"] = ~updated_reaction_species[
1501
+ SBML_DFS.SC_ID
1502
+ ].isin(sc_ids)
1503
+ updated_reaction_species = sbml_dfs_utils.add_sbo_role(updated_reaction_species)
1504
+ underspecified_reactions = sbml_dfs_utils.find_underspecified_reactions(
1505
+ updated_reaction_species
2171
1506
  )
1507
+ return underspecified_reactions
2172
1508
 
2173
- # check index
2174
- expected_index_name = table_schema["pk"]
2175
- if table_data.index.name != expected_index_name:
2176
- raise ValueError(
2177
- f"the index name for {table_name} was not the pk: {expected_index_name}"
1509
+ def _get_unused_cspecies(self) -> set[str]:
1510
+ """Returns a set of compartmentalized species
1511
+ that are not part of any reactions"""
1512
+ sc_ids = set(self.compartmentalized_species.index) - set(
1513
+ self.reaction_species[SBML_DFS.SC_ID]
2178
1514
  )
1515
+ return sc_ids # type: ignore
2179
1516
 
2180
- # check that all entries in the index are unique
2181
- if len(set(table_data.index.tolist())) != table_data.shape[0]:
2182
- duplicated_pks = table_data.index.value_counts()
2183
- duplicated_pks = duplicated_pks[duplicated_pks > 1]
2184
-
2185
- example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
2186
- raise ValueError(
2187
- f"{duplicated_pks.shape[0]} primary keys were duplicated "
2188
- f"including {', '.join(example_duplicates)}"
1517
+ def _get_unused_species(self) -> set[str]:
1518
+ """Returns a list of species that are not part of any reactions"""
1519
+ s_ids = set(self.species.index) - set(
1520
+ self.compartmentalized_species[SBML_DFS.S_ID]
2189
1521
  )
1522
+ return s_ids # type: ignore
2190
1523
 
2191
- # check variables
2192
- expected_vars = set(table_schema["vars"])
2193
- table_vars = set(list(table_data.columns))
1524
+ def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
1525
+ """Removes compartmentalized species from the model
2194
1526
 
2195
- extra_vars = table_vars.difference(expected_vars)
2196
- if len(extra_vars) != 0:
2197
- logger.debug(
2198
- f"{len(extra_vars)} extra variables were found for {table_name}: "
2199
- f"{', '.join(extra_vars)}"
2200
- )
1527
+ This should not be directly used by the user, as it can lead to
1528
+ invalid reactions when removing species without a logic to decide
1529
+ if the reaction needs to be removed as well.
2201
1530
 
2202
- missing_vars = expected_vars.difference(table_vars)
2203
- if len(missing_vars) != 0:
2204
- raise ValueError(
2205
- f"Missing {len(missing_vars)} required variables for {table_name}: "
2206
- f"{', '.join(missing_vars)}"
1531
+ Args:
1532
+ sc_ids (Iterable[str]): the compartmentalized species to remove
1533
+ """
1534
+ # Remove compartmentalized species
1535
+ self.compartmentalized_species = self.compartmentalized_species.drop(
1536
+ index=list(sc_ids)
2207
1537
  )
1538
+ # remove corresponding reactions_species
1539
+ self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
2208
1540
 
2209
- # check for empty table
2210
- if table_data.shape[0] == 0:
2211
- raise ValueError(f"{table_name} contained no entries")
1541
+ def _remove_entity_data(self, entity_type: str, label: str) -> None:
1542
+ """
1543
+ Remove data from species_data or reactions_data by table name and label.
2212
1544
 
1545
+ Parameters
1546
+ ----------
1547
+ entity_type : str
1548
+ Name of the table to remove data from ('species' or 'reactions')
1549
+ label : str
1550
+ Label of the data to remove
2213
1551
 
2214
- def _filter_promiscuous_components(
2215
- bqb_has_parts_species: pd.DataFrame, max_promiscuity: int
2216
- ) -> pd.DataFrame:
1552
+ Notes
1553
+ -----
1554
+ If the label does not exist, a warning will be logged that includes the existing labels.
1555
+ """
1556
+ if entity_type not in ENTITIES_W_DATA:
1557
+ raise ValueError("table_name must be either 'species' or 'reactions'")
2217
1558
 
2218
- # number of complexes a species is part of
2219
- n_complexes_involvedin = bqb_has_parts_species.value_counts(
2220
- [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
2221
- )
2222
- promiscuous_component_identifiers_index = n_complexes_involvedin[
2223
- n_complexes_involvedin > max_promiscuity
2224
- ].index
2225
- promiscuous_component_identifiers = pd.Series(
2226
- data=[True] * len(promiscuous_component_identifiers_index),
2227
- index=promiscuous_component_identifiers_index,
2228
- name="is_shared_component",
2229
- dtype=bool,
2230
- )
1559
+ data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
1560
+ if label not in data_dict:
1561
+ existing_labels = list(data_dict.keys())
1562
+ logger.warning(
1563
+ f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
1564
+ f"Existing labels: {existing_labels}"
1565
+ )
1566
+ return
2231
1567
 
2232
- if len(promiscuous_component_identifiers) == 0:
2233
- return bqb_has_parts_species
1568
+ del data_dict[label]
2234
1569
 
2235
- filtered_bqb_has_parts = bqb_has_parts_species.merge(
2236
- promiscuous_component_identifiers,
2237
- left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
2238
- right_index=True,
2239
- how="left",
2240
- )
1570
+ def _remove_species(self, s_ids: Iterable[str]):
1571
+ """Removes species from the model
2241
1572
 
2242
- filtered_bqb_has_parts["is_shared_component"] = (
2243
- filtered_bqb_has_parts["is_shared_component"].astype("boolean").fillna(False)
2244
- )
2245
- # drop identifiers shared as components across many species
2246
- filtered_bqb_has_parts = filtered_bqb_has_parts[
2247
- ~filtered_bqb_has_parts["is_shared_component"]
2248
- ].drop(["is_shared_component"], axis=1)
1573
+ This should not be directly used by the user, as it can lead to
1574
+ invalid reactions when removing species without a logic to decide
1575
+ if the reaction needs to be removed as well.
2249
1576
 
2250
- return filtered_bqb_has_parts
1577
+ This removes the species and corresponding compartmentalized species and
1578
+ reactions_species.
2251
1579
 
1580
+ Args:
1581
+ s_ids (Iterable[str]): the species to remove
1582
+ """
1583
+ sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
1584
+ self._remove_compartmentalized_species(sc_ids)
1585
+ # Remove species
1586
+ self.species = self.species.drop(index=list(s_ids))
1587
+ # remove data
1588
+ for k, data in self.species_data.items():
1589
+ self.species_data[k] = data.drop(index=list(s_ids))
2252
1590
 
2253
- def _edgelist_validate_inputs(
2254
- interaction_edgelist: pd.DataFrame,
2255
- species_df: pd.DataFrame,
2256
- compartments_df: pd.DataFrame,
2257
- ) -> None:
2258
- """
2259
- Validate input DataFrames have required columns.
1591
+ def _remove_unused_cspecies(self):
1592
+ """Removes compartmentalized species that are no
1593
+ longer part of any reactions"""
1594
+ sc_ids = self._get_unused_cspecies()
1595
+ self._remove_compartmentalized_species(sc_ids)
2260
1596
 
2261
- Parameters
2262
- ----------
2263
- interaction_edgelist : pd.DataFrame
2264
- Interaction data to validate
2265
- species_df : pd.DataFrame
2266
- Species data to validate
2267
- compartments_df : pd.DataFrame
2268
- Compartments data to validate
2269
- """
1597
+ def _remove_unused_species(self):
1598
+ """Removes species that are no longer part of any
1599
+ compartmentalized species"""
1600
+ s_ids = self._get_unused_species()
1601
+ self._remove_species(s_ids)
2270
1602
 
2271
- # check compartments
2272
- compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
2273
- compartments_df_columns = set(compartments_df.columns.tolist())
2274
- missing_required_fields = compartments_df_expected_vars.difference(
2275
- compartments_df_columns
2276
- )
2277
- if len(missing_required_fields) > 0:
2278
- raise ValueError(
2279
- f"{', '.join(missing_required_fields)} are required variables"
2280
- ' in "compartments_df" but were not present in the input file.'
1603
+ def _validate_identifiers(self):
1604
+ """
1605
+ Validate identifiers in the model
1606
+
1607
+ Iterates through all tables and checks if the identifier columns are valid.
1608
+
1609
+ Raises:
1610
+ ValueError: missing identifiers in the table
1611
+ """
1612
+
1613
+ SCHEMA = SBML_DFS_SCHEMA.SCHEMA
1614
+ for table in SBML_DFS_SCHEMA.SCHEMA.keys():
1615
+ if "id" not in SCHEMA[table].keys():
1616
+ continue
1617
+ id_series = self.get_table(table)[SCHEMA[table]["id"]]
1618
+ if id_series.isna().sum() > 0:
1619
+ missing_ids = id_series[id_series.isna()].index
1620
+ raise ValueError(
1621
+ f"{table} has {len(missing_ids)} missing ids: {missing_ids}"
1622
+ )
1623
+
1624
+ def _validate_pk_fk_correspondence(self):
1625
+ """
1626
+ Check whether primary keys and foreign keys agree for all tables in the schema.
1627
+ Raises ValueError if any correspondence fails.
1628
+ """
1629
+
1630
+ pk_df = pd.DataFrame(
1631
+ [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
2281
1632
  )
2282
1633
 
2283
- # check species
2284
- species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2285
- species_df_columns = set(species_df.columns.tolist())
2286
- missing_required_fields = species_df_expected_vars.difference(species_df_columns)
2287
- if len(missing_required_fields) > 0:
2288
- raise ValueError(
2289
- f"{', '.join(missing_required_fields)} are required"
2290
- ' variables in "species_df" but were not present '
2291
- "in the input file."
2292
- )
1634
+ fk_df = (
1635
+ pd.DataFrame(
1636
+ [
1637
+ {"fk_table": k, "fk": v["fk"]}
1638
+ for k, v in self.schema.items()
1639
+ if "fk" in v.keys()
1640
+ ]
1641
+ )
1642
+ .set_index("fk_table")["fk"]
1643
+ .apply(pd.Series)
1644
+ .reset_index()
1645
+ .melt(id_vars="fk_table")
1646
+ .drop(["variable"], axis=1)
1647
+ .rename(columns={"value": "key"})
1648
+ )
1649
+
1650
+ pk_fk_correspondences = pk_df.merge(fk_df)
1651
+
1652
+ for i in range(0, pk_fk_correspondences.shape[0]):
1653
+ pk_table_keys = set(
1654
+ getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
1655
+ )
1656
+ if None in pk_table_keys:
1657
+ raise ValueError(
1658
+ f"{pk_fk_correspondences['pk_table'][i]} had "
1659
+ "missing values in its index"
1660
+ )
1661
+
1662
+ fk_table_keys = set(
1663
+ getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
1664
+ :, pk_fk_correspondences["key"][i]
1665
+ ]
1666
+ )
1667
+ if None in fk_table_keys:
1668
+ raise ValueError(
1669
+ f"{pk_fk_correspondences['fk_table'][i]} included "
1670
+ f"missing {pk_fk_correspondences['key'][i]} values"
1671
+ )
1672
+
1673
+ # all foreign keys need to match a primary key
1674
+ extra_fks = fk_table_keys.difference(pk_table_keys)
1675
+ if len(extra_fks) != 0:
1676
+ raise ValueError(
1677
+ f"{len(extra_fks)} distinct "
1678
+ f"{pk_fk_correspondences['key'][i]} values were"
1679
+ f" found in {pk_fk_correspondences['fk_table'][i]} "
1680
+ f"but missing from {pk_fk_correspondences['pk_table'][i]}."
1681
+ " All foreign keys must have a matching primary key.\n\n"
1682
+ f"Extra key are: {', '.join(extra_fks)}"
1683
+ )
2293
1684
 
2294
- # check interactions
2295
- interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
2296
- missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
2297
- interaction_edgelist_columns
2298
- )
2299
- if len(missing_required_fields) > 0:
2300
- raise ValueError(
2301
- f"{', '.join(missing_required_fields)} are required "
2302
- 'variables in "interaction_edgelist" but were not '
2303
- "present in the input file."
2304
- )
1685
+ def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
2305
1686
 
2306
- return None
1687
+ if isinstance(r_ids, str):
1688
+ r_ids = [r_ids]
2307
1689
 
1690
+ if r_ids is None:
1691
+ return self.reactions.index.tolist()
1692
+ else:
1693
+ if not all(r_id in self.reactions.index for r_id in r_ids):
1694
+ raise ValueError(f"Reaction IDs {r_ids} not found in reactions table")
2308
1695
 
2309
- def _edgelist_identify_extra_columns(
2310
- interaction_edgelist, species_df, keep_reactions_data, keep_species_data
2311
- ):
2312
- """
2313
- Identify extra columns in input data that should be preserved.
1696
+ return r_ids
2314
1697
 
2315
- Parameters
2316
- ----------
2317
- interaction_edgelist : pd.DataFrame
2318
- Interaction data containing potential extra columns
2319
- species_df : pd.DataFrame
2320
- Species data containing potential extra columns
2321
- keep_reactions_data : bool or str
2322
- Whether to keep extra reaction columns
2323
- keep_species_data : bool or str
2324
- Whether to keep extra species columns
1698
+ def _validate_reaction_species(self):
1699
+ if not all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull()):
1700
+ raise ValueError(
1701
+ "All reaction_species[SBML_DFS.STOICHIOMETRY] must be not null"
1702
+ )
2325
1703
 
2326
- Returns
2327
- -------
2328
- dict
2329
- Dictionary with 'reactions' and 'species' keys containing lists of extra column names
2330
- """
2331
- extra_reactions_columns = []
2332
- extra_species_columns = []
2333
-
2334
- if keep_reactions_data is not False:
2335
- extra_reactions_columns = [
2336
- c
2337
- for c in interaction_edgelist.columns
2338
- if c not in INTERACTION_EDGELIST_EXPECTED_VARS
2339
- ]
1704
+ # test for null SBO terms
1705
+ n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
1706
+ if n_null_sbo_terms != 0:
1707
+ raise ValueError(
1708
+ f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
1709
+ )
2340
1710
 
2341
- if keep_species_data is not False:
2342
- extra_species_columns = [
2343
- c
2344
- for c in species_df.columns
2345
- if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
1711
+ # find invalid SBO terms
1712
+ sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
1713
+ invalid_sbo_term_counts = sbo_counts[
1714
+ ~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
2346
1715
  ]
2347
1716
 
2348
- return {"reactions": extra_reactions_columns, "species": extra_species_columns}
2349
-
2350
-
2351
- def _edgelist_process_compartments(compartments_df, interaction_source):
2352
- """
2353
- Format compartments DataFrame with source and ID columns.
1717
+ if invalid_sbo_term_counts.shape[0] != 0:
1718
+ invalid_sbo_counts_str = ", ".join(
1719
+ [f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
1720
+ )
1721
+ raise ValueError(
1722
+ f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
1723
+ f"defined {invalid_sbo_counts_str}"
1724
+ )
2354
1725
 
2355
- Parameters
2356
- ----------
2357
- compartments_df : pd.DataFrame
2358
- Raw compartments data
2359
- interaction_source : source.Source
2360
- Source object to assign to compartments
1726
+ def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
1727
+ """Validates reactions data attribute
2361
1728
 
2362
- Returns
2363
- -------
2364
- pd.DataFrame
2365
- Processed compartments with IDs, indexed by compartment ID
2366
- """
2367
- compartments = compartments_df.copy()
2368
- compartments[SBML_DFS.C_SOURCE] = interaction_source
2369
- compartments[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
2370
- range(compartments.shape[0]), SBML_DFS.C_ID
2371
- )
2372
- return compartments.set_index(SBML_DFS.C_ID)[
2373
- [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
2374
- ]
1729
+ Args:
1730
+ reactions_data_table (pd.DataFrame): a reactions data table
2375
1731
 
1732
+ Raises:
1733
+ ValueError: r_id not index name
1734
+ ValueError: r_id index contains duplicates
1735
+ ValueError: r_id not in reactions table
1736
+ """
1737
+ sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
2376
1738
 
2377
- def _edgelist_process_species(species_df, interaction_source, extra_species_columns):
2378
- """
2379
- Format species DataFrame and extract extra data.
1739
+ def _validate_sources(self):
1740
+ """
1741
+ Validate sources in the model
2380
1742
 
2381
- Parameters
2382
- ----------
2383
- species_df : pd.DataFrame
2384
- Raw species data
2385
- interaction_source : source.Source
2386
- Source object to assign to species
2387
- extra_species_columns : list
2388
- Names of extra columns to preserve separately
1743
+ Iterates through all tables and checks if the source columns are valid.
2389
1744
 
2390
- Returns
2391
- -------
2392
- tuple of pd.DataFrame
2393
- Processed species DataFrame and species extra data DataFrame
2394
- """
2395
- species = species_df.copy()
2396
- species[SBML_DFS.S_SOURCE] = interaction_source
2397
- species[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
2398
- range(species.shape[0]), SBML_DFS.S_ID
2399
- )
1745
+ Raises:
1746
+ ValueError: missing sources in the table
1747
+ """
2400
1748
 
2401
- required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
2402
- species_indexed = species.set_index(SBML_DFS.S_ID)[
2403
- required_cols + extra_species_columns
2404
- ]
1749
+ SCHEMA = SBML_DFS_SCHEMA.SCHEMA
1750
+ for table in SBML_DFS_SCHEMA.SCHEMA.keys():
1751
+ if "source" not in SCHEMA[table].keys():
1752
+ continue
1753
+ source_series = self.get_table(table)[SCHEMA[table]["source"]]
1754
+ if source_series.isna().sum() > 0:
1755
+ missing_sources = source_series[source_series.isna()].index
1756
+ raise ValueError(
1757
+ f"{table} has {len(missing_sources)} missing sources: {missing_sources}"
1758
+ )
2405
1759
 
2406
- # Separate extra data from main species table
2407
- species_data = species_indexed[extra_species_columns]
2408
- processed_species = species_indexed[required_cols]
1760
+ def _validate_species_data(self, species_data_table: pd.DataFrame):
1761
+ """Validates species data attribute
2409
1762
 
2410
- return processed_species, species_data
1763
+ Args:
1764
+ species_data_table (pd.DataFrame): a species data table
2411
1765
 
1766
+ Raises:
1767
+ ValueError: s_id not index name
1768
+ ValueError: s_id index contains duplicates
1769
+ ValueError: s_id not in species table
1770
+ """
1771
+ sbml_dfs_utils._validate_matching_data(species_data_table, self.species)
2412
1772
 
2413
- def _edgelist_create_compartmentalized_species(
2414
- interaction_edgelist, species_df, compartments_df, interaction_source
2415
- ):
2416
- """
2417
- Create compartmentalized species from interactions.
1773
+ def _validate_table(self, table_name: str) -> None:
1774
+ """
1775
+ Validate a table in this SBML_dfs object against its schema.
2418
1776
 
2419
- Parameters
2420
- ----------
2421
- interaction_edgelist : pd.DataFrame
2422
- Interaction data containing species-compartment combinations
2423
- species_df : pd.DataFrame
2424
- Processed species data with IDs
2425
- compartments_df : pd.DataFrame
2426
- Processed compartments data with IDs
2427
- interaction_source : source.Source
2428
- Source object to assign to compartmentalized species
1777
+ This is an internal method that validates a table that is part of this SBML_dfs
1778
+ object against the schema stored in self.schema.
2429
1779
 
2430
- Returns
2431
- -------
2432
- pd.DataFrame
2433
- Compartmentalized species with formatted names and IDs
2434
- """
2435
- # Get all distinct upstream and downstream compartmentalized species
2436
- comp_species = pd.concat(
2437
- [
2438
- interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
2439
- {
2440
- "upstream_name": SBML_DFS.S_NAME,
2441
- "upstream_compartment": SBML_DFS.C_NAME,
2442
- },
2443
- axis=1,
2444
- ),
2445
- interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
2446
- {
2447
- "downstream_name": SBML_DFS.S_NAME,
2448
- "downstream_compartment": SBML_DFS.C_NAME,
2449
- },
2450
- axis=1,
2451
- ),
2452
- ]
2453
- ).drop_duplicates()
1780
+ Parameters
1781
+ ----------
1782
+ table : str
1783
+ Name of the table to validate
2454
1784
 
2455
- # Add species and compartment IDs
2456
- comp_species_w_ids = comp_species.merge(
2457
- species_df[SBML_DFS.S_NAME].reset_index(), how="left", on=SBML_DFS.S_NAME
2458
- ).merge(
2459
- compartments_df[SBML_DFS.C_NAME].reset_index(), how="left", on=SBML_DFS.C_NAME
2460
- )
1785
+ Raises
1786
+ ------
1787
+ ValueError
1788
+ If the table does not conform to its schema
1789
+ """
1790
+ table_data = getattr(self, table_name)
2461
1791
 
2462
- # Validate merge was successful
2463
- _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
1792
+ sbml_dfs_utils.validate_sbml_dfs_table(table_data, table_name)
2464
1793
 
2465
- # Format compartmentalized species with names, source, and IDs
2466
- comp_species_w_ids[SBML_DFS.SC_NAME] = [
2467
- f"{s} [{c}]"
2468
- for s, c in zip(
2469
- comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
2470
- )
2471
- ]
2472
- comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
2473
- comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
2474
- range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
2475
- )
2476
1794
 
2477
- return comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
2478
- [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
2479
- ]
2480
-
2481
-
2482
- def _edgelist_create_reactions_and_species(
2483
- interaction_edgelist,
2484
- comp_species,
2485
- species_df,
2486
- compartments_df,
2487
- interaction_source,
2488
- upstream_stoichiometry,
2489
- downstream_stoichiometry,
2490
- downstream_sbo_name,
2491
- extra_reactions_columns,
2492
- ):
1795
+ def sbml_dfs_from_edgelist(
1796
+ interaction_edgelist: pd.DataFrame,
1797
+ species_df: pd.DataFrame,
1798
+ compartments_df: pd.DataFrame,
1799
+ interaction_source: source.Source,
1800
+ upstream_stoichiometry: int = 0,
1801
+ downstream_stoichiometry: int = 1,
1802
+ downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
1803
+ keep_species_data: bool | str = False,
1804
+ keep_reactions_data: bool | str = False,
1805
+ ) -> SBML_dfs:
2493
1806
  """
2494
- Create reactions and reaction species from interactions.
1807
+ Create SBML_dfs from interaction edgelist.
1808
+
1809
+ Combines a set of molecular interactions into a mechanistic SBML_dfs model
1810
+ by processing interaction data, species information, and compartment definitions.
2495
1811
 
2496
1812
  Parameters
2497
1813
  ----------
2498
1814
  interaction_edgelist : pd.DataFrame
2499
- Original interaction data
2500
- comp_species : pd.DataFrame
2501
- Compartmentalized species with IDs
1815
+ Table containing molecular interactions with columns:
1816
+ - upstream_name : str, matches "s_name" from species_df
1817
+ - downstream_name : str, matches "s_name" from species_df
1818
+ - upstream_compartment : str, matches "c_name" from compartments_df
1819
+ - downstream_compartment : str, matches "c_name" from compartments_df
1820
+ - r_name : str, name for the interaction
1821
+ - sbo_term : str, SBO term defining interaction type
1822
+ - r_Identifiers : identifiers.Identifiers, supporting identifiers
1823
+ - r_isreversible : bool, whether reaction is reversible
2502
1824
  species_df : pd.DataFrame
2503
- Processed species data with IDs
1825
+ Table defining molecular species with columns:
1826
+ - s_name : str, name of molecular species
1827
+ - s_Identifiers : identifiers.Identifiers, species identifiers
2504
1828
  compartments_df : pd.DataFrame
2505
- Processed compartments data with IDs
1829
+ Table defining compartments with columns:
1830
+ - c_name : str, name of compartment
1831
+ - c_Identifiers : identifiers.Identifiers, compartment identifiers
2506
1832
  interaction_source : source.Source
2507
- Source object for reactions
2508
- upstream_stoichiometry : int
2509
- Stoichiometry for upstream species
2510
- downstream_stoichiometry : int
2511
- Stoichiometry for downstream species
2512
- downstream_sbo_name : str
2513
- SBO term name for downstream species
2514
- extra_reactions_columns : list
2515
- Names of extra columns to preserve
1833
+ Source object linking model entities to interaction source
1834
+ upstream_stoichiometry : int, default 0
1835
+ Stoichiometry of upstream species in reactions
1836
+ downstream_stoichiometry : int, default 1
1837
+ Stoichiometry of downstream species in reactions
1838
+ downstream_sbo_name : str, default SBOTERM_NAMES.PRODUCT
1839
+ SBO term for downstream reactant type
1840
+ keep_species_data : bool or str, default False
1841
+ Whether to preserve extra species columns. If True, saves as 'source' label.
1842
+ If string, uses as custom label. If False, discards extra data.
1843
+ keep_reactions_data : bool or str, default False
1844
+ Whether to preserve extra reaction columns. If True, saves as 'source' label.
1845
+ If string, uses as custom label. If False, discards extra data.
2516
1846
 
2517
1847
  Returns
2518
1848
  -------
2519
- tuple
2520
- (reactions_df, reaction_species_df, reactions_data)
1849
+ SBML_dfs
1850
+ Validated SBML data structure containing compartments, species,
1851
+ compartmentalized species, reactions, and reaction species tables.
2521
1852
  """
2522
- # Add compartmentalized species IDs to interactions
2523
- comp_species_w_names = (
2524
- comp_species.reset_index()
2525
- .merge(species_df[SBML_DFS.S_NAME].reset_index())
2526
- .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
1853
+ # 1. Validate inputs
1854
+ sbml_dfs_utils._edgelist_validate_inputs(
1855
+ interaction_edgelist, species_df, compartments_df
2527
1856
  )
2528
1857
 
2529
- interaction_w_cspecies = interaction_edgelist.merge(
2530
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2531
- {
2532
- SBML_DFS.SC_ID: "sc_id_up",
2533
- SBML_DFS.S_NAME: "upstream_name",
2534
- SBML_DFS.C_NAME: "upstream_compartment",
2535
- },
2536
- axis=1,
2537
- ),
2538
- how="left",
2539
- ).merge(
2540
- comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
2541
- {
2542
- SBML_DFS.SC_ID: "sc_id_down",
2543
- SBML_DFS.S_NAME: "downstream_name",
2544
- SBML_DFS.C_NAME: "downstream_compartment",
2545
- },
2546
- axis=1,
2547
- ),
2548
- how="left",
2549
- )[
2550
- REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
2551
- ]
2552
-
2553
- # Validate merge didn't create duplicates
2554
- if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
2555
- raise ValueError(
2556
- f"Merging compartmentalized species resulted in row count change "
2557
- f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
2558
- )
1858
+ # 2. Identify which extra columns to preserve
1859
+ extra_columns = sbml_dfs_utils._edgelist_identify_extra_columns(
1860
+ interaction_edgelist, species_df, keep_reactions_data, keep_species_data
1861
+ )
2559
1862
 
2560
- # Create reaction IDs FIRST - before using them
2561
- interaction_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
2562
- range(interaction_w_cspecies.shape[0]), SBML_DFS.R_ID
1863
+ # 3. Process compartments and species tables
1864
+ processed_compartments = sbml_dfs_utils._edgelist_process_compartments(
1865
+ compartments_df, interaction_source
1866
+ )
1867
+ processed_species, species_data = sbml_dfs_utils._edgelist_process_species(
1868
+ species_df, interaction_source, extra_columns["species"]
2563
1869
  )
2564
1870
 
2565
- # Create reactions DataFrame
2566
- interactions_copy = interaction_w_cspecies.copy()
2567
- interactions_copy[SBML_DFS.R_SOURCE] = interaction_source
2568
-
2569
- reactions_columns = [
2570
- SBML_DFS.R_NAME,
2571
- SBML_DFS.R_IDENTIFIERS,
2572
- SBML_DFS.R_SOURCE,
2573
- SBML_DFS.R_ISREVERSIBLE,
2574
- ]
2575
-
2576
- reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
2577
- reactions_columns + extra_reactions_columns
2578
- ]
2579
-
2580
- # Separate extra data
2581
- reactions_data = reactions_df[extra_reactions_columns]
2582
- reactions_df = reactions_df[reactions_columns]
2583
-
2584
- # Create reaction species relationships - NOW r_id exists
2585
- reaction_species_df = pd.concat(
2586
- [
2587
- # Upstream species (modifiers/stimulators/inhibitors)
2588
- interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
2589
- .assign(stoichiometry=upstream_stoichiometry)
2590
- .rename({"sc_id_up": "sc_id"}, axis=1),
2591
- # Downstream species (products)
2592
- interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
2593
- .assign(
2594
- stoichiometry=downstream_stoichiometry,
2595
- sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
2596
- )
2597
- .rename({"sc_id_down": "sc_id"}, axis=1),
2598
- ]
1871
+ # 4. Create compartmentalized species
1872
+ comp_species = sbml_dfs_utils._edgelist_create_compartmentalized_species(
1873
+ interaction_edgelist,
1874
+ processed_species,
1875
+ processed_compartments,
1876
+ interaction_source,
2599
1877
  )
2600
1878
 
2601
- reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
2602
- range(reaction_species_df.shape[0]), "rsc_id"
1879
+ # 5. Create reactions and reaction species
1880
+ reactions, reaction_species, reactions_data = (
1881
+ sbml_dfs_utils._edgelist_create_reactions_and_species(
1882
+ interaction_edgelist,
1883
+ comp_species,
1884
+ processed_species,
1885
+ processed_compartments,
1886
+ interaction_source,
1887
+ upstream_stoichiometry,
1888
+ downstream_stoichiometry,
1889
+ downstream_sbo_name,
1890
+ extra_columns["reactions"],
1891
+ )
2603
1892
  )
2604
1893
 
2605
- reaction_species_df = reaction_species_df.set_index("rsc_id")
1894
+ # 6. Assemble final SBML_dfs object
1895
+ sbml_dfs = _edgelist_assemble_sbml_model(
1896
+ processed_compartments,
1897
+ processed_species,
1898
+ comp_species,
1899
+ reactions,
1900
+ reaction_species,
1901
+ species_data,
1902
+ reactions_data,
1903
+ keep_species_data,
1904
+ keep_reactions_data,
1905
+ extra_columns,
1906
+ )
2606
1907
 
2607
- return reactions_df, reaction_species_df, reactions_data
1908
+ return sbml_dfs
2608
1909
 
2609
1910
 
2610
1911
  def _edgelist_assemble_sbml_model(
2611
- compartments,
2612
- species,
2613
- comp_species,
2614
- reactions,
2615
- reaction_species,
1912
+ compartments: pd.DataFrame,
1913
+ species: pd.DataFrame,
1914
+ comp_species: pd.DataFrame,
1915
+ reactions: pd.DataFrame,
1916
+ reaction_species: pd.DataFrame,
2616
1917
  species_data,
2617
1918
  reactions_data,
2618
1919
  keep_species_data,
2619
1920
  keep_reactions_data,
2620
- extra_columns,
2621
- ):
1921
+ extra_columns: dict[str, list[str]],
1922
+ ) -> SBML_dfs:
2622
1923
  """
2623
1924
  Assemble the final SBML_dfs object.
2624
1925
 
@@ -2675,128 +1976,3 @@ def _edgelist_assemble_sbml_model(
2675
1976
  sbml_model.validate()
2676
1977
 
2677
1978
  return sbml_model
2678
-
2679
-
2680
- def _sbml_dfs_from_edgelist_check_cspecies_merge(
2681
- merged_species: pd.DataFrame, original_species: pd.DataFrame
2682
- ) -> None:
2683
- """Check for a mismatch between the provided species data and species implied by the edgelist."""
2684
-
2685
- # check for 1-many merge
2686
- if merged_species.shape[0] != original_species.shape[0]:
2687
- raise ValueError(
2688
- "Merging compartmentalized species to species_df"
2689
- " and compartments_df by names resulted in an "
2690
- f"increase in the tables from {original_species.shape[0]}"
2691
- f" to {merged_species.shape[0]} indicating that names were"
2692
- " not unique"
2693
- )
2694
-
2695
- # check for missing species and compartments
2696
- missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
2697
- SBML_DFS.C_NAME
2698
- ].unique()
2699
- if len(missing_compartments) >= 1:
2700
- raise ValueError(
2701
- f"{len(missing_compartments)} compartments were present in"
2702
- ' "interaction_edgelist" but not "compartments_df":'
2703
- f" {', '.join(missing_compartments)}"
2704
- )
2705
-
2706
- missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
2707
- SBML_DFS.S_NAME
2708
- ].unique()
2709
- if len(missing_species) >= 1:
2710
- raise ValueError(
2711
- f"{len(missing_species)} species were present in "
2712
- '"interaction_edgelist" but not "species_df":'
2713
- f" {', '.join(missing_species)}"
2714
- )
2715
-
2716
- return None
2717
-
2718
-
2719
- def _stub_compartments(
2720
- stubbed_compartment: str = GENERIC_COMPARTMENT,
2721
- ) -> pd.DataFrame:
2722
- """Stub Compartments
2723
-
2724
- Create a compartments table with only a single compartment
2725
-
2726
- Args:
2727
- stubbed_compartment (str): the name of a compartment which should match the
2728
- keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
2729
-
2730
- Returns:
2731
- compartments_df (pd.DataFrame): compartments dataframe
2732
- """
2733
-
2734
- if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
2735
- raise ValueError(
2736
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
2737
- )
2738
-
2739
- if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
2740
- raise ValueError(
2741
- f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
2742
- )
2743
-
2744
- stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
2745
-
2746
- formatted_uri = identifiers.format_uri(
2747
- uri=identifiers.create_uri_url(
2748
- ontology=ONTOLOGIES.GO,
2749
- identifier=stubbed_compartment_id,
2750
- ),
2751
- biological_qualifier_type=BQB.IS,
2752
- )
2753
-
2754
- compartments_df = pd.DataFrame(
2755
- {
2756
- SBML_DFS.C_NAME: [stubbed_compartment],
2757
- SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
2758
- }
2759
- )
2760
- compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
2761
- compartments_df.index.name = SBML_DFS.C_ID
2762
-
2763
- return compartments_df
2764
-
2765
-
2766
- def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
2767
- """Validates a table against a reference
2768
-
2769
- This check if the table has the same index, no duplicates in the index
2770
- and that all values in the index are in the reference table.
2771
-
2772
- Args:
2773
- data_table (pd.DataFrame): a table with data that should
2774
- match the reference
2775
- ref_table (pd.DataFrame): a reference table
2776
-
2777
- Raises:
2778
- ValueError: not same index name
2779
- ValueError: index contains duplicates
2780
- ValueError: index not subset of index of reactions table
2781
- """
2782
- ref_index_name = ref_table.index.name
2783
- if data_table.index.name != ref_index_name:
2784
- raise ValueError(
2785
- "the index name for reaction data table was not"
2786
- f" {ref_index_name}: {data_table.index.name}"
2787
- )
2788
- ids = data_table.index
2789
- if any(ids.duplicated()):
2790
- raise ValueError(
2791
- "the index for reaction data table " "contained duplicate values"
2792
- )
2793
- if not all(ids.isin(ref_table.index)):
2794
- raise ValueError(
2795
- "the index for reaction data table contained values"
2796
- " not found in the reactions table"
2797
- )
2798
- if not isinstance(data_table, pd.DataFrame):
2799
- raise TypeError(
2800
- f"The data table was type {type(data_table).__name__}"
2801
- " but must be a pd.DataFrame"
2802
- )